[Midnightbsd-cvs] src [10164] trunk/sys/cddl/contrib/opensolaris/uts: sync with freebsd
laffer1 at midnightbsd.org
laffer1 at midnightbsd.org
Fri Jun 1 18:46:42 EDT 2018
Revision: 10164
http://svnweb.midnightbsd.org/src/?rev=10164
Author: laffer1
Date: 2018-06-01 18:46:41 -0400 (Fri, 01 Jun 2018)
Log Message:
-----------
sync with freebsd
Modified Paths:
--------------
trunk/sys/cddl/contrib/opensolaris/uts/common/Makefile.files
trunk/sys/cddl/contrib/opensolaris/uts/common/ctf/ctf_mod.c
trunk/sys/cddl/contrib/opensolaris/uts/common/ctf/ctf_subr.c
trunk/sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace.c
trunk/sys/cddl/contrib/opensolaris/uts/common/dtrace/fasttrap.c
trunk/sys/cddl/contrib/opensolaris/uts/common/dtrace/lockstat.c
trunk/sys/cddl/contrib/opensolaris/uts/common/dtrace/profile.c
trunk/sys/cddl/contrib/opensolaris/uts/common/dtrace/sdt_subr.c
trunk/sys/cddl/contrib/opensolaris/uts/common/dtrace/systrace.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/gfs.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_diff.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_userhold.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lz4.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lzjb.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/rrwlock.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sha256.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bplist.h
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bpobj.h
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bptree.h
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/ddt.h
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_zfetch.h
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deadlist.h
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deleg.h
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_destroy.h
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_prop.h
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_scan.h
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_synctask.h
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_userhold.h
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/rrwlock.h
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa.h
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa_impl.h
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_boot.h
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/trim_map.h
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg.h
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock.h
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock_impl.h
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/unique.h
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_disk.h
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_file.h
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfeature.h
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_acl.h
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ctldir.h
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_fuid.h
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_onexit.h
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_rlock.h
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_sa.h
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_stat.h
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_compress.h
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zrlock.h
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zvol.h
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/trim_map.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/uberblock.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfeature.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_debug.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_onexit.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_sa.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zrlock.c
trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c
trunk/sys/cddl/contrib/opensolaris/uts/common/os/callb.c
trunk/sys/cddl/contrib/opensolaris/uts/common/os/fm.c
trunk/sys/cddl/contrib/opensolaris/uts/common/os/list.c
trunk/sys/cddl/contrib/opensolaris/uts/common/os/nvpair_alloc_system.c
trunk/sys/cddl/contrib/opensolaris/uts/common/sys/acl.h
trunk/sys/cddl/contrib/opensolaris/uts/common/sys/acl_impl.h
trunk/sys/cddl/contrib/opensolaris/uts/common/sys/asm_linkage.h
trunk/sys/cddl/contrib/opensolaris/uts/common/sys/avl.h
trunk/sys/cddl/contrib/opensolaris/uts/common/sys/avl_impl.h
trunk/sys/cddl/contrib/opensolaris/uts/common/sys/bitmap.h
trunk/sys/cddl/contrib/opensolaris/uts/common/sys/callb.h
trunk/sys/cddl/contrib/opensolaris/uts/common/sys/ccompile.h
trunk/sys/cddl/contrib/opensolaris/uts/common/sys/cmn_err.h
trunk/sys/cddl/contrib/opensolaris/uts/common/sys/compress.h
trunk/sys/cddl/contrib/opensolaris/uts/common/sys/cpupart.h
trunk/sys/cddl/contrib/opensolaris/uts/common/sys/cpuvar.h
trunk/sys/cddl/contrib/opensolaris/uts/common/sys/cred.h
trunk/sys/cddl/contrib/opensolaris/uts/common/sys/ctf.h
trunk/sys/cddl/contrib/opensolaris/uts/common/sys/ctf_api.h
trunk/sys/cddl/contrib/opensolaris/uts/common/sys/debug.h
trunk/sys/cddl/contrib/opensolaris/uts/common/sys/dtrace.h
trunk/sys/cddl/contrib/opensolaris/uts/common/sys/dtrace_impl.h
trunk/sys/cddl/contrib/opensolaris/uts/common/sys/errorq.h
trunk/sys/cddl/contrib/opensolaris/uts/common/sys/extdirent.h
trunk/sys/cddl/contrib/opensolaris/uts/common/sys/fasttrap.h
trunk/sys/cddl/contrib/opensolaris/uts/common/sys/fasttrap_impl.h
trunk/sys/cddl/contrib/opensolaris/uts/common/sys/feature_tests.h
trunk/sys/cddl/contrib/opensolaris/uts/common/sys/fm/fs/zfs.h
trunk/sys/cddl/contrib/opensolaris/uts/common/sys/fm/protocol.h
trunk/sys/cddl/contrib/opensolaris/uts/common/sys/fm/util.h
trunk/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h
trunk/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zut.h
trunk/sys/cddl/contrib/opensolaris/uts/common/sys/gfs.h
trunk/sys/cddl/contrib/opensolaris/uts/common/sys/idmap.h
trunk/sys/cddl/contrib/opensolaris/uts/common/sys/isa_defs.h
trunk/sys/cddl/contrib/opensolaris/uts/common/sys/list.h
trunk/sys/cddl/contrib/opensolaris/uts/common/sys/list_impl.h
trunk/sys/cddl/contrib/opensolaris/uts/common/sys/note.h
trunk/sys/cddl/contrib/opensolaris/uts/common/sys/nvpair.h
trunk/sys/cddl/contrib/opensolaris/uts/common/sys/nvpair_impl.h
trunk/sys/cddl/contrib/opensolaris/uts/common/sys/processor.h
trunk/sys/cddl/contrib/opensolaris/uts/common/sys/procset.h
trunk/sys/cddl/contrib/opensolaris/uts/common/sys/synch.h
trunk/sys/cddl/contrib/opensolaris/uts/common/sys/sysevent/dev.h
trunk/sys/cddl/contrib/opensolaris/uts/common/sys/sysevent/eventdefs.h
trunk/sys/cddl/contrib/opensolaris/uts/common/sys/sysevent.h
trunk/sys/cddl/contrib/opensolaris/uts/common/sys/sysmacros.h
trunk/sys/cddl/contrib/opensolaris/uts/common/sys/taskq.h
trunk/sys/cddl/contrib/opensolaris/uts/common/sys/u8_textprep.h
trunk/sys/cddl/contrib/opensolaris/uts/common/sys/u8_textprep_data.h
trunk/sys/cddl/contrib/opensolaris/uts/common/sys/vnode.h
trunk/sys/cddl/contrib/opensolaris/uts/common/sys/zmod.h
trunk/sys/cddl/contrib/opensolaris/uts/common/zmod/opensolaris_crc32.c
trunk/sys/cddl/contrib/opensolaris/uts/common/zmod/zmod.c
trunk/sys/cddl/contrib/opensolaris/uts/intel/dtrace/fasttrap_isa.c
trunk/sys/cddl/contrib/opensolaris/uts/sparc/dtrace/fasttrap_isa.c
Added Paths:
-----------
trunk/sys/cddl/contrib/opensolaris/uts/mips/
trunk/sys/cddl/contrib/opensolaris/uts/mips/dtrace/
trunk/sys/cddl/contrib/opensolaris/uts/mips/dtrace/fasttrap_isa.c
trunk/sys/cddl/contrib/opensolaris/uts/mips/sys/
trunk/sys/cddl/contrib/opensolaris/uts/mips/sys/fasttrap_isa.h
trunk/sys/cddl/contrib/opensolaris/uts/powerpc/
trunk/sys/cddl/contrib/opensolaris/uts/powerpc/dtrace/
trunk/sys/cddl/contrib/opensolaris/uts/powerpc/dtrace/fasttrap_isa.c
trunk/sys/cddl/contrib/opensolaris/uts/powerpc/sys/
trunk/sys/cddl/contrib/opensolaris/uts/powerpc/sys/fasttrap_isa.h
Property Changed:
----------------
trunk/sys/cddl/contrib/opensolaris/uts/common/Makefile.files
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/Makefile.files
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/Makefile.files 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/Makefile.files 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+# $MidnightBSD$
#
# CDDL HEADER START
#
@@ -21,7 +22,10 @@
#
# Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
-# Copyright (c) 2012 by Delphix. All rights reserved.
+# Copyright (c) 2012 Nexenta Systems, Inc. All rights reserved.
+# Copyright (c) 2012 Nexenta Systems, Inc. All rights reserved.
+# Copyright (c) 2012 Joyent, Inc. All rights reserved.
+# Copyright (c) 2011, 2014 by Delphix. All rights reserved.
# Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
#
#
@@ -32,8 +36,10 @@
ZFS_COMMON_OBJS += \
arc.o \
bplist.o \
+ blkptr.o \
bpobj.o \
bptree.o \
+ bqueue.o \
dbuf.o \
ddt.o \
ddt_zap.o \
@@ -46,6 +52,7 @@
dmu_tx.o \
dnode.o \
dnode_sync.o \
+ dsl_bookmark.o \
dsl_dir.o \
dsl_dataset.o \
dsl_deadlist.o \
@@ -62,6 +69,8 @@
lz4.o \
lzjb.o \
metaslab.o \
+ multilist.o \
+ range_tree.o \
refcount.o \
rrwlock.o \
sa.o \
@@ -72,6 +81,7 @@
spa_history.o \
spa_misc.o \
space_map.o \
+ space_reftree.o \
txg.o \
uberblock.o \
unique.o \
@@ -126,6 +136,3 @@
zfs_vfsops.o \
zfs_vnops.o \
zvol.o
-
-ZUT_OBJS += \
- zut.o
Property changes on: trunk/sys/cddl/contrib/opensolaris/uts/common/Makefile.files
___________________________________________________________________
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/ctf/ctf_mod.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/ctf/ctf_mod.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/ctf/ctf_mod.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/ctf/ctf_subr.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/ctf/ctf_subr.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/ctf/ctf_subr.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -18,16 +19,15 @@
*
* CDDL HEADER END
*
- * $FreeBSD: release/9.2.0/sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace.c 250484 2013-05-10 21:12:55Z pfg $
+ * $FreeBSD: stable/10/sys/cddl/contrib/opensolaris/uts/common/dtrace/dtrace.c 314667 2017-03-04 13:03:31Z avg $
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
/*
* DTrace - Dynamic Tracing for Solaris
*
@@ -68,7 +68,7 @@
* on capital-f functions.
*/
#include <sys/errno.h>
-#if !defined(sun)
+#ifndef illumos
#include <sys/time.h>
#endif
#include <sys/stat.h>
@@ -75,13 +75,13 @@
#include <sys/modctl.h>
#include <sys/conf.h>
#include <sys/systm.h>
-#if defined(sun)
+#ifdef illumos
#include <sys/ddi.h>
#include <sys/sunddi.h>
#endif
#include <sys/cpuvar.h>
#include <sys/kmem.h>
-#if defined(sun)
+#ifdef illumos
#include <sys/strsubr.h>
#endif
#include <sys/sysmacros.h>
@@ -88,22 +88,22 @@
#include <sys/dtrace_impl.h>
#include <sys/atomic.h>
#include <sys/cmn_err.h>
-#if defined(sun)
+#ifdef illumos
#include <sys/mutex_impl.h>
#include <sys/rwlock_impl.h>
#endif
#include <sys/ctf_api.h>
-#if defined(sun)
+#ifdef illumos
#include <sys/panic.h>
#include <sys/priv_impl.h>
#endif
#include <sys/policy.h>
-#if defined(sun)
+#ifdef illumos
#include <sys/cred_impl.h>
#include <sys/procfs_isa.h>
#endif
#include <sys/taskq.h>
-#if defined(sun)
+#ifdef illumos
#include <sys/mkdev.h>
#include <sys/kdi.h>
#endif
@@ -110,11 +110,13 @@
#include <sys/zone.h>
#include <sys/socket.h>
#include <netinet/in.h>
+#include "strtolctype.h"
/* FreeBSD includes: */
-#if !defined(sun)
+#ifndef illumos
#include <sys/callout.h>
#include <sys/ctype.h>
+#include <sys/eventhandler.h>
#include <sys/limits.h>
#include <sys/kdb.h>
#include <sys/kernel.h>
@@ -153,8 +155,8 @@
int dtrace_destructive_disallow = 0;
dtrace_optval_t dtrace_nonroot_maxsize = (16 * 1024 * 1024);
size_t dtrace_difo_maxsize = (256 * 1024);
-dtrace_optval_t dtrace_dof_maxsize = (256 * 1024);
-size_t dtrace_global_maxsize = (16 * 1024);
+dtrace_optval_t dtrace_dof_maxsize = (8 * 1024 * 1024);
+size_t dtrace_statvar_maxsize = (16 * 1024);
size_t dtrace_actions_max = (16 * 1024);
size_t dtrace_retain_max = 1024;
dtrace_optval_t dtrace_helper_actions_max = 128;
@@ -175,7 +177,7 @@
dtrace_optval_t dtrace_jstackframes_default = 50;
dtrace_optval_t dtrace_jstackstrsize_default = 512;
int dtrace_msgdsize_max = 128;
-hrtime_t dtrace_chill_max = 500 * (NANOSEC / MILLISEC); /* 500 ms */
+hrtime_t dtrace_chill_max = MSEC2NSEC(500); /* 500 ms */
hrtime_t dtrace_chill_interval = NANOSEC; /* 1000 ms */
int dtrace_devdepth_max = 32;
int dtrace_err_verbose;
@@ -183,6 +185,9 @@
hrtime_t dtrace_deadman_timeout = (hrtime_t)10 * NANOSEC;
hrtime_t dtrace_deadman_user = (hrtime_t)30 * NANOSEC;
hrtime_t dtrace_unregister_defunct_reap = (hrtime_t)60 * NANOSEC;
+#ifndef illumos
+int dtrace_memstr_max = 4096;
+#endif
/*
* DTrace External Variables
@@ -198,10 +203,10 @@
/*
* DTrace Internal Variables
*/
-#if defined(sun)
+#ifdef illumos
static dev_info_t *dtrace_devi; /* device info */
#endif
-#if defined(sun)
+#ifdef illumos
static vmem_t *dtrace_arena; /* probe ID arena */
static vmem_t *dtrace_minor; /* minor number arena */
#else
@@ -214,7 +219,8 @@
static dtrace_meta_t *dtrace_meta_pid; /* user-land meta provider */
static int dtrace_opens; /* number of opens */
static int dtrace_helpers; /* number of helpers */
-#if defined(sun)
+static int dtrace_getf; /* number of unpriv getf()s */
+#ifdef illumos
static void *dtrace_softstate; /* softstate pointer */
#endif
static dtrace_hash_t *dtrace_bymod; /* probes hashed by module */
@@ -231,14 +237,18 @@
static dtrace_genid_t dtrace_probegen; /* current probe generation */
static dtrace_helpers_t *dtrace_deferred_pid; /* deferred helper list */
static dtrace_enabling_t *dtrace_retained; /* list of retained enablings */
+static dtrace_genid_t dtrace_retained_gen; /* current retained enab gen */
static dtrace_dynvar_t dtrace_dynhash_sink; /* end of dynamic hash chains */
-#if !defined(sun)
+static int dtrace_dynvar_failclean; /* dynvars failed to clean */
+#ifndef illumos
static struct mtx dtrace_unr_mtx;
MTX_SYSINIT(dtrace_unr_mtx, &dtrace_unr_mtx, "Unique resource identifier", MTX_DEF);
int dtrace_in_probe; /* non-zero if executing a probe */
-#if defined(__i386__) || defined(__amd64__)
+#if defined(__i386__) || defined(__amd64__) || defined(__mips__) || defined(__powerpc__)
uintptr_t dtrace_in_probe_addr; /* Address of invop when already in probe */
#endif
+static eventhandler_tag dtrace_kld_load_tag;
+static eventhandler_tag dtrace_kld_unload_try_tag;
#endif
/*
@@ -275,10 +285,8 @@
static kmutex_t dtrace_provider_lock; /* provider state lock */
static kmutex_t dtrace_meta_lock; /* meta-provider state lock */
-#if !defined(sun)
+#ifndef illumos
/* XXX FreeBSD hacks. */
-static kmutex_t mod_lock;
-
#define cr_suid cr_svuid
#define cr_sgid cr_svgid
#define ipaddr_t in_addr_t
@@ -298,10 +306,11 @@
#define PRIV_PROC_ZONE (1 << 5)
#define PRIV_ALL ~0
-SYSCTL_NODE(_debug, OID_AUTO, dtrace, CTLFLAG_RD, 0, "DTrace Information");
+SYSCTL_DECL(_debug_dtrace);
+SYSCTL_DECL(_kern_dtrace);
#endif
-#if defined(sun)
+#ifdef illumos
#define curcpu CPU->cpu_id
#endif
@@ -343,18 +352,23 @@
/*
* DTrace Helper Tracing Variables
+ *
+ * These variables should be set dynamically to enable helper tracing. The
+ * only variables that should be set are dtrace_helptrace_enable (which should
+ * be set to a non-zero value to allocate helper tracing buffers on the next
+ * open of /dev/dtrace) and dtrace_helptrace_disable (which should be set to a
+ * non-zero value to deallocate helper tracing buffers on the next close of
+ * /dev/dtrace). When (and only when) helper tracing is disabled, the
+ * buffer size may also be set via dtrace_helptrace_bufsize.
*/
-uint32_t dtrace_helptrace_next = 0;
-uint32_t dtrace_helptrace_nlocals;
-char *dtrace_helptrace_buffer;
-int dtrace_helptrace_bufsize = 512 * 1024;
+int dtrace_helptrace_enable = 0;
+int dtrace_helptrace_disable = 0;
+int dtrace_helptrace_bufsize = 16 * 1024 * 1024;
+uint32_t dtrace_helptrace_nlocals;
+static dtrace_helptrace_t *dtrace_helptrace_buffer;
+static uint32_t dtrace_helptrace_next = 0;
+static int dtrace_helptrace_wrapped = 0;
-#ifdef DEBUG
-int dtrace_helptrace_enabled = 1;
-#else
-int dtrace_helptrace_enabled = 0;
-#endif
-
/*
* DTrace Error Hashing
*
@@ -411,7 +425,7 @@
* no way for a global variable key signature to match a thread-local key
* signature.
*/
-#if defined(sun)
+#ifdef illumos
#define DTRACE_TLS_THRKEY(where) { \
uint_t intr = 0; \
uint_t actv = CPU->cpu_intr_actv >> (LOCK_LEVEL + 1); \
@@ -462,8 +476,8 @@
* disallow all negative sizes. Ranges of size 0 are allowed.
*/
#define DTRACE_INRANGE(testaddr, testsz, baseaddr, basesz) \
- ((testaddr) - (baseaddr) < (basesz) && \
- (testaddr) + (testsz) - (baseaddr) <= (basesz) && \
+ ((testaddr) - (uintptr_t)(baseaddr) < (basesz) && \
+ (testaddr) + (testsz) - (uintptr_t)(baseaddr) <= (basesz) && \
(testaddr) + (testsz) >= (testaddr))
/*
@@ -572,6 +586,8 @@
dtrace_dynvar_t *dtrace_dynvar(dtrace_dstate_t *, uint_t, dtrace_key_t *,
size_t, dtrace_dynvar_op_t, dtrace_mstate_t *, dtrace_vstate_t *);
uintptr_t dtrace_dif_varstr(uintptr_t, dtrace_state_t *, dtrace_mstate_t *);
+static int dtrace_priv_proc(dtrace_state_t *);
+static void dtrace_getf_barrier(void);
/*
* DTrace Probe Context Functions
@@ -596,7 +612,11 @@
va_list alist;
va_start(alist, format);
+#ifdef __FreeBSD__
+ vpanic(format, alist);
+#else
dtrace_vpanic(format, alist);
+#endif
va_end(alist);
}
@@ -680,13 +700,33 @@
dtrace_statvar_t **svars, int nsvars)
{
int i;
+ size_t maxglobalsize, maxlocalsize;
+ if (nsvars == 0)
+ return (0);
+
+ maxglobalsize = dtrace_statvar_maxsize;
+ maxlocalsize = (maxglobalsize + sizeof (uint64_t)) * NCPU;
+
for (i = 0; i < nsvars; i++) {
dtrace_statvar_t *svar = svars[i];
+ uint8_t scope;
+ size_t size;
- if (svar == NULL || svar->dtsv_size == 0)
+ if (svar == NULL || (size = svar->dtsv_size) == 0)
continue;
+ scope = svar->dtsv_var.dtdv_scope;
+
+ /*
+ * We verify that our size is valid in the spirit of providing
+ * defense in depth: we want to prevent attackers from using
+ * DTrace to escalate an orthogonal kernel heap corruption bug
+ * into the ability to store to arbitrary locations in memory.
+ */
+ VERIFY((scope == DIFV_SCOPE_GLOBAL && size < maxglobalsize) ||
+ (scope == DIFV_SCOPE_LOCAL && size < maxlocalsize));
+
if (DTRACE_INRANGE(addr, sz, svar->dtsv_data, svar->dtsv_size))
return (1);
}
@@ -716,7 +756,7 @@
* up both thread-local variables and any global dynamically-allocated
* variables.
*/
- if (DTRACE_INRANGE(addr, sz, (uintptr_t)vstate->dtvs_dynvars.dtds_base,
+ if (DTRACE_INRANGE(addr, sz, vstate->dtvs_dynvars.dtds_base,
vstate->dtvs_dynvars.dtds_size)) {
dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
uintptr_t base = (uintptr_t)dstate->dtds_base +
@@ -783,6 +823,7 @@
dtrace_vstate_t *vstate)
{
volatile uintptr_t *illval = &cpu_core[curcpu].cpuc_dtrace_illval;
+ file_t *fp;
/*
* If we hold the privilege to read from kernel memory, then
@@ -800,10 +841,104 @@
/*
* We're allowed to read from our own string table.
*/
- if (DTRACE_INRANGE(addr, sz, (uintptr_t)mstate->dtms_difo->dtdo_strtab,
+ if (DTRACE_INRANGE(addr, sz, mstate->dtms_difo->dtdo_strtab,
mstate->dtms_difo->dtdo_strlen))
return (1);
+ if (vstate->dtvs_state != NULL &&
+ dtrace_priv_proc(vstate->dtvs_state)) {
+ proc_t *p;
+
+ /*
+ * When we have privileges to the current process, there are
+ * several context-related kernel structures that are safe to
+ * read, even absent the privilege to read from kernel memory.
+ * These reads are safe because these structures contain only
+ * state that (1) we're permitted to read, (2) is harmless or
+ * (3) contains pointers to additional kernel state that we're
+ * not permitted to read (and as such, do not present an
+ * opportunity for privilege escalation). Finally (and
+ * critically), because of the nature of their relation with
+ * the current thread context, the memory associated with these
+ * structures cannot change over the duration of probe context,
+ * and it is therefore impossible for this memory to be
+ * deallocated and reallocated as something else while it's
+ * being operated upon.
+ */
+ if (DTRACE_INRANGE(addr, sz, curthread, sizeof (kthread_t)))
+ return (1);
+
+ if ((p = curthread->t_procp) != NULL && DTRACE_INRANGE(addr,
+ sz, curthread->t_procp, sizeof (proc_t))) {
+ return (1);
+ }
+
+ if (curthread->t_cred != NULL && DTRACE_INRANGE(addr, sz,
+ curthread->t_cred, sizeof (cred_t))) {
+ return (1);
+ }
+
+#ifdef illumos
+ if (p != NULL && p->p_pidp != NULL && DTRACE_INRANGE(addr, sz,
+ &(p->p_pidp->pid_id), sizeof (pid_t))) {
+ return (1);
+ }
+
+ if (curthread->t_cpu != NULL && DTRACE_INRANGE(addr, sz,
+ curthread->t_cpu, offsetof(cpu_t, cpu_pause_thread))) {
+ return (1);
+ }
+#endif
+ }
+
+ if ((fp = mstate->dtms_getf) != NULL) {
+ uintptr_t psz = sizeof (void *);
+ vnode_t *vp;
+ vnodeops_t *op;
+
+ /*
+ * When getf() returns a file_t, the enabling is implicitly
+ * granted the (transient) right to read the returned file_t
+ * as well as the v_path and v_op->vnop_name of the underlying
+ * vnode. These accesses are allowed after a successful
+ * getf() because the members that they refer to cannot change
+ * once set -- and the barrier logic in the kernel's closef()
+ * path assures that the file_t and its referenced vode_t
+ * cannot themselves be stale (that is, it impossible for
+ * either dtms_getf itself or its f_vnode member to reference
+ * freed memory).
+ */
+ if (DTRACE_INRANGE(addr, sz, fp, sizeof (file_t)))
+ return (1);
+
+ if ((vp = fp->f_vnode) != NULL) {
+#ifdef illumos
+ if (DTRACE_INRANGE(addr, sz, &vp->v_path, psz))
+ return (1);
+ if (vp->v_path != NULL && DTRACE_INRANGE(addr, sz,
+ vp->v_path, strlen(vp->v_path) + 1)) {
+ return (1);
+ }
+#endif
+
+ if (DTRACE_INRANGE(addr, sz, &vp->v_op, psz))
+ return (1);
+
+#ifdef illumos
+ if ((op = vp->v_op) != NULL &&
+ DTRACE_INRANGE(addr, sz, &op->vnop_name, psz)) {
+ return (1);
+ }
+
+ if (op != NULL && op->vnop_name != NULL &&
+ DTRACE_INRANGE(addr, sz, op->vnop_name,
+ strlen(op->vnop_name) + 1)) {
+ return (1);
+ }
+#endif
+ }
+ }
+
DTRACE_CPUFLAG_SET(CPU_DTRACE_KPRIV);
*illval = addr;
return (0);
@@ -863,6 +998,58 @@
}
/*
+ * Convert a string to a signed integer using safe loads.
+ *
+ * NOTE: This function uses various macros from strtolctype.h to manipulate
+ * digit values, etc -- these have all been checked to ensure they make
+ * no additional function calls.
+ */
+static int64_t
+dtrace_strtoll(char *input, int base, size_t limit)
+{
+ uintptr_t pos = (uintptr_t)input;
+ int64_t val = 0;
+ int x;
+ boolean_t neg = B_FALSE;
+ char c, cc, ccc;
+ uintptr_t end = pos + limit;
+
+ /*
+ * Consume any whitespace preceding digits.
+ */
+ while ((c = dtrace_load8(pos)) == ' ' || c == '\t')
+ pos++;
+
+ /*
+ * Handle an explicit sign if one is present.
+ */
+ if (c == '-' || c == '+') {
+ if (c == '-')
+ neg = B_TRUE;
+ c = dtrace_load8(++pos);
+ }
+
+ /*
+ * Check for an explicit hexadecimal prefix ("0x" or "0X") and skip it
+ * if present.
+ */
+ if (base == 16 && c == '0' && ((cc = dtrace_load8(pos + 1)) == 'x' ||
+ cc == 'X') && isxdigit(ccc = dtrace_load8(pos + 2))) {
+ pos += 2;
+ c = ccc;
+ }
+
+ /*
+ * Read in contiguous digits until the first non-digit character.
+ */
+ for (; pos < end && c != '\0' && lisalnum(c) && (x = DIGIT(c)) < base;
+ c = dtrace_load8(++pos))
+ val = val * base + x;
+
+ return (neg ? -val : val);
+}
+
+/*
* Compare two strings using safe loads.
*/
static int
@@ -1174,7 +1361,7 @@
static int
dtrace_priv_proc_common_zone(dtrace_state_t *state)
{
-#if defined(sun)
+#ifdef illumos
cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
/*
@@ -1183,8 +1370,7 @@
*/
ASSERT(s_cr != NULL);
- if ((cr = CRED()) != NULL &&
- s_cr->cr_zone == cr->cr_zone)
+ if ((cr = CRED()) != NULL && s_cr->cr_zone == cr->cr_zone)
return (1);
return (0);
@@ -1284,6 +1470,115 @@
}
/*
+ * Determine if the dte_cond of the specified ECB allows for processing of
+ * the current probe to continue. Note that this routine may allow continued
+ * processing, but with access(es) stripped from the mstate's dtms_access
+ * field.
+ */
+static int
+dtrace_priv_probe(dtrace_state_t *state, dtrace_mstate_t *mstate,
+ dtrace_ecb_t *ecb)
+{
+ dtrace_probe_t *probe = ecb->dte_probe;
+ dtrace_provider_t *prov = probe->dtpr_provider;
+ dtrace_pops_t *pops = &prov->dtpv_pops;
+ int mode = DTRACE_MODE_NOPRIV_DROP;
+
+ ASSERT(ecb->dte_cond);
+
+#ifdef illumos
+ if (pops->dtps_mode != NULL) {
+ mode = pops->dtps_mode(prov->dtpv_arg,
+ probe->dtpr_id, probe->dtpr_arg);
+
+ ASSERT((mode & DTRACE_MODE_USER) ||
+ (mode & DTRACE_MODE_KERNEL));
+ ASSERT((mode & DTRACE_MODE_NOPRIV_RESTRICT) ||
+ (mode & DTRACE_MODE_NOPRIV_DROP));
+ }
+
+ /*
+ * If the dte_cond bits indicate that this consumer is only allowed to
+ * see user-mode firings of this probe, call the provider's dtps_mode()
+ * entry point to check that the probe was fired while in a user
+ * context. If that's not the case, use the policy specified by the
+ * provider to determine if we drop the probe or merely restrict
+ * operation.
+ */
+ if (ecb->dte_cond & DTRACE_COND_USERMODE) {
+ ASSERT(mode != DTRACE_MODE_NOPRIV_DROP);
+
+ if (!(mode & DTRACE_MODE_USER)) {
+ if (mode & DTRACE_MODE_NOPRIV_DROP)
+ return (0);
+
+ mstate->dtms_access &= ~DTRACE_ACCESS_ARGS;
+ }
+ }
+#endif
+
+ /*
+ * This is more subtle than it looks. We have to be absolutely certain
+ * that CRED() isn't going to change out from under us so it's only
+ * legit to examine that structure if we're in constrained situations.
+ * Currently, the only times we'll this check is if a non-super-user
+ * has enabled the profile or syscall providers -- providers that
+ * allow visibility of all processes. For the profile case, the check
+ * above will ensure that we're examining a user context.
+ */
+ if (ecb->dte_cond & DTRACE_COND_OWNER) {
+ cred_t *cr;
+ cred_t *s_cr = state->dts_cred.dcr_cred;
+ proc_t *proc;
+
+ ASSERT(s_cr != NULL);
+
+ if ((cr = CRED()) == NULL ||
+ s_cr->cr_uid != cr->cr_uid ||
+ s_cr->cr_uid != cr->cr_ruid ||
+ s_cr->cr_uid != cr->cr_suid ||
+ s_cr->cr_gid != cr->cr_gid ||
+ s_cr->cr_gid != cr->cr_rgid ||
+ s_cr->cr_gid != cr->cr_sgid ||
+ (proc = ttoproc(curthread)) == NULL ||
+ (proc->p_flag & SNOCD)) {
+ if (mode & DTRACE_MODE_NOPRIV_DROP)
+ return (0);
+
+#ifdef illumos
+ mstate->dtms_access &= ~DTRACE_ACCESS_PROC;
+#endif
+ }
+ }
+
+#ifdef illumos
+ /*
+ * If our dte_cond is set to DTRACE_COND_ZONEOWNER and we are not
+ * in our zone, check to see if our mode policy is to restrict rather
+ * than to drop; if to restrict, strip away both DTRACE_ACCESS_PROC
+ * and DTRACE_ACCESS_ARGS
+ */
+ if (ecb->dte_cond & DTRACE_COND_ZONEOWNER) {
+ cred_t *cr;
+ cred_t *s_cr = state->dts_cred.dcr_cred;
+
+ ASSERT(s_cr != NULL);
+
+ if ((cr = CRED()) == NULL ||
+ s_cr->cr_zone->zone_id != cr->cr_zone->zone_id) {
+ if (mode & DTRACE_MODE_NOPRIV_DROP)
+ return (0);
+
+ mstate->dtms_access &=
+ ~(DTRACE_ACCESS_PROC | DTRACE_ACCESS_ARGS);
+ }
+ }
+#endif
+
+ return (1);
+}
+
+/*
* Note: not called from probe context. This function is called
* asynchronously (and at a regular interval) from outside of probe context to
* clean the dirty dynamic variable lists on all CPUs. Dynamic variable
@@ -1294,13 +1589,13 @@
{
dtrace_dynvar_t *dirty;
dtrace_dstate_percpu_t *dcpu;
- int i, work = 0;
+ dtrace_dynvar_t **rinsep;
+ int i, j, work = 0;
for (i = 0; i < NCPU; i++) {
dcpu = &dstate->dtds_percpu[i];
+ rinsep = &dcpu->dtdsc_rinsing;
- ASSERT(dcpu->dtdsc_rinsing == NULL);
-
/*
* If the dirty list is NULL, there is no dirty work to do.
*/
@@ -1307,15 +1602,63 @@
if (dcpu->dtdsc_dirty == NULL)
continue;
- /*
- * If the clean list is non-NULL, then we're not going to do
- * any work for this CPU -- it means that there has not been
- * a dtrace_dynvar() allocation on this CPU (or from this CPU)
- * since the last time we cleaned house.
- */
- if (dcpu->dtdsc_clean != NULL)
+ if (dcpu->dtdsc_rinsing != NULL) {
+ /*
+ * If the rinsing list is non-NULL, then it is because
+ * this CPU was selected to accept another CPU's
+ * dirty list -- and since that time, dirty buffers
+ * have accumulated. This is a highly unlikely
+ * condition, but we choose to ignore the dirty
+ * buffers -- they'll be picked up a future cleanse.
+ */
continue;
+ }
+ if (dcpu->dtdsc_clean != NULL) {
+ /*
+ * If the clean list is non-NULL, then we're in a
+ * situation where a CPU has done deallocations (we
+ * have a non-NULL dirty list) but no allocations (we
+ * also have a non-NULL clean list). We can't simply
+ * move the dirty list into the clean list on this
+ * CPU, yet we also don't want to allow this condition
+ * to persist, lest a short clean list prevent a
+ * massive dirty list from being cleaned (which in
+ * turn could lead to otherwise avoidable dynamic
+ * drops). To deal with this, we look for some CPU
+ * with a NULL clean list, NULL dirty list, and NULL
+ * rinsing list -- and then we borrow this CPU to
+ * rinse our dirty list.
+ */
+ for (j = 0; j < NCPU; j++) {
+ dtrace_dstate_percpu_t *rinser;
+
+ rinser = &dstate->dtds_percpu[j];
+
+ if (rinser->dtdsc_rinsing != NULL)
+ continue;
+
+ if (rinser->dtdsc_dirty != NULL)
+ continue;
+
+ if (rinser->dtdsc_clean != NULL)
+ continue;
+
+ rinsep = &rinser->dtdsc_rinsing;
+ break;
+ }
+
+ if (j == NCPU) {
+ /*
+ * We were unable to find another CPU that
+ * could accept this dirty list -- we are
+ * therefore unable to clean it now.
+ */
+ dtrace_dynvar_failclean++;
+ continue;
+ }
+ }
+
work = 1;
/*
@@ -1331,7 +1674,7 @@
* on a hash chain, either the dirty list or the
* rinsing list for some CPU must be non-NULL.)
*/
- dcpu->dtdsc_rinsing = dirty;
+ *rinsep = dirty;
dtrace_membar_producer();
} while (dtrace_casptr(&dcpu->dtdsc_dirty,
dirty, NULL) != dirty);
@@ -1762,7 +2105,7 @@
ASSERT(clean->dtdv_hashval == DTRACE_DYNHASH_FREE);
/*
- * Now we'll move the clean list to the free list.
+ * Now we'll move the clean list to our free list.
* It's impossible for this to fail: the only way
* the free list can be updated is through this
* code path, and only one CPU can own the clean list.
@@ -1775,6 +2118,7 @@
* owners of the clean lists out before resetting
* the clean lists.
*/
+ dcpu = &dstate->dtds_percpu[me];
rval = dtrace_casptr(&dcpu->dtdsc_free, NULL, clean);
ASSERT(rval == NULL);
goto retry;
@@ -2345,9 +2689,10 @@
{
dtrace_speculation_t *spec;
dtrace_buffer_t *src, *dest;
- uintptr_t daddr, saddr, dlimit;
+ uintptr_t daddr, saddr, dlimit, slimit;
dtrace_speculation_state_t current, new = 0;
intptr_t offs;
+ uint64_t timestamp;
if (which == 0)
return;
@@ -2423,7 +2768,37 @@
}
/*
- * We have the space; copy the buffer across. (Note that this is a
+ * We have sufficient space to copy the speculative buffer into the
+ * primary buffer. First, modify the speculative buffer, filling
+ * in the timestamp of all entries with the current time. The data
+ * must have the commit() time rather than the time it was traced,
+ * so that all entries in the primary buffer are in timestamp order.
+ */
+ timestamp = dtrace_gethrtime();
+ saddr = (uintptr_t)src->dtb_tomax;
+ slimit = saddr + src->dtb_offset;
+ while (saddr < slimit) {
+ size_t size;
+ dtrace_rechdr_t *dtrh = (dtrace_rechdr_t *)saddr;
+
+ if (dtrh->dtrh_epid == DTRACE_EPIDNONE) {
+ saddr += sizeof (dtrace_epid_t);
+ continue;
+ }
+ ASSERT3U(dtrh->dtrh_epid, <=, state->dts_necbs);
+ size = state->dts_ecbs[dtrh->dtrh_epid - 1]->dte_size;
+
+ ASSERT3U(saddr + size, <=, slimit);
+ ASSERT3U(size, >=, sizeof (dtrace_rechdr_t));
+ ASSERT3U(DTRACE_RECORD_LOAD_TIMESTAMP(dtrh), ==, UINT64_MAX);
+
+ DTRACE_RECORD_STORE_TIMESTAMP(dtrh, timestamp);
+
+ saddr += size;
+ }
+
+ /*
+ * Copy the buffer across. (Note that this is a
* highly subobtimal bcopy(); in the unlikely event that this becomes
* a serious performance issue, a high-performance DTrace-specific
* bcopy() should obviously be invented.)
@@ -2836,7 +3211,7 @@
return (mstate->dtms_arg[ndx]);
-#if defined(sun)
+#ifdef illumos
case DIF_VAR_UREGS: {
klwp_t *lwp;
@@ -2870,7 +3245,7 @@
#endif
case DIF_VAR_CURTHREAD:
- if (!dtrace_priv_kernel(state))
+ if (!dtrace_priv_proc(state))
return (0);
return ((uint64_t)(uintptr_t)curthread);
@@ -2892,7 +3267,7 @@
}
return (mstate->dtms_walltimestamp);
-#if defined(sun)
+#ifdef illumos
case DIF_VAR_IPL:
if (!dtrace_priv_kernel(state))
return (0);
@@ -3029,7 +3404,7 @@
if (!dtrace_priv_proc(state))
return (0);
-#if defined(sun)
+#ifdef illumos
/*
* Note that we are assuming that an unanchored probe is
* always due to a high-level interrupt. (And we're assuming
@@ -3055,7 +3430,7 @@
if (!dtrace_priv_proc(state))
return (0);
-#if defined(sun)
+#ifdef illumos
/*
* See comment in DIF_VAR_PID.
*/
@@ -3070,11 +3445,14 @@
*/
return ((uint64_t)curthread->t_procp->p_ppid);
#else
- return ((uint64_t)curproc->p_pptr->p_pid);
+ if (curproc->p_pid == proc0.p_pid)
+ return (curproc->p_pid);
+ else
+ return (curproc->p_pptr->p_pid);
#endif
case DIF_VAR_TID:
-#if defined(sun)
+#ifdef illumos
/*
* See comment in DIF_VAR_PID.
*/
@@ -3095,7 +3473,7 @@
}
case DIF_VAR_EXECNAME:
-#if defined(sun)
+#ifdef illumos
if (!dtrace_priv_proc(state))
return (0);
@@ -3120,7 +3498,7 @@
#endif
case DIF_VAR_ZONENAME:
-#if defined(sun)
+#ifdef illumos
if (!dtrace_priv_proc(state))
return (0);
@@ -3147,13 +3525,12 @@
if (!dtrace_priv_proc(state))
return (0);
-#if defined(sun)
+#ifdef illumos
/*
* See comment in DIF_VAR_PID.
*/
if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
return ((uint64_t)p0.p_cred->cr_uid);
-#endif
/*
* It is always safe to dereference one's own t_procp pointer:
@@ -3165,18 +3542,20 @@
* credential, since this is never NULL after process birth.
*/
return ((uint64_t)curthread->t_procp->p_cred->cr_uid);
+#else
+ return ((uint64_t)curthread->td_ucred->cr_uid);
+#endif
case DIF_VAR_GID:
if (!dtrace_priv_proc(state))
return (0);
-#if defined(sun)
+#ifdef illumos
/*
* See comment in DIF_VAR_PID.
*/
if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
return ((uint64_t)p0.p_cred->cr_gid);
-#endif
/*
* It is always safe to dereference one's own t_procp pointer:
@@ -3188,9 +3567,12 @@
* credential, since this is never NULL after process birth.
*/
return ((uint64_t)curthread->t_procp->p_cred->cr_gid);
+#else
+ return ((uint64_t)curthread->td_ucred->cr_gid);
+#endif
case DIF_VAR_ERRNO: {
-#if defined(sun)
+#ifdef illumos
klwp_t *lwp;
if (!dtrace_priv_proc(state))
return (0);
@@ -3215,7 +3597,7 @@
return (curthread->td_errno);
#endif
}
-#if !defined(sun)
+#ifndef illumos
case DIF_VAR_CPU: {
return curcpu;
}
@@ -3226,7 +3608,464 @@
}
}
+
+typedef enum dtrace_json_state {
+ DTRACE_JSON_REST = 1,
+ DTRACE_JSON_OBJECT,
+ DTRACE_JSON_STRING,
+ DTRACE_JSON_STRING_ESCAPE,
+ DTRACE_JSON_STRING_ESCAPE_UNICODE,
+ DTRACE_JSON_COLON,
+ DTRACE_JSON_COMMA,
+ DTRACE_JSON_VALUE,
+ DTRACE_JSON_IDENTIFIER,
+ DTRACE_JSON_NUMBER,
+ DTRACE_JSON_NUMBER_FRAC,
+ DTRACE_JSON_NUMBER_EXP,
+ DTRACE_JSON_COLLECT_OBJECT
+} dtrace_json_state_t;
+
/*
+ * This function possesses just enough knowledge about JSON to extract a single
+ * value from a JSON string and store it in the scratch buffer. It is able
+ * to extract nested object values, and members of arrays by index.
+ *
+ * elemlist is a list of JSON keys, stored as packed NUL-terminated strings, to
+ * be looked up as we descend into the object tree. e.g.
+ *
+ * foo[0].bar.baz[32] --> "foo" NUL "0" NUL "bar" NUL "baz" NUL "32" NUL
+ * with nelems = 5.
+ *
+ * The run time of this function must be bounded above by strsize to limit the
+ * amount of work done in probe context. As such, it is implemented as a
+ * simple state machine, reading one character at a time using safe loads
+ * until we find the requested element, hit a parsing error or run off the
+ * end of the object or string.
+ *
+ * As there is no way for a subroutine to return an error without interrupting
+ * clause execution, we simply return NULL in the event of a missing key or any
+ * other error condition. Each NULL return in this function is commented with
+ * the error condition it represents -- parsing or otherwise.
+ *
+ * The set of states for the state machine closely matches the JSON
+ * specification (http://json.org/). Briefly:
+ *
+ * DTRACE_JSON_REST:
+ * Skip whitespace until we find either a top-level Object, moving
+ * to DTRACE_JSON_OBJECT; or an Array, moving to DTRACE_JSON_VALUE.
+ *
+ * DTRACE_JSON_OBJECT:
+ * Locate the next key String in an Object. Sets a flag to denote
+ * the next String as a key string and moves to DTRACE_JSON_STRING.
+ *
+ * DTRACE_JSON_COLON:
+ * Skip whitespace until we find the colon that separates key Strings
+ * from their values. Once found, move to DTRACE_JSON_VALUE.
+ *
+ * DTRACE_JSON_VALUE:
+ * Detects the type of the next value (String, Number, Identifier, Object
+ * or Array) and routes to the states that process that type. Here we also
+ * deal with the element selector list if we are requested to traverse down
+ * into the object tree.
+ *
+ * DTRACE_JSON_COMMA:
+ * Skip whitespace until we find the comma that separates key-value pairs
+ * in Objects (returning to DTRACE_JSON_OBJECT) or values in Arrays
+ * (similarly DTRACE_JSON_VALUE). All following literal value processing
+ * states return to this state at the end of their value, unless otherwise
+ * noted.
+ *
+ * DTRACE_JSON_NUMBER, DTRACE_JSON_NUMBER_FRAC, DTRACE_JSON_NUMBER_EXP:
+ * Processes a Number literal from the JSON, including any exponent
+ * component that may be present. Numbers are returned as strings, which
+ * may be passed to strtoll() if an integer is required.
+ *
+ * DTRACE_JSON_IDENTIFIER:
+ * Processes a "true", "false" or "null" literal in the JSON.
+ *
+ * DTRACE_JSON_STRING, DTRACE_JSON_STRING_ESCAPE,
+ * DTRACE_JSON_STRING_ESCAPE_UNICODE:
+ * Processes a String literal from the JSON, whether the String denotes
+ * a key, a value or part of a larger Object. Handles all escape sequences
+ * present in the specification, including four-digit unicode characters,
+ * but merely includes the escape sequence without converting it to the
+ * actual escaped character. If the String is flagged as a key, we
+ * move to DTRACE_JSON_COLON rather than DTRACE_JSON_COMMA.
+ *
+ * DTRACE_JSON_COLLECT_OBJECT:
+ * This state collects an entire Object (or Array), correctly handling
+ * embedded strings. If the full element selector list matches this nested
+ * object, we return the Object in full as a string. If not, we use this
+ * state to skip to the next value at this level and continue processing.
+ *
+ * NOTE: This function uses various macros from strtolctype.h to manipulate
+ * digit values, etc -- these have all been checked to ensure they make
+ * no additional function calls.
+ */
+static char *
+dtrace_json(uint64_t size, uintptr_t json, char *elemlist, int nelems,
+ char *dest)
+{
+ dtrace_json_state_t state = DTRACE_JSON_REST;
+ int64_t array_elem = INT64_MIN;
+ int64_t array_pos = 0;
+ uint8_t escape_unicount = 0;
+ boolean_t string_is_key = B_FALSE;
+ boolean_t collect_object = B_FALSE;
+ boolean_t found_key = B_FALSE;
+ boolean_t in_array = B_FALSE;
+ uint32_t braces = 0, brackets = 0;
+ char *elem = elemlist;
+ char *dd = dest;
+ uintptr_t cur;
+
+ for (cur = json; cur < json + size; cur++) {
+ char cc = dtrace_load8(cur);
+ if (cc == '\0')
+ return (NULL);
+
+ switch (state) {
+ case DTRACE_JSON_REST:
+ if (isspace(cc))
+ break;
+
+ if (cc == '{') {
+ state = DTRACE_JSON_OBJECT;
+ break;
+ }
+
+ if (cc == '[') {
+ in_array = B_TRUE;
+ array_pos = 0;
+ array_elem = dtrace_strtoll(elem, 10, size);
+ found_key = array_elem == 0 ? B_TRUE : B_FALSE;
+ state = DTRACE_JSON_VALUE;
+ break;
+ }
+
+ /*
+ * ERROR: expected to find a top-level object or array.
+ */
+ return (NULL);
+ case DTRACE_JSON_OBJECT:
+ if (isspace(cc))
+ break;
+
+ if (cc == '"') {
+ state = DTRACE_JSON_STRING;
+ string_is_key = B_TRUE;
+ break;
+ }
+
+ /*
+ * ERROR: either the object did not start with a key
+ * string, or we've run off the end of the object
+ * without finding the requested key.
+ */
+ return (NULL);
+ case DTRACE_JSON_STRING:
+ if (cc == '\\') {
+ *dd++ = '\\';
+ state = DTRACE_JSON_STRING_ESCAPE;
+ break;
+ }
+
+ if (cc == '"') {
+ if (collect_object) {
+ /*
+ * We don't reset the dest here, as
+ * the string is part of a larger
+ * object being collected.
+ */
+ *dd++ = cc;
+ collect_object = B_FALSE;
+ state = DTRACE_JSON_COLLECT_OBJECT;
+ break;
+ }
+ *dd = '\0';
+ dd = dest; /* reset string buffer */
+ if (string_is_key) {
+ if (dtrace_strncmp(dest, elem,
+ size) == 0)
+ found_key = B_TRUE;
+ } else if (found_key) {
+ if (nelems > 1) {
+ /*
+ * We expected an object, not
+ * this string.
+ */
+ return (NULL);
+ }
+ return (dest);
+ }
+ state = string_is_key ? DTRACE_JSON_COLON :
+ DTRACE_JSON_COMMA;
+ string_is_key = B_FALSE;
+ break;
+ }
+
+ *dd++ = cc;
+ break;
+ case DTRACE_JSON_STRING_ESCAPE:
+ *dd++ = cc;
+ if (cc == 'u') {
+ escape_unicount = 0;
+ state = DTRACE_JSON_STRING_ESCAPE_UNICODE;
+ } else {
+ state = DTRACE_JSON_STRING;
+ }
+ break;
+ case DTRACE_JSON_STRING_ESCAPE_UNICODE:
+ if (!isxdigit(cc)) {
+ /*
+ * ERROR: invalid unicode escape, expected
+ * four valid hexidecimal digits.
+ */
+ return (NULL);
+ }
+
+ *dd++ = cc;
+ if (++escape_unicount == 4)
+ state = DTRACE_JSON_STRING;
+ break;
+ case DTRACE_JSON_COLON:
+ if (isspace(cc))
+ break;
+
+ if (cc == ':') {
+ state = DTRACE_JSON_VALUE;
+ break;
+ }
+
+ /*
+ * ERROR: expected a colon.
+ */
+ return (NULL);
+ case DTRACE_JSON_COMMA:
+ if (isspace(cc))
+ break;
+
+ if (cc == ',') {
+ if (in_array) {
+ state = DTRACE_JSON_VALUE;
+ if (++array_pos == array_elem)
+ found_key = B_TRUE;
+ } else {
+ state = DTRACE_JSON_OBJECT;
+ }
+ break;
+ }
+
+ /*
+ * ERROR: either we hit an unexpected character, or
+ * we reached the end of the object or array without
+ * finding the requested key.
+ */
+ return (NULL);
+ case DTRACE_JSON_IDENTIFIER:
+ if (islower(cc)) {
+ *dd++ = cc;
+ break;
+ }
+
+ *dd = '\0';
+ dd = dest; /* reset string buffer */
+
+ if (dtrace_strncmp(dest, "true", 5) == 0 ||
+ dtrace_strncmp(dest, "false", 6) == 0 ||
+ dtrace_strncmp(dest, "null", 5) == 0) {
+ if (found_key) {
+ if (nelems > 1) {
+ /*
+ * ERROR: We expected an object,
+ * not this identifier.
+ */
+ return (NULL);
+ }
+ return (dest);
+ } else {
+ cur--;
+ state = DTRACE_JSON_COMMA;
+ break;
+ }
+ }
+
+ /*
+ * ERROR: we did not recognise the identifier as one
+ * of those in the JSON specification.
+ */
+ return (NULL);
+ case DTRACE_JSON_NUMBER:
+ if (cc == '.') {
+ *dd++ = cc;
+ state = DTRACE_JSON_NUMBER_FRAC;
+ break;
+ }
+
+ if (cc == 'x' || cc == 'X') {
+ /*
+ * ERROR: specification explicitly excludes
+ * hexidecimal or octal numbers.
+ */
+ return (NULL);
+ }
+
+ /* FALLTHRU */
+ case DTRACE_JSON_NUMBER_FRAC:
+ if (cc == 'e' || cc == 'E') {
+ *dd++ = cc;
+ state = DTRACE_JSON_NUMBER_EXP;
+ break;
+ }
+
+ if (cc == '+' || cc == '-') {
+ /*
+ * ERROR: expect sign as part of exponent only.
+ */
+ return (NULL);
+ }
+ /* FALLTHRU */
+ case DTRACE_JSON_NUMBER_EXP:
+ if (isdigit(cc) || cc == '+' || cc == '-') {
+ *dd++ = cc;
+ break;
+ }
+
+ *dd = '\0';
+ dd = dest; /* reset string buffer */
+ if (found_key) {
+ if (nelems > 1) {
+ /*
+ * ERROR: We expected an object, not
+ * this number.
+ */
+ return (NULL);
+ }
+ return (dest);
+ }
+
+ cur--;
+ state = DTRACE_JSON_COMMA;
+ break;
+ case DTRACE_JSON_VALUE:
+ if (isspace(cc))
+ break;
+
+ if (cc == '{' || cc == '[') {
+ if (nelems > 1 && found_key) {
+ in_array = cc == '[' ? B_TRUE : B_FALSE;
+ /*
+ * If our element selector directs us
+ * to descend into this nested object,
+ * then move to the next selector
+ * element in the list and restart the
+ * state machine.
+ */
+ while (*elem != '\0')
+ elem++;
+ elem++; /* skip the inter-element NUL */
+ nelems--;
+ dd = dest;
+ if (in_array) {
+ state = DTRACE_JSON_VALUE;
+ array_pos = 0;
+ array_elem = dtrace_strtoll(
+ elem, 10, size);
+ found_key = array_elem == 0 ?
+ B_TRUE : B_FALSE;
+ } else {
+ found_key = B_FALSE;
+ state = DTRACE_JSON_OBJECT;
+ }
+ break;
+ }
+
+ /*
+ * Otherwise, we wish to either skip this
+ * nested object or return it in full.
+ */
+ if (cc == '[')
+ brackets = 1;
+ else
+ braces = 1;
+ *dd++ = cc;
+ state = DTRACE_JSON_COLLECT_OBJECT;
+ break;
+ }
+
+ if (cc == '"') {
+ state = DTRACE_JSON_STRING;
+ break;
+ }
+
+ if (islower(cc)) {
+ /*
+ * Here we deal with true, false and null.
+ */
+ *dd++ = cc;
+ state = DTRACE_JSON_IDENTIFIER;
+ break;
+ }
+
+ if (cc == '-' || isdigit(cc)) {
+ *dd++ = cc;
+ state = DTRACE_JSON_NUMBER;
+ break;
+ }
+
+ /*
+ * ERROR: unexpected character at start of value.
+ */
+ return (NULL);
+ case DTRACE_JSON_COLLECT_OBJECT:
+ if (cc == '\0')
+ /*
+ * ERROR: unexpected end of input.
+ */
+ return (NULL);
+
+ *dd++ = cc;
+ if (cc == '"') {
+ collect_object = B_TRUE;
+ state = DTRACE_JSON_STRING;
+ break;
+ }
+
+ if (cc == ']') {
+ if (brackets-- == 0) {
+ /*
+ * ERROR: unbalanced brackets.
+ */
+ return (NULL);
+ }
+ } else if (cc == '}') {
+ if (braces-- == 0) {
+ /*
+ * ERROR: unbalanced braces.
+ */
+ return (NULL);
+ }
+ } else if (cc == '{') {
+ braces++;
+ } else if (cc == '[') {
+ brackets++;
+ }
+
+ if (brackets == 0 && braces == 0) {
+ if (found_key) {
+ *dd = '\0';
+ return (dest);
+ }
+ dd = dest; /* reset string buffer */
+ state = DTRACE_JSON_COMMA;
+ }
+ break;
+ }
+ }
+ return (NULL);
+}
+
+/*
* Emulate the execution of DTrace ID subroutines invoked by the call opcode.
* Notice that we don't bother validating the proper number of arguments or
* their types in the tuple stack. This isn't needed because all argument
@@ -3242,7 +4081,7 @@
volatile uintptr_t *illval = &cpu_core[curcpu].cpuc_dtrace_illval;
dtrace_vstate_t *vstate = &state->dts_vstate;
-#if defined(sun)
+#ifdef illumos
union {
mutex_impl_t mi;
uint64_t mx;
@@ -3265,7 +4104,7 @@
regs[rd] = (dtrace_gethrtime() * 2416 + 374441) % 1771875;
break;
-#if defined(sun)
+#ifdef illumos
case DIF_SUBR_MUTEX_OWNED:
if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
mstate, vstate)) {
@@ -3353,7 +4192,7 @@
regs[rd] = _RW_ISWRITER(&r.ri);
break;
-#else
+#else /* !illumos */
case DIF_SUBR_MUTEX_OWNED:
if (!dtrace_canload(tupregs[0].dttk_value,
sizeof (struct lock_object), mstate, vstate)) {
@@ -3432,7 +4271,7 @@
regs[rd] = LOCK_CLASS(l.li)->lc_owner(l.li, &lowner) &&
lowner != NULL;
break;
-#endif /* ! defined(sun) */
+#endif /* illumos */
case DIF_SUBR_BCOPY: {
/*
@@ -3542,7 +4381,7 @@
break;
}
-#if defined(sun)
+#ifdef illumos
case DIF_SUBR_MSGSIZE:
case DIF_SUBR_MSGDSIZE: {
uintptr_t baddr = tupregs[0].dttk_value, daddr;
@@ -3610,7 +4449,7 @@
DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
for (p = curthread->t_procp; p != NULL; p = p->p_parent) {
-#if defined(sun)
+#ifdef illumos
if (p->p_pidp->pid_id == pid) {
#else
if (p->p_pid == pid) {
@@ -3637,7 +4476,8 @@
if (!dtrace_destructive_disallow &&
dtrace_priv_proc_control(state) &&
- !dtrace_istoxic(kaddr, size)) {
+ !dtrace_istoxic(kaddr, size) &&
+ dtrace_canload(kaddr, size, mstate, vstate)) {
DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
dtrace_copyout(kaddr, uaddr, size, flags);
DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
@@ -3652,7 +4492,8 @@
if (!dtrace_destructive_disallow &&
dtrace_priv_proc_control(state) &&
- !dtrace_istoxic(kaddr, size)) {
+ !dtrace_istoxic(kaddr, size) &&
+ dtrace_strcanload(kaddr, size, mstate, vstate)) {
DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
dtrace_copyoutstr(kaddr, uaddr, size, flags);
DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
@@ -3977,7 +4818,7 @@
int64_t index = (int64_t)tupregs[1].dttk_value;
int64_t remaining = (int64_t)tupregs[2].dttk_value;
size_t len = dtrace_strlen((char *)s, size);
- int64_t i = 0;
+ int64_t i;
if (!dtrace_canload(s, len + 1, mstate, vstate)) {
regs[rd] = 0;
@@ -4022,6 +4863,65 @@
break;
}
+ case DIF_SUBR_JSON: {
+ uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
+ uintptr_t json = tupregs[0].dttk_value;
+ size_t jsonlen = dtrace_strlen((char *)json, size);
+ uintptr_t elem = tupregs[1].dttk_value;
+ size_t elemlen = dtrace_strlen((char *)elem, size);
+
+ char *dest = (char *)mstate->dtms_scratch_ptr;
+ char *elemlist = (char *)mstate->dtms_scratch_ptr + jsonlen + 1;
+ char *ee = elemlist;
+ int nelems = 1;
+ uintptr_t cur;
+
+ if (!dtrace_canload(json, jsonlen + 1, mstate, vstate) ||
+ !dtrace_canload(elem, elemlen + 1, mstate, vstate)) {
+ regs[rd] = 0;
+ break;
+ }
+
+ if (!DTRACE_INSCRATCH(mstate, jsonlen + 1 + elemlen + 1)) {
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
+ regs[rd] = 0;
+ break;
+ }
+
+ /*
+ * Read the element selector and split it up into a packed list
+ * of strings.
+ */
+ for (cur = elem; cur < elem + elemlen; cur++) {
+ char cc = dtrace_load8(cur);
+
+ if (cur == elem && cc == '[') {
+ /*
+ * If the first element selector key is
+ * actually an array index then ignore the
+ * bracket.
+ */
+ continue;
+ }
+
+ if (cc == ']')
+ continue;
+
+ if (cc == '.' || cc == '[') {
+ nelems++;
+ cc = '\0';
+ }
+
+ *ee++ = cc;
+ }
+ *ee++ = '\0';
+
+ if ((regs[rd] = (uintptr_t)dtrace_json(size, json, elemlist,
+ nelems, dest)) != 0)
+ mstate->dtms_scratch_ptr += jsonlen + 1;
+ break;
+ }
+
case DIF_SUBR_TOUPPER:
case DIF_SUBR_TOLOWER: {
uintptr_t s = tupregs[0].dttk_value;
@@ -4069,7 +4969,7 @@
break;
}
-#if defined(sun)
+#ifdef illumos
case DIF_SUBR_GETMAJOR:
#ifdef _LP64
regs[rd] = (tupregs[0].dttk_value >> NBITSMINOR64) & MAXMAJ64;
@@ -4331,6 +5231,28 @@
break;
}
+ case DIF_SUBR_STRTOLL: {
+ uintptr_t s = tupregs[0].dttk_value;
+ uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
+ int base = 10;
+
+ if (nargs > 1) {
+ if ((base = tupregs[1].dttk_value) <= 1 ||
+ base > ('z' - 'a' + 1) + ('9' - '0' + 1)) {
+ *flags |= CPU_DTRACE_ILLOP;
+ break;
+ }
+ }
+
+ if (!dtrace_strcanload(s, size, mstate, vstate)) {
+ regs[rd] = INT64_MIN;
+ break;
+ }
+
+ regs[rd] = dtrace_strtoll((char *)s, base, size);
+ break;
+ }
+
case DIF_SUBR_LLTOSTR: {
int64_t i = (int64_t)tupregs[0].dttk_value;
uint64_t val, digit;
@@ -4540,11 +5462,32 @@
break;
}
+ case DIF_SUBR_GETF: {
+ uintptr_t fd = tupregs[0].dttk_value;
+ struct filedesc *fdp;
+ file_t *fp;
+
+ if (!dtrace_priv_proc(state)) {
+ regs[rd] = 0;
+ break;
+ }
+ fdp = curproc->p_fd;
+ FILEDESC_SLOCK(fdp);
+ fp = fget_locked(fdp, fd);
+ mstate->dtms_getf = fp;
+ regs[rd] = (uintptr_t)fp;
+ FILEDESC_SUNLOCK(fdp);
+ break;
+ }
+
case DIF_SUBR_CLEANPATH: {
char *dest = (char *)mstate->dtms_scratch_ptr, c;
uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
uintptr_t src = tupregs[0].dttk_value;
int i = 0, j = 0;
+#ifdef illumos
+ zone_t *z;
+#endif
if (!dtrace_strcanload(src, size, mstate, vstate)) {
regs[rd] = 0;
@@ -4643,6 +5586,25 @@
} while (c != '\0');
dest[j] = '\0';
+
+#ifdef illumos
+ if (mstate->dtms_getf != NULL &&
+ !(mstate->dtms_access & DTRACE_ACCESS_KERNEL) &&
+ (z = state->dts_cred.dcr_cred->cr_zone) != kcred->cr_zone) {
+ /*
+ * If we've done a getf() as a part of this ECB and we
+ * don't have kernel access (and we're not in the global
+ * zone), check if the path we cleaned up begins with
+ * the zone's root path, and trim it off if so. Note
+ * that this is an output cleanliness issue, not a
+ * security issue: knowing one's zone root path does
+ * not enable privilege escalation.
+ */
+ if (strstr(dest, z->zone_rootpath) == dest)
+ dest += strlen(z->zone_rootpath) - 1;
+ }
+#endif
+
regs[rd] = (uintptr_t)dest;
mstate->dtms_scratch_ptr += size;
break;
@@ -4748,7 +5710,7 @@
tryzero = -1;
numzero = 1;
for (i = 0; i < sizeof (struct in6_addr); i++) {
-#if defined(sun)
+#ifdef illumos
if (ip6._S6_un._S6_u8[i] == 0 &&
#else
if (ip6.__u6_addr.__u6_addr8[i] == 0 &&
@@ -4759,7 +5721,7 @@
}
if (tryzero != -1 &&
-#if defined(sun)
+#ifdef illumos
(ip6._S6_un._S6_u8[i] != 0 ||
#else
(ip6.__u6_addr.__u6_addr8[i] != 0 ||
@@ -4775,7 +5737,7 @@
numzero = i - i % 2 - tryzero;
tryzero = -1;
-#if defined(sun)
+#ifdef illumos
if (ip6._S6_un._S6_u8[i] == 0 &&
#else
if (ip6.__u6_addr.__u6_addr8[i] == 0 &&
@@ -4796,7 +5758,7 @@
i >= DTRACE_V4MAPPED_OFFSET; i--) {
ASSERT(end >= base);
-#if defined(sun)
+#ifdef illumos
val = ip6._S6_un._S6_u8[i];
#else
val = ip6.__u6_addr.__u6_addr8[i];
@@ -4841,7 +5803,7 @@
if (i < 14 && i != firstzero - 2)
*end-- = ':';
-#if defined(sun)
+#ifdef illumos
val = (ip6._S6_un._S6_u8[i] << 8) +
ip6._S6_un._S6_u8[i + 1];
#else
@@ -4887,6 +5849,45 @@
break;
}
+#ifndef illumos
+ case DIF_SUBR_MEMSTR: {
+ char *str = (char *)mstate->dtms_scratch_ptr;
+ uintptr_t mem = tupregs[0].dttk_value;
+ char c = tupregs[1].dttk_value;
+ size_t size = tupregs[2].dttk_value;
+ uint8_t n;
+ int i;
+
+ regs[rd] = 0;
+
+ if (size == 0)
+ break;
+
+ if (!dtrace_canload(mem, size - 1, mstate, vstate))
+ break;
+
+ if (!DTRACE_INSCRATCH(mstate, size)) {
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
+ break;
+ }
+
+ if (dtrace_memstr_max != 0 && size > dtrace_memstr_max) {
+ *flags |= CPU_DTRACE_ILLOP;
+ break;
+ }
+
+ for (i = 0; i < size - 1; i++) {
+ n = dtrace_load8(mem++);
+ str[i] = (n == 0) ? c : n;
+ }
+ str[size - 1] = 0;
+
+ regs[rd] = (uintptr_t)str;
+ mstate->dtms_scratch_ptr += size;
+ break;
+ }
+#endif
+
case DIF_SUBR_TYPEREF: {
uintptr_t size = 4 * sizeof(uintptr_t);
uintptr_t *typeref = (uintptr_t *) P2ROUNDUP(mstate->dtms_scratch_ptr, sizeof(uintptr_t));
@@ -5077,102 +6078,95 @@
pc = DIF_INSTR_LABEL(instr);
break;
case DIF_OP_RLDSB:
- if (!dtrace_canstore(regs[r1], 1, mstate, vstate)) {
- *flags |= CPU_DTRACE_KPRIV;
- *illval = regs[r1];
+ if (!dtrace_canload(regs[r1], 1, mstate, vstate))
break;
- }
/*FALLTHROUGH*/
case DIF_OP_LDSB:
regs[rd] = (int8_t)dtrace_load8(regs[r1]);
break;
case DIF_OP_RLDSH:
- if (!dtrace_canstore(regs[r1], 2, mstate, vstate)) {
- *flags |= CPU_DTRACE_KPRIV;
- *illval = regs[r1];
+ if (!dtrace_canload(regs[r1], 2, mstate, vstate))
break;
- }
/*FALLTHROUGH*/
case DIF_OP_LDSH:
regs[rd] = (int16_t)dtrace_load16(regs[r1]);
break;
case DIF_OP_RLDSW:
- if (!dtrace_canstore(regs[r1], 4, mstate, vstate)) {
- *flags |= CPU_DTRACE_KPRIV;
- *illval = regs[r1];
+ if (!dtrace_canload(regs[r1], 4, mstate, vstate))
break;
- }
/*FALLTHROUGH*/
case DIF_OP_LDSW:
regs[rd] = (int32_t)dtrace_load32(regs[r1]);
break;
case DIF_OP_RLDUB:
- if (!dtrace_canstore(regs[r1], 1, mstate, vstate)) {
- *flags |= CPU_DTRACE_KPRIV;
- *illval = regs[r1];
+ if (!dtrace_canload(regs[r1], 1, mstate, vstate))
break;
- }
/*FALLTHROUGH*/
case DIF_OP_LDUB:
regs[rd] = dtrace_load8(regs[r1]);
break;
case DIF_OP_RLDUH:
- if (!dtrace_canstore(regs[r1], 2, mstate, vstate)) {
- *flags |= CPU_DTRACE_KPRIV;
- *illval = regs[r1];
+ if (!dtrace_canload(regs[r1], 2, mstate, vstate))
break;
- }
/*FALLTHROUGH*/
case DIF_OP_LDUH:
regs[rd] = dtrace_load16(regs[r1]);
break;
case DIF_OP_RLDUW:
- if (!dtrace_canstore(regs[r1], 4, mstate, vstate)) {
- *flags |= CPU_DTRACE_KPRIV;
- *illval = regs[r1];
+ if (!dtrace_canload(regs[r1], 4, mstate, vstate))
break;
- }
/*FALLTHROUGH*/
case DIF_OP_LDUW:
regs[rd] = dtrace_load32(regs[r1]);
break;
case DIF_OP_RLDX:
- if (!dtrace_canstore(regs[r1], 8, mstate, vstate)) {
- *flags |= CPU_DTRACE_KPRIV;
- *illval = regs[r1];
+ if (!dtrace_canload(regs[r1], 8, mstate, vstate))
break;
- }
/*FALLTHROUGH*/
case DIF_OP_LDX:
regs[rd] = dtrace_load64(regs[r1]);
break;
case DIF_OP_ULDSB:
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
regs[rd] = (int8_t)
dtrace_fuword8((void *)(uintptr_t)regs[r1]);
+ DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
break;
case DIF_OP_ULDSH:
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
regs[rd] = (int16_t)
dtrace_fuword16((void *)(uintptr_t)regs[r1]);
+ DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
break;
case DIF_OP_ULDSW:
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
regs[rd] = (int32_t)
dtrace_fuword32((void *)(uintptr_t)regs[r1]);
+ DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
break;
case DIF_OP_ULDUB:
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
regs[rd] =
dtrace_fuword8((void *)(uintptr_t)regs[r1]);
+ DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
break;
case DIF_OP_ULDUH:
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
regs[rd] =
dtrace_fuword16((void *)(uintptr_t)regs[r1]);
+ DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
break;
case DIF_OP_ULDUW:
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
regs[rd] =
dtrace_fuword32((void *)(uintptr_t)regs[r1]);
+ DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
break;
case DIF_OP_ULDX:
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
regs[rd] =
dtrace_fuword64((void *)(uintptr_t)regs[r1]);
+ DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
break;
case DIF_OP_RET:
rval = regs[rd];
@@ -5487,6 +6481,11 @@
regs[r2] ? regs[r2] :
dtrace_strsize_default) + 1;
} else {
+ if (regs[r2] > LONG_MAX) {
+ *flags |= CPU_DTRACE_ILLOP;
+ break;
+ }
+
tupregs[ttop].dttk_size = regs[r2];
}
@@ -5755,7 +6754,7 @@
c[i++] = ')';
c[i] = '\0';
-#if defined(sun)
+#ifdef illumos
debug_enter(c);
#else
kdb_enter(KDB_WHY_DTRACE, "breakpoint action");
@@ -5802,7 +6801,7 @@
return;
}
-#if defined(sun)
+#ifdef illumos
/*
* raise() has a queue depth of 1 -- we ignore all subsequent
* invocations of the raise() action.
@@ -5826,7 +6825,7 @@
if (dtrace_destructive_disallow)
return;
-#if defined(sun)
+#ifdef illumos
if (!curthread->t_dtrace_stop) {
curthread->t_dtrace_stop = 1;
curthread->t_sig_check = 1;
@@ -5845,7 +6844,7 @@
{
hrtime_t now;
volatile uint16_t *flags;
-#if defined(sun)
+#ifdef illumos
cpu_t *cpu = CPU;
#else
cpu_t *cpu = &solaris_cpu[curcpu];
@@ -5854,7 +6853,7 @@
if (dtrace_destructive_disallow)
return;
- flags = (volatile uint16_t *)&cpu_core[cpu->cpu_id].cpuc_dtrace_flags;
+ flags = (volatile uint16_t *)&cpu_core[curcpu].cpuc_dtrace_flags;
now = dtrace_gethrtime();
@@ -6002,6 +7001,63 @@
mstate->dtms_scratch_ptr = old;
}
+static void
+dtrace_store_by_ref(dtrace_difo_t *dp, caddr_t tomax, size_t size,
+ size_t *valoffsp, uint64_t *valp, uint64_t end, int intuple, int dtkind)
+{
+ volatile uint16_t *flags;
+ uint64_t val = *valp;
+ size_t valoffs = *valoffsp;
+
+ flags = (volatile uint16_t *)&cpu_core[curcpu].cpuc_dtrace_flags;
+ ASSERT(dtkind == DIF_TF_BYREF || dtkind == DIF_TF_BYUREF);
+
+ /*
+ * If this is a string, we're going to only load until we find the zero
+ * byte -- after which we'll store zero bytes.
+ */
+ if (dp->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING) {
+ char c = '\0' + 1;
+ size_t s;
+
+ for (s = 0; s < size; s++) {
+ if (c != '\0' && dtkind == DIF_TF_BYREF) {
+ c = dtrace_load8(val++);
+ } else if (c != '\0' && dtkind == DIF_TF_BYUREF) {
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
+ c = dtrace_fuword8((void *)(uintptr_t)val++);
+ DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
+ if (*flags & CPU_DTRACE_FAULT)
+ break;
+ }
+
+ DTRACE_STORE(uint8_t, tomax, valoffs++, c);
+
+ if (c == '\0' && intuple)
+ break;
+ }
+ } else {
+ uint8_t c;
+ while (valoffs < end) {
+ if (dtkind == DIF_TF_BYREF) {
+ c = dtrace_load8(val++);
+ } else if (dtkind == DIF_TF_BYUREF) {
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
+ c = dtrace_fuword8((void *)(uintptr_t)val++);
+ DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
+ if (*flags & CPU_DTRACE_FAULT)
+ break;
+ }
+
+ DTRACE_STORE(uint8_t, tomax,
+ valoffs++, c);
+ }
+ }
+
+ *valp = val;
+ *valoffsp = valoffs;
+}
+
/*
* If you're looking for the epicenter of DTrace, you just found it. This
* is the function called by the provider to fire a probe -- from which all
@@ -6026,7 +7082,7 @@
if (panicstr != NULL)
return;
-#if defined(sun)
+#ifdef illumos
/*
* Kick out immediately if this CPU is still being born (in which case
* curthread will be set to -1) or the current thread can't allow
@@ -6051,7 +7107,7 @@
return;
}
-#if defined(sun)
+#ifdef illumos
if (panic_quiesce) {
#else
if (panicstr != NULL) {
@@ -6063,7 +7119,8 @@
return;
}
- now = dtrace_gethrtime();
+ now = mstate.dtms_timestamp = dtrace_gethrtime();
+ mstate.dtms_present |= DTRACE_MSTATE_TIMESTAMP;
vtime = dtrace_vtime_references != 0;
if (vtime && curthread->t_dtrace_start)
@@ -6104,6 +7161,8 @@
uint64_t val = 0;
mstate.dtms_present = DTRACE_MSTATE_ARGS | DTRACE_MSTATE_PROBE;
+ mstate.dtms_getf = NULL;
+
*flags &= ~CPU_DTRACE_ERROR;
if (prov == dtrace_provider) {
@@ -6155,7 +7214,7 @@
probe->dtpr_id, probe->dtpr_arg) == 0)
continue;
-#if defined(sun)
+#ifdef illumos
/*
* This is more subtle than it looks. We have to be
* absolutely certain that CRED() isn't going to
@@ -6206,7 +7265,7 @@
if (now - state->dts_alive > dtrace_deadman_timeout) {
/*
* We seem to be dead. Unless we (a) have kernel
- * destructive permissions (b) have expicitly enabled
+ * destructive permissions (b) have explicitly enabled
* destructive actions and (c) destructive actions have
* not been disabled, we're going to transition into
* the KILLED state, from which no further processing
@@ -6234,8 +7293,18 @@
tomax = buf->dtb_tomax;
ASSERT(tomax != NULL);
- if (ecb->dte_size != 0)
- DTRACE_STORE(uint32_t, tomax, offs, ecb->dte_epid);
+ if (ecb->dte_size != 0) {
+ dtrace_rechdr_t dtrh;
+ if (!(mstate.dtms_present & DTRACE_MSTATE_TIMESTAMP)) {
+ mstate.dtms_timestamp = dtrace_gethrtime();
+ mstate.dtms_present |= DTRACE_MSTATE_TIMESTAMP;
+ }
+ ASSERT3U(ecb->dte_size, >=, sizeof (dtrace_rechdr_t));
+ dtrh.dtrh_epid = ecb->dte_epid;
+ DTRACE_RECORD_STORE_TIMESTAMP(&dtrh,
+ mstate.dtms_timestamp);
+ *((dtrace_rechdr_t *)(tomax + offs)) = dtrh;
+ }
mstate.dtms_epid = ecb->dte_epid;
mstate.dtms_present |= DTRACE_MSTATE_EPID;
@@ -6247,7 +7316,7 @@
if (pred != NULL) {
dtrace_difo_t *dp = pred->dtp_difo;
- int rval;
+ uint64_t rval;
rval = dtrace_dif_emulate(dp, &mstate, vstate, state);
@@ -6382,7 +7451,9 @@
continue;
switch (act->dta_kind) {
- case DTRACEACT_SPECULATE:
+ case DTRACEACT_SPECULATE: {
+ dtrace_rechdr_t *dtrh;
+
ASSERT(buf == &state->dts_buffer[cpuid]);
buf = dtrace_speculation_buffer(state,
cpuid, val);
@@ -6404,10 +7475,23 @@
tomax = buf->dtb_tomax;
ASSERT(tomax != NULL);
- if (ecb->dte_size != 0)
- DTRACE_STORE(uint32_t, tomax, offs,
- ecb->dte_epid);
+ if (ecb->dte_size == 0)
+ continue;
+
+ ASSERT3U(ecb->dte_size, >=,
+ sizeof (dtrace_rechdr_t));
+ dtrh = ((void *)(tomax + offs));
+ dtrh->dtrh_epid = ecb->dte_epid;
+ /*
+ * When the speculation is committed, all of
+ * the records in the speculative buffer will
+ * have their timestamps set to the commit
+ * time. Until then, it is set to a sentinel
+ * value, for debugability.
+ */
+ DTRACE_RECORD_STORE_TIMESTAMP(dtrh, UINT64_MAX);
continue;
+ }
case DTRACEACT_PRINTM: {
/* The DIF returns a 'memref'. */
@@ -6555,7 +7639,7 @@
case DTRACEACT_USYM:
case DTRACEACT_UMOD:
case DTRACEACT_UADDR: {
-#if defined(sun)
+#ifdef illumos
struct pid *pid = curthread->t_procp->p_pidp;
#endif
@@ -6563,7 +7647,7 @@
continue;
DTRACE_STORE(uint64_t, tomax,
-#if defined(sun)
+#ifdef illumos
valoffs, (uint64_t)pid->pid_id);
#else
valoffs, (uint64_t) curproc->p_pid);
@@ -6612,7 +7696,8 @@
ASSERT(0);
}
- if (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF) {
+ if (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF ||
+ dp->dtdo_rtype.dtdt_flags & DIF_TF_BYUREF) {
uintptr_t end = valoffs + size;
if (tracememsize != 0 &&
@@ -6621,40 +7706,15 @@
tracememsize = 0;
}
- if (!dtrace_vcanload((void *)(uintptr_t)val,
+ if (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF &&
+ !dtrace_vcanload((void *)(uintptr_t)val,
&dp->dtdo_rtype, &mstate, vstate))
continue;
- /*
- * If this is a string, we're going to only
- * load until we find the zero byte -- after
- * which we'll store zero bytes.
- */
- if (dp->dtdo_rtype.dtdt_kind ==
- DIF_TYPE_STRING) {
- char c = '\0' + 1;
- int intuple = act->dta_intuple;
- size_t s;
-
- for (s = 0; s < size; s++) {
- if (c != '\0')
- c = dtrace_load8(val++);
-
- DTRACE_STORE(uint8_t, tomax,
- valoffs++, c);
-
- if (c == '\0' && intuple)
- break;
- }
-
- continue;
- }
-
- while (valoffs < end) {
- DTRACE_STORE(uint8_t, tomax, valoffs++,
- dtrace_load8(val++));
- }
-
+ dtrace_store_by_ref(dp, tomax, size, &valoffs,
+ &val, end, act->dta_intuple,
+ dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF ?
+ DIF_TF_BYREF: DIF_TF_BYUREF);
continue;
}
@@ -7012,7 +8072,7 @@
{
uint32_t priv;
-#if defined(sun)
+#ifdef illumos
if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
/*
* For DTRACE_PRIV_ALL, the uid and zoneid don't matter.
@@ -7606,7 +8666,7 @@
* already held.
*/
ASSERT(old == dtrace_provider);
-#if defined(sun)
+#ifdef illumos
ASSERT(dtrace_devi != NULL);
#endif
ASSERT(MUTEX_HELD(&dtrace_provider_lock));
@@ -7621,7 +8681,9 @@
}
} else {
mutex_enter(&dtrace_provider_lock);
+#ifdef illumos
mutex_enter(&mod_lock);
+#endif
mutex_enter(&dtrace_lock);
}
@@ -7635,7 +8697,9 @@
dtrace_anon.dta_state->dts_necbs > 0))) {
if (!self) {
mutex_exit(&dtrace_lock);
+#ifdef illumos
mutex_exit(&mod_lock);
+#endif
mutex_exit(&dtrace_provider_lock);
}
return (EBUSY);
@@ -7669,7 +8733,9 @@
if (!self) {
mutex_exit(&dtrace_lock);
+#ifdef illumos
mutex_exit(&mod_lock);
+#endif
mutex_exit(&dtrace_provider_lock);
}
@@ -7723,7 +8789,7 @@
kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
-#if defined(sun)
+#ifdef illumos
vmem_free(dtrace_arena, (void *)(uintptr_t)(probe->dtpr_id), 1);
#else
free_unr(dtrace_arena, probe->dtpr_id);
@@ -7732,7 +8798,7 @@
}
if ((prev = dtrace_provider) == old) {
-#if defined(sun)
+#ifdef illumos
ASSERT(self || dtrace_devi == NULL);
ASSERT(old->dtpv_next == NULL || dtrace_devi == NULL);
#endif
@@ -7751,7 +8817,9 @@
if (!self) {
mutex_exit(&dtrace_lock);
+#ifdef illumos
mutex_exit(&mod_lock);
+#endif
mutex_exit(&dtrace_provider_lock);
}
@@ -7842,7 +8910,7 @@
kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
kmem_free(probe, sizeof (dtrace_probe_t));
-#if defined(sun)
+#ifdef illumos
vmem_free(dtrace_arena, (void *)((uintptr_t)i + 1), 1);
#else
free_unr(dtrace_arena, i + 1);
@@ -7882,7 +8950,7 @@
mutex_enter(&dtrace_lock);
}
-#if defined(sun)
+#ifdef illumos
id = (dtrace_id_t)(uintptr_t)vmem_alloc(dtrace_arena, 1,
VM_BESTFIT | VM_SLEEP);
#else
@@ -8035,19 +9103,6 @@
(void) strncpy(pdp->dtpd_name, prp->dtpr_name, DTRACE_NAMELEN - 1);
}
-#if !defined(sun)
-static int
-dtrace_probe_provide_cb(linker_file_t lf, void *arg)
-{
- dtrace_provider_t *prv = (dtrace_provider_t *) arg;
-
- prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, lf);
-
- return(0);
-}
-#endif
-
-
/*
* Called to indicate that a probe -- or probes -- should be provided by a
* specfied provider. If the specified description is NULL, the provider will
@@ -8066,7 +9121,7 @@
static void
dtrace_probe_provide(dtrace_probedesc_t *desc, dtrace_provider_t *prv)
{
-#if defined(sun)
+#ifdef illumos
modctl_t *ctl;
#endif
int all = 0;
@@ -8084,6 +9139,7 @@
*/
prv->dtpv_pops.dtps_provide(prv->dtpv_arg, desc);
+#ifdef illumos
/*
* Now call the per-module provide operation. We will grab
* mod_lock to prevent the list from being modified. Note
@@ -8092,7 +9148,6 @@
*/
mutex_enter(&mod_lock);
-#if defined(sun)
ctl = &modules;
do {
if (ctl->mod_busy || ctl->mod_mp == NULL)
@@ -8101,15 +9156,13 @@
prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
} while ((ctl = ctl->mod_next) != &modules);
-#else
- (void) linker_file_foreach(dtrace_probe_provide_cb, prv);
-#endif
mutex_exit(&mod_lock);
+#endif
} while (all && (prv = prv->dtpv_next) != NULL);
}
-#if defined(sun)
+#ifdef illumos
/*
* Iterate over each probe, and call the Framework-to-Provider API function
* denoted by offs.
@@ -8270,6 +9323,10 @@
probe = (dof_probe_t *)(uintptr_t)(daddr +
prb_sec->dofs_offset + i * prb_sec->dofs_entsize);
+ /* See the check in dtrace_helper_provider_validate(). */
+ if (strlen(strtab + probe->dofpr_func) >= DTRACE_FUNCNAMELEN)
+ continue;
+
dhpb.dthpb_mod = dhp->dofhp_mod;
dhpb.dthpb_func = strtab + probe->dofpr_func;
dhpb.dthpb_name = strtab + probe->dofpr_name;
@@ -8759,6 +9816,20 @@
subr == DIF_SUBR_COPYOUTSTR) {
dp->dtdo_destructive = 1;
}
+
+ if (subr == DIF_SUBR_GETF) {
+ /*
+ * If we have a getf() we need to record that
+ * in our state. Note that our state can be
+ * NULL if this is a helper -- but in that
+ * case, the call to getf() is itself illegal,
+ * and will be caught (slightly later) when
+ * the helper is validated.
+ */
+ if (vstate->dtvs_state != NULL)
+ vstate->dtvs_state->dts_getf++;
+ }
+
break;
case DIF_OP_PUSHTR:
if (type != DIF_TYPE_STRING && type != DIF_TYPE_CTF)
@@ -8788,7 +9859,7 @@
"expected 'ret' as last DIF instruction\n");
}
- if (!(dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF)) {
+ if (!(dp->dtdo_rtype.dtdt_flags & (DIF_TF_BYREF | DIF_TF_BYUREF))) {
/*
* If we're not returning by reference, the size must be either
* 0 or the size of one of the base types.
@@ -8802,7 +9873,7 @@
break;
default:
- err += efunc(dp->dtdo_len - 1, "bad return size");
+ err += efunc(dp->dtdo_len - 1, "bad return size\n");
}
}
@@ -8876,9 +9947,10 @@
break;
}
- if (v->dtdv_scope == DIFV_SCOPE_GLOBAL &&
- vt->dtdt_size > dtrace_global_maxsize) {
- err += efunc(i, "oversized by-ref global\n");
+ if ((v->dtdv_scope == DIFV_SCOPE_GLOBAL ||
+ v->dtdv_scope == DIFV_SCOPE_LOCAL) &&
+ vt->dtdt_size > dtrace_statvar_maxsize) {
+ err += efunc(i, "oversized by-ref static\n");
break;
}
}
@@ -9038,7 +10110,9 @@
subr == DIF_SUBR_INET_NTOA ||
subr == DIF_SUBR_INET_NTOA6 ||
subr == DIF_SUBR_INET_NTOP ||
+ subr == DIF_SUBR_JSON ||
subr == DIF_SUBR_LLTOSTR ||
+ subr == DIF_SUBR_STRTOLL ||
subr == DIF_SUBR_RINDEX ||
subr == DIF_SUBR_STRCHR ||
subr == DIF_SUBR_STRJOIN ||
@@ -9051,6 +10125,9 @@
subr == DIF_SUBR_NTOHL ||
subr == DIF_SUBR_NTOHLL ||
subr == DIF_SUBR_MEMREF ||
+#ifndef illumos
+ subr == DIF_SUBR_MEMSTR ||
+#endif
subr == DIF_SUBR_TYPEREF)
break;
@@ -9217,6 +10294,9 @@
if (srd == 0)
return;
+ if (sval > LONG_MAX)
+ return;
+
tupregs[ttop++].dttk_size = sval;
}
@@ -9278,6 +10358,19 @@
*/
size = P2ROUNDUP(size, sizeof (uint64_t));
+ /*
+ * Before setting the chunk size, check that we're not going
+ * to set it to a negative value...
+ */
+ if (size > LONG_MAX)
+ return;
+
+ /*
+ * ...and make certain that we didn't badly overflow.
+ */
+ if (size < ksize || size < sizeof (dtrace_dynvar_t))
+ return;
+
if (size > vstate->dtvs_dynvars.dtds_chunksize)
vstate->dtvs_dynvars.dtds_chunksize = size;
}
@@ -9686,7 +10779,7 @@
{
dtrace_actdesc_t *act;
-#if defined(sun)
+#ifdef illumos
ASSERT(!DTRACEACT_ISPRINTFLIKE(kind) || (arg != NULL &&
arg >= KERNELBASE) || (arg == NULL && kind == DTRACEACT_PRINTA));
#endif
@@ -9725,7 +10818,7 @@
if (DTRACEACT_ISPRINTFLIKE(kind)) {
char *str = (char *)(uintptr_t)act->dtad_arg;
-#if defined(sun)
+#ifdef illumos
ASSERT((str != NULL && (uintptr_t)str >= KERNELBASE) ||
(str == NULL && act->dtad_kind == DTRACEACT_PRINTA));
#endif
@@ -9754,9 +10847,9 @@
/*
* The default size is the size of the default action: recording
- * the epid.
+ * the header.
*/
- ecb->dte_size = ecb->dte_needed = sizeof (dtrace_epid_t);
+ ecb->dte_size = ecb->dte_needed = sizeof (dtrace_rechdr_t);
ecb->dte_alignment = sizeof (dtrace_epid_t);
epid = state->dts_epid++;
@@ -9854,122 +10947,89 @@
static void
dtrace_ecb_resize(dtrace_ecb_t *ecb)
{
- uint32_t maxalign = sizeof (dtrace_epid_t);
- uint32_t align = sizeof (uint8_t), offs, diff;
dtrace_action_t *act;
- int wastuple = 0;
+ uint32_t curneeded = UINT32_MAX;
uint32_t aggbase = UINT32_MAX;
- dtrace_state_t *state = ecb->dte_state;
/*
- * If we record anything, we always record the epid. (And we always
- * record it first.)
+ * If we record anything, we always record the dtrace_rechdr_t. (And
+ * we always record it first.)
*/
- offs = sizeof (dtrace_epid_t);
- ecb->dte_size = ecb->dte_needed = sizeof (dtrace_epid_t);
+ ecb->dte_size = sizeof (dtrace_rechdr_t);
+ ecb->dte_alignment = sizeof (dtrace_epid_t);
for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
dtrace_recdesc_t *rec = &act->dta_rec;
+ ASSERT(rec->dtrd_size > 0 || rec->dtrd_alignment == 1);
- if ((align = rec->dtrd_alignment) > maxalign)
- maxalign = align;
+ ecb->dte_alignment = MAX(ecb->dte_alignment,
+ rec->dtrd_alignment);
- if (!wastuple && act->dta_intuple) {
- /*
- * This is the first record in a tuple. Align the
- * offset to be at offset 4 in an 8-byte aligned
- * block.
- */
- diff = offs + sizeof (dtrace_aggid_t);
-
- if ((diff = (diff & (sizeof (uint64_t) - 1))))
- offs += sizeof (uint64_t) - diff;
-
- aggbase = offs - sizeof (dtrace_aggid_t);
- ASSERT(!(aggbase & (sizeof (uint64_t) - 1)));
- }
-
- /*LINTED*/
- if (rec->dtrd_size != 0 && (diff = (offs & (align - 1)))) {
- /*
- * The current offset is not properly aligned; align it.
- */
- offs += align - diff;
- }
-
- rec->dtrd_offset = offs;
-
- if (offs + rec->dtrd_size > ecb->dte_needed) {
- ecb->dte_needed = offs + rec->dtrd_size;
-
- if (ecb->dte_needed > state->dts_needed)
- state->dts_needed = ecb->dte_needed;
- }
-
if (DTRACEACT_ISAGG(act->dta_kind)) {
dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act;
- dtrace_action_t *first = agg->dtag_first, *prev;
- ASSERT(rec->dtrd_size != 0 && first != NULL);
- ASSERT(wastuple);
+ ASSERT(rec->dtrd_size != 0);
+ ASSERT(agg->dtag_first != NULL);
+ ASSERT(act->dta_prev->dta_intuple);
ASSERT(aggbase != UINT32_MAX);
+ ASSERT(curneeded != UINT32_MAX);
agg->dtag_base = aggbase;
- while ((prev = first->dta_prev) != NULL &&
- DTRACEACT_ISAGG(prev->dta_kind)) {
- agg = (dtrace_aggregation_t *)prev;
- first = agg->dtag_first;
- }
+ curneeded = P2ROUNDUP(curneeded, rec->dtrd_alignment);
+ rec->dtrd_offset = curneeded;
+ curneeded += rec->dtrd_size;
+ ecb->dte_needed = MAX(ecb->dte_needed, curneeded);
- if (prev != NULL) {
- offs = prev->dta_rec.dtrd_offset +
- prev->dta_rec.dtrd_size;
- } else {
- offs = sizeof (dtrace_epid_t);
+ aggbase = UINT32_MAX;
+ curneeded = UINT32_MAX;
+ } else if (act->dta_intuple) {
+ if (curneeded == UINT32_MAX) {
+ /*
+ * This is the first record in a tuple. Align
+ * curneeded to be at offset 4 in an 8-byte
+ * aligned block.
+ */
+ ASSERT(act->dta_prev == NULL ||
+ !act->dta_prev->dta_intuple);
+ ASSERT3U(aggbase, ==, UINT32_MAX);
+ curneeded = P2PHASEUP(ecb->dte_size,
+ sizeof (uint64_t), sizeof (dtrace_aggid_t));
+
+ aggbase = curneeded - sizeof (dtrace_aggid_t);
+ ASSERT(IS_P2ALIGNED(aggbase,
+ sizeof (uint64_t)));
}
- wastuple = 0;
+ curneeded = P2ROUNDUP(curneeded, rec->dtrd_alignment);
+ rec->dtrd_offset = curneeded;
+ curneeded += rec->dtrd_size;
} else {
- if (!act->dta_intuple)
- ecb->dte_size = offs + rec->dtrd_size;
+ /* tuples must be followed by an aggregation */
+ ASSERT(act->dta_prev == NULL ||
+ !act->dta_prev->dta_intuple);
- offs += rec->dtrd_size;
+ ecb->dte_size = P2ROUNDUP(ecb->dte_size,
+ rec->dtrd_alignment);
+ rec->dtrd_offset = ecb->dte_size;
+ ecb->dte_size += rec->dtrd_size;
+ ecb->dte_needed = MAX(ecb->dte_needed, ecb->dte_size);
}
-
- wastuple = act->dta_intuple;
}
if ((act = ecb->dte_action) != NULL &&
!(act->dta_kind == DTRACEACT_SPECULATE && act->dta_next == NULL) &&
- ecb->dte_size == sizeof (dtrace_epid_t)) {
+ ecb->dte_size == sizeof (dtrace_rechdr_t)) {
/*
- * If the size is still sizeof (dtrace_epid_t), then all
+ * If the size is still sizeof (dtrace_rechdr_t), then all
* actions store no data; set the size to 0.
*/
- ecb->dte_alignment = maxalign;
ecb->dte_size = 0;
-
- /*
- * If the needed space is still sizeof (dtrace_epid_t), then
- * all actions need no additional space; set the needed
- * size to 0.
- */
- if (ecb->dte_needed == sizeof (dtrace_epid_t))
- ecb->dte_needed = 0;
-
- return;
}
- /*
- * Set our alignment, and make sure that the dte_size and dte_needed
- * are aligned to the size of an EPID.
- */
- ecb->dte_alignment = maxalign;
- ecb->dte_size = (ecb->dte_size + (sizeof (dtrace_epid_t) - 1)) &
- ~(sizeof (dtrace_epid_t) - 1);
- ecb->dte_needed = (ecb->dte_needed + (sizeof (dtrace_epid_t) - 1)) &
- ~(sizeof (dtrace_epid_t) - 1);
- ASSERT(ecb->dte_size <= ecb->dte_needed);
+ ecb->dte_size = P2ROUNDUP(ecb->dte_size, sizeof (dtrace_epid_t));
+ ecb->dte_needed = P2ROUNDUP(ecb->dte_needed, (sizeof (dtrace_epid_t)));
+ ecb->dte_state->dts_needed = MAX(ecb->dte_state->dts_needed,
+ ecb->dte_needed);
}
static dtrace_action_t *
@@ -10117,7 +11177,7 @@
/*
* We need to allocate an id for this aggregation.
*/
-#if defined(sun)
+#ifdef illumos
aggid = (dtrace_aggid_t)(uintptr_t)vmem_alloc(state->dts_aggid_arena, 1,
VM_BESTFIT | VM_SLEEP);
#else
@@ -10171,7 +11231,7 @@
dtrace_aggid_t aggid = agg->dtag_id;
ASSERT(DTRACEACT_ISAGG(act->dta_kind));
-#if defined(sun)
+#ifdef illumos
vmem_free(state->dts_aggid_arena, (void *)(uintptr_t)aggid, 1);
#else
free_unr(state->dts_aggid_arena, aggid);
@@ -10240,7 +11300,7 @@
format = 0;
} else {
ASSERT(arg != 0);
-#if defined(sun)
+#ifdef illumos
ASSERT(arg > KERNELBASE);
#endif
format = dtrace_format_add(state,
@@ -10349,7 +11409,7 @@
break;
case DTRACEACT_SPECULATE:
- if (ecb->dte_size > sizeof (dtrace_epid_t))
+ if (ecb->dte_size > sizeof (dtrace_rechdr_t))
return (EINVAL);
if (dp == NULL)
@@ -10470,7 +11530,7 @@
ecb->dte_action = NULL;
ecb->dte_action_last = NULL;
- ecb->dte_size = sizeof (dtrace_epid_t);
+ ecb->dte_size = 0;
}
static void
@@ -10739,12 +11799,13 @@
caddr_t tomax = buf->dtb_tomax;
caddr_t xamot = buf->dtb_xamot;
dtrace_icookie_t cookie;
- hrtime_t now = dtrace_gethrtime();
+ hrtime_t now;
ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
ASSERT(!(buf->dtb_flags & DTRACEBUF_RING));
cookie = dtrace_interrupt_disable();
+ now = dtrace_gethrtime();
buf->dtb_tomax = xamot;
buf->dtb_xamot = tomax;
buf->dtb_xamot_drops = buf->dtb_drops;
@@ -10789,17 +11850,20 @@
static int
dtrace_buffer_alloc(dtrace_buffer_t *bufs, size_t size, int flags,
- processorid_t cpu)
+ processorid_t cpu, int *factor)
{
-#if defined(sun)
+#ifdef illumos
cpu_t *cp;
#endif
dtrace_buffer_t *buf;
+ int allocated = 0, desired = 0;
-#if defined(sun)
+#ifdef illumos
ASSERT(MUTEX_HELD(&cpu_lock));
ASSERT(MUTEX_HELD(&dtrace_lock));
+ *factor = 1;
+
if (size > dtrace_nonroot_maxsize &&
!PRIV_POLICY_CHOICE(CRED(), PRIV_ALL, B_FALSE))
return (EFBIG);
@@ -10823,7 +11887,8 @@
ASSERT(buf->dtb_xamot == NULL);
- if ((buf->dtb_tomax = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
+ if ((buf->dtb_tomax = kmem_zalloc(size,
+ KM_NOSLEEP | KM_NORMALPRI)) == NULL)
goto err;
buf->dtb_size = size;
@@ -10834,7 +11899,8 @@
if (flags & DTRACEBUF_NOSWITCH)
continue;
- if ((buf->dtb_xamot = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
+ if ((buf->dtb_xamot = kmem_zalloc(size,
+ KM_NOSLEEP | KM_NORMALPRI)) == NULL)
goto err;
} while ((cp = cp->cpu_next) != cpu_list);
@@ -10848,16 +11914,19 @@
continue;
buf = &bufs[cp->cpu_id];
+ desired += 2;
if (buf->dtb_xamot != NULL) {
ASSERT(buf->dtb_tomax != NULL);
ASSERT(buf->dtb_size == size);
kmem_free(buf->dtb_xamot, size);
+ allocated++;
}
if (buf->dtb_tomax != NULL) {
ASSERT(buf->dtb_size == size);
kmem_free(buf->dtb_tomax, size);
+ allocated++;
}
buf->dtb_tomax = NULL;
@@ -10864,12 +11933,11 @@
buf->dtb_xamot = NULL;
buf->dtb_size = 0;
} while ((cp = cp->cpu_next) != cpu_list);
-
- return (ENOMEM);
#else
int i;
-#if defined(__amd64__)
+ *factor = 1;
+#if defined(__amd64__) || defined(__mips__) || defined(__powerpc__)
/*
* FreeBSD isn't good at limiting the amount of memory we
* ask to malloc, so let's place a limit here before trying
@@ -10876,7 +11944,7 @@
* to do something that might well end in tears at bedtime.
*/
if (size > physmem * PAGE_SIZE / (128 * (mp_maxid + 1)))
- return(ENOMEM);
+ return (ENOMEM);
#endif
ASSERT(MUTEX_HELD(&dtrace_lock));
@@ -10898,7 +11966,8 @@
ASSERT(buf->dtb_xamot == NULL);
- if ((buf->dtb_tomax = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
+ if ((buf->dtb_tomax = kmem_zalloc(size,
+ KM_NOSLEEP | KM_NORMALPRI)) == NULL)
goto err;
buf->dtb_size = size;
@@ -10909,7 +11978,8 @@
if (flags & DTRACEBUF_NOSWITCH)
continue;
- if ((buf->dtb_xamot = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
+ if ((buf->dtb_xamot = kmem_zalloc(size,
+ KM_NOSLEEP | KM_NORMALPRI)) == NULL)
goto err;
}
@@ -10925,16 +11995,19 @@
continue;
buf = &bufs[i];
+ desired += 2;
if (buf->dtb_xamot != NULL) {
ASSERT(buf->dtb_tomax != NULL);
ASSERT(buf->dtb_size == size);
kmem_free(buf->dtb_xamot, size);
+ allocated++;
}
if (buf->dtb_tomax != NULL) {
ASSERT(buf->dtb_size == size);
kmem_free(buf->dtb_tomax, size);
+ allocated++;
}
buf->dtb_tomax = NULL;
@@ -10942,9 +12015,10 @@
buf->dtb_size = 0;
}
+#endif
+ *factor = desired / (allocated > 0 ? allocated : 1);
return (ENOMEM);
-#endif
}
/*
@@ -11110,7 +12184,7 @@
if (epid == DTRACE_EPIDNONE) {
size = sizeof (uint32_t);
} else {
- ASSERT(epid <= state->dts_necbs);
+ ASSERT3U(epid, <=, state->dts_necbs);
ASSERT(state->dts_ecbs[epid - 1] != NULL);
size = state->dts_ecbs[epid - 1]->dte_size;
@@ -11434,6 +12508,7 @@
ASSERT(enab->dten_vstate->dtvs_state != NULL);
ASSERT(enab->dten_vstate->dtvs_state->dts_nretained > 0);
enab->dten_vstate->dtvs_state->dts_nretained--;
+ dtrace_retained_gen++;
}
if (enab->dten_prev == NULL) {
@@ -11476,6 +12551,7 @@
return (ENOSPC);
state->dts_nretained++;
+ dtrace_retained_gen++;
if (dtrace_retained == NULL) {
dtrace_retained = enab;
@@ -11653,10 +12729,11 @@
* block pending our completion.
*/
for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
-#if defined(sun)
+#ifdef illumos
cred_t *cr = enab->dten_vstate->dtvs_state->dts_cred.dcr_cred;
- if (INGLOBALZONE(curproc) || getzoneid() == crgetzoneid(cr))
+ if (INGLOBALZONE(curproc) ||
+ cr != NULL && getzoneid() == crgetzoneid(cr))
#endif
(void) dtrace_enabling_match(enab, NULL);
}
@@ -11717,6 +12794,7 @@
{
int i, all = 0;
dtrace_probedesc_t desc;
+ dtrace_genid_t gen;
ASSERT(MUTEX_HELD(&dtrace_lock));
ASSERT(MUTEX_HELD(&dtrace_provider_lock));
@@ -11727,15 +12805,25 @@
}
do {
- dtrace_enabling_t *enab = dtrace_retained;
+ dtrace_enabling_t *enab;
void *parg = prv->dtpv_arg;
- for (; enab != NULL; enab = enab->dten_next) {
+retry:
+ gen = dtrace_retained_gen;
+ for (enab = dtrace_retained; enab != NULL;
+ enab = enab->dten_next) {
for (i = 0; i < enab->dten_ndesc; i++) {
desc = enab->dten_desc[i]->dted_probe;
mutex_exit(&dtrace_lock);
prv->dtpv_pops.dtps_provide(parg, &desc);
mutex_enter(&dtrace_lock);
+ /*
+ * Process the retained enablings again if
+ * they have changed while we weren't holding
+ * dtrace_lock.
+ */
+ if (gen != dtrace_retained_gen)
+ goto retry;
}
}
} while (all && (prv = prv->dtpv_next) != NULL);
@@ -11936,7 +13024,8 @@
dof = kmem_alloc(hdr.dofh_loadsz, KM_SLEEP);
- if (copyin((void *)uarg, dof, hdr.dofh_loadsz) != 0) {
+ if (copyin((void *)uarg, dof, hdr.dofh_loadsz) != 0 ||
+ dof->dofh_loadsz != hdr.dofh_loadsz) {
kmem_free(dof, hdr.dofh_loadsz);
*errp = EFAULT;
return (NULL);
@@ -11945,7 +13034,7 @@
return (dof);
}
-#if !defined(sun)
+#ifndef illumos
static __inline uchar_t
dtrace_dof_char(char c) {
switch (c) {
@@ -11988,7 +13077,7 @@
unsigned int len, i;
dof_hdr_t *dof;
-#if defined(sun)
+#ifdef illumos
/*
* Unfortunately, array of values in .conf files are always (and
* only) interpreted to be integer arrays. We must read our DOF
@@ -12746,10 +13835,17 @@
}
}
+ if (DOF_SEC_ISLOADABLE(sec->dofs_type) &&
+ !(sec->dofs_flags & DOF_SECF_LOAD)) {
+ dtrace_dof_error(dof, "loadable section with load "
+ "flag unset");
+ return (-1);
+ }
+
if (!(sec->dofs_flags & DOF_SECF_LOAD))
continue; /* just ignore non-loadable sections */
- if (sec->dofs_align & (sec->dofs_align - 1)) {
+ if (!ISP2(sec->dofs_align)) {
dtrace_dof_error(dof, "bad section alignment");
return (-1);
}
@@ -12894,10 +13990,12 @@
if ((dstate->dtds_chunksize = chunksize) == 0)
dstate->dtds_chunksize = DTRACE_DYNVAR_CHUNKSIZE;
+ VERIFY(dstate->dtds_chunksize < LONG_MAX);
+
if (size < (min = dstate->dtds_chunksize + sizeof (dtrace_dynhash_t)))
size = min;
- if ((base = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
+ if ((base = kmem_zalloc(size, KM_NOSLEEP | KM_NORMALPRI)) == NULL)
return (ENOMEM);
dstate->dtds_size = size;
@@ -12934,10 +14032,13 @@
((uintptr_t)base + hashsize * sizeof (dtrace_dynhash_t));
limit = (uintptr_t)base + size;
+ VERIFY((uintptr_t)start < limit);
+ VERIFY((uintptr_t)start >= (uintptr_t)base);
+
maxper = (limit - (uintptr_t)start) / NCPU;
maxper = (maxper / dstate->dtds_chunksize) * dstate->dtds_chunksize;
-#if !defined(sun)
+#ifndef illumos
CPU_FOREACH(i) {
#else
for (i = 0; i < NCPU; i++) {
@@ -12959,7 +14060,7 @@
start = (dtrace_dynvar_t *)limit;
}
- ASSERT(limit <= (uintptr_t)base + size);
+ VERIFY(limit <= (uintptr_t)base + size);
for (;;) {
next = (dtrace_dynvar_t *)((uintptr_t)dvar +
@@ -12968,6 +14069,8 @@
if ((uintptr_t)next + dstate->dtds_chunksize >= limit)
break;
+ VERIFY((uintptr_t)dvar >= (uintptr_t)base &&
+ (uintptr_t)dvar <= (uintptr_t)base + size);
dvar->dtdv_next = next;
dvar = next;
}
@@ -13017,7 +14120,7 @@
}
}
-#if defined(sun)
+#ifdef illumos
static void
dtrace_state_clean(dtrace_state_t *state)
{
@@ -13054,7 +14157,7 @@
dtrace_membar_producer();
state->dts_alive = now;
}
-#else
+#else /* !illumos */
static void
dtrace_state_clean(void *arg)
{
@@ -13103,16 +14206,16 @@
callout_reset(&state->dts_deadman, hz * dtrace_deadman_interval / NANOSEC,
dtrace_state_deadman, state);
}
-#endif
+#endif /* illumos */
static dtrace_state_t *
-#if defined(sun)
+#ifdef illumos
dtrace_state_create(dev_t *devp, cred_t *cr)
#else
dtrace_state_create(struct cdev *dev)
#endif
{
-#if defined(sun)
+#ifdef illumos
minor_t minor;
major_t major;
#else
@@ -13127,7 +14230,7 @@
ASSERT(MUTEX_HELD(&dtrace_lock));
ASSERT(MUTEX_HELD(&cpu_lock));
-#if defined(sun)
+#ifdef illumos
minor = (minor_t)(uintptr_t)vmem_alloc(dtrace_minor, 1,
VM_BESTFIT | VM_SLEEP);
@@ -13141,7 +14244,7 @@
if (dev != NULL) {
cr = dev->si_cred;
m = dev2unit(dev);
- }
+ }
/* Allocate memory for the state. */
state = kmem_zalloc(sizeof(dtrace_state_t), KM_SLEEP);
@@ -13150,7 +14253,7 @@
state->dts_epid = DTRACE_EPIDNONE + 1;
(void) snprintf(c, sizeof (c), "dtrace_aggid_%d", m);
-#if defined(sun)
+#ifdef illumos
state->dts_aggid_arena = vmem_create(c, (void *)1, UINT32_MAX, 1,
NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
@@ -13178,12 +14281,12 @@
state->dts_buffer = kmem_zalloc(bufsize, KM_SLEEP);
state->dts_aggbuffer = kmem_zalloc(bufsize, KM_SLEEP);
-#if defined(sun)
+#ifdef illumos
state->dts_cleaner = CYCLIC_NONE;
state->dts_deadman = CYCLIC_NONE;
#else
- callout_init(&state->dts_cleaner, CALLOUT_MPSAFE);
- callout_init(&state->dts_deadman, CALLOUT_MPSAFE);
+ callout_init(&state->dts_cleaner, 1);
+ callout_init(&state->dts_deadman, 1);
#endif
state->dts_vstate.dtvs_state = state;
@@ -13268,7 +14371,7 @@
* we can do destructive things to processes which
* have altered credentials.
*/
-#if defined(sun)
+#ifdef illumos
if (priv_isequalset(priv_getset(cr, PRIV_EFFECTIVE),
cr->cr_zone->zone_privset)) {
state->dts_cred.dcr_action |=
@@ -13310,7 +14413,7 @@
state->dts_cred.dcr_action |=
DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
-#if defined(sun)
+#ifdef illumos
/*
* If we have all privs in whatever zone this is,
* we can do destructive things to processes which
@@ -13349,7 +14452,7 @@
{
dtrace_optval_t *opt = state->dts_options, size;
processorid_t cpu = 0;;
- int flags = 0, rval;
+ int flags = 0, rval, factor, divisor = 1;
ASSERT(MUTEX_HELD(&dtrace_lock));
ASSERT(MUTEX_HELD(&cpu_lock));
@@ -13379,7 +14482,7 @@
flags |= DTRACEBUF_INACTIVE;
}
- for (size = opt[which]; size >= sizeof (uint64_t); size >>= 1) {
+ for (size = opt[which]; size >= sizeof (uint64_t); size /= divisor) {
/*
* The size must be 8-byte aligned. If the size is not 8-byte
* aligned, drop it down by the difference.
@@ -13397,7 +14500,7 @@
return (E2BIG);
}
- rval = dtrace_buffer_alloc(buf, size, flags, cpu);
+ rval = dtrace_buffer_alloc(buf, size, flags, cpu, &factor);
if (rval != ENOMEM) {
opt[which] = size;
@@ -13406,6 +14509,9 @@
if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL)
return (rval);
+
+ for (divisor = 2; divisor < factor; divisor <<= 1)
+ continue;
}
return (ENOMEM);
@@ -13466,7 +14572,7 @@
dtrace_optval_t *opt = state->dts_options, sz, nspec;
dtrace_speculation_t *spec;
dtrace_buffer_t *buf;
-#if defined(sun)
+#ifdef illumos
cyc_handler_t hdlr;
cyc_time_t when;
#endif
@@ -13507,7 +14613,8 @@
goto out;
}
- spec = kmem_zalloc(nspec * sizeof (dtrace_speculation_t), KM_NOSLEEP);
+ spec = kmem_zalloc(nspec * sizeof (dtrace_speculation_t),
+ KM_NOSLEEP | KM_NORMALPRI);
if (spec == NULL) {
rval = ENOMEM;
@@ -13518,7 +14625,8 @@
state->dts_nspeculations = (int)nspec;
for (i = 0; i < nspec; i++) {
- if ((buf = kmem_zalloc(bufsize, KM_NOSLEEP)) == NULL) {
+ if ((buf = kmem_zalloc(bufsize,
+ KM_NOSLEEP | KM_NORMALPRI)) == NULL) {
rval = ENOMEM;
goto err;
}
@@ -13648,7 +14756,7 @@
opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max;
state->dts_alive = state->dts_laststatus = dtrace_gethrtime();
-#if defined(sun)
+#ifdef illumos
hdlr.cyh_func = (cyc_func_t)dtrace_state_clean;
hdlr.cyh_arg = state;
hdlr.cyh_level = CY_LOW_LEVEL;
@@ -13675,6 +14783,24 @@
state->dts_activity = DTRACE_ACTIVITY_WARMUP;
+#ifdef illumos
+ if (state->dts_getf != 0 &&
+ !(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)) {
+ /*
+ * We don't have kernel privs but we have at least one call
+ * to getf(); we need to bump our zone's count, and (if
+ * this is the first enabling to have an unprivileged call
+ * to getf()) we need to hook into closef().
+ */
+ state->dts_cred.dcr_cred->cr_zone->zone_dtrace_getf++;
+
+ if (dtrace_getf++ == 0) {
+ ASSERT(dtrace_closef == NULL);
+ dtrace_closef = dtrace_getf_barrier;
+ }
+ }
+#endif
+
/*
* Now it's time to actually fire the BEGIN probe. We need to disable
* interrupts here both to record the CPU on which we fired the BEGIN
@@ -13791,6 +14917,26 @@
state->dts_activity = DTRACE_ACTIVITY_STOPPED;
dtrace_sync();
+#ifdef illumos
+ if (state->dts_getf != 0 &&
+ !(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)) {
+ /*
+ * We don't have kernel privs but we have at least one call
+ * to getf(); we need to lower our zone's count, and (if
+ * this is the last enabling to have an unprivileged call
+ * to getf()) we need to clear the closef() hook.
+ */
+ ASSERT(state->dts_cred.dcr_cred->cr_zone->zone_dtrace_getf > 0);
+ ASSERT(dtrace_closef == dtrace_getf_barrier);
+ ASSERT(dtrace_getf > 0);
+
+ state->dts_cred.dcr_cred->cr_zone->zone_dtrace_getf--;
+
+ if (--dtrace_getf == 0)
+ dtrace_closef = NULL;
+ }
+#endif
+
return (0);
}
@@ -13854,7 +15000,7 @@
{
dtrace_ecb_t *ecb;
dtrace_vstate_t *vstate = &state->dts_vstate;
-#if defined(sun)
+#ifdef illumos
minor_t minor = getminor(state->dts_dev);
#endif
int i, bufsize = NCPU * sizeof (dtrace_buffer_t);
@@ -13932,7 +15078,7 @@
for (i = 0; i < nspec; i++)
dtrace_buffer_free(spec[i].dtsp_buffer);
-#if defined(sun)
+#ifdef illumos
if (state->dts_cleaner != CYCLIC_NONE)
cyclic_remove(state->dts_cleaner);
@@ -13972,7 +15118,7 @@
dtrace_format_destroy(state);
if (state->dts_aggid_arena != NULL) {
-#if defined(sun)
+#ifdef illumos
vmem_destroy(state->dts_aggid_arena);
#else
delete_unrhdr(state->dts_aggid_arena);
@@ -13979,7 +15125,7 @@
#endif
state->dts_aggid_arena = NULL;
}
-#if defined(sun)
+#ifdef illumos
ddi_soft_state_free(dtrace_softstate, minor);
vmem_free(dtrace_minor, (void *)(uintptr_t)minor, 1);
#endif
@@ -14031,7 +15177,7 @@
break;
}
-#if defined(sun)
+#ifdef illumos
/*
* We want to create anonymous state, so we need to transition
* the kernel debugger to indicate that DTrace is active. If
@@ -14050,7 +15196,7 @@
* If we haven't allocated an anonymous state, we'll do so now.
*/
if ((state = dtrace_anon.dta_state) == NULL) {
-#if defined(sun)
+#ifdef illumos
state = dtrace_state_create(NULL, NULL);
#else
state = dtrace_state_create(NULL);
@@ -14123,10 +15269,10 @@
dtrace_mstate_t *mstate, dtrace_vstate_t *vstate, int where)
{
uint32_t size, next, nnext, i;
- dtrace_helptrace_t *ent;
+ dtrace_helptrace_t *ent, *buffer;
uint16_t flags = cpu_core[curcpu].cpuc_dtrace_flags;
- if (!dtrace_helptrace_enabled)
+ if ((buffer = dtrace_helptrace_buffer) == NULL)
return;
ASSERT(vstate->dtvs_nlocals <= dtrace_helptrace_nlocals);
@@ -14154,10 +15300,12 @@
/*
* We have our slot; fill it in.
*/
- if (nnext == size)
+ if (nnext == size) {
+ dtrace_helptrace_wrapped++;
next = 0;
+ }
- ent = (dtrace_helptrace_t *)&dtrace_helptrace_buffer[next];
+ ent = (dtrace_helptrace_t *)((uintptr_t)buffer + next);
ent->dtht_helper = helper;
ent->dtht_where = where;
ent->dtht_nlocals = vstate->dtvs_nlocals;
@@ -14191,7 +15339,7 @@
dtrace_helper_action_t *helper;
dtrace_vstate_t *vstate;
dtrace_difo_t *pred;
- int i, trace = dtrace_helptrace_enabled;
+ int i, trace = dtrace_helptrace_buffer != NULL;
ASSERT(which >= 0 && which < DTRACE_NHELPER_ACTIONS);
@@ -14552,8 +15700,8 @@
* Check to make sure this isn't a duplicate.
*/
for (i = 0; i < help->dthps_nprovs; i++) {
- if (dofhp->dofhp_addr ==
- help->dthps_provs[i]->dthp_prov.dofhp_addr)
+ if (dofhp->dofhp_dof ==
+ help->dthps_provs[i]->dthp_prov.dofhp_dof)
return (EALREADY);
}
@@ -14713,7 +15861,13 @@
if (strlen(strtab + probe->dofpr_func) >= DTRACE_FUNCNAMELEN) {
dtrace_dof_error(dof, "function name too long");
- return (-1);
+ /*
+ * Keep going if the function name is too long.
+ * Unlike provider and probe names, we cannot reasonably
+ * impose restrictions on function names, since they're
+ * a property of the code being instrumented. We will
+ * skip this probe in dtrace_helper_provide_one().
+ */
}
if (probe->dofpr_name >= str_sec->dofs_size ||
@@ -14939,7 +16093,7 @@
return (help);
}
-#if defined(sun)
+#ifdef illumos
static
#endif
void
@@ -14947,7 +16101,7 @@
{
dtrace_helpers_t *help;
dtrace_vstate_t *vstate;
-#if defined(sun)
+#ifdef illumos
proc_t *p = curproc;
#endif
int i;
@@ -15036,7 +16190,7 @@
mutex_exit(&dtrace_lock);
}
-#if defined(sun)
+#ifdef illumos
static
#endif
void
@@ -15121,7 +16275,6 @@
dtrace_helper_provider_register(to, newhelp, NULL);
}
-#if defined(sun)
/*
* DTrace Hook Functions
*/
@@ -15131,9 +16284,13 @@
dtrace_provider_t *prv;
mutex_enter(&dtrace_provider_lock);
+#ifdef illumos
mutex_enter(&mod_lock);
+#endif
+#ifdef illumos
ASSERT(ctl->mod_busy);
+#endif
/*
* We're going to call each providers per-module provide operation
@@ -15142,7 +16299,9 @@
for (prv = dtrace_provider; prv != NULL; prv = prv->dtpv_next)
prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
+#ifdef illumos
mutex_exit(&mod_lock);
+#endif
mutex_exit(&dtrace_provider_lock);
/*
@@ -15179,17 +16338,48 @@
}
static void
+#ifdef illumos
dtrace_module_unloaded(modctl_t *ctl)
+#else
+dtrace_module_unloaded(modctl_t *ctl, int *error)
+#endif
{
dtrace_probe_t template, *probe, *first, *next;
dtrace_provider_t *prov;
+#ifndef illumos
+ char modname[DTRACE_MODNAMELEN];
+ size_t len;
+#endif
+#ifdef illumos
template.dtpr_mod = ctl->mod_modname;
+#else
+ /* Handle the fact that ctl->filename may end in ".ko". */
+ strlcpy(modname, ctl->filename, sizeof(modname));
+ len = strlen(ctl->filename);
+ if (len > 3 && strcmp(modname + len - 3, ".ko") == 0)
+ modname[len - 3] = '\0';
+ template.dtpr_mod = modname;
+#endif
mutex_enter(&dtrace_provider_lock);
+#ifdef illumos
mutex_enter(&mod_lock);
+#endif
mutex_enter(&dtrace_lock);
+#ifndef illumos
+ if (ctl->nenabled > 0) {
+ /* Don't allow unloads if a probe is enabled. */
+ mutex_exit(&dtrace_provider_lock);
+ mutex_exit(&dtrace_lock);
+ *error = -1;
+ printf(
+ "kldunload: attempt to unload module that has DTrace probes enabled\n");
+ return;
+ }
+#endif
+
if (dtrace_bymod == NULL) {
/*
* The DTrace module is loaded (obviously) but not attached;
@@ -15196,7 +16386,9 @@
* we don't have any work to do.
*/
mutex_exit(&dtrace_provider_lock);
+#ifdef illumos
mutex_exit(&mod_lock);
+#endif
mutex_exit(&dtrace_lock);
return;
}
@@ -15205,7 +16397,9 @@
probe != NULL; probe = probe->dtpr_nextmod) {
if (probe->dtpr_ecb != NULL) {
mutex_exit(&dtrace_provider_lock);
+#ifdef illumos
mutex_exit(&mod_lock);
+#endif
mutex_exit(&dtrace_lock);
/*
@@ -15219,8 +16413,13 @@
* probe, either.
*/
if (dtrace_err_verbose) {
+#ifdef illumos
cmn_err(CE_WARN, "unloaded module '%s' had "
"enabled probes", ctl->mod_modname);
+#else
+ cmn_err(CE_WARN, "unloaded module '%s' had "
+ "enabled probes", modname);
+#endif
}
return;
@@ -15263,16 +16462,42 @@
kmem_free(probe->dtpr_mod, strlen(probe->dtpr_mod) + 1);
kmem_free(probe->dtpr_func, strlen(probe->dtpr_func) + 1);
kmem_free(probe->dtpr_name, strlen(probe->dtpr_name) + 1);
+#ifdef illumos
vmem_free(dtrace_arena, (void *)(uintptr_t)probe->dtpr_id, 1);
+#else
+ free_unr(dtrace_arena, probe->dtpr_id);
+#endif
kmem_free(probe, sizeof (dtrace_probe_t));
}
mutex_exit(&dtrace_lock);
+#ifdef illumos
mutex_exit(&mod_lock);
+#endif
mutex_exit(&dtrace_provider_lock);
}
+#ifndef illumos
static void
+dtrace_kld_load(void *arg __unused, linker_file_t lf)
+{
+
+ dtrace_module_loaded(lf);
+}
+
+static void
+dtrace_kld_unload_try(void *arg __unused, linker_file_t lf, int *error)
+{
+
+ if (*error != 0)
+ /* We already have an error, so don't do anything. */
+ return;
+ dtrace_module_unloaded(lf, error);
+}
+#endif
+
+#ifdef illumos
+static void
dtrace_suspend(void)
{
dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_suspend));
@@ -15344,7 +16569,7 @@
return (0);
}
-#if defined(sun)
+#ifdef illumos
static void
dtrace_cpu_setup_initial(processorid_t cpu)
{
@@ -15389,10 +16614,29 @@
dtrace_toxranges++;
}
+static void
+dtrace_getf_barrier()
+{
+#ifdef illumos
+ /*
+ * When we have unprivileged (that is, non-DTRACE_CRV_KERNEL) enablings
+ * that contain calls to getf(), this routine will be called on every
+ * closef() before either the underlying vnode is released or the
+ * file_t itself is freed. By the time we are here, it is essential
+ * that the file_t can no longer be accessed from a call to getf()
+ * in probe context -- that assures that a dtrace_sync() can be used
+ * to clear out any enablings referring to the old structures.
+ */
+ if (curthread->t_procp->p_zone->zone_dtrace_getf != 0 ||
+ kcred->cr_zone->zone_dtrace_getf != 0)
+ dtrace_sync();
+#endif
+}
+
/*
* DTrace Driver Cookbook Functions
*/
-#if defined(sun)
+#ifdef illumos
/*ARGSUSED*/
static int
dtrace_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
@@ -15504,17 +16748,6 @@
mutex_exit(&cpu_lock);
/*
- * If DTrace helper tracing is enabled, we need to allocate the
- * trace buffer and initialize the values.
- */
- if (dtrace_helptrace_enabled) {
- ASSERT(dtrace_helptrace_buffer == NULL);
- dtrace_helptrace_buffer =
- kmem_zalloc(dtrace_helptrace_bufsize, KM_SLEEP);
- dtrace_helptrace_next = 0;
- }
-
- /*
* If there are already providers, we must ask them to provide their
* probes, and then match any anonymous enabling against them. Note
* that there should be no other retained enablings at this time:
@@ -15559,20 +16792,15 @@
return (DDI_SUCCESS);
}
-#endif
+#endif /* illumos */
-#if !defined(sun)
-#if __FreeBSD_version >= 800039
-static void
-dtrace_dtr(void *data __unused)
-{
-}
+#ifndef illumos
+static void dtrace_dtr(void *);
#endif
-#endif
/*ARGSUSED*/
static int
-#if defined(sun)
+#ifdef illumos
dtrace_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
#else
dtrace_open(struct cdev *dev, int oflags, int devtype, struct thread *td)
@@ -15583,7 +16811,7 @@
uid_t uid;
zoneid_t zoneid;
-#if defined(sun)
+#ifdef illumos
if (getminor(*devp) == DTRACEMNRN_HELPER)
return (0);
@@ -15591,30 +16819,11 @@
* If this wasn't an open with the "helper" minor, then it must be
* the "dtrace" minor.
*/
- ASSERT(getminor(*devp) == DTRACEMNRN_DTRACE);
+ if (getminor(*devp) == DTRACEMNRN_DTRACE)
+ return (ENXIO);
#else
cred_t *cred_p = NULL;
-
-#if __FreeBSD_version < 800039
- /*
- * The first minor device is the one that is cloned so there is
- * nothing more to do here.
- */
- if (dev2unit(dev) == 0)
- return 0;
-
- /*
- * Devices are cloned, so if the DTrace state has already
- * been allocated, that means this device belongs to a
- * different client. Each client should open '/dev/dtrace'
- * to get a cloned device.
- */
- if (dev->si_drv1 != NULL)
- return (EBUSY);
-#endif
-
cred_p = dev->si_cred;
-#endif
/*
* If no DTRACE_PRIV_* bits are set in the credential, then the
@@ -15622,12 +16831,7 @@
*/
dtrace_cred2priv(cred_p, &priv, &uid, &zoneid);
if (priv == DTRACE_PRIV_NONE) {
-#if !defined(sun)
-#if __FreeBSD_version < 800039
- /* Destroy the cloned device. */
- destroy_dev(dev);
#endif
-#endif
return (EACCES);
}
@@ -15644,7 +16848,7 @@
dtrace_opens++;
dtrace_membar_producer();
-#if defined(sun)
+#ifdef illumos
/*
* If the kernel debugger is active (that is, if the kernel debugger
* modified text in some way), we won't allow the open.
@@ -15656,36 +16860,34 @@
return (EBUSY);
}
+ if (dtrace_helptrace_enable && dtrace_helptrace_buffer == NULL) {
+ /*
+ * If DTrace helper tracing is enabled, we need to allocate the
+ * trace buffer and initialize the values.
+ */
+ dtrace_helptrace_buffer =
+ kmem_zalloc(dtrace_helptrace_bufsize, KM_SLEEP);
+ dtrace_helptrace_next = 0;
+ dtrace_helptrace_wrapped = 0;
+ dtrace_helptrace_enable = 0;
+ }
+
state = dtrace_state_create(devp, cred_p);
#else
state = dtrace_state_create(dev);
-#if __FreeBSD_version < 800039
- dev->si_drv1 = state;
-#else
devfs_set_cdevpriv(state, dtrace_dtr);
#endif
- /* This code actually belongs in dtrace_attach() */
- if (dtrace_opens == 1)
- dtrace_taskq = taskq_create("dtrace_taskq", 1, maxclsyspri,
- 1, INT_MAX, 0);
-#endif
mutex_exit(&cpu_lock);
if (state == NULL) {
-#if defined(sun)
- if (--dtrace_opens == 0)
+#ifdef illumos
+ if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL)
(void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
#else
--dtrace_opens;
#endif
mutex_exit(&dtrace_lock);
-#if !defined(sun)
-#if __FreeBSD_version < 800039
- /* Destroy the cloned device. */
- destroy_dev(dev);
-#endif
-#endif
return (EAGAIN);
}
@@ -15695,82 +16897,92 @@
}
/*ARGSUSED*/
+#ifdef illumos
static int
-#if defined(sun)
dtrace_close(dev_t dev, int flag, int otyp, cred_t *cred_p)
#else
-dtrace_close(struct cdev *dev, int flags, int fmt __unused, struct thread *td)
+static void
+dtrace_dtr(void *data)
#endif
{
-#if defined(sun)
+#ifdef illumos
minor_t minor = getminor(dev);
dtrace_state_t *state;
+#endif
+ dtrace_helptrace_t *buf = NULL;
+#ifdef illumos
if (minor == DTRACEMNRN_HELPER)
return (0);
state = ddi_get_soft_state(dtrace_softstate, minor);
#else
-#if __FreeBSD_version < 800039
- dtrace_state_t *state = dev->si_drv1;
+ dtrace_state_t *state = data;
+#endif
- /* Check if this is not a cloned device. */
- if (dev2unit(dev) == 0)
- return (0);
+ mutex_enter(&cpu_lock);
+ mutex_enter(&dtrace_lock);
+
+#ifdef illumos
+ if (state->dts_anon)
#else
- dtrace_state_t *state;
- devfs_get_cdevpriv((void **) &state);
+ if (state != NULL && state->dts_anon)
#endif
+ {
+ /*
+ * There is anonymous state. Destroy that first.
+ */
+ ASSERT(dtrace_anon.dta_state == NULL);
+ dtrace_state_destroy(state->dts_anon);
+ }
-#endif
+ if (dtrace_helptrace_disable) {
+ /*
+ * If we have been told to disable helper tracing, set the
+ * buffer to NULL before calling into dtrace_state_destroy();
+ * we take advantage of its dtrace_sync() to know that no
+ * CPU is in probe context with enabled helper tracing
+ * after it returns.
+ */
+ buf = dtrace_helptrace_buffer;
+ dtrace_helptrace_buffer = NULL;
+ }
- mutex_enter(&cpu_lock);
- mutex_enter(&dtrace_lock);
-
+#ifdef illumos
+ dtrace_state_destroy(state);
+#else
if (state != NULL) {
- if (state->dts_anon) {
- /*
- * There is anonymous state. Destroy that first.
- */
- ASSERT(dtrace_anon.dta_state == NULL);
- dtrace_state_destroy(state->dts_anon);
- }
-
dtrace_state_destroy(state);
-
-#if !defined(sun)
kmem_free(state, 0);
-#if __FreeBSD_version < 800039
- dev->si_drv1 = NULL;
+ }
#endif
-#endif
- }
+ ASSERT(dtrace_opens > 0);
- ASSERT(dtrace_opens > 0);
-#if defined(sun)
- if (--dtrace_opens == 0)
+#ifdef illumos
+ /*
+ * Only relinquish control of the kernel debugger interface when there
+ * are no consumers and no anonymous enablings.
+ */
+ if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL)
(void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
#else
--dtrace_opens;
- /* This code actually belongs in dtrace_detach() */
- if ((dtrace_opens == 0) && (dtrace_taskq != NULL)) {
- taskq_destroy(dtrace_taskq);
- dtrace_taskq = NULL;
- }
#endif
+ if (buf != NULL) {
+ kmem_free(buf, dtrace_helptrace_bufsize);
+ dtrace_helptrace_disable = 0;
+ }
+
mutex_exit(&dtrace_lock);
mutex_exit(&cpu_lock);
-#if __FreeBSD_version < 800039
- /* Schedule this cloned device to be destroyed. */
- destroy_dev_sched(dev);
+#ifdef illumos
+ return (0);
#endif
-
- return (0);
}
-#if defined(sun)
+#ifdef illumos
/*ARGSUSED*/
static int
dtrace_ioctl_helper(int cmd, intptr_t arg, int *rv)
@@ -16385,6 +17597,7 @@
desc.dtbd_drops = buf->dtb_drops;
desc.dtbd_errors = buf->dtb_errors;
desc.dtbd_oldest = buf->dtb_xamot_offset;
+ desc.dtbd_timestamp = dtrace_gethrtime();
mutex_exit(&dtrace_lock);
@@ -16437,6 +17650,7 @@
desc.dtbd_drops = buf->dtb_xamot_drops;
desc.dtbd_errors = buf->dtb_xamot_errors;
desc.dtbd_oldest = 0;
+ desc.dtbd_timestamp = buf->dtb_switched;
mutex_exit(&dtrace_lock);
@@ -16652,13 +17866,11 @@
dtrace_modload = NULL;
dtrace_modunload = NULL;
+ ASSERT(dtrace_getf == 0);
+ ASSERT(dtrace_closef == NULL);
+
mutex_exit(&cpu_lock);
- if (dtrace_helptrace_enabled) {
- kmem_free(dtrace_helptrace_buffer, dtrace_helptrace_bufsize);
- dtrace_helptrace_buffer = NULL;
- }
-
kmem_free(dtrace_probes, dtrace_nprobes * sizeof (dtrace_probe_t *));
dtrace_probes = NULL;
dtrace_nprobes = 0;
@@ -16709,7 +17921,7 @@
}
#endif
-#if defined(sun)
+#ifdef illumos
/*ARGSUSED*/
static int
dtrace_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
@@ -16732,7 +17944,7 @@
}
#endif
-#if defined(sun)
+#ifdef illumos
static struct cb_ops dtrace_cb_ops = {
dtrace_open, /* open */
dtrace_close, /* close */
@@ -16800,14 +18012,8 @@
static d_ioctl_t dtrace_ioctl_helper;
static void dtrace_load(void *);
static int dtrace_unload(void);
-#if __FreeBSD_version < 800039
-static void dtrace_clone(void *, struct ucred *, char *, int , struct cdev **);
-static struct clonedevs *dtrace_clones; /* Ptr to the array of cloned devices. */
-static eventhandler_tag eh_tag; /* Event handler tag. */
-#else
static struct cdev *dtrace_dev;
static struct cdev *helper_dev;
-#endif
void dtrace_invop_init(void);
void dtrace_invop_uninit(void);
@@ -16814,8 +18020,6 @@
static struct cdevsw dtrace_cdevsw = {
.d_version = D_VERSION,
- .d_flags = D_TRACKCLOSE | D_NEEDMINOR,
- .d_close = dtrace_close,
.d_ioctl = dtrace_ioctl,
.d_open = dtrace_open,
.d_name = "dtrace",
@@ -16823,15 +18027,11 @@
static struct cdevsw helper_cdevsw = {
.d_version = D_VERSION,
- .d_flags = D_TRACKCLOSE | D_NEEDMINOR,
.d_ioctl = dtrace_ioctl_helper,
.d_name = "helper",
};
#include <dtrace_anon.c>
-#if __FreeBSD_version < 800039
-#include <dtrace_clone.c>
-#endif
#include <dtrace_ioctl.c>
#include <dtrace_load.c>
#include <dtrace_modevent.c>
@@ -16847,6 +18047,5 @@
DEV_MODULE(dtrace, dtrace_modevent, NULL);
MODULE_VERSION(dtrace, 1);
-MODULE_DEPEND(dtrace, cyclic, 1, 1, 1);
MODULE_DEPEND(dtrace, opensolaris, 1, 1, 1);
#endif
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/dtrace/fasttrap.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/dtrace/fasttrap.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/dtrace/fasttrap.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -20,7 +21,7 @@
*
* Portions Copyright 2010 The FreeBSD Foundation
*
- * $FreeBSD: release/9.2.0/sys/cddl/contrib/opensolaris/uts/common/dtrace/fasttrap.c 253394 2013-07-16 15:51:32Z avg $
+ * $FreeBSD: stable/10/sys/cddl/contrib/opensolaris/uts/common/dtrace/fasttrap.c 299003 2016-05-03 20:08:05Z markj $
*/
/*
@@ -28,9 +29,9 @@
* Use is subject to license terms.
*/
-#if defined(sun)
-#pragma ident "%Z%%M% %I% %E% SMI"
-#endif
+/*
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ */
#include <sys/atomic.h>
#include <sys/errno.h>
@@ -38,13 +39,13 @@
#include <sys/modctl.h>
#include <sys/conf.h>
#include <sys/systm.h>
-#if defined(sun)
+#ifdef illumos
#include <sys/ddi.h>
#endif
#include <sys/sunddi.h>
#include <sys/cpuvar.h>
#include <sys/kmem.h>
-#if defined(sun)
+#ifdef illumos
#include <sys/strsubr.h>
#endif
#include <sys/fasttrap.h>
@@ -55,14 +56,24 @@
#include <sys/sysmacros.h>
#include <sys/proc.h>
#include <sys/policy.h>
-#if defined(sun)
+#ifdef illumos
#include <util/qsort.h>
#endif
#include <sys/mutex.h>
#include <sys/kernel.h>
-#if !defined(sun)
+#ifndef illumos
+#include <sys/dtrace_bsd.h>
+#include <sys/eventhandler.h>
+#include <sys/rmlock.h>
+#include <sys/sysctl.h>
+#include <sys/u8_textprep.h>
#include <sys/user.h>
-#include <sys/dtrace_bsd.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_param.h>
+
#include <cddl/dev/dtrace/dtrace_cddl.h>
#endif
@@ -155,9 +166,9 @@
static struct cdev *fasttrap_cdev;
static dtrace_meta_provider_id_t fasttrap_meta_id;
-static struct callout fasttrap_timeout;
+static struct proc *fasttrap_cleanup_proc;
static struct mtx fasttrap_cleanup_mtx;
-static uint_t fasttrap_cleanup_work;
+static uint_t fasttrap_cleanup_work, fasttrap_cleanup_drain, fasttrap_cleanup_cv;
/*
* Generation count on modifications to the global tracepoint lookup table.
@@ -166,13 +177,14 @@
/*
* When the fasttrap provider is loaded, fasttrap_max is set to either
- * FASTTRAP_MAX_DEFAULT or the value for fasttrap-max-probes in the
- * fasttrap.conf file. Each time a probe is created, fasttrap_total is
- * incremented by the number of tracepoints that may be associated with that
- * probe; fasttrap_total is capped at fasttrap_max.
+ * FASTTRAP_MAX_DEFAULT, or the value for fasttrap-max-probes in the
+ * fasttrap.conf file (Illumos), or the value provied in the loader.conf (FreeBSD).
+ * Each time a probe is created, fasttrap_total is incremented by the number
+ * of tracepoints that may be associated with that probe; fasttrap_total is capped
+ * at fasttrap_max.
*/
#define FASTTRAP_MAX_DEFAULT 250000
-static uint32_t fasttrap_max;
+static uint32_t fasttrap_max = FASTTRAP_MAX_DEFAULT;
static uint32_t fasttrap_total;
/*
@@ -206,15 +218,31 @@
static fasttrap_proc_t *fasttrap_proc_lookup(pid_t);
static void fasttrap_proc_release(fasttrap_proc_t *);
+#ifndef illumos
+static void fasttrap_thread_dtor(void *, struct thread *);
+#endif
+
#define FASTTRAP_PROVS_INDEX(pid, name) \
((fasttrap_hash_str(name) + (pid)) & fasttrap_provs.fth_mask)
#define FASTTRAP_PROCS_INDEX(pid) ((pid) & fasttrap_procs.fth_mask)
-#if !defined(sun)
-static kmutex_t fasttrap_cpuc_pid_lock[MAXCPU];
+#ifndef illumos
+struct rmlock fasttrap_tp_lock;
+static eventhandler_tag fasttrap_thread_dtor_tag;
#endif
+static unsigned long tpoints_hash_size = FASTTRAP_TPOINTS_DEFAULT_SIZE;
+
+#ifdef __FreeBSD__
+SYSCTL_DECL(_kern_dtrace);
+SYSCTL_NODE(_kern_dtrace, OID_AUTO, fasttrap, CTLFLAG_RD, 0, "DTrace fasttrap parameters");
+SYSCTL_UINT(_kern_dtrace_fasttrap, OID_AUTO, max_probes, CTLFLAG_RWTUN, &fasttrap_max,
+ FASTTRAP_MAX_DEFAULT, "Maximum number of fasttrap probes");
+SYSCTL_ULONG(_kern_dtrace_fasttrap, OID_AUTO, tpoints_hash_size, CTLFLAG_RDTUN, &tpoints_hash_size,
+ FASTTRAP_TPOINTS_DEFAULT_SIZE, "Size of the tracepoint hash table");
+#endif
+
static int
fasttrap_highbit(ulong_t i)
{
@@ -263,7 +291,7 @@
void
fasttrap_sigtrap(proc_t *p, kthread_t *t, uintptr_t pc)
{
-#if defined(sun)
+#ifdef illumos
sigqueue_t *sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP);
sqp->sq_info.si_signo = SIGTRAP;
@@ -284,12 +312,124 @@
ksi->ksi_code = TRAP_DTRACE;
ksi->ksi_addr = (caddr_t)pc;
PROC_LOCK(p);
- (void) tdksignal(t, SIGTRAP, ksi);
+ (void) tdsendsignal(p, t, SIGTRAP, ksi);
PROC_UNLOCK(p);
#endif
}
+#ifndef illumos
/*
+ * Obtain a chunk of scratch space in the address space of the target process.
+ */
+fasttrap_scrspace_t *
+fasttrap_scraddr(struct thread *td, fasttrap_proc_t *fprc)
+{
+ fasttrap_scrblock_t *scrblk;
+ fasttrap_scrspace_t *scrspc;
+ struct proc *p;
+ vm_offset_t addr;
+ int error, i;
+
+ scrspc = NULL;
+ if (td->t_dtrace_sscr != NULL) {
+ /* If the thread already has scratch space, we're done. */
+ scrspc = (fasttrap_scrspace_t *)td->t_dtrace_sscr;
+ return (scrspc);
+ }
+
+ p = td->td_proc;
+
+ mutex_enter(&fprc->ftpc_mtx);
+ if (LIST_EMPTY(&fprc->ftpc_fscr)) {
+ /*
+ * No scratch space is available, so we'll map a new scratch
+ * space block into the traced process' address space.
+ */
+ addr = 0;
+ error = vm_map_find(&p->p_vmspace->vm_map, NULL, 0, &addr,
+ FASTTRAP_SCRBLOCK_SIZE, 0, VMFS_ANY_SPACE, VM_PROT_ALL,
+ VM_PROT_ALL, 0);
+ if (error != KERN_SUCCESS)
+ goto done;
+
+ scrblk = malloc(sizeof(*scrblk), M_SOLARIS, M_WAITOK);
+ scrblk->ftsb_addr = addr;
+ LIST_INSERT_HEAD(&fprc->ftpc_scrblks, scrblk, ftsb_next);
+
+ /*
+ * Carve the block up into chunks and put them on the free list.
+ */
+ for (i = 0;
+ i < FASTTRAP_SCRBLOCK_SIZE / FASTTRAP_SCRSPACE_SIZE; i++) {
+ scrspc = malloc(sizeof(*scrspc), M_SOLARIS, M_WAITOK);
+ scrspc->ftss_addr = addr +
+ i * FASTTRAP_SCRSPACE_SIZE;
+ LIST_INSERT_HEAD(&fprc->ftpc_fscr, scrspc,
+ ftss_next);
+ }
+ }
+
+ /*
+ * Take the first scratch chunk off the free list, put it on the
+ * allocated list, and return its address.
+ */
+ scrspc = LIST_FIRST(&fprc->ftpc_fscr);
+ LIST_REMOVE(scrspc, ftss_next);
+ LIST_INSERT_HEAD(&fprc->ftpc_ascr, scrspc, ftss_next);
+
+ /*
+ * This scratch space is reserved for use by td until the thread exits.
+ */
+ td->t_dtrace_sscr = scrspc;
+
+done:
+ mutex_exit(&fprc->ftpc_mtx);
+
+ return (scrspc);
+}
+
+/*
+ * Return any allocated per-thread scratch space chunks back to the process'
+ * free list.
+ */
+static void
+fasttrap_thread_dtor(void *arg __unused, struct thread *td)
+{
+ fasttrap_bucket_t *bucket;
+ fasttrap_proc_t *fprc;
+ fasttrap_scrspace_t *scrspc;
+ pid_t pid;
+
+ if (td->t_dtrace_sscr == NULL)
+ return;
+
+ pid = td->td_proc->p_pid;
+ bucket = &fasttrap_procs.fth_table[FASTTRAP_PROCS_INDEX(pid)];
+ fprc = NULL;
+
+ /* Look up the fasttrap process handle for this process. */
+ mutex_enter(&bucket->ftb_mtx);
+ for (fprc = bucket->ftb_data; fprc != NULL; fprc = fprc->ftpc_next) {
+ if (fprc->ftpc_pid == pid) {
+ mutex_enter(&fprc->ftpc_mtx);
+ mutex_exit(&bucket->ftb_mtx);
+ break;
+ }
+ }
+ if (fprc == NULL) {
+ mutex_exit(&bucket->ftb_mtx);
+ return;
+ }
+
+ scrspc = (fasttrap_scrspace_t *)td->t_dtrace_sscr;
+ LIST_REMOVE(scrspc, ftss_next);
+ LIST_INSERT_HEAD(&fprc->ftpc_fscr, scrspc, ftss_next);
+
+ mutex_exit(&fprc->ftpc_mtx);
+}
+#endif
+
+/*
* This function ensures that no threads are actively using the memory
* associated with probes that were formerly live.
*/
@@ -303,15 +443,23 @@
fasttrap_mod_gen++;
+#ifdef illumos
CPU_FOREACH(i) {
mutex_enter(&fasttrap_cpuc_pid_lock[i]);
mutex_exit(&fasttrap_cpuc_pid_lock[i]);
}
+#else
+ rm_wlock(&fasttrap_tp_lock);
+ rm_wunlock(&fasttrap_tp_lock);
+#endif
}
/*
- * This is the timeout's callback for cleaning up the providers and their
- * probes.
+ * This function performs asynchronous cleanup of fasttrap providers. The
+ * Solaris implementation of this mechanism use a timeout that's activated in
+ * fasttrap_pid_cleanup(), but this doesn't work in FreeBSD: one may sleep while
+ * holding the DTrace mutexes, but it is unsafe to sleep in a callout handler.
+ * Thus we use a dedicated process to perform the cleanup when requested.
*/
/*ARGSUSED*/
static void
@@ -322,11 +470,8 @@
dtrace_provider_id_t provid;
int i, later = 0, rval;
- static volatile int in = 0;
- ASSERT(in == 0);
- in = 1;
-
- while (fasttrap_cleanup_work) {
+ mtx_lock(&fasttrap_cleanup_mtx);
+ while (!fasttrap_cleanup_drain || later > 0) {
fasttrap_cleanup_work = 0;
mtx_unlock(&fasttrap_cleanup_mtx);
@@ -397,39 +542,32 @@
}
mutex_exit(&bucket->ftb_mtx);
}
+ mtx_lock(&fasttrap_cleanup_mtx);
- mtx_lock(&fasttrap_cleanup_mtx);
+ /*
+ * If we were unable to retire a provider, try again after a
+ * second. This situation can occur in certain circumstances
+ * where providers cannot be unregistered even though they have
+ * no probes enabled because of an execution of dtrace -l or
+ * something similar.
+ */
+ if (later > 0 || fasttrap_cleanup_work ||
+ fasttrap_cleanup_drain) {
+ mtx_unlock(&fasttrap_cleanup_mtx);
+ pause("ftclean", hz);
+ mtx_lock(&fasttrap_cleanup_mtx);
+ } else
+ mtx_sleep(&fasttrap_cleanup_cv, &fasttrap_cleanup_mtx,
+ 0, "ftcl", 0);
}
-#if 0
- ASSERT(fasttrap_timeout != 0);
-#endif
-
/*
- * If we were unable to remove a retired provider, try again after
- * a second. This situation can occur in certain circumstances where
- * providers cannot be unregistered even though they have no probes
- * enabled because of an execution of dtrace -l or something similar.
- * If the timeout has been disabled (set to 1 because we're trying
- * to detach), we set fasttrap_cleanup_work to ensure that we'll
- * get a chance to do that work if and when the timeout is reenabled
- * (if detach fails).
+ * Wake up the thread in fasttrap_unload() now that we're done.
*/
- if (later > 0) {
- if (callout_active(&fasttrap_timeout)) {
- callout_reset(&fasttrap_timeout, hz,
- &fasttrap_pid_cleanup_cb, NULL);
- }
-
- else if (later > 0)
- fasttrap_cleanup_work = 1;
- } else {
-#if !defined(sun)
- /* Nothing to be done for FreeBSD */
-#endif
- }
+ wakeup(&fasttrap_cleanup_drain);
+ mtx_unlock(&fasttrap_cleanup_mtx);
- in = 0;
+ kthread_exit();
}
/*
@@ -440,8 +578,10 @@
{
mtx_lock(&fasttrap_cleanup_mtx);
- fasttrap_cleanup_work = 1;
- callout_reset(&fasttrap_timeout, 1, &fasttrap_pid_cleanup_cb, NULL);
+ if (!fasttrap_cleanup_work) {
+ fasttrap_cleanup_work = 1;
+ wakeup(&fasttrap_cleanup_cv);
+ }
mtx_unlock(&fasttrap_cleanup_mtx);
}
@@ -454,16 +594,20 @@
static void
fasttrap_fork(proc_t *p, proc_t *cp)
{
+#ifndef illumos
+ fasttrap_scrblock_t *scrblk;
+ fasttrap_proc_t *fprc = NULL;
+#endif
pid_t ppid = p->p_pid;
int i;
-#if defined(sun)
+#ifdef illumos
ASSERT(curproc == p);
ASSERT(p->p_proc_flag & P_PR_LOCK);
#else
PROC_LOCK_ASSERT(p, MA_OWNED);
#endif
-#if defined(sun)
+#ifdef illumos
ASSERT(p->p_dtrace_count > 0);
#else
if (p->p_dtrace_helpers) {
@@ -502,12 +646,18 @@
* We don't have to worry about the child process disappearing
* because we're in fork().
*/
-#if defined(sun)
+#ifdef illumos
mtx_lock_spin(&cp->p_slock);
sprlock_proc(cp);
mtx_unlock_spin(&cp->p_slock);
#else
+ /*
+ * fasttrap_tracepoint_remove() expects the child process to be
+ * unlocked and the VM then expects curproc to be unlocked.
+ */
_PHOLD(cp);
+ PROC_UNLOCK(cp);
+ PROC_UNLOCK(p);
#endif
/*
@@ -533,15 +683,36 @@
* mid-fork.
*/
ASSERT(tp->ftt_proc->ftpc_acount != 0);
+#ifndef illumos
+ fprc = tp->ftt_proc;
+#endif
}
}
mutex_exit(&bucket->ftb_mtx);
+
+#ifndef illumos
+ /*
+ * Unmap any scratch space inherited from the parent's address
+ * space.
+ */
+ if (fprc != NULL) {
+ mutex_enter(&fprc->ftpc_mtx);
+ LIST_FOREACH(scrblk, &fprc->ftpc_scrblks, ftsb_next) {
+ vm_map_remove(&cp->p_vmspace->vm_map,
+ scrblk->ftsb_addr,
+ scrblk->ftsb_addr + FASTTRAP_SCRBLOCK_SIZE);
+ }
+ mutex_exit(&fprc->ftpc_mtx);
+ }
+#endif
}
-#if defined(sun)
+#ifdef illumos
mutex_enter(&cp->p_lock);
sprunlock(cp);
#else
+ PROC_LOCK(p);
+ PROC_LOCK(cp);
_PRELE(cp);
#endif
}
@@ -554,12 +725,24 @@
static void
fasttrap_exec_exit(proc_t *p)
{
-#if defined(sun)
+#ifndef illumos
+ struct thread *td;
+#endif
+
+#ifdef illumos
ASSERT(p == curproc);
-#endif
+#else
PROC_LOCK_ASSERT(p, MA_OWNED);
_PHOLD(p);
+ /*
+ * Since struct threads may be recycled, we cannot rely on t_dtrace_sscr
+ * fields to be zeroed by kdtrace_thread_ctor. Thus we must zero it
+ * ourselves when a process exits.
+ */
+ FOREACH_THREAD_IN_PROC(p, td)
+ td->t_dtrace_sscr = NULL;
PROC_UNLOCK(p);
+#endif
/*
* We clean up the pid provider for this process here; user-land
@@ -566,12 +749,12 @@
* static probes are handled by the meta-provider remove entry point.
*/
fasttrap_provider_retire(p->p_pid, FASTTRAP_PID_NAME, 0);
-#if !defined(sun)
+#ifndef illumos
if (p->p_dtrace_helpers)
dtrace_helpers_destroy(p);
-#endif
PROC_LOCK(p);
_PRELE(p);
+#endif
}
@@ -601,7 +784,7 @@
ASSERT(probe->ftp_tps[index].fit_tp->ftt_pid == pid);
-#if defined(sun)
+#ifdef illumos
ASSERT(!(p->p_flag & SVFORK));
#endif
@@ -709,7 +892,7 @@
* Increment the count of the number of tracepoints active in
* the victim process.
*/
-#if defined(sun)
+#ifdef illumos
ASSERT(p->p_proc_flag & P_PR_LOCK);
#endif
p->p_dtrace_count++;
@@ -901,7 +1084,7 @@
* Decrement the count of the number of tracepoints active
* in the victim process.
*/
-#if defined(sun)
+#ifdef illumos
ASSERT(p->p_proc_flag & P_PR_LOCK);
#endif
p->p_dtrace_count--;
@@ -954,7 +1137,7 @@
static void
fasttrap_disable_callbacks(void)
{
-#if defined(sun)
+#ifdef illumos
ASSERT(MUTEX_HELD(&cpu_lock));
#endif
@@ -963,7 +1146,7 @@
ASSERT(fasttrap_pid_count > 0);
fasttrap_pid_count--;
if (fasttrap_pid_count == 0) {
-#if defined(sun)
+#ifdef illumos
cpu_t *cur, *cpu = CPU;
for (cur = cpu->cpu_next_onln; cur != cpu;
@@ -973,7 +1156,7 @@
#endif
dtrace_pid_probe_ptr = NULL;
dtrace_return_probe_ptr = NULL;
-#if defined(sun)
+#ifdef illumos
for (cur = cpu->cpu_next_onln; cur != cpu;
cur = cur->cpu_next_onln) {
rw_exit(&cur->cpu_ft_lock);
@@ -991,11 +1174,10 @@
proc_t *p = NULL;
int i, rc;
-
ASSERT(probe != NULL);
ASSERT(!probe->ftp_enabled);
ASSERT(id == probe->ftp_id);
-#if defined(sun)
+#ifdef illumos
ASSERT(MUTEX_HELD(&cpu_lock));
#endif
@@ -1022,7 +1204,7 @@
* a fork in which the traced process is being born and we're copying
* USDT probes. Otherwise, the process is gone so bail.
*/
-#if defined(sun)
+#ifdef illumos
if ((p = sprlock(probe->ftp_pid)) == NULL) {
if ((curproc->p_flag & SFORKING) == 0)
return;
@@ -1090,7 +1272,7 @@
i--;
}
-#if defined(sun)
+#ifdef illumos
mutex_enter(&p->p_lock);
sprunlock(p);
#else
@@ -1105,7 +1287,7 @@
return;
}
}
-#if defined(sun)
+#ifdef illumos
mutex_enter(&p->p_lock);
sprunlock(p);
#else
@@ -1136,8 +1318,13 @@
*/
if ((p = pfind(probe->ftp_pid)) != NULL) {
#ifdef __FreeBSD__
- _PHOLD(p);
- PROC_UNLOCK(p);
+ if (p->p_flag & P_WEXIT) {
+ PROC_UNLOCK(p);
+ p = NULL;
+ } else {
+ _PHOLD(p);
+ PROC_UNLOCK(p);
+ }
#endif
}
@@ -1184,7 +1371,7 @@
probe->ftp_enabled = 0;
-#if defined(sun)
+#ifdef illumos
ASSERT(MUTEX_HELD(&cpu_lock));
#endif
fasttrap_disable_callbacks();
@@ -1307,7 +1494,7 @@
mutex_enter(&fprc->ftpc_mtx);
mutex_exit(&bucket->ftb_mtx);
fprc->ftpc_rcount++;
- atomic_add_64(&fprc->ftpc_acount, 1);
+ atomic_inc_64(&fprc->ftpc_acount);
ASSERT(fprc->ftpc_acount <= fprc->ftpc_rcount);
mutex_exit(&fprc->ftpc_mtx);
@@ -1325,7 +1512,7 @@
new_fprc->ftpc_pid = pid;
new_fprc->ftpc_rcount = 1;
new_fprc->ftpc_acount = 1;
-#if !defined(sun)
+#ifndef illumos
mutex_init(&new_fprc->ftpc_mtx, "fasttrap proc mtx", MUTEX_DEFAULT,
NULL);
#endif
@@ -1341,7 +1528,7 @@
mutex_enter(&fprc->ftpc_mtx);
mutex_exit(&bucket->ftb_mtx);
fprc->ftpc_rcount++;
- atomic_add_64(&fprc->ftpc_acount, 1);
+ atomic_inc_64(&fprc->ftpc_acount);
ASSERT(fprc->ftpc_acount <= fprc->ftpc_rcount);
mutex_exit(&fprc->ftpc_mtx);
@@ -1365,6 +1552,12 @@
fasttrap_bucket_t *bucket;
fasttrap_proc_t *fprc, **fprcp;
pid_t pid = proc->ftpc_pid;
+#ifndef illumos
+ fasttrap_scrblock_t *scrblk, *scrblktmp;
+ fasttrap_scrspace_t *scrspc, *scrspctmp;
+ struct proc *p;
+ struct thread *td;
+#endif
mutex_enter(&proc->ftpc_mtx);
@@ -1376,6 +1569,31 @@
return;
}
+#ifndef illumos
+ /*
+ * Free all structures used to manage per-thread scratch space.
+ */
+ LIST_FOREACH_SAFE(scrblk, &proc->ftpc_scrblks, ftsb_next,
+ scrblktmp) {
+ LIST_REMOVE(scrblk, ftsb_next);
+ free(scrblk, M_SOLARIS);
+ }
+ LIST_FOREACH_SAFE(scrspc, &proc->ftpc_fscr, ftss_next, scrspctmp) {
+ LIST_REMOVE(scrspc, ftss_next);
+ free(scrspc, M_SOLARIS);
+ }
+ LIST_FOREACH_SAFE(scrspc, &proc->ftpc_ascr, ftss_next, scrspctmp) {
+ LIST_REMOVE(scrspc, ftss_next);
+ free(scrspc, M_SOLARIS);
+ }
+
+ if ((p = pfind(pid)) != NULL) {
+ FOREACH_THREAD_IN_PROC(p, td)
+ td->t_dtrace_sscr = NULL;
+ PROC_UNLOCK(p);
+ }
+#endif
+
mutex_exit(&proc->ftpc_mtx);
/*
@@ -1473,7 +1691,7 @@
new_fp = kmem_zalloc(sizeof (fasttrap_provider_t), KM_SLEEP);
new_fp->ftp_pid = pid;
new_fp->ftp_proc = fasttrap_proc_lookup(pid);
-#if !defined(sun)
+#ifndef illumos
mutex_init(&new_fp->ftp_mtx, "provider mtx", MUTEX_DEFAULT, NULL);
mutex_init(&new_fp->ftp_cmtx, "lock on creating", MUTEX_DEFAULT, NULL);
#endif
@@ -1547,7 +1765,7 @@
* count of active providers on the associated process structure.
*/
if (!provider->ftp_retired) {
- atomic_add_64(&provider->ftp_proc->ftpc_acount, -1);
+ atomic_dec_64(&provider->ftp_proc->ftpc_acount);
ASSERT(provider->ftp_proc->ftpc_acount <
provider->ftp_proc->ftpc_rcount);
}
@@ -1554,7 +1772,7 @@
fasttrap_proc_release(provider->ftp_proc);
-#if !defined(sun)
+#ifndef illumos
mutex_destroy(&provider->ftp_mtx);
mutex_destroy(&provider->ftp_cmtx);
#endif
@@ -1572,7 +1790,7 @@
}
p->p_dtrace_probes--;
-#if !defined(sun)
+#ifndef illumos
PROC_UNLOCK(p);
#endif
}
@@ -1623,7 +1841,7 @@
* bucket lock therefore protects the integrity of the provider hash
* table.
*/
- atomic_add_64(&fp->ftp_proc->ftpc_acount, -1);
+ atomic_dec_64(&fp->ftp_proc->ftpc_acount);
ASSERT(fp->ftp_proc->ftpc_acount < fp->ftp_proc->ftpc_rcount);
fp->ftp_retired = 1;
@@ -1719,10 +1937,10 @@
pdata->ftps_mod, pdata->ftps_func, name_str) != 0)
continue;
- atomic_add_32(&fasttrap_total, 1);
+ atomic_inc_32(&fasttrap_total);
if (fasttrap_total > fasttrap_max) {
- atomic_add_32(&fasttrap_total, -1);
+ atomic_dec_32(&fasttrap_total);
goto no_mem;
}
@@ -2062,20 +2280,15 @@
return (EAGAIN);
if (cmd == FASTTRAPIOC_MAKEPROBE) {
- fasttrap_probe_spec_t *uprobe = (void *)arg;
+ fasttrap_probe_spec_t *uprobe = *(fasttrap_probe_spec_t **)arg;
fasttrap_probe_spec_t *probe;
uint64_t noffs;
size_t size;
- int ret;
- char *c;
+ int ret, err;
-#if defined(sun)
if (copyin(&uprobe->ftps_noffs, &noffs,
sizeof (uprobe->ftps_noffs)))
return (EFAULT);
-#else
- noffs = uprobe->ftps_noffs;
-#endif
/*
* Probes must have at least one tracepoint.
@@ -2091,36 +2304,26 @@
probe = kmem_alloc(size, KM_SLEEP);
-#if defined(sun)
- if (copyin(uprobe, probe, size) != 0) {
+ if (copyin(uprobe, probe, size) != 0 ||
+ probe->ftps_noffs != noffs) {
kmem_free(probe, size);
return (EFAULT);
}
-#else
- memcpy(probe, uprobe, sizeof(*probe));
- if (noffs > 1 && copyin(uprobe + 1, probe + 1, size) != 0) {
- kmem_free(probe, size);
- return (EFAULT);
- }
-#endif
-
/*
* Verify that the function and module strings contain no
* funny characters.
*/
- for (c = &probe->ftps_func[0]; *c != '\0'; c++) {
- if (*c < 0x20 || 0x7f <= *c) {
- ret = EINVAL;
- goto err;
- }
+ if (u8_validate(probe->ftps_func, strlen(probe->ftps_func),
+ NULL, U8_VALIDATE_ENTIRE, &err) < 0) {
+ ret = EINVAL;
+ goto err;
}
- for (c = &probe->ftps_mod[0]; *c != '\0'; c++) {
- if (*c < 0x20 || 0x7f <= *c) {
- ret = EINVAL;
- goto err;
- }
+ if (u8_validate(probe->ftps_mod, strlen(probe->ftps_mod),
+ NULL, U8_VALIDATE_ENTIRE, &err) < 0) {
+ ret = EINVAL;
+ goto err;
}
#ifdef notyet
@@ -2128,7 +2331,7 @@
proc_t *p;
pid_t pid = probe->ftps_pid;
-#if defined(sun)
+#ifdef illumos
mutex_enter(&pidlock);
#endif
/*
@@ -2139,12 +2342,12 @@
if (p)
fill_kinfo_proc(p, &kp);
if (p == NULL || kp.ki_stat == SIDL) {
-#if defined(sun)
+#ifdef illumos
mutex_exit(&pidlock);
#endif
return (ESRCH);
}
-#if defined(sun)
+#ifdef illumos
mutex_enter(&p->p_lock);
mutex_exit(&pidlock);
#else
@@ -2154,7 +2357,7 @@
#ifdef notyet
if ((ret = priv_proc_cred_perm(cr, p, NULL,
VREAD | VWRITE)) != 0) {
-#if defined(sun)
+#ifdef illumos
mutex_exit(&p->p_lock);
#else
PROC_UNLOCK(p);
@@ -2162,7 +2365,7 @@
return (ret);
}
#endif /* notyet */
-#if defined(sun)
+#ifdef illumos
mutex_exit(&p->p_lock);
#else
PROC_UNLOCK(p);
@@ -2180,11 +2383,11 @@
fasttrap_instr_query_t instr;
fasttrap_tracepoint_t *tp;
uint_t index;
-#if defined(sun)
+#ifdef illumos
int ret;
#endif
-#if defined(sun)
+#ifdef illumos
if (copyin((void *)arg, &instr, sizeof (instr)) != 0)
return (EFAULT);
#endif
@@ -2194,7 +2397,7 @@
proc_t *p;
pid_t pid = instr.ftiq_pid;
-#if defined(sun)
+#ifdef illumos
mutex_enter(&pidlock);
#endif
/*
@@ -2205,12 +2408,12 @@
if (p)
fill_kinfo_proc(p, &kp);
if (p == NULL || kp.ki_stat == SIDL) {
-#if defined(sun)
+#ifdef illumos
mutex_exit(&pidlock);
#endif
return (ESRCH);
}
-#if defined(sun)
+#ifdef illumos
mutex_enter(&p->p_lock);
mutex_exit(&pidlock);
#else
@@ -2220,7 +2423,7 @@
#ifdef notyet
if ((ret = priv_proc_cred_perm(cr, p, NULL,
VREAD)) != 0) {
-#if defined(sun)
+#ifdef illumos
mutex_exit(&p->p_lock);
#else
PROC_UNLOCK(p);
@@ -2229,7 +2432,7 @@
}
#endif /* notyet */
-#if defined(sun)
+#ifdef illumos
mutex_exit(&p->p_lock);
#else
PROC_UNLOCK(p);
@@ -2272,7 +2475,7 @@
fasttrap_load(void)
{
ulong_t nent;
- int i;
+ int i, ret;
/* Create the /dev/dtrace/fasttrap entry. */
fasttrap_cdev = make_dev(&fasttrap_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600,
@@ -2279,15 +2482,12 @@
"dtrace/fasttrap");
mtx_init(&fasttrap_cleanup_mtx, "fasttrap clean", "dtrace", MTX_DEF);
- callout_init_mtx(&fasttrap_timeout, &fasttrap_cleanup_mtx, 0);
mutex_init(&fasttrap_count_mtx, "fasttrap count mtx", MUTEX_DEFAULT,
NULL);
-#if defined(sun)
+#ifdef illumos
fasttrap_max = ddi_getprop(DDI_DEV_T_ANY, devi, DDI_PROP_DONTPASS,
"fasttrap-max-probes", FASTTRAP_MAX_DEFAULT);
-#else
- fasttrap_max = FASTTRAP_MAX_DEFAULT;
#endif
fasttrap_total = 0;
@@ -2294,17 +2494,19 @@
/*
* Conjure up the tracepoints hashtable...
*/
-#if defined(sun)
+#ifdef illumos
nent = ddi_getprop(DDI_DEV_T_ANY, devi, DDI_PROP_DONTPASS,
"fasttrap-hash-size", FASTTRAP_TPOINTS_DEFAULT_SIZE);
#else
- nent = FASTTRAP_TPOINTS_DEFAULT_SIZE;
+ nent = tpoints_hash_size;
#endif
if (nent == 0 || nent > 0x1000000)
nent = FASTTRAP_TPOINTS_DEFAULT_SIZE;
- if ((nent & (nent - 1)) == 0)
+ tpoints_hash_size = nent;
+
+ if (ISP2(nent))
fasttrap_tpoints.fth_nent = nent;
else
fasttrap_tpoints.fth_nent = 1 << fasttrap_highbit(nent);
@@ -2312,7 +2514,7 @@
fasttrap_tpoints.fth_mask = fasttrap_tpoints.fth_nent - 1;
fasttrap_tpoints.fth_table = kmem_zalloc(fasttrap_tpoints.fth_nent *
sizeof (fasttrap_bucket_t), KM_SLEEP);
-#if !defined(sun)
+#ifndef illumos
for (i = 0; i < fasttrap_tpoints.fth_nent; i++)
mutex_init(&fasttrap_tpoints.fth_table[i].ftb_mtx,
"tracepoints bucket mtx", MUTEX_DEFAULT, NULL);
@@ -2322,7 +2524,7 @@
* ... and the providers hash table...
*/
nent = FASTTRAP_PROVIDERS_DEFAULT_SIZE;
- if ((nent & (nent - 1)) == 0)
+ if (ISP2(nent))
fasttrap_provs.fth_nent = nent;
else
fasttrap_provs.fth_nent = 1 << fasttrap_highbit(nent);
@@ -2330,17 +2532,35 @@
fasttrap_provs.fth_mask = fasttrap_provs.fth_nent - 1;
fasttrap_provs.fth_table = kmem_zalloc(fasttrap_provs.fth_nent *
sizeof (fasttrap_bucket_t), KM_SLEEP);
-#if !defined(sun)
+#ifndef illumos
for (i = 0; i < fasttrap_provs.fth_nent; i++)
mutex_init(&fasttrap_provs.fth_table[i].ftb_mtx,
"providers bucket mtx", MUTEX_DEFAULT, NULL);
#endif
+ ret = kproc_create(fasttrap_pid_cleanup_cb, NULL,
+ &fasttrap_cleanup_proc, 0, 0, "ftcleanup");
+ if (ret != 0) {
+ destroy_dev(fasttrap_cdev);
+#ifndef illumos
+ for (i = 0; i < fasttrap_provs.fth_nent; i++)
+ mutex_destroy(&fasttrap_provs.fth_table[i].ftb_mtx);
+ for (i = 0; i < fasttrap_tpoints.fth_nent; i++)
+ mutex_destroy(&fasttrap_tpoints.fth_table[i].ftb_mtx);
+#endif
+ kmem_free(fasttrap_provs.fth_table, fasttrap_provs.fth_nent *
+ sizeof (fasttrap_bucket_t));
+ mtx_destroy(&fasttrap_cleanup_mtx);
+ mutex_destroy(&fasttrap_count_mtx);
+ return (ret);
+ }
+
+
/*
* ... and the procs hash table.
*/
nent = FASTTRAP_PROCS_DEFAULT_SIZE;
- if ((nent & (nent - 1)) == 0)
+ if (ISP2(nent))
fasttrap_procs.fth_nent = nent;
else
fasttrap_procs.fth_nent = 1 << fasttrap_highbit(nent);
@@ -2348,15 +2568,19 @@
fasttrap_procs.fth_mask = fasttrap_procs.fth_nent - 1;
fasttrap_procs.fth_table = kmem_zalloc(fasttrap_procs.fth_nent *
sizeof (fasttrap_bucket_t), KM_SLEEP);
-#if !defined(sun)
+#ifndef illumos
for (i = 0; i < fasttrap_procs.fth_nent; i++)
mutex_init(&fasttrap_procs.fth_table[i].ftb_mtx,
"processes bucket mtx", MUTEX_DEFAULT, NULL);
- CPU_FOREACH(i) {
- mutex_init(&fasttrap_cpuc_pid_lock[i], "fasttrap barrier",
- MUTEX_DEFAULT, NULL);
- }
+ rm_init(&fasttrap_tp_lock, "fasttrap tracepoint");
+
+ /*
+ * This event handler must run before kdtrace_thread_dtor() since it
+ * accesses the thread's struct kdtrace_thread.
+ */
+ fasttrap_thread_dtor_tag = EVENTHANDLER_REGISTER(thread_dtor,
+ fasttrap_thread_dtor, NULL, EVENTHANDLER_PRI_FIRST);
#endif
/*
@@ -2389,15 +2613,6 @@
return (-1);
/*
- * Prevent any new timeouts from running by setting fasttrap_timeout
- * to a non-zero value, and wait for the current timeout to complete.
- */
- mtx_lock(&fasttrap_cleanup_mtx);
- fasttrap_cleanup_work = 0;
- callout_drain(&fasttrap_timeout);
- mtx_unlock(&fasttrap_cleanup_mtx);
-
- /*
* Iterate over all of our providers. If there's still a process
* that corresponds to that pid, fail to detach.
*/
@@ -2431,20 +2646,6 @@
}
if (fail) {
- uint_t work;
- /*
- * If we're failing to detach, we need to unblock timeouts
- * and start a new timeout if any work has accumulated while
- * we've been unsuccessfully trying to detach.
- */
- mtx_lock(&fasttrap_cleanup_mtx);
- work = fasttrap_cleanup_work;
- callout_drain(&fasttrap_timeout);
- mtx_unlock(&fasttrap_cleanup_mtx);
-
- if (work)
- fasttrap_pid_cleanup();
-
(void) dtrace_meta_register("fasttrap", &fasttrap_mops, NULL,
&fasttrap_meta_id);
@@ -2451,6 +2652,29 @@
return (-1);
}
+ /*
+ * Stop new processes from entering these hooks now, before the
+ * fasttrap_cleanup thread runs. That way all processes will hopefully
+ * be out of these hooks before we free fasttrap_provs.fth_table
+ */
+ ASSERT(dtrace_fasttrap_fork == &fasttrap_fork);
+ dtrace_fasttrap_fork = NULL;
+
+ ASSERT(dtrace_fasttrap_exec == &fasttrap_exec_exit);
+ dtrace_fasttrap_exec = NULL;
+
+ ASSERT(dtrace_fasttrap_exit == &fasttrap_exec_exit);
+ dtrace_fasttrap_exit = NULL;
+
+ mtx_lock(&fasttrap_cleanup_mtx);
+ fasttrap_cleanup_drain = 1;
+ /* Wait for the cleanup thread to finish up and signal us. */
+ wakeup(&fasttrap_cleanup_cv);
+ mtx_sleep(&fasttrap_cleanup_drain, &fasttrap_cleanup_mtx, 0, "ftcld",
+ 0);
+ fasttrap_cleanup_proc = NULL;
+ mtx_destroy(&fasttrap_cleanup_mtx);
+
#ifdef DEBUG
mutex_enter(&fasttrap_count_mtx);
ASSERT(fasttrap_pid_count == 0);
@@ -2457,6 +2681,16 @@
mutex_exit(&fasttrap_count_mtx);
#endif
+#ifndef illumos
+ EVENTHANDLER_DEREGISTER(thread_dtor, fasttrap_thread_dtor_tag);
+
+ for (i = 0; i < fasttrap_tpoints.fth_nent; i++)
+ mutex_destroy(&fasttrap_tpoints.fth_table[i].ftb_mtx);
+ for (i = 0; i < fasttrap_provs.fth_nent; i++)
+ mutex_destroy(&fasttrap_provs.fth_table[i].ftb_mtx);
+ for (i = 0; i < fasttrap_procs.fth_nent; i++)
+ mutex_destroy(&fasttrap_procs.fth_table[i].ftb_mtx);
+#endif
kmem_free(fasttrap_tpoints.fth_table,
fasttrap_tpoints.fth_nent * sizeof (fasttrap_bucket_t));
fasttrap_tpoints.fth_nent = 0;
@@ -2469,28 +2703,10 @@
fasttrap_procs.fth_nent * sizeof (fasttrap_bucket_t));
fasttrap_procs.fth_nent = 0;
- /*
- * We know there are no tracepoints in any process anywhere in
- * the system so there is no process which has its p_dtrace_count
- * greater than zero, therefore we know that no thread can actively
- * be executing code in fasttrap_fork(). Similarly for p_dtrace_probes
- * and fasttrap_exec() and fasttrap_exit().
- */
- ASSERT(dtrace_fasttrap_fork == &fasttrap_fork);
- dtrace_fasttrap_fork = NULL;
-
- ASSERT(dtrace_fasttrap_exec == &fasttrap_exec_exit);
- dtrace_fasttrap_exec = NULL;
-
- ASSERT(dtrace_fasttrap_exit == &fasttrap_exec_exit);
- dtrace_fasttrap_exit = NULL;
-
-#if !defined(sun)
+#ifndef illumos
destroy_dev(fasttrap_cdev);
mutex_destroy(&fasttrap_count_mtx);
- CPU_FOREACH(i) {
- mutex_destroy(&fasttrap_cpuc_pid_lock[i]);
- }
+ rm_destroy(&fasttrap_tp_lock);
#endif
return (0);
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/dtrace/lockstat.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/dtrace/lockstat.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/dtrace/lockstat.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/dtrace/profile.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/dtrace/profile.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/dtrace/profile.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -169,9 +170,9 @@
if (dtrace_probe_lookup(profile_id, NULL, NULL, name) != 0)
return;
- atomic_add_32(&profile_total, 1);
+ atomic_inc_32(&profile_total);
if (profile_total > profile_max) {
- atomic_add_32(&profile_total, -1);
+ atomic_dec_32(&profile_total);
return;
}
@@ -326,7 +327,7 @@
kmem_free(prof, sizeof (profile_probe_t));
ASSERT(profile_total >= 1);
- atomic_add_32(&profile_total, -1);
+ atomic_dec_32(&profile_total);
}
/*ARGSUSED*/
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/dtrace/sdt_subr.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/dtrace/sdt_subr.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/dtrace/sdt_subr.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -21,6 +22,7 @@
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
@@ -85,11 +87,11 @@
sdt_provider_t sdt_providers[] = {
{ "vtrace", "__vtrace_", &vtrace_attr, 0 },
- { "sysinfo", "__cpu_sysinfo_", &info_attr, 0 },
- { "vminfo", "__cpu_vminfo_", &info_attr, 0 },
+ { "sysinfo", "__cpu_sysinfo_", &info_attr, DTRACE_PRIV_USER },
+ { "vminfo", "__cpu_vminfo_", &info_attr, DTRACE_PRIV_USER },
{ "fpuinfo", "__fpuinfo_", &fpu_attr, 0 },
- { "sched", "__sched_", &stab_attr, 0 },
- { "proc", "__proc_", &stab_attr, 0 },
+ { "sched", "__sched_", &stab_attr, DTRACE_PRIV_USER },
+ { "proc", "__proc_", &stab_attr, DTRACE_PRIV_USER },
{ "io", "__io_", &stab_attr, 0 },
{ "mib", "__mib_", &stab_attr, 0 },
{ "fsinfo", "__fsinfo_", &fsinfo_attr, 0 },
@@ -852,6 +854,20 @@
};
/*ARGSUSED*/
+int
+sdt_mode(void *arg, dtrace_id_t id, void *parg)
+{
+ /*
+ * We tell DTrace that we're in kernel mode, that the firing needs to
+ * be dropped for anything that doesn't have necessary privileges, and
+ * that it needs to be restricted for anything that has restricted
+ * (i.e., not all-zone) privileges.
+ */
+ return (DTRACE_MODE_KERNEL | DTRACE_MODE_NOPRIV_DROP |
+ DTRACE_MODE_LIMITEDPRIV_RESTRICT);
+}
+
+/*ARGSUSED*/
void
sdt_getargdesc(void *arg, dtrace_id_t id, void *parg, dtrace_argdesc_t *desc)
{
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/dtrace/systrace.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/dtrace/systrace.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/dtrace/systrace.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -165,11 +166,11 @@
return;
}
- (void) casptr(&sysent[sysnum].sy_callc,
+ (void) atomic_cas_ptr(&sysent[sysnum].sy_callc,
(void *)systrace_sysent[sysnum].stsy_underlying,
(void *)dtrace_systrace_syscall);
#ifdef _SYSCALL32_IMPL
- (void) casptr(&sysent32[sysnum].sy_callc,
+ (void) atomic_cas_ptr(&sysent32[sysnum].sy_callc,
(void *)systrace_sysent32[sysnum].stsy_underlying,
(void *)dtrace_systrace_syscall32);
#endif
@@ -184,12 +185,12 @@
systrace_sysent[sysnum].stsy_return == DTRACE_IDNONE);
if (disable) {
- (void) casptr(&sysent[sysnum].sy_callc,
+ (void) atomic_cas_ptr(&sysent[sysnum].sy_callc,
(void *)dtrace_systrace_syscall,
(void *)systrace_sysent[sysnum].stsy_underlying);
#ifdef _SYSCALL32_IMPL
- (void) casptr(&sysent32[sysnum].sy_callc,
+ (void) atomic_cas_ptr(&sysent32[sysnum].sy_callc,
(void *)dtrace_systrace_syscall32,
(void *)systrace_sysent32[sysnum].stsy_underlying);
#endif
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/gfs.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/gfs.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/gfs.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/vnode.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,9 +21,10 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
- * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2014 by Saso Kiselkov. All rights reserved.
+ * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
*/
/*
@@ -82,9 +83,9 @@
* types of locks: 1) the hash table lock array, and 2) the
* arc list locks.
*
- * Buffers do not have their own mutexs, rather they rely on the
- * hash table mutexs for the bulk of their protection (i.e. most
- * fields in the arc_buf_hdr_t are protected by these mutexs).
+ * Buffers do not have their own mutexes, rather they rely on the
+ * hash table mutexes for the bulk of their protection (i.e. most
+ * fields in the arc_buf_hdr_t are protected by these mutexes).
*
* buf_hash_find() returns the appropriate mutex (held) when it
* locates the requested buffer in the hash table. It returns
@@ -105,13 +106,13 @@
* with the buffer may be evicted prior to the callback. The callback
* must be made with *no locks held* (to prevent deadlock). Additionally,
* the users of callbacks must ensure that their private data is
- * protected from simultaneous callbacks from arc_buf_evict()
+ * protected from simultaneous callbacks from arc_clear_callback()
* and arc_do_user_evicts().
*
* Note that the majority of the performance stats are manipulated
* with atomic operations.
*
- * The L2ARC uses the l2arc_buflist_mtx global mutex for the following:
+ * The L2ARC uses the l2ad_mtx on each vdev for the following:
*
* - L2ARC buflist creation
* - L2ARC buflist eviction
@@ -120,14 +121,141 @@
* - ARC header release, as it removes from L2ARC buflists
*/
+/*
+ * ARC operation:
+ *
+ * Every block that is in the ARC is tracked by an arc_buf_hdr_t structure.
+ * This structure can point either to a block that is still in the cache or to
+ * one that is only accessible in an L2 ARC device, or it can provide
+ * information about a block that was recently evicted. If a block is
+ * only accessible in the L2ARC, then the arc_buf_hdr_t only has enough
+ * information to retrieve it from the L2ARC device. This information is
+ * stored in the l2arc_buf_hdr_t sub-structure of the arc_buf_hdr_t. A block
+ * that is in this state cannot access the data directly.
+ *
+ * Blocks that are actively being referenced or have not been evicted
+ * are cached in the L1ARC. The L1ARC (l1arc_buf_hdr_t) is a structure within
+ * the arc_buf_hdr_t that will point to the data block in memory. A block can
+ * only be read by a consumer if it has an l1arc_buf_hdr_t. The L1ARC
+ * caches data in two ways -- in a list of arc buffers (arc_buf_t) and
+ * also in the arc_buf_hdr_t's private physical data block pointer (b_pdata).
+ * Each arc buffer (arc_buf_t) is being actively accessed by a specific ARC
+ * consumer, and always contains uncompressed data. The ARC will provide
+ * references to this data and will keep it cached until it is no longer in
+ * use. Typically, the arc will try to cache only the L1ARC's physical data
+ * block and will aggressively evict any arc_buf_t that is no longer referenced.
+ * The amount of memory consumed by the arc_buf_t's can be seen via the
+ * "overhead_size" kstat.
+ *
+ *
+ * arc_buf_hdr_t
+ * +-----------+
+ * | |
+ * | |
+ * | |
+ * +-----------+
+ * l2arc_buf_hdr_t| |
+ * | |
+ * +-----------+
+ * l1arc_buf_hdr_t| |
+ * | | arc_buf_t
+ * | b_buf +------------>+---------+ arc_buf_t
+ * | | |b_next +---->+---------+
+ * | b_pdata +-+ |---------| |b_next +-->NULL
+ * +-----------+ | | | +---------+
+ * | |b_data +-+ | |
+ * | +---------+ | |b_data +-+
+ * +->+------+ | +---------+ |
+ * (potentially) | | | |
+ * compressed | | | |
+ * data +------+ | v
+ * +->+------+ +------+
+ * uncompressed | | | |
+ * data | | | |
+ * +------+ +------+
+ *
+ * The L1ARC's data pointer, however, may or may not be uncompressed. The
+ * ARC has the ability to store the physical data (b_pdata) associated with
+ * the DVA of the arc_buf_hdr_t. Since the b_pdata is a copy of the on-disk
+ * physical block, it will match its on-disk compression characteristics.
+ * If the block on-disk is compressed, then the physical data block
+ * in the cache will also be compressed and vice-versa. This behavior
+ * can be disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the
+ * compressed ARC functionality is disabled, the b_pdata will point to an
+ * uncompressed version of the on-disk data.
+ *
+ * When a consumer reads a block, the ARC must first look to see if the
+ * arc_buf_hdr_t is cached. If the hdr is cached and already has an arc_buf_t,
+ * then an additional arc_buf_t is allocated and the uncompressed data is
+ * bcopied from the existing arc_buf_t. If the hdr is cached but does not
+ * have an arc_buf_t, then the ARC allocates a new arc_buf_t and decompresses
+ * the b_pdata contents into the arc_buf_t's b_data. If the arc_buf_hdr_t's
+ * b_pdata is not compressed, then the block is shared with the newly
+ * allocated arc_buf_t. This block sharing only occurs with one arc_buf_t
+ * in the arc buffer chain. Sharing the block reduces the memory overhead
+ * required when the hdr is caching uncompressed blocks or the compressed
+ * arc functionality has been disabled via 'zfs_compressed_arc_enabled'.
+ *
+ * The diagram below shows an example of an uncompressed ARC hdr that is
+ * sharing its data with an arc_buf_t:
+ *
+ * arc_buf_hdr_t
+ * +-----------+
+ * | |
+ * | |
+ * | |
+ * +-----------+
+ * l2arc_buf_hdr_t| |
+ * | |
+ * +-----------+
+ * l1arc_buf_hdr_t| |
+ * | | arc_buf_t (shared)
+ * | b_buf +------------>+---------+ arc_buf_t
+ * | | |b_next +---->+---------+
+ * | b_pdata +-+ |---------| |b_next +-->NULL
+ * +-----------+ | | | +---------+
+ * | |b_data +-+ | |
+ * | +---------+ | |b_data +-+
+ * +->+------+ | +---------+ |
+ * | | | |
+ * uncompressed | | | |
+ * data +------+ | |
+ * ^ +->+------+ |
+ * | uncompressed | | |
+ * | data | | |
+ * | +------+ |
+ * +---------------------------------+
+ *
+ * Writing to the arc requires that the ARC first discard the b_pdata
+ * since the physical block is about to be rewritten. The new data contents
+ * will be contained in the arc_buf_t (uncompressed). As the I/O pipeline
+ * performs the write, it may compress the data before writing it to disk.
+ * The ARC will be called with the transformed data and will bcopy the
+ * transformed on-disk block into a newly allocated b_pdata.
+ *
+ * When the L2ARC is in use, it will also take advantage of the b_pdata. The
+ * L2ARC will always write the contents of b_pdata to the L2ARC. This means
+ * that when compressed arc is enabled that the L2ARC blocks are identical
+ * to the on-disk block in the main data pool. This provides a significant
+ * advantage since the ARC can leverage the bp's checksum when reading from the
+ * L2ARC to determine if the contents are valid. However, if the compressed
+ * arc is disabled, then the L2ARC's block must be transformed to look
+ * like the physical block in the main data pool before comparing the
+ * checksum and determining its validity.
+ */
+
#include <sys/spa.h>
#include <sys/zio.h>
+#include <sys/spa_impl.h>
#include <sys/zio_compress.h>
+#include <sys/zio_checksum.h>
#include <sys/zfs_context.h>
#include <sys/arc.h>
#include <sys/refcount.h>
#include <sys/vdev.h>
#include <sys/vdev_impl.h>
+#include <sys/dsl_pool.h>
+#include <sys/multilist.h>
#ifdef _KERNEL
#include <sys/dnlc.h>
#endif
@@ -137,7 +265,7 @@
#include <zfs_fletcher.h>
#include <sys/sdt.h>
-#include <vm/vm_pageout.h>
+#include <machine/vmparam.h>
#ifdef illumos
#ifndef _KERNEL
@@ -147,39 +275,66 @@
#endif
#endif /* illumos */
-static kmutex_t arc_reclaim_thr_lock;
-static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */
-static uint8_t arc_thread_exit;
+static kmutex_t arc_reclaim_lock;
+static kcondvar_t arc_reclaim_thread_cv;
+static boolean_t arc_reclaim_thread_exit;
+static kcondvar_t arc_reclaim_waiters_cv;
-extern int zfs_write_limit_shift;
-extern uint64_t zfs_write_limit_max;
-extern kmutex_t zfs_write_limit_lock;
+uint_t arc_reduce_dnlc_percent = 3;
-#define ARC_REDUCE_DNLC_PERCENT 3
-uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
+/*
+ * The number of headers to evict in arc_evict_state_impl() before
+ * dropping the sublist lock and evicting from another sublist. A lower
+ * value means we're more likely to evict the "correct" header (i.e. the
+ * oldest header in the arc state), but comes with higher overhead
+ * (i.e. more invocations of arc_evict_state_impl()).
+ */
+int zfs_arc_evict_batch_limit = 10;
-typedef enum arc_reclaim_strategy {
- ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */
- ARC_RECLAIM_CONS /* Conservative reclaim strategy */
-} arc_reclaim_strategy_t;
+/*
+ * The number of sublists used for each of the arc state lists. If this
+ * is not set to a suitable value by the user, it will be configured to
+ * the number of CPUs on the system in arc_init().
+ */
+int zfs_arc_num_sublists_per_state = 0;
/* number of seconds before growing cache again */
static int arc_grow_retry = 60;
+/* shift of arc_c for calculating overflow limit in arc_get_data_buf */
+int zfs_arc_overflow_shift = 8;
+
/* shift of arc_c for calculating both min and max arc_p */
static int arc_p_min_shift = 4;
/* log2(fraction of arc to reclaim) */
-static int arc_shrink_shift = 5;
+static int arc_shrink_shift = 7;
/*
+ * log2(fraction of ARC which must be free to allow growing).
+ * I.e. If there is less than arc_c >> arc_no_grow_shift free memory,
+ * when reading a new block into the ARC, we will evict an equal-sized block
+ * from the ARC.
+ *
+ * This must be less than arc_shrink_shift, so that when we shrink the ARC,
+ * we will still not allow it to grow.
+ */
+int arc_no_grow_shift = 5;
+
+
+/*
* minimum lifespan of a prefetch block in clock ticks
* (initialized in arc_init())
*/
static int arc_min_prefetch_lifespan;
+/*
+ * If this percent of memory is free, don't throttle.
+ */
+int arc_lotsfree_percent = 10;
+
static int arc_dead;
-extern int zfs_prefetch_disable;
+extern boolean_t zfs_prefetch_disable;
/*
* The arc has filled available memory and has now warmed up.
@@ -192,21 +347,94 @@
uint64_t zfs_arc_max;
uint64_t zfs_arc_min;
uint64_t zfs_arc_meta_limit = 0;
+uint64_t zfs_arc_meta_min = 0;
int zfs_arc_grow_retry = 0;
int zfs_arc_shrink_shift = 0;
int zfs_arc_p_min_shift = 0;
-int zfs_disable_dup_eviction = 0;
+uint64_t zfs_arc_average_blocksize = 8 * 1024; /* 8KB */
+u_int zfs_arc_free_target = 0;
+/* Absolute min for arc min / max is 16MB. */
+static uint64_t arc_abs_min = 16 << 20;
+
+boolean_t zfs_compressed_arc_enabled = B_TRUE;
+
+static int sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS);
+static int sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS);
+static int sysctl_vfs_zfs_arc_max(SYSCTL_HANDLER_ARGS);
+static int sysctl_vfs_zfs_arc_min(SYSCTL_HANDLER_ARGS);
+
+#if defined(__FreeBSD__) && defined(_KERNEL)
+static void
+arc_free_target_init(void *unused __unused)
+{
+
+ zfs_arc_free_target = vm_pageout_wakeup_thresh;
+}
+SYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY,
+ arc_free_target_init, NULL);
+
TUNABLE_QUAD("vfs.zfs.arc_max", &zfs_arc_max);
TUNABLE_QUAD("vfs.zfs.arc_min", &zfs_arc_min);
TUNABLE_QUAD("vfs.zfs.arc_meta_limit", &zfs_arc_meta_limit);
+TUNABLE_QUAD("vfs.zfs.arc_meta_min", &zfs_arc_meta_min);
+TUNABLE_QUAD("vfs.zfs.arc_average_blocksize", &zfs_arc_average_blocksize);
+TUNABLE_INT("vfs.zfs.arc_shrink_shift", &zfs_arc_shrink_shift);
SYSCTL_DECL(_vfs_zfs);
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_max, CTLFLAG_RDTUN, &zfs_arc_max, 0,
- "Maximum ARC size");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_min, CTLFLAG_RDTUN, &zfs_arc_min, 0,
- "Minimum ARC size");
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_max, CTLTYPE_U64 | CTLFLAG_RWTUN,
+ 0, sizeof(uint64_t), sysctl_vfs_zfs_arc_max, "QU", "Maximum ARC size");
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_min, CTLTYPE_U64 | CTLFLAG_RWTUN,
+ 0, sizeof(uint64_t), sysctl_vfs_zfs_arc_min, "QU", "Minimum ARC size");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_average_blocksize, CTLFLAG_RDTUN,
+ &zfs_arc_average_blocksize, 0,
+ "ARC average blocksize");
+SYSCTL_INT(_vfs_zfs, OID_AUTO, arc_shrink_shift, CTLFLAG_RW,
+ &arc_shrink_shift, 0,
+ "log2(fraction of arc to reclaim)");
+SYSCTL_INT(_vfs_zfs, OID_AUTO, compressed_arc_enabled, CTLFLAG_RDTUN,
+ &zfs_compressed_arc_enabled, 0, "Enable compressed ARC");
/*
+ * We don't have a tunable for arc_free_target due to the dependency on
+ * pagedaemon initialisation.
+ */
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target,
+ CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(u_int),
+ sysctl_vfs_zfs_arc_free_target, "IU",
+ "Desired number of free pages below which ARC triggers reclaim");
+
+static int
+sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS)
+{
+ u_int val;
+ int err;
+
+ val = zfs_arc_free_target;
+ err = sysctl_handle_int(oidp, &val, 0, req);
+ if (err != 0 || req->newptr == NULL)
+ return (err);
+
+ if (val < minfree)
+ return (EINVAL);
+ if (val > cnt.v_page_count)
+ return (EINVAL);
+
+ zfs_arc_free_target = val;
+
+ return (0);
+}
+
+/*
+ * Must be declared here, before the definition of corresponding kstat
+ * macro which uses the same names will confuse the compiler.
+ */
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_meta_limit,
+ CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t),
+ sysctl_vfs_zfs_arc_meta_limit, "QU",
+ "ARC metadata limit");
+#endif
+
+/*
* Note that buffers can be in one of 6 states:
* ARC_anon - anonymous (discussed below)
* ARC_mru - recently used, currently cached
@@ -238,31 +466,22 @@
* second level ARC benefit from these fast lookups.
*/
-#define ARCS_LOCK_PAD CACHE_LINE_SIZE
-struct arcs_lock {
- kmutex_t arcs_lock;
-#ifdef _KERNEL
- unsigned char pad[(ARCS_LOCK_PAD - sizeof (kmutex_t))];
-#endif
-};
-
-/*
- * must be power of two for mask use to work
- *
- */
-#define ARC_BUFC_NUMDATALISTS 16
-#define ARC_BUFC_NUMMETADATALISTS 16
-#define ARC_BUFC_NUMLISTS (ARC_BUFC_NUMMETADATALISTS + ARC_BUFC_NUMDATALISTS)
-
typedef struct arc_state {
- uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */
- uint64_t arcs_size; /* total amount of data in this state */
- list_t arcs_lists[ARC_BUFC_NUMLISTS]; /* list of evictable buffers */
- struct arcs_lock arcs_locks[ARC_BUFC_NUMLISTS] __aligned(CACHE_LINE_SIZE);
+ /*
+ * list of evictable buffers
+ */
+ multilist_t arcs_list[ARC_BUFC_NUMTYPES];
+ /*
+ * total amount of evictable data in this state
+ */
+ refcount_t arcs_esize[ARC_BUFC_NUMTYPES];
+ /*
+ * total amount of data in this state; this includes: evictable,
+ * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA.
+ */
+ refcount_t arcs_size;
} arc_state_t;
-#define ARCS_LOCK(s, i) (&((s)->arcs_locks[(i)].arcs_lock))
-
/* The 6 states: */
static arc_state_t ARC_anon;
static arc_state_t ARC_mru;
@@ -288,8 +507,6 @@
kstat_named_t arcstat_mfu_ghost_hits;
kstat_named_t arcstat_allocated;
kstat_named_t arcstat_deleted;
- kstat_named_t arcstat_stolen;
- kstat_named_t arcstat_recycle_miss;
/*
* Number of buffers that could not be evicted because the hash lock
* was held by another thread. The lock may not necessarily be held
@@ -303,9 +520,15 @@
* not from the spa we're trying to evict from.
*/
kstat_named_t arcstat_evict_skip;
+ /*
+ * Number of times arc_evict_state() was unable to evict enough
+ * buffers to reach it's target amount.
+ */
+ kstat_named_t arcstat_evict_not_enough;
kstat_named_t arcstat_evict_l2_cached;
kstat_named_t arcstat_evict_l2_eligible;
kstat_named_t arcstat_evict_l2_ineligible;
+ kstat_named_t arcstat_evict_l2_skip;
kstat_named_t arcstat_hash_elements;
kstat_named_t arcstat_hash_elements_max;
kstat_named_t arcstat_hash_collisions;
@@ -316,9 +539,157 @@
kstat_named_t arcstat_c_min;
kstat_named_t arcstat_c_max;
kstat_named_t arcstat_size;
+ /*
+ * Number of compressed bytes stored in the arc_buf_hdr_t's b_pdata.
+ * Note that the compressed bytes may match the uncompressed bytes
+ * if the block is either not compressed or compressed arc is disabled.
+ */
+ kstat_named_t arcstat_compressed_size;
+ /*
+ * Uncompressed size of the data stored in b_pdata. If compressed
+ * arc is disabled then this value will be identical to the stat
+ * above.
+ */
+ kstat_named_t arcstat_uncompressed_size;
+ /*
+ * Number of bytes stored in all the arc_buf_t's. This is classified
+ * as "overhead" since this data is typically short-lived and will
+ * be evicted from the arc when it becomes unreferenced unless the
+ * zfs_keep_uncompressed_metadata or zfs_keep_uncompressed_level
+ * values have been set (see comment in dbuf.c for more information).
+ */
+ kstat_named_t arcstat_overhead_size;
+ /*
+ * Number of bytes consumed by internal ARC structures necessary
+ * for tracking purposes; these structures are not actually
+ * backed by ARC buffers. This includes arc_buf_hdr_t structures
+ * (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only
+ * caches), and arc_buf_t structures (allocated via arc_buf_t
+ * cache).
+ */
kstat_named_t arcstat_hdr_size;
+ /*
+ * Number of bytes consumed by ARC buffers of type equal to
+ * ARC_BUFC_DATA. This is generally consumed by buffers backing
+ * on disk user data (e.g. plain file contents).
+ */
kstat_named_t arcstat_data_size;
+ /*
+ * Number of bytes consumed by ARC buffers of type equal to
+ * ARC_BUFC_METADATA. This is generally consumed by buffers
+ * backing on disk data that is used for internal ZFS
+ * structures (e.g. ZAP, dnode, indirect blocks, etc).
+ */
+ kstat_named_t arcstat_metadata_size;
+ /*
+ * Number of bytes consumed by various buffers and structures
+ * not actually backed with ARC buffers. This includes bonus
+ * buffers (allocated directly via zio_buf_* functions),
+ * dmu_buf_impl_t structures (allocated via dmu_buf_impl_t
+ * cache), and dnode_t structures (allocated via dnode_t cache).
+ */
kstat_named_t arcstat_other_size;
+ /*
+ * Total number of bytes consumed by ARC buffers residing in the
+ * arc_anon state. This includes *all* buffers in the arc_anon
+ * state; e.g. data, metadata, evictable, and unevictable buffers
+ * are all included in this value.
+ */
+ kstat_named_t arcstat_anon_size;
+ /*
+ * Number of bytes consumed by ARC buffers that meet the
+ * following criteria: backing buffers of type ARC_BUFC_DATA,
+ * residing in the arc_anon state, and are eligible for eviction
+ * (e.g. have no outstanding holds on the buffer).
+ */
+ kstat_named_t arcstat_anon_evictable_data;
+ /*
+ * Number of bytes consumed by ARC buffers that meet the
+ * following criteria: backing buffers of type ARC_BUFC_METADATA,
+ * residing in the arc_anon state, and are eligible for eviction
+ * (e.g. have no outstanding holds on the buffer).
+ */
+ kstat_named_t arcstat_anon_evictable_metadata;
+ /*
+ * Total number of bytes consumed by ARC buffers residing in the
+ * arc_mru state. This includes *all* buffers in the arc_mru
+ * state; e.g. data, metadata, evictable, and unevictable buffers
+ * are all included in this value.
+ */
+ kstat_named_t arcstat_mru_size;
+ /*
+ * Number of bytes consumed by ARC buffers that meet the
+ * following criteria: backing buffers of type ARC_BUFC_DATA,
+ * residing in the arc_mru state, and are eligible for eviction
+ * (e.g. have no outstanding holds on the buffer).
+ */
+ kstat_named_t arcstat_mru_evictable_data;
+ /*
+ * Number of bytes consumed by ARC buffers that meet the
+ * following criteria: backing buffers of type ARC_BUFC_METADATA,
+ * residing in the arc_mru state, and are eligible for eviction
+ * (e.g. have no outstanding holds on the buffer).
+ */
+ kstat_named_t arcstat_mru_evictable_metadata;
+ /*
+ * Total number of bytes that *would have been* consumed by ARC
+ * buffers in the arc_mru_ghost state. The key thing to note
+ * here, is the fact that this size doesn't actually indicate
+ * RAM consumption. The ghost lists only consist of headers and
+ * don't actually have ARC buffers linked off of these headers.
+ * Thus, *if* the headers had associated ARC buffers, these
+ * buffers *would have* consumed this number of bytes.
+ */
+ kstat_named_t arcstat_mru_ghost_size;
+ /*
+ * Number of bytes that *would have been* consumed by ARC
+ * buffers that are eligible for eviction, of type
+ * ARC_BUFC_DATA, and linked off the arc_mru_ghost state.
+ */
+ kstat_named_t arcstat_mru_ghost_evictable_data;
+ /*
+ * Number of bytes that *would have been* consumed by ARC
+ * buffers that are eligible for eviction, of type
+ * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
+ */
+ kstat_named_t arcstat_mru_ghost_evictable_metadata;
+ /*
+ * Total number of bytes consumed by ARC buffers residing in the
+ * arc_mfu state. This includes *all* buffers in the arc_mfu
+ * state; e.g. data, metadata, evictable, and unevictable buffers
+ * are all included in this value.
+ */
+ kstat_named_t arcstat_mfu_size;
+ /*
+ * Number of bytes consumed by ARC buffers that are eligible for
+ * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu
+ * state.
+ */
+ kstat_named_t arcstat_mfu_evictable_data;
+ /*
+ * Number of bytes consumed by ARC buffers that are eligible for
+ * eviction, of type ARC_BUFC_METADATA, and reside in the
+ * arc_mfu state.
+ */
+ kstat_named_t arcstat_mfu_evictable_metadata;
+ /*
+ * Total number of bytes that *would have been* consumed by ARC
+ * buffers in the arc_mfu_ghost state. See the comment above
+ * arcstat_mru_ghost_size for more details.
+ */
+ kstat_named_t arcstat_mfu_ghost_size;
+ /*
+ * Number of bytes that *would have been* consumed by ARC
+ * buffers that are eligible for eviction, of type
+ * ARC_BUFC_DATA, and linked off the arc_mfu_ghost state.
+ */
+ kstat_named_t arcstat_mfu_ghost_evictable_data;
+ /*
+ * Number of bytes that *would have been* consumed by ARC
+ * buffers that are eligible for eviction, of type
+ * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
+ */
+ kstat_named_t arcstat_mfu_ghost_evictable_metadata;
kstat_named_t arcstat_l2_hits;
kstat_named_t arcstat_l2_misses;
kstat_named_t arcstat_l2_feeds;
@@ -328,9 +699,10 @@
kstat_named_t arcstat_l2_writes_sent;
kstat_named_t arcstat_l2_writes_done;
kstat_named_t arcstat_l2_writes_error;
- kstat_named_t arcstat_l2_writes_hdr_miss;
+ kstat_named_t arcstat_l2_writes_lock_retry;
kstat_named_t arcstat_l2_evict_lock_retry;
kstat_named_t arcstat_l2_evict_reading;
+ kstat_named_t arcstat_l2_evict_l1cached;
kstat_named_t arcstat_l2_free_on_write;
kstat_named_t arcstat_l2_abort_lowmem;
kstat_named_t arcstat_l2_cksum_bad;
@@ -338,9 +710,6 @@
kstat_named_t arcstat_l2_size;
kstat_named_t arcstat_l2_asize;
kstat_named_t arcstat_l2_hdr_size;
- kstat_named_t arcstat_l2_compress_successes;
- kstat_named_t arcstat_l2_compress_zeros;
- kstat_named_t arcstat_l2_compress_failures;
kstat_named_t arcstat_l2_write_trylock_fail;
kstat_named_t arcstat_l2_write_passed_headroom;
kstat_named_t arcstat_l2_write_spa_mismatch;
@@ -354,9 +723,12 @@
kstat_named_t arcstat_l2_write_buffer_list_iter;
kstat_named_t arcstat_l2_write_buffer_list_null_iter;
kstat_named_t arcstat_memory_throttle_count;
- kstat_named_t arcstat_duplicate_buffers;
- kstat_named_t arcstat_duplicate_buffers_size;
- kstat_named_t arcstat_duplicate_reads;
+ kstat_named_t arcstat_meta_used;
+ kstat_named_t arcstat_meta_limit;
+ kstat_named_t arcstat_meta_max;
+ kstat_named_t arcstat_meta_min;
+ kstat_named_t arcstat_sync_wait_for_async;
+ kstat_named_t arcstat_demand_hit_predictive_prefetch;
} arc_stats_t;
static arc_stats_t arc_stats = {
@@ -376,13 +748,13 @@
{ "mfu_ghost_hits", KSTAT_DATA_UINT64 },
{ "allocated", KSTAT_DATA_UINT64 },
{ "deleted", KSTAT_DATA_UINT64 },
- { "stolen", KSTAT_DATA_UINT64 },
- { "recycle_miss", KSTAT_DATA_UINT64 },
{ "mutex_miss", KSTAT_DATA_UINT64 },
{ "evict_skip", KSTAT_DATA_UINT64 },
+ { "evict_not_enough", KSTAT_DATA_UINT64 },
{ "evict_l2_cached", KSTAT_DATA_UINT64 },
{ "evict_l2_eligible", KSTAT_DATA_UINT64 },
{ "evict_l2_ineligible", KSTAT_DATA_UINT64 },
+ { "evict_l2_skip", KSTAT_DATA_UINT64 },
{ "hash_elements", KSTAT_DATA_UINT64 },
{ "hash_elements_max", KSTAT_DATA_UINT64 },
{ "hash_collisions", KSTAT_DATA_UINT64 },
@@ -393,9 +765,28 @@
{ "c_min", KSTAT_DATA_UINT64 },
{ "c_max", KSTAT_DATA_UINT64 },
{ "size", KSTAT_DATA_UINT64 },
+ { "compressed_size", KSTAT_DATA_UINT64 },
+ { "uncompressed_size", KSTAT_DATA_UINT64 },
+ { "overhead_size", KSTAT_DATA_UINT64 },
{ "hdr_size", KSTAT_DATA_UINT64 },
{ "data_size", KSTAT_DATA_UINT64 },
+ { "metadata_size", KSTAT_DATA_UINT64 },
{ "other_size", KSTAT_DATA_UINT64 },
+ { "anon_size", KSTAT_DATA_UINT64 },
+ { "anon_evictable_data", KSTAT_DATA_UINT64 },
+ { "anon_evictable_metadata", KSTAT_DATA_UINT64 },
+ { "mru_size", KSTAT_DATA_UINT64 },
+ { "mru_evictable_data", KSTAT_DATA_UINT64 },
+ { "mru_evictable_metadata", KSTAT_DATA_UINT64 },
+ { "mru_ghost_size", KSTAT_DATA_UINT64 },
+ { "mru_ghost_evictable_data", KSTAT_DATA_UINT64 },
+ { "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
+ { "mfu_size", KSTAT_DATA_UINT64 },
+ { "mfu_evictable_data", KSTAT_DATA_UINT64 },
+ { "mfu_evictable_metadata", KSTAT_DATA_UINT64 },
+ { "mfu_ghost_size", KSTAT_DATA_UINT64 },
+ { "mfu_ghost_evictable_data", KSTAT_DATA_UINT64 },
+ { "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
{ "l2_hits", KSTAT_DATA_UINT64 },
{ "l2_misses", KSTAT_DATA_UINT64 },
{ "l2_feeds", KSTAT_DATA_UINT64 },
@@ -405,9 +796,10 @@
{ "l2_writes_sent", KSTAT_DATA_UINT64 },
{ "l2_writes_done", KSTAT_DATA_UINT64 },
{ "l2_writes_error", KSTAT_DATA_UINT64 },
- { "l2_writes_hdr_miss", KSTAT_DATA_UINT64 },
+ { "l2_writes_lock_retry", KSTAT_DATA_UINT64 },
{ "l2_evict_lock_retry", KSTAT_DATA_UINT64 },
{ "l2_evict_reading", KSTAT_DATA_UINT64 },
+ { "l2_evict_l1cached", KSTAT_DATA_UINT64 },
{ "l2_free_on_write", KSTAT_DATA_UINT64 },
{ "l2_abort_lowmem", KSTAT_DATA_UINT64 },
{ "l2_cksum_bad", KSTAT_DATA_UINT64 },
@@ -415,9 +807,6 @@
{ "l2_size", KSTAT_DATA_UINT64 },
{ "l2_asize", KSTAT_DATA_UINT64 },
{ "l2_hdr_size", KSTAT_DATA_UINT64 },
- { "l2_compress_successes", KSTAT_DATA_UINT64 },
- { "l2_compress_zeros", KSTAT_DATA_UINT64 },
- { "l2_compress_failures", KSTAT_DATA_UINT64 },
{ "l2_write_trylock_fail", KSTAT_DATA_UINT64 },
{ "l2_write_passed_headroom", KSTAT_DATA_UINT64 },
{ "l2_write_spa_mismatch", KSTAT_DATA_UINT64 },
@@ -431,9 +820,12 @@
{ "l2_write_buffer_list_iter", KSTAT_DATA_UINT64 },
{ "l2_write_buffer_list_null_iter", KSTAT_DATA_UINT64 },
{ "memory_throttle_count", KSTAT_DATA_UINT64 },
- { "duplicate_buffers", KSTAT_DATA_UINT64 },
- { "duplicate_buffers_size", KSTAT_DATA_UINT64 },
- { "duplicate_reads", KSTAT_DATA_UINT64 }
+ { "arc_meta_used", KSTAT_DATA_UINT64 },
+ { "arc_meta_limit", KSTAT_DATA_UINT64 },
+ { "arc_meta_max", KSTAT_DATA_UINT64 },
+ { "arc_meta_min", KSTAT_DATA_UINT64 },
+ { "sync_wait_for_async", KSTAT_DATA_UINT64 },
+ { "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 },
};
#define ARCSTAT(stat) (arc_stats.stat.value.ui64)
@@ -495,23 +887,22 @@
#define arc_c ARCSTAT(arcstat_c) /* target size of cache */
#define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */
#define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */
+#define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */
+#define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */
+#define arc_meta_used ARCSTAT(arcstat_meta_used) /* size of metadata */
+#define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */
-#define L2ARC_IS_VALID_COMPRESS(_c_) \
- ((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY)
+/* compressed size of entire arc */
+#define arc_compressed_size ARCSTAT(arcstat_compressed_size)
+/* uncompressed size of entire arc */
+#define arc_uncompressed_size ARCSTAT(arcstat_uncompressed_size)
+/* number of bytes in the arc from arc_buf_t's */
+#define arc_overhead_size ARCSTAT(arcstat_overhead_size)
static int arc_no_grow; /* Don't try to grow cache size */
static uint64_t arc_tempreserve;
static uint64_t arc_loaned_bytes;
-static uint64_t arc_meta_used;
-static uint64_t arc_meta_limit;
-static uint64_t arc_meta_max = 0;
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_meta_used, CTLFLAG_RDTUN,
- &arc_meta_used, 0, "ARC metadata used");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, arc_meta_limit, CTLFLAG_RDTUN,
- &arc_meta_limit, 0, "ARC metadata limit");
-typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
-
typedef struct arc_callback arc_callback_t;
struct arc_callback {
@@ -527,36 +918,64 @@
struct arc_write_callback {
void *awcb_private;
arc_done_func_t *awcb_ready;
+ arc_done_func_t *awcb_children_ready;
+ arc_done_func_t *awcb_physdone;
arc_done_func_t *awcb_done;
arc_buf_t *awcb_buf;
};
-struct arc_buf_hdr {
- /* protected by hash lock */
- dva_t b_dva;
- uint64_t b_birth;
- uint64_t b_cksum0;
-
+/*
+ * ARC buffers are separated into multiple structs as a memory saving measure:
+ * - Common fields struct, always defined, and embedded within it:
+ * - L2-only fields, always allocated but undefined when not in L2ARC
+ * - L1-only fields, only allocated when in L1ARC
+ *
+ * Buffer in L1 Buffer only in L2
+ * +------------------------+ +------------------------+
+ * | arc_buf_hdr_t | | arc_buf_hdr_t |
+ * | | | |
+ * | | | |
+ * | | | |
+ * +------------------------+ +------------------------+
+ * | l2arc_buf_hdr_t | | l2arc_buf_hdr_t |
+ * | (undefined if L1-only) | | |
+ * +------------------------+ +------------------------+
+ * | l1arc_buf_hdr_t |
+ * | |
+ * | |
+ * | |
+ * | |
+ * +------------------------+
+ *
+ * Because it's possible for the L2ARC to become extremely large, we can wind
+ * up eating a lot of memory in L2ARC buffer headers, so the size of a header
+ * is minimized by only allocating the fields necessary for an L1-cached buffer
+ * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and
+ * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple
+ * words in pointers. arc_hdr_realloc() is used to switch a header between
+ * these two allocation states.
+ */
+typedef struct l1arc_buf_hdr {
kmutex_t b_freeze_lock;
zio_cksum_t *b_freeze_cksum;
+#ifdef ZFS_DEBUG
+ /*
+ * used for debugging wtih kmem_flags - by allocating and freeing
+ * b_thawed when the buffer is thawed, we get a record of the stack
+ * trace that thawed it.
+ */
void *b_thawed;
+#endif
- arc_buf_hdr_t *b_hash_next;
arc_buf_t *b_buf;
- uint32_t b_flags;
- uint32_t b_datacnt;
-
- arc_callback_t *b_acb;
+ uint32_t b_bufcnt;
+ /* for waiting on writes to complete */
kcondvar_t b_cv;
+ uint8_t b_byteswap;
- /* immutable */
- arc_buf_contents_t b_type;
- uint64_t b_size;
- uint64_t b_spa;
-
/* protected by arc state mutex */
arc_state_t *b_state;
- list_node_t b_arc_node;
+ multilist_node_t b_arc_node;
/* updated atomically */
clock_t b_arc_access;
@@ -564,66 +983,201 @@
/* self protecting */
refcount_t b_refcnt;
- l2arc_buf_hdr_t *b_l2hdr;
+ arc_callback_t *b_acb;
+ void *b_pdata;
+} l1arc_buf_hdr_t;
+
+typedef struct l2arc_dev l2arc_dev_t;
+
+typedef struct l2arc_buf_hdr {
+ /* protected by arc_buf_hdr mutex */
+ l2arc_dev_t *b_dev; /* L2ARC device */
+ uint64_t b_daddr; /* disk address, offset byte */
+
list_node_t b_l2node;
+} l2arc_buf_hdr_t;
+
+struct arc_buf_hdr {
+ /* protected by hash lock */
+ dva_t b_dva;
+ uint64_t b_birth;
+
+ arc_buf_contents_t b_type;
+ arc_buf_hdr_t *b_hash_next;
+ arc_flags_t b_flags;
+
+ /*
+ * This field stores the size of the data buffer after
+ * compression, and is set in the arc's zio completion handlers.
+ * It is in units of SPA_MINBLOCKSIZE (e.g. 1 == 512 bytes).
+ *
+ * While the block pointers can store up to 32MB in their psize
+ * field, we can only store up to 32MB minus 512B. This is due
+ * to the bp using a bias of 1, whereas we use a bias of 0 (i.e.
+ * a field of zeros represents 512B in the bp). We can't use a
+ * bias of 1 since we need to reserve a psize of zero, here, to
+ * represent holes and embedded blocks.
+ *
+ * This isn't a problem in practice, since the maximum size of a
+ * buffer is limited to 16MB, so we never need to store 32MB in
+ * this field. Even in the upstream illumos code base, the
+ * maximum size of a buffer is limited to 16MB.
+ */
+ uint16_t b_psize;
+
+ /*
+ * This field stores the size of the data buffer before
+ * compression, and cannot change once set. It is in units
+ * of SPA_MINBLOCKSIZE (e.g. 2 == 1024 bytes)
+ */
+ uint16_t b_lsize; /* immutable */
+ uint64_t b_spa; /* immutable */
+
+ /* L2ARC fields. Undefined when not in L2ARC. */
+ l2arc_buf_hdr_t b_l2hdr;
+ /* L1ARC fields. Undefined when in l2arc_only state */
+ l1arc_buf_hdr_t b_l1hdr;
};
-static arc_buf_t *arc_eviction_list;
-static kmutex_t arc_eviction_mtx;
-static arc_buf_hdr_t arc_eviction_hdr;
-static void arc_get_data_buf(arc_buf_t *buf);
-static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
-static int arc_evict_needed(arc_buf_contents_t type);
-static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes);
-#ifdef illumos
-static void arc_buf_watch(arc_buf_t *buf);
-#endif /* illumos */
+#if defined(__FreeBSD__) && defined(_KERNEL)
+static int
+sysctl_vfs_zfs_arc_meta_limit(SYSCTL_HANDLER_ARGS)
+{
+ uint64_t val;
+ int err;
-static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab);
+ val = arc_meta_limit;
+ err = sysctl_handle_64(oidp, &val, 0, req);
+ if (err != 0 || req->newptr == NULL)
+ return (err);
+ if (val <= 0 || val > arc_c_max)
+ return (EINVAL);
+
+ arc_meta_limit = val;
+ return (0);
+}
+
+static int
+sysctl_vfs_zfs_arc_max(SYSCTL_HANDLER_ARGS)
+{
+ uint64_t val;
+ int err;
+
+ val = zfs_arc_max;
+ err = sysctl_handle_64(oidp, &val, 0, req);
+ if (err != 0 || req->newptr == NULL)
+ return (err);
+
+ if (zfs_arc_max == 0) {
+ /* Loader tunable so blindly set */
+ zfs_arc_max = val;
+ return (0);
+ }
+
+ if (val < arc_abs_min || val > kmem_size())
+ return (EINVAL);
+ if (val < arc_c_min)
+ return (EINVAL);
+ if (zfs_arc_meta_limit > 0 && val < zfs_arc_meta_limit)
+ return (EINVAL);
+
+ arc_c_max = val;
+
+ arc_c = arc_c_max;
+ arc_p = (arc_c >> 1);
+
+ if (zfs_arc_meta_limit == 0) {
+ /* limit meta-data to 1/4 of the arc capacity */
+ arc_meta_limit = arc_c_max / 4;
+ }
+
+ /* if kmem_flags are set, lets try to use less memory */
+ if (kmem_debugging())
+ arc_c = arc_c / 2;
+
+ zfs_arc_max = arc_c;
+
+ return (0);
+}
+
+static int
+sysctl_vfs_zfs_arc_min(SYSCTL_HANDLER_ARGS)
+{
+ uint64_t val;
+ int err;
+
+ val = zfs_arc_min;
+ err = sysctl_handle_64(oidp, &val, 0, req);
+ if (err != 0 || req->newptr == NULL)
+ return (err);
+
+ if (zfs_arc_min == 0) {
+ /* Loader tunable so blindly set */
+ zfs_arc_min = val;
+ return (0);
+ }
+
+ if (val < arc_abs_min || val > arc_c_max)
+ return (EINVAL);
+
+ arc_c_min = val;
+
+ if (zfs_arc_meta_min == 0)
+ arc_meta_min = arc_c_min / 2;
+
+ if (arc_c < arc_c_min)
+ arc_c = arc_c_min;
+
+ zfs_arc_min = arc_c_min;
+
+ return (0);
+}
+#endif
+
#define GHOST_STATE(state) \
((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \
(state) == arc_l2c_only)
-/*
- * Private ARC flags. These flags are private ARC only flags that will show up
- * in b_flags in the arc_hdr_buf_t. Some flags are publicly declared, and can
- * be passed in as arc_flags in things like arc_read. However, these flags
- * should never be passed and should only be set by ARC code. When adding new
- * public flags, make sure not to smash the private ones.
- */
+#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE)
+#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS)
+#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_FLAG_IO_ERROR)
+#define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH)
+#define HDR_COMPRESSION_ENABLED(hdr) \
+ ((hdr)->b_flags & ARC_FLAG_COMPRESSED_ARC)
-#define ARC_IN_HASH_TABLE (1 << 9) /* this buffer is hashed */
-#define ARC_IO_IN_PROGRESS (1 << 10) /* I/O in progress for buf */
-#define ARC_IO_ERROR (1 << 11) /* I/O failed for buf */
-#define ARC_FREED_IN_READ (1 << 12) /* buf freed while in read */
-#define ARC_BUF_AVAILABLE (1 << 13) /* block not in active use */
-#define ARC_INDIRECT (1 << 14) /* this is an indirect block */
-#define ARC_FREE_IN_PROGRESS (1 << 15) /* hdr about to be freed */
-#define ARC_L2_WRITING (1 << 16) /* L2ARC write in progress */
-#define ARC_L2_EVICTED (1 << 17) /* evicted during I/O */
-#define ARC_L2_WRITE_HEAD (1 << 18) /* head of write list */
+#define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_FLAG_L2CACHE)
+#define HDR_L2_READING(hdr) \
+ (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) && \
+ ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR))
+#define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITING)
+#define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_FLAG_L2_EVICTED)
+#define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD)
+#define HDR_SHARED_DATA(hdr) ((hdr)->b_flags & ARC_FLAG_SHARED_DATA)
-#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE)
-#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS)
-#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR)
-#define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_PREFETCH)
-#define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ)
-#define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE)
-#define HDR_FREE_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FREE_IN_PROGRESS)
-#define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_L2CACHE)
-#define HDR_L2_READING(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS && \
- (hdr)->b_l2hdr != NULL)
-#define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_L2_WRITING)
-#define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_L2_EVICTED)
-#define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_L2_WRITE_HEAD)
+#define HDR_ISTYPE_METADATA(hdr) \
+ ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA)
+#define HDR_ISTYPE_DATA(hdr) (!HDR_ISTYPE_METADATA(hdr))
+#define HDR_HAS_L1HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR)
+#define HDR_HAS_L2HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)
+
+/* For storing compression mode in b_flags */
+#define HDR_COMPRESS_OFFSET (highbit64(ARC_FLAG_COMPRESS_0) - 1)
+
+#define HDR_GET_COMPRESS(hdr) ((enum zio_compress)BF32_GET((hdr)->b_flags, \
+ HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS))
+#define HDR_SET_COMPRESS(hdr, cmp) BF32_SET((hdr)->b_flags, \
+ HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS, (cmp));
+
+#define ARC_BUF_LAST(buf) ((buf)->b_next == NULL)
+
/*
* Other sizes
*/
-#define HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
-#define L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t))
+#define HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
+#define HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr))
/*
* Hash table routines
@@ -703,68 +1257,74 @@
&l2arc_norw, 0, "no reads during writes");
SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD,
- &ARC_anon.arcs_size, 0, "size of anonymous state");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_lsize, CTLFLAG_RD,
- &ARC_anon.arcs_lsize[ARC_BUFC_METADATA], 0, "size of anonymous state");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_lsize, CTLFLAG_RD,
- &ARC_anon.arcs_lsize[ARC_BUFC_DATA], 0, "size of anonymous state");
+ &ARC_anon.arcs_size.rc_count, 0, "size of anonymous state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_esize, CTLFLAG_RD,
+ &ARC_anon.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
+ "size of anonymous state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_esize, CTLFLAG_RD,
+ &ARC_anon.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
+ "size of anonymous state");
SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD,
- &ARC_mru.arcs_size, 0, "size of mru state");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_lsize, CTLFLAG_RD,
- &ARC_mru.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mru state");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_lsize, CTLFLAG_RD,
- &ARC_mru.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mru state");
+ &ARC_mru.arcs_size.rc_count, 0, "size of mru state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_esize, CTLFLAG_RD,
+ &ARC_mru.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
+ "size of metadata in mru state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_esize, CTLFLAG_RD,
+ &ARC_mru.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
+ "size of data in mru state");
SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD,
- &ARC_mru_ghost.arcs_size, 0, "size of mru ghost state");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_lsize, CTLFLAG_RD,
- &ARC_mru_ghost.arcs_lsize[ARC_BUFC_METADATA], 0,
+ &ARC_mru_ghost.arcs_size.rc_count, 0, "size of mru ghost state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_esize, CTLFLAG_RD,
+ &ARC_mru_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
"size of metadata in mru ghost state");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_lsize, CTLFLAG_RD,
- &ARC_mru_ghost.arcs_lsize[ARC_BUFC_DATA], 0,
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_esize, CTLFLAG_RD,
+ &ARC_mru_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
"size of data in mru ghost state");
SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD,
- &ARC_mfu.arcs_size, 0, "size of mfu state");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_lsize, CTLFLAG_RD,
- &ARC_mfu.arcs_lsize[ARC_BUFC_METADATA], 0, "size of metadata in mfu state");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_lsize, CTLFLAG_RD,
- &ARC_mfu.arcs_lsize[ARC_BUFC_DATA], 0, "size of data in mfu state");
+ &ARC_mfu.arcs_size.rc_count, 0, "size of mfu state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_esize, CTLFLAG_RD,
+ &ARC_mfu.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
+ "size of metadata in mfu state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_esize, CTLFLAG_RD,
+ &ARC_mfu.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
+ "size of data in mfu state");
SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD,
- &ARC_mfu_ghost.arcs_size, 0, "size of mfu ghost state");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_lsize, CTLFLAG_RD,
- &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_METADATA], 0,
+ &ARC_mfu_ghost.arcs_size.rc_count, 0, "size of mfu ghost state");
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_esize, CTLFLAG_RD,
+ &ARC_mfu_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0,
"size of metadata in mfu ghost state");
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_lsize, CTLFLAG_RD,
- &ARC_mfu_ghost.arcs_lsize[ARC_BUFC_DATA], 0,
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_esize, CTLFLAG_RD,
+ &ARC_mfu_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0,
"size of data in mfu ghost state");
SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD,
- &ARC_l2c_only.arcs_size, 0, "size of mru state");
+ &ARC_l2c_only.arcs_size.rc_count, 0, "size of mru state");
/*
* L2ARC Internals
*/
-typedef struct l2arc_dev {
+struct l2arc_dev {
vdev_t *l2ad_vdev; /* vdev */
spa_t *l2ad_spa; /* spa */
uint64_t l2ad_hand; /* next write location */
uint64_t l2ad_start; /* first addr on device */
uint64_t l2ad_end; /* last addr on device */
- uint64_t l2ad_evict; /* last addr eviction reached */
boolean_t l2ad_first; /* first sweep through */
boolean_t l2ad_writing; /* currently writing */
- list_t *l2ad_buflist; /* buffer list */
+ kmutex_t l2ad_mtx; /* lock for buffer list */
+ list_t l2ad_buflist; /* buffer list */
list_node_t l2ad_node; /* device list node */
-} l2arc_dev_t;
+ refcount_t l2ad_alloc; /* allocated bytes */
+};
static list_t L2ARC_dev_list; /* device list */
static list_t *l2arc_dev_list; /* device list pointer */
static kmutex_t l2arc_dev_mtx; /* device list mutex */
static l2arc_dev_t *l2arc_dev_last; /* last device used */
-static kmutex_t l2arc_buflist_mtx; /* mutex for all buflists */
static list_t L2ARC_free_on_write; /* free after write buf list */
static list_t *l2arc_free_on_write; /* free after write list ptr */
static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */
@@ -771,12 +1331,11 @@
static uint64_t l2arc_ndev; /* number of devices */
typedef struct l2arc_read_callback {
- arc_buf_t *l2rcb_buf; /* read buffer */
- spa_t *l2rcb_spa; /* spa */
+ arc_buf_hdr_t *l2rcb_hdr; /* read buffer */
blkptr_t l2rcb_bp; /* original blkptr */
- zbookmark_t l2rcb_zb; /* original bookmark */
+ zbookmark_phys_t l2rcb_zb; /* original bookmark */
int l2rcb_flags; /* original flags */
- enum zio_compress l2rcb_compress; /* applied compress */
+ void *l2rcb_data; /* temporary buffer */
} l2arc_read_callback_t;
typedef struct l2arc_write_callback {
@@ -784,23 +1343,11 @@
arc_buf_hdr_t *l2wcb_head; /* head of write buflist */
} l2arc_write_callback_t;
-struct l2arc_buf_hdr {
- /* protected by arc_buf_hdr mutex */
- l2arc_dev_t *b_dev; /* L2ARC device */
- uint64_t b_daddr; /* disk address, offset byte */
- /* compression applied to buffer data */
- enum zio_compress b_compress;
- /* real alloc'd buffer size depending on b_compress applied */
- int b_asize;
- /* temporary buffer holder for in-flight compressed data */
- void *b_tmp_cdata;
-};
-
typedef struct l2arc_data_free {
/* protected by l2arc_free_on_write_mtx */
void *l2df_data;
size_t l2df_size;
- void (*l2df_func)(void *, size_t);
+ arc_buf_contents_t l2df_type;
list_node_t l2df_list_node;
} l2arc_data_free_t;
@@ -808,15 +1355,36 @@
static kcondvar_t l2arc_feed_thr_cv;
static uint8_t l2arc_thread_exit;
-static void l2arc_read_done(zio_t *zio);
-static void l2arc_hdr_stat_add(void);
-static void l2arc_hdr_stat_remove(void);
+static void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, void *);
+static void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, void *);
+static void arc_hdr_free_pdata(arc_buf_hdr_t *hdr);
+static void arc_hdr_alloc_pdata(arc_buf_hdr_t *);
+static void arc_access(arc_buf_hdr_t *, kmutex_t *);
+static boolean_t arc_is_overflowing();
+static void arc_buf_watch(arc_buf_t *);
-static boolean_t l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr);
-static void l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr,
- enum zio_compress c);
-static void l2arc_release_cdata_buf(arc_buf_hdr_t *ab);
+static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *);
+static uint32_t arc_bufc_to_flags(arc_buf_contents_t);
+static inline void arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags);
+static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags);
+static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *);
+static void l2arc_read_done(zio_t *);
+
+static void
+l2arc_trim(const arc_buf_hdr_t *hdr)
+{
+ l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
+
+ ASSERT(HDR_HAS_L2HDR(hdr));
+ ASSERT(MUTEX_HELD(&dev->l2ad_mtx));
+
+ if (HDR_GET_PSIZE(hdr) != 0) {
+ trim_map_free(dev->l2ad_vdev, hdr->b_l2hdr.b_daddr,
+ HDR_GET_PSIZE(hdr), 0);
+ }
+}
+
static uint64_t
buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
{
@@ -834,15 +1402,14 @@
return (crc);
}
-#define BUF_EMPTY(buf) \
- ((buf)->b_dva.dva_word[0] == 0 && \
- (buf)->b_dva.dva_word[1] == 0 && \
- (buf)->b_birth == 0)
+#define HDR_EMPTY(hdr) \
+ ((hdr)->b_dva.dva_word[0] == 0 && \
+ (hdr)->b_dva.dva_word[1] == 0)
-#define BUF_EQUAL(spa, dva, birth, buf) \
- ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \
- ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \
- ((buf)->b_birth == birth) && ((buf)->b_spa == spa)
+#define HDR_EQUAL(spa, dva, birth, hdr) \
+ ((hdr)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \
+ ((hdr)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \
+ ((hdr)->b_birth == birth) && ((hdr)->b_spa == spa)
static void
buf_discard_identity(arc_buf_hdr_t *hdr)
@@ -850,22 +1417,23 @@
hdr->b_dva.dva_word[0] = 0;
hdr->b_dva.dva_word[1] = 0;
hdr->b_birth = 0;
- hdr->b_cksum0 = 0;
}
static arc_buf_hdr_t *
-buf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp)
+buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp)
{
+ const dva_t *dva = BP_IDENTITY(bp);
+ uint64_t birth = BP_PHYSICAL_BIRTH(bp);
uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
- arc_buf_hdr_t *buf;
+ arc_buf_hdr_t *hdr;
mutex_enter(hash_lock);
- for (buf = buf_hash_table.ht_table[idx]; buf != NULL;
- buf = buf->b_hash_next) {
- if (BUF_EQUAL(spa, dva, birth, buf)) {
+ for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL;
+ hdr = hdr->b_hash_next) {
+ if (HDR_EQUAL(spa, dva, birth, hdr)) {
*lockp = hash_lock;
- return (buf);
+ return (hdr);
}
}
mutex_exit(hash_lock);
@@ -878,28 +1446,37 @@
* equal to elem in the hash table, then the already existing element
* will be returned and the new element will not be inserted.
* Otherwise returns NULL.
+ * If lockp == NULL, the caller is assumed to already hold the hash lock.
*/
static arc_buf_hdr_t *
-buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
+buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp)
{
- uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
+ uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
- arc_buf_hdr_t *fbuf;
+ arc_buf_hdr_t *fhdr;
uint32_t i;
- ASSERT(!HDR_IN_HASH_TABLE(buf));
- *lockp = hash_lock;
- mutex_enter(hash_lock);
- for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL;
- fbuf = fbuf->b_hash_next, i++) {
- if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf))
- return (fbuf);
+ ASSERT(!DVA_IS_EMPTY(&hdr->b_dva));
+ ASSERT(hdr->b_birth != 0);
+ ASSERT(!HDR_IN_HASH_TABLE(hdr));
+
+ if (lockp != NULL) {
+ *lockp = hash_lock;
+ mutex_enter(hash_lock);
+ } else {
+ ASSERT(MUTEX_HELD(hash_lock));
}
- buf->b_hash_next = buf_hash_table.ht_table[idx];
- buf_hash_table.ht_table[idx] = buf;
- buf->b_flags |= ARC_IN_HASH_TABLE;
+ for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL;
+ fhdr = fhdr->b_hash_next, i++) {
+ if (HDR_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr))
+ return (fhdr);
+ }
+ hdr->b_hash_next = buf_hash_table.ht_table[idx];
+ buf_hash_table.ht_table[idx] = hdr;
+ arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
+
/* collect some hash table performance data */
if (i > 0) {
ARCSTAT_BUMP(arcstat_hash_collisions);
@@ -916,22 +1493,22 @@
}
static void
-buf_hash_remove(arc_buf_hdr_t *buf)
+buf_hash_remove(arc_buf_hdr_t *hdr)
{
- arc_buf_hdr_t *fbuf, **bufp;
- uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
+ arc_buf_hdr_t *fhdr, **hdrp;
+ uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
- ASSERT(HDR_IN_HASH_TABLE(buf));
+ ASSERT(HDR_IN_HASH_TABLE(hdr));
- bufp = &buf_hash_table.ht_table[idx];
- while ((fbuf = *bufp) != buf) {
- ASSERT(fbuf != NULL);
- bufp = &fbuf->b_hash_next;
+ hdrp = &buf_hash_table.ht_table[idx];
+ while ((fhdr = *hdrp) != hdr) {
+ ASSERT3P(fhdr, !=, NULL);
+ hdrp = &fhdr->b_hash_next;
}
- *bufp = buf->b_hash_next;
- buf->b_hash_next = NULL;
- buf->b_flags &= ~ARC_IN_HASH_TABLE;
+ *hdrp = hdr->b_hash_next;
+ hdr->b_hash_next = NULL;
+ arc_hdr_clear_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
/* collect some hash table performance data */
ARCSTAT_BUMPDOWN(arcstat_hash_elements);
@@ -944,7 +1521,8 @@
/*
* Global data structures and functions for the buf kmem cache.
*/
-static kmem_cache_t *hdr_cache;
+static kmem_cache_t *hdr_full_cache;
+static kmem_cache_t *hdr_l2only_cache;
static kmem_cache_t *buf_cache;
static void
@@ -956,7 +1534,8 @@
(buf_hash_table.ht_mask + 1) * sizeof (void *));
for (i = 0; i < BUF_LOCKS; i++)
mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
- kmem_cache_destroy(hdr_cache);
+ kmem_cache_destroy(hdr_full_cache);
+ kmem_cache_destroy(hdr_l2only_cache);
kmem_cache_destroy(buf_cache);
}
@@ -966,15 +1545,16 @@
*/
/* ARGSUSED */
static int
-hdr_cons(void *vbuf, void *unused, int kmflag)
+hdr_full_cons(void *vbuf, void *unused, int kmflag)
{
- arc_buf_hdr_t *buf = vbuf;
+ arc_buf_hdr_t *hdr = vbuf;
- bzero(buf, sizeof (arc_buf_hdr_t));
- refcount_create(&buf->b_refcnt);
- cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
- mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
- arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
+ bzero(hdr, HDR_FULL_SIZE);
+ cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL);
+ refcount_create(&hdr->b_l1hdr.b_refcnt);
+ mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
+ multilist_link_init(&hdr->b_l1hdr.b_arc_node);
+ arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS);
return (0);
}
@@ -981,6 +1561,18 @@
/* ARGSUSED */
static int
+hdr_l2only_cons(void *vbuf, void *unused, int kmflag)
+{
+ arc_buf_hdr_t *hdr = vbuf;
+
+ bzero(hdr, HDR_L2ONLY_SIZE);
+ arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
+
+ return (0);
+}
+
+/* ARGSUSED */
+static int
buf_cons(void *vbuf, void *unused, int kmflag)
{
arc_buf_t *buf = vbuf;
@@ -998,19 +1590,30 @@
*/
/* ARGSUSED */
static void
-hdr_dest(void *vbuf, void *unused)
+hdr_full_dest(void *vbuf, void *unused)
{
- arc_buf_hdr_t *buf = vbuf;
+ arc_buf_hdr_t *hdr = vbuf;
- ASSERT(BUF_EMPTY(buf));
- refcount_destroy(&buf->b_refcnt);
- cv_destroy(&buf->b_cv);
- mutex_destroy(&buf->b_freeze_lock);
- arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
+ ASSERT(HDR_EMPTY(hdr));
+ cv_destroy(&hdr->b_l1hdr.b_cv);
+ refcount_destroy(&hdr->b_l1hdr.b_refcnt);
+ mutex_destroy(&hdr->b_l1hdr.b_freeze_lock);
+ ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
+ arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS);
}
/* ARGSUSED */
static void
+hdr_l2only_dest(void *vbuf, void *unused)
+{
+ arc_buf_hdr_t *hdr = vbuf;
+
+ ASSERT(HDR_EMPTY(hdr));
+ arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
+}
+
+/* ARGSUSED */
+static void
buf_dest(void *vbuf, void *unused)
{
arc_buf_t *buf = vbuf;
@@ -1032,7 +1635,7 @@
* which is after we do arc_fini().
*/
if (!arc_dead)
- cv_signal(&arc_reclaim_thr_cv);
+ cv_signal(&arc_reclaim_thread_cv);
}
static void
@@ -1044,10 +1647,11 @@
/*
* The hash table is big enough to fill all of physical memory
- * with an average 64K block size. The table will take up
- * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers).
+ * with an average block size of zfs_arc_average_blocksize (default 8K).
+ * By default, the table will take up
+ * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers).
*/
- while (hsize * 65536 < (uint64_t)physmem * PAGESIZE)
+ while (hsize * zfs_arc_average_blocksize < (uint64_t)physmem * PAGESIZE)
hsize <<= 1;
retry:
buf_hash_table.ht_mask = hsize - 1;
@@ -1059,8 +1663,11 @@
goto retry;
}
- hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
- 0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
+ hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE,
+ 0, hdr_full_cons, hdr_full_dest, hdr_recl, NULL, NULL, 0);
+ hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only",
+ HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, hdr_recl,
+ NULL, NULL, 0);
buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
@@ -1076,58 +1683,139 @@
#define ARC_MINTIME (hz>>4) /* 62 ms */
+static inline boolean_t
+arc_buf_is_shared(arc_buf_t *buf)
+{
+ boolean_t shared = (buf->b_data != NULL &&
+ buf->b_data == buf->b_hdr->b_l1hdr.b_pdata);
+ IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr));
+ return (shared);
+}
+
+static inline void
+arc_cksum_free(arc_buf_hdr_t *hdr)
+{
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
+ if (hdr->b_l1hdr.b_freeze_cksum != NULL) {
+ kmem_free(hdr->b_l1hdr.b_freeze_cksum, sizeof (zio_cksum_t));
+ hdr->b_l1hdr.b_freeze_cksum = NULL;
+ }
+ mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
+}
+
static void
arc_cksum_verify(arc_buf_t *buf)
{
+ arc_buf_hdr_t *hdr = buf->b_hdr;
zio_cksum_t zc;
if (!(zfs_flags & ZFS_DEBUG_MODIFY))
return;
- mutex_enter(&buf->b_hdr->b_freeze_lock);
- if (buf->b_hdr->b_freeze_cksum == NULL ||
- (buf->b_hdr->b_flags & ARC_IO_ERROR)) {
- mutex_exit(&buf->b_hdr->b_freeze_lock);
+ ASSERT(HDR_HAS_L1HDR(hdr));
+
+ mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
+ if (hdr->b_l1hdr.b_freeze_cksum == NULL || HDR_IO_ERROR(hdr)) {
+ mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
return;
}
- fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
- if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
+ fletcher_2_native(buf->b_data, HDR_GET_LSIZE(hdr), NULL, &zc);
+ if (!ZIO_CHECKSUM_EQUAL(*hdr->b_l1hdr.b_freeze_cksum, zc))
panic("buffer modified while frozen!");
- mutex_exit(&buf->b_hdr->b_freeze_lock);
+ mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
}
-static int
-arc_cksum_equal(arc_buf_t *buf)
+static boolean_t
+arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio)
{
- zio_cksum_t zc;
- int equal;
+ enum zio_compress compress = BP_GET_COMPRESS(zio->io_bp);
+ boolean_t valid_cksum;
- mutex_enter(&buf->b_hdr->b_freeze_lock);
- fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
- equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
- mutex_exit(&buf->b_hdr->b_freeze_lock);
+ ASSERT(!BP_IS_EMBEDDED(zio->io_bp));
+ VERIFY3U(BP_GET_PSIZE(zio->io_bp), ==, HDR_GET_PSIZE(hdr));
- return (equal);
+ /*
+ * We rely on the blkptr's checksum to determine if the block
+ * is valid or not. When compressed arc is enabled, the l2arc
+ * writes the block to the l2arc just as it appears in the pool.
+ * This allows us to use the blkptr's checksum to validate the
+ * data that we just read off of the l2arc without having to store
+ * a separate checksum in the arc_buf_hdr_t. However, if compressed
+ * arc is disabled, then the data written to the l2arc is always
+ * uncompressed and won't match the block as it exists in the main
+ * pool. When this is the case, we must first compress it if it is
+ * compressed on the main pool before we can validate the checksum.
+ */
+ if (!HDR_COMPRESSION_ENABLED(hdr) && compress != ZIO_COMPRESS_OFF) {
+ ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF);
+ uint64_t lsize = HDR_GET_LSIZE(hdr);
+ uint64_t csize;
+
+ void *cbuf = zio_buf_alloc(HDR_GET_PSIZE(hdr));
+ csize = zio_compress_data(compress, zio->io_data, cbuf, lsize);
+ ASSERT3U(csize, <=, HDR_GET_PSIZE(hdr));
+ if (csize < HDR_GET_PSIZE(hdr)) {
+ /*
+ * Compressed blocks are always a multiple of the
+ * smallest ashift in the pool. Ideally, we would
+ * like to round up the csize to the next
+ * spa_min_ashift but that value may have changed
+ * since the block was last written. Instead,
+ * we rely on the fact that the hdr's psize
+ * was set to the psize of the block when it was
+ * last written. We set the csize to that value
+ * and zero out any part that should not contain
+ * data.
+ */
+ bzero((char *)cbuf + csize, HDR_GET_PSIZE(hdr) - csize);
+ csize = HDR_GET_PSIZE(hdr);
+ }
+ zio_push_transform(zio, cbuf, csize, HDR_GET_PSIZE(hdr), NULL);
+ }
+
+ /*
+ * Block pointers always store the checksum for the logical data.
+ * If the block pointer has the gang bit set, then the checksum
+ * it represents is for the reconstituted data and not for an
+ * individual gang member. The zio pipeline, however, must be able to
+ * determine the checksum of each of the gang constituents so it
+ * treats the checksum comparison differently than what we need
+ * for l2arc blocks. This prevents us from using the
+ * zio_checksum_error() interface directly. Instead we must call the
+ * zio_checksum_error_impl() so that we can ensure the checksum is
+ * generated using the correct checksum algorithm and accounts for the
+ * logical I/O size and not just a gang fragment.
+ */
+ valid_cksum = (zio_checksum_error_impl(zio->io_spa, zio->io_bp,
+ BP_GET_CHECKSUM(zio->io_bp), zio->io_data, zio->io_size,
+ zio->io_offset, NULL) == 0);
+ zio_pop_transforms(zio);
+ return (valid_cksum);
}
static void
-arc_cksum_compute(arc_buf_t *buf, boolean_t force)
+arc_cksum_compute(arc_buf_t *buf)
{
- if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
+ arc_buf_hdr_t *hdr = buf->b_hdr;
+
+ if (!(zfs_flags & ZFS_DEBUG_MODIFY))
return;
- mutex_enter(&buf->b_hdr->b_freeze_lock);
- if (buf->b_hdr->b_freeze_cksum != NULL) {
- mutex_exit(&buf->b_hdr->b_freeze_lock);
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
+ if (hdr->b_l1hdr.b_freeze_cksum != NULL) {
+ mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
return;
}
- buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
- fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
- buf->b_hdr->b_freeze_cksum);
- mutex_exit(&buf->b_hdr->b_freeze_lock);
+ hdr->b_l1hdr.b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t),
+ KM_SLEEP);
+ fletcher_2_native(buf->b_data, HDR_GET_LSIZE(hdr), NULL,
+ hdr->b_l1hdr.b_freeze_cksum);
+ mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
#ifdef illumos
arc_buf_watch(buf);
-#endif /* illumos */
+#endif
}
#ifdef illumos
@@ -1166,7 +1854,7 @@
procctl_t ctl;
ctl.cmd = PCWATCH;
ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
- ctl.prwatch.pr_size = buf->b_hdr->b_size;
+ ctl.prwatch.pr_size = HDR_GET_LSIZE(buf->b_hdr);
ctl.prwatch.pr_wflags = WA_WRITE;
result = write(arc_procfd, &ctl, sizeof (ctl));
ASSERT3U(result, ==, sizeof (ctl));
@@ -1175,225 +1863,529 @@
}
#endif /* illumos */
+static arc_buf_contents_t
+arc_buf_type(arc_buf_hdr_t *hdr)
+{
+ arc_buf_contents_t type;
+ if (HDR_ISTYPE_METADATA(hdr)) {
+ type = ARC_BUFC_METADATA;
+ } else {
+ type = ARC_BUFC_DATA;
+ }
+ VERIFY3U(hdr->b_type, ==, type);
+ return (type);
+}
+
+static uint32_t
+arc_bufc_to_flags(arc_buf_contents_t type)
+{
+ switch (type) {
+ case ARC_BUFC_DATA:
+ /* metadata field is 0 if buffer contains normal data */
+ return (0);
+ case ARC_BUFC_METADATA:
+ return (ARC_FLAG_BUFC_METADATA);
+ default:
+ break;
+ }
+ panic("undefined ARC buffer type!");
+ return ((uint32_t)-1);
+}
+
void
arc_buf_thaw(arc_buf_t *buf)
{
+ arc_buf_hdr_t *hdr = buf->b_hdr;
+
if (zfs_flags & ZFS_DEBUG_MODIFY) {
- if (buf->b_hdr->b_state != arc_anon)
+ if (hdr->b_l1hdr.b_state != arc_anon)
panic("modifying non-anon buffer!");
- if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS)
+ if (HDR_IO_IN_PROGRESS(hdr))
panic("modifying buffer while i/o in progress!");
arc_cksum_verify(buf);
}
- mutex_enter(&buf->b_hdr->b_freeze_lock);
- if (buf->b_hdr->b_freeze_cksum != NULL) {
- kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
- buf->b_hdr->b_freeze_cksum = NULL;
- }
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ arc_cksum_free(hdr);
+ mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
+#ifdef ZFS_DEBUG
if (zfs_flags & ZFS_DEBUG_MODIFY) {
- if (buf->b_hdr->b_thawed)
- kmem_free(buf->b_hdr->b_thawed, 1);
- buf->b_hdr->b_thawed = kmem_alloc(1, KM_SLEEP);
+ if (hdr->b_l1hdr.b_thawed != NULL)
+ kmem_free(hdr->b_l1hdr.b_thawed, 1);
+ hdr->b_l1hdr.b_thawed = kmem_alloc(1, KM_SLEEP);
}
+#endif
- mutex_exit(&buf->b_hdr->b_freeze_lock);
+ mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
#ifdef illumos
arc_buf_unwatch(buf);
-#endif /* illumos */
+#endif
}
void
arc_buf_freeze(arc_buf_t *buf)
{
+ arc_buf_hdr_t *hdr = buf->b_hdr;
kmutex_t *hash_lock;
if (!(zfs_flags & ZFS_DEBUG_MODIFY))
return;
- hash_lock = HDR_LOCK(buf->b_hdr);
+ hash_lock = HDR_LOCK(hdr);
mutex_enter(hash_lock);
- ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
- buf->b_hdr->b_state == arc_anon);
- arc_cksum_compute(buf, B_FALSE);
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ ASSERT(hdr->b_l1hdr.b_freeze_cksum != NULL ||
+ hdr->b_l1hdr.b_state == arc_anon);
+ arc_cksum_compute(buf);
mutex_exit(hash_lock);
}
+/*
+ * The arc_buf_hdr_t's b_flags should never be modified directly. Instead,
+ * the following functions should be used to ensure that the flags are
+ * updated in a thread-safe way. When manipulating the flags either
+ * the hash_lock must be held or the hdr must be undiscoverable. This
+ * ensures that we're not racing with any other threads when updating
+ * the flags.
+ */
+static inline void
+arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags)
+{
+ ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
+ hdr->b_flags |= flags;
+}
+
+static inline void
+arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags)
+{
+ ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
+ hdr->b_flags &= ~flags;
+}
+
+/*
+ * Setting the compression bits in the arc_buf_hdr_t's b_flags is
+ * done in a special way since we have to clear and set bits
+ * at the same time. Consumers that wish to set the compression bits
+ * must use this function to ensure that the flags are updated in
+ * thread-safe manner.
+ */
static void
-get_buf_info(arc_buf_hdr_t *ab, arc_state_t *state, list_t **list, kmutex_t **lock)
+arc_hdr_set_compress(arc_buf_hdr_t *hdr, enum zio_compress cmp)
{
- uint64_t buf_hashid = buf_hash(ab->b_spa, &ab->b_dva, ab->b_birth);
+ ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
- if (ab->b_type == ARC_BUFC_METADATA)
- buf_hashid &= (ARC_BUFC_NUMMETADATALISTS - 1);
- else {
- buf_hashid &= (ARC_BUFC_NUMDATALISTS - 1);
- buf_hashid += ARC_BUFC_NUMMETADATALISTS;
+ /*
+ * Holes and embedded blocks will always have a psize = 0 so
+ * we ignore the compression of the blkptr and set the
+ * arc_buf_hdr_t's compression to ZIO_COMPRESS_OFF.
+ * Holes and embedded blocks remain anonymous so we don't
+ * want to uncompress them. Mark them as uncompressed.
+ */
+ if (!zfs_compressed_arc_enabled || HDR_GET_PSIZE(hdr) == 0) {
+ arc_hdr_clear_flags(hdr, ARC_FLAG_COMPRESSED_ARC);
+ HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF);
+ ASSERT(!HDR_COMPRESSION_ENABLED(hdr));
+ ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF);
+ } else {
+ arc_hdr_set_flags(hdr, ARC_FLAG_COMPRESSED_ARC);
+ HDR_SET_COMPRESS(hdr, cmp);
+ ASSERT3U(HDR_GET_COMPRESS(hdr), ==, cmp);
+ ASSERT(HDR_COMPRESSION_ENABLED(hdr));
}
+}
- *list = &state->arcs_lists[buf_hashid];
- *lock = ARCS_LOCK(state, buf_hashid);
+static int
+arc_decompress(arc_buf_t *buf)
+{
+ arc_buf_hdr_t *hdr = buf->b_hdr;
+ dmu_object_byteswap_t bswap = hdr->b_l1hdr.b_byteswap;
+ int error;
+
+ if (arc_buf_is_shared(buf)) {
+ ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF);
+ } else if (HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF) {
+ /*
+ * The arc_buf_hdr_t is either not compressed or is
+ * associated with an embedded block or a hole in which
+ * case they remain anonymous.
+ */
+ IMPLY(HDR_COMPRESSION_ENABLED(hdr), HDR_GET_PSIZE(hdr) == 0 ||
+ HDR_GET_PSIZE(hdr) == HDR_GET_LSIZE(hdr));
+ ASSERT(!HDR_SHARED_DATA(hdr));
+ bcopy(hdr->b_l1hdr.b_pdata, buf->b_data, HDR_GET_LSIZE(hdr));
+ } else {
+ ASSERT(!HDR_SHARED_DATA(hdr));
+ ASSERT3U(HDR_GET_LSIZE(hdr), !=, HDR_GET_PSIZE(hdr));
+ error = zio_decompress_data(HDR_GET_COMPRESS(hdr),
+ hdr->b_l1hdr.b_pdata, buf->b_data, HDR_GET_PSIZE(hdr),
+ HDR_GET_LSIZE(hdr));
+ if (error != 0) {
+ zfs_dbgmsg("hdr %p, compress %d, psize %d, lsize %d",
+ hdr, HDR_GET_COMPRESS(hdr), HDR_GET_PSIZE(hdr),
+ HDR_GET_LSIZE(hdr));
+ return (SET_ERROR(EIO));
+ }
+ }
+ if (bswap != DMU_BSWAP_NUMFUNCS) {
+ ASSERT(!HDR_SHARED_DATA(hdr));
+ ASSERT3U(bswap, <, DMU_BSWAP_NUMFUNCS);
+ dmu_ot_byteswap[bswap].ob_func(buf->b_data, HDR_GET_LSIZE(hdr));
+ }
+ arc_cksum_compute(buf);
+ return (0);
}
+/*
+ * Return the size of the block, b_pdata, that is stored in the arc_buf_hdr_t.
+ */
+static uint64_t
+arc_hdr_size(arc_buf_hdr_t *hdr)
+{
+ uint64_t size;
+ if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
+ HDR_GET_PSIZE(hdr) > 0) {
+ size = HDR_GET_PSIZE(hdr);
+ } else {
+ ASSERT3U(HDR_GET_LSIZE(hdr), !=, 0);
+ size = HDR_GET_LSIZE(hdr);
+ }
+ return (size);
+}
+
+/*
+ * Increment the amount of evictable space in the arc_state_t's refcount.
+ * We account for the space used by the hdr and the arc buf individually
+ * so that we can add and remove them from the refcount individually.
+ */
static void
-add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
+arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state)
{
- ASSERT(MUTEX_HELD(hash_lock));
+ arc_buf_contents_t type = arc_buf_type(hdr);
+ uint64_t lsize = HDR_GET_LSIZE(hdr);
- if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
- (ab->b_state != arc_anon)) {
- uint64_t delta = ab->b_size * ab->b_datacnt;
- uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type];
- list_t *list;
- kmutex_t *lock;
+ ASSERT(HDR_HAS_L1HDR(hdr));
- get_buf_info(ab, ab->b_state, &list, &lock);
- ASSERT(!MUTEX_HELD(lock));
- mutex_enter(lock);
- ASSERT(list_link_active(&ab->b_arc_node));
- list_remove(list, ab);
- if (GHOST_STATE(ab->b_state)) {
- ASSERT0(ab->b_datacnt);
- ASSERT3P(ab->b_buf, ==, NULL);
- delta = ab->b_size;
+ if (GHOST_STATE(state)) {
+ ASSERT0(hdr->b_l1hdr.b_bufcnt);
+ ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
+ ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
+ (void) refcount_add_many(&state->arcs_esize[type], lsize, hdr);
+ return;
+ }
+
+ ASSERT(!GHOST_STATE(state));
+ if (hdr->b_l1hdr.b_pdata != NULL) {
+ (void) refcount_add_many(&state->arcs_esize[type],
+ arc_hdr_size(hdr), hdr);
+ }
+ for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
+ buf = buf->b_next) {
+ if (arc_buf_is_shared(buf)) {
+ ASSERT(ARC_BUF_LAST(buf));
+ continue;
}
- ASSERT(delta > 0);
- ASSERT3U(*size, >=, delta);
- atomic_add_64(size, -delta);
- mutex_exit(lock);
+ (void) refcount_add_many(&state->arcs_esize[type], lsize, buf);
+ }
+}
+
+/*
+ * Decrement the amount of evictable space in the arc_state_t's refcount.
+ * We account for the space used by the hdr and the arc buf individually
+ * so that we can add and remove them from the refcount individually.
+ */
+static void
+arc_evitable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state)
+{
+ arc_buf_contents_t type = arc_buf_type(hdr);
+ uint64_t lsize = HDR_GET_LSIZE(hdr);
+
+ ASSERT(HDR_HAS_L1HDR(hdr));
+
+ if (GHOST_STATE(state)) {
+ ASSERT0(hdr->b_l1hdr.b_bufcnt);
+ ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
+ ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
+ (void) refcount_remove_many(&state->arcs_esize[type],
+ lsize, hdr);
+ return;
+ }
+
+ ASSERT(!GHOST_STATE(state));
+ if (hdr->b_l1hdr.b_pdata != NULL) {
+ (void) refcount_remove_many(&state->arcs_esize[type],
+ arc_hdr_size(hdr), hdr);
+ }
+ for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
+ buf = buf->b_next) {
+ if (arc_buf_is_shared(buf)) {
+ ASSERT(ARC_BUF_LAST(buf));
+ continue;
+ }
+ (void) refcount_remove_many(&state->arcs_esize[type],
+ lsize, buf);
+ }
+}
+
+/*
+ * Add a reference to this hdr indicating that someone is actively
+ * referencing that memory. When the refcount transitions from 0 to 1,
+ * we remove it from the respective arc_state_t list to indicate that
+ * it is not evictable.
+ */
+static void
+add_reference(arc_buf_hdr_t *hdr, void *tag)
+{
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ if (!MUTEX_HELD(HDR_LOCK(hdr))) {
+ ASSERT(hdr->b_l1hdr.b_state == arc_anon);
+ ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
+ ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
+ }
+
+ arc_state_t *state = hdr->b_l1hdr.b_state;
+
+ if ((refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) &&
+ (state != arc_anon)) {
+ /* We don't use the L2-only state list. */
+ if (state != arc_l2c_only) {
+ multilist_remove(&state->arcs_list[arc_buf_type(hdr)],
+ hdr);
+ arc_evitable_space_decrement(hdr, state);
+ }
/* remove the prefetch flag if we get a reference */
- if (ab->b_flags & ARC_PREFETCH)
- ab->b_flags &= ~ARC_PREFETCH;
+ arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH);
}
}
+/*
+ * Remove a reference from this hdr. When the reference transitions from
+ * 1 to 0 and we're not anonymous, then we add this hdr to the arc_state_t's
+ * list making it eligible for eviction.
+ */
static int
-remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
+remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag)
{
int cnt;
- arc_state_t *state = ab->b_state;
+ arc_state_t *state = hdr->b_l1hdr.b_state;
+ ASSERT(HDR_HAS_L1HDR(hdr));
ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
ASSERT(!GHOST_STATE(state));
- if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) &&
+ /*
+ * arc_l2c_only counts as a ghost state so we don't need to explicitly
+ * check to prevent usage of the arc_l2c_only list.
+ */
+ if (((cnt = refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) &&
(state != arc_anon)) {
- uint64_t *size = &state->arcs_lsize[ab->b_type];
- list_t *list;
- kmutex_t *lock;
-
- get_buf_info(ab, state, &list, &lock);
- ASSERT(!MUTEX_HELD(lock));
- mutex_enter(lock);
- ASSERT(!list_link_active(&ab->b_arc_node));
- list_insert_head(list, ab);
- ASSERT(ab->b_datacnt > 0);
- atomic_add_64(size, ab->b_size * ab->b_datacnt);
- mutex_exit(lock);
+ multilist_insert(&state->arcs_list[arc_buf_type(hdr)], hdr);
+ ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0);
+ arc_evictable_space_increment(hdr, state);
}
return (cnt);
}
/*
- * Move the supplied buffer to the indicated state. The mutex
+ * Move the supplied buffer to the indicated state. The hash lock
* for the buffer must be held by the caller.
*/
static void
-arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
+arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
+ kmutex_t *hash_lock)
{
- arc_state_t *old_state = ab->b_state;
- int64_t refcnt = refcount_count(&ab->b_refcnt);
- uint64_t from_delta, to_delta;
- list_t *list;
- kmutex_t *lock;
+ arc_state_t *old_state;
+ int64_t refcnt;
+ uint32_t bufcnt;
+ boolean_t update_old, update_new;
+ arc_buf_contents_t buftype = arc_buf_type(hdr);
+ /*
+ * We almost always have an L1 hdr here, since we call arc_hdr_realloc()
+ * in arc_read() when bringing a buffer out of the L2ARC. However, the
+ * L1 hdr doesn't always exist when we change state to arc_anon before
+ * destroying a header, in which case reallocating to add the L1 hdr is
+ * pointless.
+ */
+ if (HDR_HAS_L1HDR(hdr)) {
+ old_state = hdr->b_l1hdr.b_state;
+ refcnt = refcount_count(&hdr->b_l1hdr.b_refcnt);
+ bufcnt = hdr->b_l1hdr.b_bufcnt;
+ update_old = (bufcnt > 0 || hdr->b_l1hdr.b_pdata != NULL);
+ } else {
+ old_state = arc_l2c_only;
+ refcnt = 0;
+ bufcnt = 0;
+ update_old = B_FALSE;
+ }
+ update_new = update_old;
+
ASSERT(MUTEX_HELD(hash_lock));
- ASSERT(new_state != old_state);
- ASSERT(refcnt == 0 || ab->b_datacnt > 0);
- ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
- ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon);
+ ASSERT3P(new_state, !=, old_state);
+ ASSERT(!GHOST_STATE(new_state) || bufcnt == 0);
+ ASSERT(old_state != arc_anon || bufcnt <= 1);
- from_delta = to_delta = ab->b_datacnt * ab->b_size;
-
/*
* If this buffer is evictable, transfer it from the
* old state list to the new state list.
*/
if (refcnt == 0) {
- if (old_state != arc_anon) {
- int use_mutex;
- uint64_t *size = &old_state->arcs_lsize[ab->b_type];
+ if (old_state != arc_anon && old_state != arc_l2c_only) {
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ multilist_remove(&old_state->arcs_list[buftype], hdr);
- get_buf_info(ab, old_state, &list, &lock);
- use_mutex = !MUTEX_HELD(lock);
- if (use_mutex)
- mutex_enter(lock);
+ if (GHOST_STATE(old_state)) {
+ ASSERT0(bufcnt);
+ ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
+ update_old = B_TRUE;
+ }
+ arc_evitable_space_decrement(hdr, old_state);
+ }
+ if (new_state != arc_anon && new_state != arc_l2c_only) {
- ASSERT(list_link_active(&ab->b_arc_node));
- list_remove(list, ab);
-
/*
- * If prefetching out of the ghost cache,
- * we will have a non-zero datacnt.
+ * An L1 header always exists here, since if we're
+ * moving to some L1-cached state (i.e. not l2c_only or
+ * anonymous), we realloc the header to add an L1hdr
+ * beforehand.
*/
- if (GHOST_STATE(old_state) && ab->b_datacnt == 0) {
- /* ghost elements have a ghost size */
- ASSERT(ab->b_buf == NULL);
- from_delta = ab->b_size;
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ multilist_insert(&new_state->arcs_list[buftype], hdr);
+
+ if (GHOST_STATE(new_state)) {
+ ASSERT0(bufcnt);
+ ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
+ update_new = B_TRUE;
}
- ASSERT3U(*size, >=, from_delta);
- atomic_add_64(size, -from_delta);
-
- if (use_mutex)
- mutex_exit(lock);
+ arc_evictable_space_increment(hdr, new_state);
}
- if (new_state != arc_anon) {
- int use_mutex;
- uint64_t *size = &new_state->arcs_lsize[ab->b_type];
+ }
- get_buf_info(ab, new_state, &list, &lock);
- use_mutex = !MUTEX_HELD(lock);
- if (use_mutex)
- mutex_enter(lock);
+ ASSERT(!HDR_EMPTY(hdr));
+ if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr))
+ buf_hash_remove(hdr);
- list_insert_head(list, ab);
+ /* adjust state sizes (ignore arc_l2c_only) */
- /* ghost elements have a ghost size */
- if (GHOST_STATE(new_state)) {
- ASSERT(ab->b_datacnt == 0);
- ASSERT(ab->b_buf == NULL);
- to_delta = ab->b_size;
+ if (update_new && new_state != arc_l2c_only) {
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ if (GHOST_STATE(new_state)) {
+ ASSERT0(bufcnt);
+
+ /*
+ * When moving a header to a ghost state, we first
+ * remove all arc buffers. Thus, we'll have a
+ * bufcnt of zero, and no arc buffer to use for
+ * the reference. As a result, we use the arc
+ * header pointer for the reference.
+ */
+ (void) refcount_add_many(&new_state->arcs_size,
+ HDR_GET_LSIZE(hdr), hdr);
+ ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
+ } else {
+ uint32_t buffers = 0;
+
+ /*
+ * Each individual buffer holds a unique reference,
+ * thus we must remove each of these references one
+ * at a time.
+ */
+ for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
+ buf = buf->b_next) {
+ ASSERT3U(bufcnt, !=, 0);
+ buffers++;
+
+ /*
+ * When the arc_buf_t is sharing the data
+ * block with the hdr, the owner of the
+ * reference belongs to the hdr. Only
+ * add to the refcount if the arc_buf_t is
+ * not shared.
+ */
+ if (arc_buf_is_shared(buf)) {
+ ASSERT(ARC_BUF_LAST(buf));
+ continue;
+ }
+
+ (void) refcount_add_many(&new_state->arcs_size,
+ HDR_GET_LSIZE(hdr), buf);
}
- atomic_add_64(size, to_delta);
+ ASSERT3U(bufcnt, ==, buffers);
- if (use_mutex)
- mutex_exit(lock);
+ if (hdr->b_l1hdr.b_pdata != NULL) {
+ (void) refcount_add_many(&new_state->arcs_size,
+ arc_hdr_size(hdr), hdr);
+ } else {
+ ASSERT(GHOST_STATE(old_state));
+ }
}
}
- ASSERT(!BUF_EMPTY(ab));
- if (new_state == arc_anon && HDR_IN_HASH_TABLE(ab))
- buf_hash_remove(ab);
+ if (update_old && old_state != arc_l2c_only) {
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ if (GHOST_STATE(old_state)) {
+ ASSERT0(bufcnt);
- /* adjust state sizes */
- if (to_delta)
- atomic_add_64(&new_state->arcs_size, to_delta);
- if (from_delta) {
- ASSERT3U(old_state->arcs_size, >=, from_delta);
- atomic_add_64(&old_state->arcs_size, -from_delta);
+ /*
+ * When moving a header off of a ghost state,
+ * the header will not contain any arc buffers.
+ * We use the arc header pointer for the reference
+ * which is exactly what we did when we put the
+ * header on the ghost state.
+ */
+
+ (void) refcount_remove_many(&old_state->arcs_size,
+ HDR_GET_LSIZE(hdr), hdr);
+ ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
+ } else {
+ uint32_t buffers = 0;
+
+ /*
+ * Each individual buffer holds a unique reference,
+ * thus we must remove each of these references one
+ * at a time.
+ */
+ for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
+ buf = buf->b_next) {
+ ASSERT3P(bufcnt, !=, 0);
+ buffers++;
+
+ /*
+ * When the arc_buf_t is sharing the data
+ * block with the hdr, the owner of the
+ * reference belongs to the hdr. Only
+ * add to the refcount if the arc_buf_t is
+ * not shared.
+ */
+ if (arc_buf_is_shared(buf)) {
+ ASSERT(ARC_BUF_LAST(buf));
+ continue;
+ }
+
+ (void) refcount_remove_many(
+ &old_state->arcs_size, HDR_GET_LSIZE(hdr),
+ buf);
+ }
+ ASSERT3U(bufcnt, ==, buffers);
+ ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
+ (void) refcount_remove_many(
+ &old_state->arcs_size, arc_hdr_size(hdr), hdr);
+ }
}
- ab->b_state = new_state;
- /* adjust l2arc hdr stats */
- if (new_state == arc_l2c_only)
- l2arc_hdr_stat_add();
- else if (old_state == arc_l2c_only)
- l2arc_hdr_stat_remove();
+ if (HDR_HAS_L1HDR(hdr))
+ hdr->b_l1hdr.b_state = new_state;
+
+ /*
+ * L2 headers should never be on the L2 state list since they don't
+ * have L1 headers allocated.
+ */
+ ASSERT(multilist_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]) &&
+ multilist_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]));
}
void
@@ -1405,6 +2397,9 @@
case ARC_SPACE_DATA:
ARCSTAT_INCR(arcstat_data_size, space);
break;
+ case ARC_SPACE_META:
+ ARCSTAT_INCR(arcstat_metadata_size, space);
+ break;
case ARC_SPACE_OTHER:
ARCSTAT_INCR(arcstat_other_size, space);
break;
@@ -1416,7 +2411,9 @@
break;
}
- atomic_add_64(&arc_meta_used, space);
+ if (type != ARC_SPACE_DATA)
+ ARCSTAT_INCR(arcstat_meta_used, space);
+
atomic_add_64(&arc_size, space);
}
@@ -1429,6 +2426,9 @@
case ARC_SPACE_DATA:
ARCSTAT_INCR(arcstat_data_size, -space);
break;
+ case ARC_SPACE_META:
+ ARCSTAT_INCR(arcstat_metadata_size, -space);
+ break;
case ARC_SPACE_OTHER:
ARCSTAT_INCR(arcstat_other_size, -space);
break;
@@ -1440,58 +2440,96 @@
break;
}
- ASSERT(arc_meta_used >= space);
- if (arc_meta_max < arc_meta_used)
- arc_meta_max = arc_meta_used;
- atomic_add_64(&arc_meta_used, -space);
+ if (type != ARC_SPACE_DATA) {
+ ASSERT(arc_meta_used >= space);
+ if (arc_meta_max < arc_meta_used)
+ arc_meta_max = arc_meta_used;
+ ARCSTAT_INCR(arcstat_meta_used, -space);
+ }
+
ASSERT(arc_size >= space);
atomic_add_64(&arc_size, -space);
}
-void *
-arc_data_buf_alloc(uint64_t size)
+/*
+ * Allocate an initial buffer for this hdr, subsequent buffers will
+ * use arc_buf_clone().
+ */
+static arc_buf_t *
+arc_buf_alloc_impl(arc_buf_hdr_t *hdr, void *tag)
{
- if (arc_evict_needed(ARC_BUFC_DATA))
- cv_signal(&arc_reclaim_thr_cv);
- atomic_add_64(&arc_size, size);
- return (zio_data_buf_alloc(size));
-}
+ arc_buf_t *buf;
-void
-arc_data_buf_free(void *buf, uint64_t size)
-{
- zio_data_buf_free(buf, size);
- ASSERT(arc_size >= size);
- atomic_add_64(&arc_size, -size);
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ ASSERT3U(HDR_GET_LSIZE(hdr), >, 0);
+ VERIFY(hdr->b_type == ARC_BUFC_DATA ||
+ hdr->b_type == ARC_BUFC_METADATA);
+
+ ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
+ ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
+ ASSERT0(hdr->b_l1hdr.b_bufcnt);
+
+ buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
+ buf->b_hdr = hdr;
+ buf->b_data = NULL;
+ buf->b_next = NULL;
+
+ add_reference(hdr, tag);
+
+ /*
+ * We're about to change the hdr's b_flags. We must either
+ * hold the hash_lock or be undiscoverable.
+ */
+ ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
+
+ /*
+ * If the hdr's data can be shared (no byteswapping, hdr is
+ * uncompressed, hdr's data is not currently being written to the
+ * L2ARC write) then we share the data buffer and set the appropriate
+ * bit in the hdr's b_flags to indicate the hdr is sharing it's
+ * b_pdata with the arc_buf_t. Otherwise, we allocate a new buffer to
+ * store the buf's data.
+ */
+ if (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS &&
+ HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF && !HDR_L2_WRITING(hdr)) {
+ buf->b_data = hdr->b_l1hdr.b_pdata;
+ arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA);
+ } else {
+ buf->b_data = arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf);
+ ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr));
+ arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
+ }
+ VERIFY3P(buf->b_data, !=, NULL);
+
+ hdr->b_l1hdr.b_buf = buf;
+ hdr->b_l1hdr.b_bufcnt += 1;
+
+ return (buf);
}
-arc_buf_t *
-arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
+/*
+ * Used when allocating additional buffers.
+ */
+static arc_buf_t *
+arc_buf_clone(arc_buf_t *from)
{
- arc_buf_hdr_t *hdr;
arc_buf_t *buf;
+ arc_buf_hdr_t *hdr = from->b_hdr;
+ uint64_t size = HDR_GET_LSIZE(hdr);
- ASSERT3U(size, >, 0);
- hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
- ASSERT(BUF_EMPTY(hdr));
- hdr->b_size = size;
- hdr->b_type = type;
- hdr->b_spa = spa_load_guid(spa);
- hdr->b_state = arc_anon;
- hdr->b_arc_access = 0;
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ ASSERT(hdr->b_l1hdr.b_state != arc_anon);
+
buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
buf->b_hdr = hdr;
buf->b_data = NULL;
- buf->b_efunc = NULL;
- buf->b_private = NULL;
- buf->b_next = NULL;
- hdr->b_buf = buf;
- arc_get_data_buf(buf);
- hdr->b_datacnt = 1;
- hdr->b_flags = 0;
- ASSERT(refcount_is_zero(&hdr->b_refcnt));
- (void) refcount_add(&hdr->b_refcnt, tag);
+ buf->b_next = hdr->b_l1hdr.b_buf;
+ hdr->b_l1hdr.b_buf = buf;
+ buf->b_data = arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf);
+ bcopy(from->b_data, buf->b_data, size);
+ hdr->b_l1hdr.b_bufcnt += 1;
+ ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr));
return (buf);
}
@@ -1508,7 +2546,7 @@
{
arc_buf_t *buf;
- buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
+ buf = arc_alloc_buf(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
atomic_add_64(&arc_loaned_bytes, size);
return (buf);
@@ -1522,11 +2560,12 @@
{
arc_buf_hdr_t *hdr = buf->b_hdr;
- ASSERT(buf->b_data != NULL);
- (void) refcount_add(&hdr->b_refcnt, tag);
- (void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag);
+ ASSERT3P(buf->b_data, !=, NULL);
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag);
+ (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
- atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
+ atomic_add_64(&arc_loaned_bytes, -HDR_GET_LSIZE(hdr));
}
/* Detach an arc_buf from a dbuf (tag) */
@@ -1533,173 +2572,213 @@
void
arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
{
- arc_buf_hdr_t *hdr;
+ arc_buf_hdr_t *hdr = buf->b_hdr;
- ASSERT(buf->b_data != NULL);
- hdr = buf->b_hdr;
- (void) refcount_add(&hdr->b_refcnt, arc_onloan_tag);
- (void) refcount_remove(&hdr->b_refcnt, tag);
- buf->b_efunc = NULL;
- buf->b_private = NULL;
+ ASSERT3P(buf->b_data, !=, NULL);
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ (void) refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
+ (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, tag);
- atomic_add_64(&arc_loaned_bytes, hdr->b_size);
+ atomic_add_64(&arc_loaned_bytes, HDR_GET_LSIZE(hdr));
}
-static arc_buf_t *
-arc_buf_clone(arc_buf_t *from)
+static void
+l2arc_free_data_on_write(void *data, size_t size, arc_buf_contents_t type)
{
- arc_buf_t *buf;
- arc_buf_hdr_t *hdr = from->b_hdr;
- uint64_t size = hdr->b_size;
+ l2arc_data_free_t *df = kmem_alloc(sizeof (*df), KM_SLEEP);
- ASSERT(hdr->b_state != arc_anon);
+ df->l2df_data = data;
+ df->l2df_size = size;
+ df->l2df_type = type;
+ mutex_enter(&l2arc_free_on_write_mtx);
+ list_insert_head(l2arc_free_on_write, df);
+ mutex_exit(&l2arc_free_on_write_mtx);
+}
- buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
- buf->b_hdr = hdr;
- buf->b_data = NULL;
- buf->b_efunc = NULL;
- buf->b_private = NULL;
- buf->b_next = hdr->b_buf;
- hdr->b_buf = buf;
- arc_get_data_buf(buf);
- bcopy(from->b_data, buf->b_data, size);
+static void
+arc_hdr_free_on_write(arc_buf_hdr_t *hdr)
+{
+ arc_state_t *state = hdr->b_l1hdr.b_state;
+ arc_buf_contents_t type = arc_buf_type(hdr);
+ uint64_t size = arc_hdr_size(hdr);
- /*
- * This buffer already exists in the arc so create a duplicate
- * copy for the caller. If the buffer is associated with user data
- * then track the size and number of duplicates. These stats will be
- * updated as duplicate buffers are created and destroyed.
- */
- if (hdr->b_type == ARC_BUFC_DATA) {
- ARCSTAT_BUMP(arcstat_duplicate_buffers);
- ARCSTAT_INCR(arcstat_duplicate_buffers_size, size);
+ /* protected by hash lock, if in the hash table */
+ if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
+ ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
+ ASSERT(state != arc_anon && state != arc_l2c_only);
+
+ (void) refcount_remove_many(&state->arcs_esize[type],
+ size, hdr);
}
- hdr->b_datacnt += 1;
- return (buf);
+ (void) refcount_remove_many(&state->arcs_size, size, hdr);
+ if (type == ARC_BUFC_METADATA) {
+ arc_space_return(size, ARC_SPACE_META);
+ } else {
+ ASSERT(type == ARC_BUFC_DATA);
+ arc_space_return(size, ARC_SPACE_DATA);
+ }
+
+ l2arc_free_data_on_write(hdr->b_l1hdr.b_pdata, size, type);
}
-void
-arc_buf_add_ref(arc_buf_t *buf, void* tag)
+/*
+ * Share the arc_buf_t's data with the hdr. Whenever we are sharing the
+ * data buffer, we transfer the refcount ownership to the hdr and update
+ * the appropriate kstats.
+ */
+static void
+arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
{
- arc_buf_hdr_t *hdr;
- kmutex_t *hash_lock;
+ arc_state_t *state = hdr->b_l1hdr.b_state;
+ ASSERT(!HDR_SHARED_DATA(hdr));
+ ASSERT(!arc_buf_is_shared(buf));
+ ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
+ ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
+
/*
- * Check to see if this buffer is evicted. Callers
- * must verify b_data != NULL to know if the add_ref
- * was successful.
+ * Start sharing the data buffer. We transfer the
+ * refcount ownership to the hdr since it always owns
+ * the refcount whenever an arc_buf_t is shared.
*/
- mutex_enter(&buf->b_evict_lock);
- if (buf->b_data == NULL) {
- mutex_exit(&buf->b_evict_lock);
- return;
- }
- hash_lock = HDR_LOCK(buf->b_hdr);
- mutex_enter(hash_lock);
- hdr = buf->b_hdr;
- ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
- mutex_exit(&buf->b_evict_lock);
+ refcount_transfer_ownership(&state->arcs_size, buf, hdr);
+ hdr->b_l1hdr.b_pdata = buf->b_data;
+ arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA);
- ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
- add_reference(hdr, hash_lock, tag);
- DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
- arc_access(hdr, hash_lock);
- mutex_exit(hash_lock);
- ARCSTAT_BUMP(arcstat_hits);
- ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
- demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
- data, metadata, hits);
+ /*
+ * Since we've transferred ownership to the hdr we need
+ * to increment its compressed and uncompressed kstats and
+ * decrement the overhead size.
+ */
+ ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr));
+ ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr));
+ ARCSTAT_INCR(arcstat_overhead_size, -HDR_GET_LSIZE(hdr));
}
-/*
- * Free the arc data buffer. If it is an l2arc write in progress,
- * the buffer is placed on l2arc_free_on_write to be freed later.
- */
static void
-arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t))
+arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
{
- arc_buf_hdr_t *hdr = buf->b_hdr;
+ arc_state_t *state = hdr->b_l1hdr.b_state;
- if (HDR_L2_WRITING(hdr)) {
- l2arc_data_free_t *df;
- df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP);
- df->l2df_data = buf->b_data;
- df->l2df_size = hdr->b_size;
- df->l2df_func = free_func;
- mutex_enter(&l2arc_free_on_write_mtx);
- list_insert_head(l2arc_free_on_write, df);
- mutex_exit(&l2arc_free_on_write_mtx);
- ARCSTAT_BUMP(arcstat_l2_free_on_write);
- } else {
- free_func(buf->b_data, hdr->b_size);
- }
+ ASSERT(HDR_SHARED_DATA(hdr));
+ ASSERT(arc_buf_is_shared(buf));
+ ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
+ ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
+
+ /*
+ * We are no longer sharing this buffer so we need
+ * to transfer its ownership to the rightful owner.
+ */
+ refcount_transfer_ownership(&state->arcs_size, hdr, buf);
+ arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
+ hdr->b_l1hdr.b_pdata = NULL;
+
+ /*
+ * Since the buffer is no longer shared between
+ * the arc buf and the hdr, count it as overhead.
+ */
+ ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr));
+ ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr));
+ ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr));
}
+/*
+ * Free up buf->b_data and if 'remove' is set, then pull the
+ * arc_buf_t off of the the arc_buf_hdr_t's list and free it.
+ */
static void
-arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
+arc_buf_destroy_impl(arc_buf_t *buf, boolean_t remove)
{
arc_buf_t **bufp;
+ arc_buf_hdr_t *hdr = buf->b_hdr;
+ uint64_t size = HDR_GET_LSIZE(hdr);
+ boolean_t destroyed_buf_is_shared = arc_buf_is_shared(buf);
- /* free up data associated with the buf */
- if (buf->b_data) {
- arc_state_t *state = buf->b_hdr->b_state;
- uint64_t size = buf->b_hdr->b_size;
- arc_buf_contents_t type = buf->b_hdr->b_type;
+ /*
+ * Free up the data associated with the buf but only
+ * if we're not sharing this with the hdr. If we are sharing
+ * it with the hdr, then hdr will have performed the allocation
+ * so allow it to do the free.
+ */
+ if (buf->b_data != NULL) {
+ /*
+ * We're about to change the hdr's b_flags. We must either
+ * hold the hash_lock or be undiscoverable.
+ */
+ ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
arc_cksum_verify(buf);
#ifdef illumos
arc_buf_unwatch(buf);
-#endif /* illumos */
+#endif
- if (!recycle) {
- if (type == ARC_BUFC_METADATA) {
- arc_buf_data_free(buf, zio_buf_free);
- arc_space_return(size, ARC_SPACE_DATA);
- } else {
- ASSERT(type == ARC_BUFC_DATA);
- arc_buf_data_free(buf, zio_data_buf_free);
- ARCSTAT_INCR(arcstat_data_size, -size);
- atomic_add_64(&arc_size, -size);
- }
+ if (destroyed_buf_is_shared) {
+ ASSERT(ARC_BUF_LAST(buf));
+ ASSERT(HDR_SHARED_DATA(hdr));
+ arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
+ } else {
+ arc_free_data_buf(hdr, buf->b_data, size, buf);
+ ARCSTAT_INCR(arcstat_overhead_size, -size);
}
- if (list_link_active(&buf->b_hdr->b_arc_node)) {
- uint64_t *cnt = &state->arcs_lsize[type];
-
- ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt));
- ASSERT(state != arc_anon);
-
- ASSERT3U(*cnt, >=, size);
- atomic_add_64(cnt, -size);
- }
- ASSERT3U(state->arcs_size, >=, size);
- atomic_add_64(&state->arcs_size, -size);
buf->b_data = NULL;
- /*
- * If we're destroying a duplicate buffer make sure
- * that the appropriate statistics are updated.
- */
- if (buf->b_hdr->b_datacnt > 1 &&
- buf->b_hdr->b_type == ARC_BUFC_DATA) {
- ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
- ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size);
- }
- ASSERT(buf->b_hdr->b_datacnt > 0);
- buf->b_hdr->b_datacnt -= 1;
+ ASSERT(hdr->b_l1hdr.b_bufcnt > 0);
+ hdr->b_l1hdr.b_bufcnt -= 1;
}
/* only remove the buf if requested */
- if (!all)
+ if (!remove)
return;
/* remove the buf from the hdr list */
- for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next)
- continue;
- *bufp = buf->b_next;
+ arc_buf_t *lastbuf = NULL;
+ bufp = &hdr->b_l1hdr.b_buf;
+ while (*bufp != NULL) {
+ if (*bufp == buf)
+ *bufp = buf->b_next;
+
+ /*
+ * If we've removed a buffer in the middle of
+ * the list then update the lastbuf and update
+ * bufp.
+ */
+ if (*bufp != NULL) {
+ lastbuf = *bufp;
+ bufp = &(*bufp)->b_next;
+ }
+ }
buf->b_next = NULL;
+ ASSERT3P(lastbuf, !=, buf);
- ASSERT(buf->b_efunc == NULL);
+ /*
+ * If the current arc_buf_t is sharing its data
+ * buffer with the hdr, then reassign the hdr's
+ * b_pdata to share it with the new buffer at the end
+ * of the list. The shared buffer is always the last one
+ * on the hdr's buffer list.
+ */
+ if (destroyed_buf_is_shared && lastbuf != NULL) {
+ ASSERT(ARC_BUF_LAST(buf));
+ ASSERT(ARC_BUF_LAST(lastbuf));
+ VERIFY(!arc_buf_is_shared(lastbuf));
+ ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
+ arc_hdr_free_pdata(hdr);
+
+ /*
+ * We must setup a new shared block between the
+ * last buffer and the hdr. The data would have
+ * been allocated by the arc buf so we need to transfer
+ * ownership to the hdr since it's now being shared.
+ */
+ arc_share_buf(hdr, lastbuf);
+ } else if (HDR_SHARED_DATA(hdr)) {
+ ASSERT(arc_buf_is_shared(lastbuf));
+ }
+
+ if (hdr->b_l1hdr.b_bufcnt == 0)
+ arc_cksum_free(hdr);
+
/* clean up the buf */
buf->b_hdr = NULL;
kmem_cache_free(buf_cache, buf);
@@ -1706,668 +2785,1026 @@
}
static void
-arc_hdr_destroy(arc_buf_hdr_t *hdr)
+arc_hdr_alloc_pdata(arc_buf_hdr_t *hdr)
{
- ASSERT(refcount_is_zero(&hdr->b_refcnt));
- ASSERT3P(hdr->b_state, ==, arc_anon);
- ASSERT(!HDR_IO_IN_PROGRESS(hdr));
- l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr;
+ ASSERT3U(HDR_GET_LSIZE(hdr), >, 0);
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ ASSERT(!HDR_SHARED_DATA(hdr));
- if (l2hdr != NULL) {
- boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx);
+ ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
+ hdr->b_l1hdr.b_pdata = arc_get_data_buf(hdr, arc_hdr_size(hdr), hdr);
+ hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
+ ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
+
+ ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr));
+ ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr));
+}
+
+static void
+arc_hdr_free_pdata(arc_buf_hdr_t *hdr)
+{
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
+
+ /*
+ * If the hdr is currently being written to the l2arc then
+ * we defer freeing the data by adding it to the l2arc_free_on_write
+ * list. The l2arc will free the data once it's finished
+ * writing it to the l2arc device.
+ */
+ if (HDR_L2_WRITING(hdr)) {
+ arc_hdr_free_on_write(hdr);
+ ARCSTAT_BUMP(arcstat_l2_free_on_write);
+ } else {
+ arc_free_data_buf(hdr, hdr->b_l1hdr.b_pdata,
+ arc_hdr_size(hdr), hdr);
+ }
+ hdr->b_l1hdr.b_pdata = NULL;
+ hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
+
+ ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr));
+ ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr));
+}
+
+static arc_buf_hdr_t *
+arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize,
+ enum zio_compress compress, arc_buf_contents_t type)
+{
+ arc_buf_hdr_t *hdr;
+
+ ASSERT3U(lsize, >, 0);
+ VERIFY(type == ARC_BUFC_DATA || type == ARC_BUFC_METADATA);
+
+ hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE);
+ ASSERT(HDR_EMPTY(hdr));
+ ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
+ ASSERT3P(hdr->b_l1hdr.b_thawed, ==, NULL);
+ HDR_SET_PSIZE(hdr, psize);
+ HDR_SET_LSIZE(hdr, lsize);
+ hdr->b_spa = spa;
+ hdr->b_type = type;
+ hdr->b_flags = 0;
+ arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L1HDR);
+ arc_hdr_set_compress(hdr, compress);
+
+ hdr->b_l1hdr.b_state = arc_anon;
+ hdr->b_l1hdr.b_arc_access = 0;
+ hdr->b_l1hdr.b_bufcnt = 0;
+ hdr->b_l1hdr.b_buf = NULL;
+
+ /*
+ * Allocate the hdr's buffer. This will contain either
+ * the compressed or uncompressed data depending on the block
+ * it references and compressed arc enablement.
+ */
+ arc_hdr_alloc_pdata(hdr);
+ ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
+
+ return (hdr);
+}
+
+/*
+ * Transition between the two allocation states for the arc_buf_hdr struct.
+ * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without
+ * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller
+ * version is used when a cache buffer is only in the L2ARC in order to reduce
+ * memory usage.
+ */
+static arc_buf_hdr_t *
+arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)
+{
+ ASSERT(HDR_HAS_L2HDR(hdr));
+
+ arc_buf_hdr_t *nhdr;
+ l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
+
+ ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) ||
+ (old == hdr_l2only_cache && new == hdr_full_cache));
+
+ nhdr = kmem_cache_alloc(new, KM_PUSHPAGE);
+
+ ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
+ buf_hash_remove(hdr);
+
+ bcopy(hdr, nhdr, HDR_L2ONLY_SIZE);
+
+ if (new == hdr_full_cache) {
+ arc_hdr_set_flags(nhdr, ARC_FLAG_HAS_L1HDR);
/*
- * To prevent arc_free() and l2arc_evict() from
- * attempting to free the same buffer at the same time,
- * a FREE_IN_PROGRESS flag is given to arc_free() to
- * give it priority. l2arc_evict() can't destroy this
- * header while we are waiting on l2arc_buflist_mtx.
- *
- * The hdr may be removed from l2ad_buflist before we
- * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
+ * arc_access and arc_change_state need to be aware that a
+ * header has just come out of L2ARC, so we set its state to
+ * l2c_only even though it's about to change.
*/
- if (!buflist_held) {
- mutex_enter(&l2arc_buflist_mtx);
- l2hdr = hdr->b_l2hdr;
- }
+ nhdr->b_l1hdr.b_state = arc_l2c_only;
- if (l2hdr != NULL) {
- trim_map_free(l2hdr->b_dev->l2ad_vdev, l2hdr->b_daddr,
- hdr->b_size, 0);
- list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
- ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
- ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
- kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
- if (hdr->b_state == arc_l2c_only)
- l2arc_hdr_stat_remove();
- hdr->b_l2hdr = NULL;
- }
+ /* Verify previous threads set to NULL before freeing */
+ ASSERT3P(nhdr->b_l1hdr.b_pdata, ==, NULL);
+ } else {
+ ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
+ ASSERT0(hdr->b_l1hdr.b_bufcnt);
+ ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
- if (!buflist_held)
- mutex_exit(&l2arc_buflist_mtx);
- }
+ /*
+ * If we've reached here, We must have been called from
+ * arc_evict_hdr(), as such we should have already been
+ * removed from any ghost list we were previously on
+ * (which protects us from racing with arc_evict_state),
+ * thus no locking is needed during this check.
+ */
+ ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
- if (!BUF_EMPTY(hdr)) {
- ASSERT(!HDR_IN_HASH_TABLE(hdr));
- buf_discard_identity(hdr);
- }
- while (hdr->b_buf) {
- arc_buf_t *buf = hdr->b_buf;
+ /*
+ * A buffer must not be moved into the arc_l2c_only
+ * state if it's not finished being written out to the
+ * l2arc device. Otherwise, the b_l1hdr.b_pdata field
+ * might try to be accessed, even though it was removed.
+ */
+ VERIFY(!HDR_L2_WRITING(hdr));
+ VERIFY3P(hdr->b_l1hdr.b_pdata, ==, NULL);
- if (buf->b_efunc) {
- mutex_enter(&arc_eviction_mtx);
- mutex_enter(&buf->b_evict_lock);
- ASSERT(buf->b_hdr != NULL);
- arc_buf_destroy(hdr->b_buf, FALSE, FALSE);
- hdr->b_buf = buf->b_next;
- buf->b_hdr = &arc_eviction_hdr;
- buf->b_next = arc_eviction_list;
- arc_eviction_list = buf;
- mutex_exit(&buf->b_evict_lock);
- mutex_exit(&arc_eviction_mtx);
- } else {
- arc_buf_destroy(hdr->b_buf, FALSE, TRUE);
+#ifdef ZFS_DEBUG
+ if (hdr->b_l1hdr.b_thawed != NULL) {
+ kmem_free(hdr->b_l1hdr.b_thawed, 1);
+ hdr->b_l1hdr.b_thawed = NULL;
}
+#endif
+
+ arc_hdr_clear_flags(nhdr, ARC_FLAG_HAS_L1HDR);
}
- if (hdr->b_freeze_cksum != NULL) {
- kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
- hdr->b_freeze_cksum = NULL;
- }
- if (hdr->b_thawed) {
- kmem_free(hdr->b_thawed, 1);
- hdr->b_thawed = NULL;
- }
+ /*
+ * The header has been reallocated so we need to re-insert it into any
+ * lists it was on.
+ */
+ (void) buf_hash_insert(nhdr, NULL);
- ASSERT(!list_link_active(&hdr->b_arc_node));
- ASSERT3P(hdr->b_hash_next, ==, NULL);
- ASSERT3P(hdr->b_acb, ==, NULL);
- kmem_cache_free(hdr_cache, hdr);
+ ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node));
+
+ mutex_enter(&dev->l2ad_mtx);
+
+ /*
+ * We must place the realloc'ed header back into the list at
+ * the same spot. Otherwise, if it's placed earlier in the list,
+ * l2arc_write_buffers() could find it during the function's
+ * write phase, and try to write it out to the l2arc.
+ */
+ list_insert_after(&dev->l2ad_buflist, hdr, nhdr);
+ list_remove(&dev->l2ad_buflist, hdr);
+
+ mutex_exit(&dev->l2ad_mtx);
+
+ /*
+ * Since we're using the pointer address as the tag when
+ * incrementing and decrementing the l2ad_alloc refcount, we
+ * must remove the old pointer (that we're about to destroy) and
+ * add the new pointer to the refcount. Otherwise we'd remove
+ * the wrong pointer address when calling arc_hdr_destroy() later.
+ */
+
+ (void) refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr);
+ (void) refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(nhdr), nhdr);
+
+ buf_discard_identity(hdr);
+ kmem_cache_free(old, hdr);
+
+ return (nhdr);
}
-void
-arc_buf_free(arc_buf_t *buf, void *tag)
+/*
+ * Allocate a new arc_buf_hdr_t and arc_buf_t and return the buf to the caller.
+ * The buf is returned thawed since we expect the consumer to modify it.
+ */
+arc_buf_t *
+arc_alloc_buf(spa_t *spa, int32_t size, void *tag, arc_buf_contents_t type)
{
- arc_buf_hdr_t *hdr = buf->b_hdr;
- int hashed = hdr->b_state != arc_anon;
+ arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), size, size,
+ ZIO_COMPRESS_OFF, type);
+ ASSERT(!MUTEX_HELD(HDR_LOCK(hdr)));
+ arc_buf_t *buf = arc_buf_alloc_impl(hdr, tag);
+ arc_buf_thaw(buf);
+ return (buf);
+}
- ASSERT(buf->b_efunc == NULL);
- ASSERT(buf->b_data != NULL);
+static void
+arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr)
+{
+ l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr;
+ l2arc_dev_t *dev = l2hdr->b_dev;
+ uint64_t asize = arc_hdr_size(hdr);
- if (hashed) {
- kmutex_t *hash_lock = HDR_LOCK(hdr);
+ ASSERT(MUTEX_HELD(&dev->l2ad_mtx));
+ ASSERT(HDR_HAS_L2HDR(hdr));
- mutex_enter(hash_lock);
- hdr = buf->b_hdr;
- ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
+ list_remove(&dev->l2ad_buflist, hdr);
- (void) remove_reference(hdr, hash_lock, tag);
- if (hdr->b_datacnt > 1) {
- arc_buf_destroy(buf, FALSE, TRUE);
- } else {
- ASSERT(buf == hdr->b_buf);
- ASSERT(buf->b_efunc == NULL);
- hdr->b_flags |= ARC_BUF_AVAILABLE;
- }
- mutex_exit(hash_lock);
- } else if (HDR_IO_IN_PROGRESS(hdr)) {
- int destroy_hdr;
+ ARCSTAT_INCR(arcstat_l2_asize, -asize);
+ ARCSTAT_INCR(arcstat_l2_size, -HDR_GET_LSIZE(hdr));
+
+ vdev_space_update(dev->l2ad_vdev, -asize, 0, 0);
+
+ (void) refcount_remove_many(&dev->l2ad_alloc, asize, hdr);
+ arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR);
+}
+
+static void
+arc_hdr_destroy(arc_buf_hdr_t *hdr)
+{
+ if (HDR_HAS_L1HDR(hdr)) {
+ ASSERT(hdr->b_l1hdr.b_buf == NULL ||
+ hdr->b_l1hdr.b_bufcnt > 0);
+ ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
+ ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
+ }
+ ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+ ASSERT(!HDR_IN_HASH_TABLE(hdr));
+
+ if (!HDR_EMPTY(hdr))
+ buf_discard_identity(hdr);
+
+ if (HDR_HAS_L2HDR(hdr)) {
+ l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
+ boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx);
+
+ if (!buflist_held)
+ mutex_enter(&dev->l2ad_mtx);
+
/*
- * We are in the middle of an async write. Don't destroy
- * this buffer unless the write completes before we finish
- * decrementing the reference count.
+ * Even though we checked this conditional above, we
+ * need to check this again now that we have the
+ * l2ad_mtx. This is because we could be racing with
+ * another thread calling l2arc_evict() which might have
+ * destroyed this header's L2 portion as we were waiting
+ * to acquire the l2ad_mtx. If that happens, we don't
+ * want to re-destroy the header's L2 portion.
*/
- mutex_enter(&arc_eviction_mtx);
- (void) remove_reference(hdr, NULL, tag);
- ASSERT(refcount_is_zero(&hdr->b_refcnt));
- destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
- mutex_exit(&arc_eviction_mtx);
- if (destroy_hdr)
- arc_hdr_destroy(hdr);
+ if (HDR_HAS_L2HDR(hdr)) {
+ l2arc_trim(hdr);
+ arc_hdr_l2hdr_destroy(hdr);
+ }
+
+ if (!buflist_held)
+ mutex_exit(&dev->l2ad_mtx);
+ }
+
+ if (HDR_HAS_L1HDR(hdr)) {
+ arc_cksum_free(hdr);
+
+ while (hdr->b_l1hdr.b_buf != NULL)
+ arc_buf_destroy_impl(hdr->b_l1hdr.b_buf, B_TRUE);
+
+#ifdef ZFS_DEBUG
+ if (hdr->b_l1hdr.b_thawed != NULL) {
+ kmem_free(hdr->b_l1hdr.b_thawed, 1);
+ hdr->b_l1hdr.b_thawed = NULL;
+ }
+#endif
+
+ if (hdr->b_l1hdr.b_pdata != NULL) {
+ arc_hdr_free_pdata(hdr);
+ }
+ }
+
+ ASSERT3P(hdr->b_hash_next, ==, NULL);
+ if (HDR_HAS_L1HDR(hdr)) {
+ ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
+ ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
+ kmem_cache_free(hdr_full_cache, hdr);
} else {
- if (remove_reference(hdr, NULL, tag) > 0)
- arc_buf_destroy(buf, FALSE, TRUE);
- else
- arc_hdr_destroy(hdr);
+ kmem_cache_free(hdr_l2only_cache, hdr);
}
}
-boolean_t
-arc_buf_remove_ref(arc_buf_t *buf, void* tag)
+void
+arc_buf_destroy(arc_buf_t *buf, void* tag)
{
arc_buf_hdr_t *hdr = buf->b_hdr;
kmutex_t *hash_lock = HDR_LOCK(hdr);
- boolean_t no_callback = (buf->b_efunc == NULL);
- if (hdr->b_state == arc_anon) {
- ASSERT(hdr->b_datacnt == 1);
- arc_buf_free(buf, tag);
- return (no_callback);
+ if (hdr->b_l1hdr.b_state == arc_anon) {
+ ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1);
+ ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+ VERIFY0(remove_reference(hdr, NULL, tag));
+ arc_hdr_destroy(hdr);
+ return;
}
mutex_enter(hash_lock);
- hdr = buf->b_hdr;
+ ASSERT3P(hdr, ==, buf->b_hdr);
+ ASSERT(hdr->b_l1hdr.b_bufcnt > 0);
ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
- ASSERT(hdr->b_state != arc_anon);
- ASSERT(buf->b_data != NULL);
+ ASSERT3P(hdr->b_l1hdr.b_state, !=, arc_anon);
+ ASSERT3P(buf->b_data, !=, NULL);
(void) remove_reference(hdr, hash_lock, tag);
- if (hdr->b_datacnt > 1) {
- if (no_callback)
- arc_buf_destroy(buf, FALSE, TRUE);
- } else if (no_callback) {
- ASSERT(hdr->b_buf == buf && buf->b_next == NULL);
- ASSERT(buf->b_efunc == NULL);
- hdr->b_flags |= ARC_BUF_AVAILABLE;
- }
- ASSERT(no_callback || hdr->b_datacnt > 1 ||
- refcount_is_zero(&hdr->b_refcnt));
+ arc_buf_destroy_impl(buf, B_TRUE);
mutex_exit(hash_lock);
- return (no_callback);
}
-int
+int32_t
arc_buf_size(arc_buf_t *buf)
{
- return (buf->b_hdr->b_size);
+ return (HDR_GET_LSIZE(buf->b_hdr));
}
/*
- * Called from the DMU to determine if the current buffer should be
- * evicted. In order to ensure proper locking, the eviction must be initiated
- * from the DMU. Return true if the buffer is associated with user data and
- * duplicate buffers still exist.
+ * Evict the arc_buf_hdr that is provided as a parameter. The resultant
+ * state of the header is dependent on it's state prior to entering this
+ * function. The following transitions are possible:
+ *
+ * - arc_mru -> arc_mru_ghost
+ * - arc_mfu -> arc_mfu_ghost
+ * - arc_mru_ghost -> arc_l2c_only
+ * - arc_mru_ghost -> deleted
+ * - arc_mfu_ghost -> arc_l2c_only
+ * - arc_mfu_ghost -> deleted
*/
-boolean_t
-arc_buf_eviction_needed(arc_buf_t *buf)
+static int64_t
+arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
{
- arc_buf_hdr_t *hdr;
- boolean_t evict_needed = B_FALSE;
+ arc_state_t *evicted_state, *state;
+ int64_t bytes_evicted = 0;
- if (zfs_disable_dup_eviction)
- return (B_FALSE);
+ ASSERT(MUTEX_HELD(hash_lock));
+ ASSERT(HDR_HAS_L1HDR(hdr));
- mutex_enter(&buf->b_evict_lock);
- hdr = buf->b_hdr;
- if (hdr == NULL) {
+ state = hdr->b_l1hdr.b_state;
+ if (GHOST_STATE(state)) {
+ ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+ ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
+
/*
- * We are in arc_do_user_evicts(); let that function
- * perform the eviction.
+ * l2arc_write_buffers() relies on a header's L1 portion
+ * (i.e. its b_pdata field) during its write phase.
+ * Thus, we cannot push a header onto the arc_l2c_only
+ * state (removing it's L1 piece) until the header is
+ * done being written to the l2arc.
*/
- ASSERT(buf->b_data == NULL);
+ if (HDR_HAS_L2HDR(hdr) && HDR_L2_WRITING(hdr)) {
+ ARCSTAT_BUMP(arcstat_evict_l2_skip);
+ return (bytes_evicted);
+ }
+
+ ARCSTAT_BUMP(arcstat_deleted);
+ bytes_evicted += HDR_GET_LSIZE(hdr);
+
+ DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr);
+
+ ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
+ if (HDR_HAS_L2HDR(hdr)) {
+ ASSERT(hdr->b_l1hdr.b_pdata == NULL);
+ /*
+ * This buffer is cached on the 2nd Level ARC;
+ * don't destroy the header.
+ */
+ arc_change_state(arc_l2c_only, hdr, hash_lock);
+ /*
+ * dropping from L1+L2 cached to L2-only,
+ * realloc to remove the L1 header.
+ */
+ hdr = arc_hdr_realloc(hdr, hdr_full_cache,
+ hdr_l2only_cache);
+ } else {
+ ASSERT(hdr->b_l1hdr.b_pdata == NULL);
+ arc_change_state(arc_anon, hdr, hash_lock);
+ arc_hdr_destroy(hdr);
+ }
+ return (bytes_evicted);
+ }
+
+ ASSERT(state == arc_mru || state == arc_mfu);
+ evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
+
+ /* prefetch buffers have a minimum lifespan */
+ if (HDR_IO_IN_PROGRESS(hdr) ||
+ ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) &&
+ ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access <
+ arc_min_prefetch_lifespan)) {
+ ARCSTAT_BUMP(arcstat_evict_skip);
+ return (bytes_evicted);
+ }
+
+ ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt));
+ while (hdr->b_l1hdr.b_buf) {
+ arc_buf_t *buf = hdr->b_l1hdr.b_buf;
+ if (!mutex_tryenter(&buf->b_evict_lock)) {
+ ARCSTAT_BUMP(arcstat_mutex_miss);
+ break;
+ }
+ if (buf->b_data != NULL)
+ bytes_evicted += HDR_GET_LSIZE(hdr);
mutex_exit(&buf->b_evict_lock);
- return (B_FALSE);
- } else if (buf->b_data == NULL) {
+ arc_buf_destroy_impl(buf, B_TRUE);
+ }
+
+ if (HDR_HAS_L2HDR(hdr)) {
+ ARCSTAT_INCR(arcstat_evict_l2_cached, HDR_GET_LSIZE(hdr));
+ } else {
+ if (l2arc_write_eligible(hdr->b_spa, hdr)) {
+ ARCSTAT_INCR(arcstat_evict_l2_eligible,
+ HDR_GET_LSIZE(hdr));
+ } else {
+ ARCSTAT_INCR(arcstat_evict_l2_ineligible,
+ HDR_GET_LSIZE(hdr));
+ }
+ }
+
+ if (hdr->b_l1hdr.b_bufcnt == 0) {
+ arc_cksum_free(hdr);
+
+ bytes_evicted += arc_hdr_size(hdr);
+
/*
- * We have already been added to the arc eviction list;
- * recommend eviction.
+ * If this hdr is being evicted and has a compressed
+ * buffer then we discard it here before we change states.
+ * This ensures that the accounting is updated correctly
+ * in arc_free_data_buf().
*/
- ASSERT3P(hdr, ==, &arc_eviction_hdr);
- mutex_exit(&buf->b_evict_lock);
- return (B_TRUE);
+ arc_hdr_free_pdata(hdr);
+
+ arc_change_state(evicted_state, hdr, hash_lock);
+ ASSERT(HDR_IN_HASH_TABLE(hdr));
+ arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
+ DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr);
}
- if (hdr->b_datacnt > 1 && hdr->b_type == ARC_BUFC_DATA)
- evict_needed = B_TRUE;
+ return (bytes_evicted);
+}
- mutex_exit(&buf->b_evict_lock);
- return (evict_needed);
+static uint64_t
+arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
+ uint64_t spa, int64_t bytes)
+{
+ multilist_sublist_t *mls;
+ uint64_t bytes_evicted = 0;
+ arc_buf_hdr_t *hdr;
+ kmutex_t *hash_lock;
+ int evict_count = 0;
+
+ ASSERT3P(marker, !=, NULL);
+ IMPLY(bytes < 0, bytes == ARC_EVICT_ALL);
+
+ mls = multilist_sublist_lock(ml, idx);
+
+ for (hdr = multilist_sublist_prev(mls, marker); hdr != NULL;
+ hdr = multilist_sublist_prev(mls, marker)) {
+ if ((bytes != ARC_EVICT_ALL && bytes_evicted >= bytes) ||
+ (evict_count >= zfs_arc_evict_batch_limit))
+ break;
+
+ /*
+ * To keep our iteration location, move the marker
+ * forward. Since we're not holding hdr's hash lock, we
+ * must be very careful and not remove 'hdr' from the
+ * sublist. Otherwise, other consumers might mistake the
+ * 'hdr' as not being on a sublist when they call the
+ * multilist_link_active() function (they all rely on
+ * the hash lock protecting concurrent insertions and
+ * removals). multilist_sublist_move_forward() was
+ * specifically implemented to ensure this is the case
+ * (only 'marker' will be removed and re-inserted).
+ */
+ multilist_sublist_move_forward(mls, marker);
+
+ /*
+ * The only case where the b_spa field should ever be
+ * zero, is the marker headers inserted by
+ * arc_evict_state(). It's possible for multiple threads
+ * to be calling arc_evict_state() concurrently (e.g.
+ * dsl_pool_close() and zio_inject_fault()), so we must
+ * skip any markers we see from these other threads.
+ */
+ if (hdr->b_spa == 0)
+ continue;
+
+ /* we're only interested in evicting buffers of a certain spa */
+ if (spa != 0 && hdr->b_spa != spa) {
+ ARCSTAT_BUMP(arcstat_evict_skip);
+ continue;
+ }
+
+ hash_lock = HDR_LOCK(hdr);
+
+ /*
+ * We aren't calling this function from any code path
+ * that would already be holding a hash lock, so we're
+ * asserting on this assumption to be defensive in case
+ * this ever changes. Without this check, it would be
+ * possible to incorrectly increment arcstat_mutex_miss
+ * below (e.g. if the code changed such that we called
+ * this function with a hash lock held).
+ */
+ ASSERT(!MUTEX_HELD(hash_lock));
+
+ if (mutex_tryenter(hash_lock)) {
+ uint64_t evicted = arc_evict_hdr(hdr, hash_lock);
+ mutex_exit(hash_lock);
+
+ bytes_evicted += evicted;
+
+ /*
+ * If evicted is zero, arc_evict_hdr() must have
+ * decided to skip this header, don't increment
+ * evict_count in this case.
+ */
+ if (evicted != 0)
+ evict_count++;
+
+ /*
+ * If arc_size isn't overflowing, signal any
+ * threads that might happen to be waiting.
+ *
+ * For each header evicted, we wake up a single
+ * thread. If we used cv_broadcast, we could
+ * wake up "too many" threads causing arc_size
+ * to significantly overflow arc_c; since
+ * arc_get_data_buf() doesn't check for overflow
+ * when it's woken up (it doesn't because it's
+ * possible for the ARC to be overflowing while
+ * full of un-evictable buffers, and the
+ * function should proceed in this case).
+ *
+ * If threads are left sleeping, due to not
+ * using cv_broadcast, they will be woken up
+ * just before arc_reclaim_thread() sleeps.
+ */
+ mutex_enter(&arc_reclaim_lock);
+ if (!arc_is_overflowing())
+ cv_signal(&arc_reclaim_waiters_cv);
+ mutex_exit(&arc_reclaim_lock);
+ } else {
+ ARCSTAT_BUMP(arcstat_mutex_miss);
+ }
+ }
+
+ multilist_sublist_unlock(mls);
+
+ return (bytes_evicted);
}
/*
- * Evict buffers from list until we've removed the specified number of
- * bytes. Move the removed buffers to the appropriate evict state.
- * If the recycle flag is set, then attempt to "recycle" a buffer:
- * - look for a buffer to evict that is `bytes' long.
- * - return the data block from this buffer rather than freeing it.
- * This flag is used by callers that are trying to make space for a
- * new buffer in a full arc cache.
+ * Evict buffers from the given arc state, until we've removed the
+ * specified number of bytes. Move the removed buffers to the
+ * appropriate evict state.
*
- * This function makes a "best effort". It skips over any buffers
- * it can't get a hash_lock on, and so may not catch all candidates.
+ * This function makes a "best effort". It skips over any buffers
+ * it can't get a hash_lock on, and so, may not catch all candidates.
* It may also return without evicting as much space as requested.
+ *
+ * If bytes is specified using the special value ARC_EVICT_ALL, this
+ * will evict all available (i.e. unlocked and evictable) buffers from
+ * the given arc state; which is used by arc_flush().
*/
-static void *
-arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
+static uint64_t
+arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes,
arc_buf_contents_t type)
{
- arc_state_t *evicted_state;
- uint64_t bytes_evicted = 0, skipped = 0, missed = 0;
- int64_t bytes_remaining;
- arc_buf_hdr_t *ab, *ab_prev = NULL;
- list_t *evicted_list, *list, *evicted_list_start, *list_start;
- kmutex_t *lock, *evicted_lock;
- kmutex_t *hash_lock;
- boolean_t have_lock;
- void *stolen = NULL;
- static int evict_metadata_offset, evict_data_offset;
- int i, idx, offset, list_count, count;
+ uint64_t total_evicted = 0;
+ multilist_t *ml = &state->arcs_list[type];
+ int num_sublists;
+ arc_buf_hdr_t **markers;
- ASSERT(state == arc_mru || state == arc_mfu);
+ IMPLY(bytes < 0, bytes == ARC_EVICT_ALL);
- evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
+ num_sublists = multilist_get_num_sublists(ml);
- if (type == ARC_BUFC_METADATA) {
- offset = 0;
- list_count = ARC_BUFC_NUMMETADATALISTS;
- list_start = &state->arcs_lists[0];
- evicted_list_start = &evicted_state->arcs_lists[0];
- idx = evict_metadata_offset;
- } else {
- offset = ARC_BUFC_NUMMETADATALISTS;
- list_start = &state->arcs_lists[offset];
- evicted_list_start = &evicted_state->arcs_lists[offset];
- list_count = ARC_BUFC_NUMDATALISTS;
- idx = evict_data_offset;
+ /*
+ * If we've tried to evict from each sublist, made some
+ * progress, but still have not hit the target number of bytes
+ * to evict, we want to keep trying. The markers allow us to
+ * pick up where we left off for each individual sublist, rather
+ * than starting from the tail each time.
+ */
+ markers = kmem_zalloc(sizeof (*markers) * num_sublists, KM_SLEEP);
+ for (int i = 0; i < num_sublists; i++) {
+ markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP);
+
+ /*
+ * A b_spa of 0 is used to indicate that this header is
+ * a marker. This fact is used in arc_adjust_type() and
+ * arc_evict_state_impl().
+ */
+ markers[i]->b_spa = 0;
+
+ multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
+ multilist_sublist_insert_tail(mls, markers[i]);
+ multilist_sublist_unlock(mls);
}
- bytes_remaining = evicted_state->arcs_lsize[type];
- count = 0;
-evict_start:
- list = &list_start[idx];
- evicted_list = &evicted_list_start[idx];
- lock = ARCS_LOCK(state, (offset + idx));
- evicted_lock = ARCS_LOCK(evicted_state, (offset + idx));
+ /*
+ * While we haven't hit our target number of bytes to evict, or
+ * we're evicting all available buffers.
+ */
+ while (total_evicted < bytes || bytes == ARC_EVICT_ALL) {
+ /*
+ * Start eviction using a randomly selected sublist,
+ * this is to try and evenly balance eviction across all
+ * sublists. Always starting at the same sublist
+ * (e.g. index 0) would cause evictions to favor certain
+ * sublists over others.
+ */
+ int sublist_idx = multilist_get_random_index(ml);
+ uint64_t scan_evicted = 0;
- mutex_enter(lock);
- mutex_enter(evicted_lock);
+ for (int i = 0; i < num_sublists; i++) {
+ uint64_t bytes_remaining;
+ uint64_t bytes_evicted;
- for (ab = list_tail(list); ab; ab = ab_prev) {
- ab_prev = list_prev(list, ab);
- bytes_remaining -= (ab->b_size * ab->b_datacnt);
- /* prefetch buffers have a minimum lifespan */
- if (HDR_IO_IN_PROGRESS(ab) ||
- (spa && ab->b_spa != spa) ||
- (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
- ddi_get_lbolt() - ab->b_arc_access <
- arc_min_prefetch_lifespan)) {
- skipped++;
- continue;
+ if (bytes == ARC_EVICT_ALL)
+ bytes_remaining = ARC_EVICT_ALL;
+ else if (total_evicted < bytes)
+ bytes_remaining = bytes - total_evicted;
+ else
+ break;
+
+ bytes_evicted = arc_evict_state_impl(ml, sublist_idx,
+ markers[sublist_idx], spa, bytes_remaining);
+
+ scan_evicted += bytes_evicted;
+ total_evicted += bytes_evicted;
+
+ /* we've reached the end, wrap to the beginning */
+ if (++sublist_idx >= num_sublists)
+ sublist_idx = 0;
}
- /* "lookahead" for better eviction candidate */
- if (recycle && ab->b_size != bytes &&
- ab_prev && ab_prev->b_size == bytes)
- continue;
- hash_lock = HDR_LOCK(ab);
- have_lock = MUTEX_HELD(hash_lock);
- if (have_lock || mutex_tryenter(hash_lock)) {
- ASSERT0(refcount_count(&ab->b_refcnt));
- ASSERT(ab->b_datacnt > 0);
- while (ab->b_buf) {
- arc_buf_t *buf = ab->b_buf;
- if (!mutex_tryenter(&buf->b_evict_lock)) {
- missed += 1;
- break;
- }
- if (buf->b_data) {
- bytes_evicted += ab->b_size;
- if (recycle && ab->b_type == type &&
- ab->b_size == bytes &&
- !HDR_L2_WRITING(ab)) {
- stolen = buf->b_data;
- recycle = FALSE;
- }
- }
- if (buf->b_efunc) {
- mutex_enter(&arc_eviction_mtx);
- arc_buf_destroy(buf,
- buf->b_data == stolen, FALSE);
- ab->b_buf = buf->b_next;
- buf->b_hdr = &arc_eviction_hdr;
- buf->b_next = arc_eviction_list;
- arc_eviction_list = buf;
- mutex_exit(&arc_eviction_mtx);
- mutex_exit(&buf->b_evict_lock);
- } else {
- mutex_exit(&buf->b_evict_lock);
- arc_buf_destroy(buf,
- buf->b_data == stolen, TRUE);
- }
- }
- if (ab->b_l2hdr) {
- ARCSTAT_INCR(arcstat_evict_l2_cached,
- ab->b_size);
- } else {
- if (l2arc_write_eligible(ab->b_spa, ab)) {
- ARCSTAT_INCR(arcstat_evict_l2_eligible,
- ab->b_size);
- } else {
- ARCSTAT_INCR(
- arcstat_evict_l2_ineligible,
- ab->b_size);
- }
+ /*
+ * If we didn't evict anything during this scan, we have
+ * no reason to believe we'll evict more during another
+ * scan, so break the loop.
+ */
+ if (scan_evicted == 0) {
+ /* This isn't possible, let's make that obvious */
+ ASSERT3S(bytes, !=, 0);
+
+ /*
+ * When bytes is ARC_EVICT_ALL, the only way to
+ * break the loop is when scan_evicted is zero.
+ * In that case, we actually have evicted enough,
+ * so we don't want to increment the kstat.
+ */
+ if (bytes != ARC_EVICT_ALL) {
+ ASSERT3S(total_evicted, <, bytes);
+ ARCSTAT_BUMP(arcstat_evict_not_enough);
}
- if (ab->b_datacnt == 0) {
- arc_change_state(evicted_state, ab, hash_lock);
- ASSERT(HDR_IN_HASH_TABLE(ab));
- ab->b_flags |= ARC_IN_HASH_TABLE;
- ab->b_flags &= ~ARC_BUF_AVAILABLE;
- DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
- }
- if (!have_lock)
- mutex_exit(hash_lock);
- if (bytes >= 0 && bytes_evicted >= bytes)
- break;
- if (bytes_remaining > 0) {
- mutex_exit(evicted_lock);
- mutex_exit(lock);
- idx = ((idx + 1) & (list_count - 1));
- count++;
- goto evict_start;
- }
- } else {
- missed += 1;
+ break;
}
}
- mutex_exit(evicted_lock);
- mutex_exit(lock);
+ for (int i = 0; i < num_sublists; i++) {
+ multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
+ multilist_sublist_remove(mls, markers[i]);
+ multilist_sublist_unlock(mls);
- idx = ((idx + 1) & (list_count - 1));
- count++;
-
- if (bytes_evicted < bytes) {
- if (count < list_count)
- goto evict_start;
- else
- dprintf("only evicted %lld bytes from %x",
- (longlong_t)bytes_evicted, state);
+ kmem_cache_free(hdr_full_cache, markers[i]);
}
- if (type == ARC_BUFC_METADATA)
- evict_metadata_offset = idx;
- else
- evict_data_offset = idx;
+ kmem_free(markers, sizeof (*markers) * num_sublists);
- if (skipped)
- ARCSTAT_INCR(arcstat_evict_skip, skipped);
+ return (total_evicted);
+}
- if (missed)
- ARCSTAT_INCR(arcstat_mutex_miss, missed);
+/*
+ * Flush all "evictable" data of the given type from the arc state
+ * specified. This will not evict any "active" buffers (i.e. referenced).
+ *
+ * When 'retry' is set to B_FALSE, the function will make a single pass
+ * over the state and evict any buffers that it can. Since it doesn't
+ * continually retry the eviction, it might end up leaving some buffers
+ * in the ARC due to lock misses.
+ *
+ * When 'retry' is set to B_TRUE, the function will continually retry the
+ * eviction until *all* evictable buffers have been removed from the
+ * state. As a result, if concurrent insertions into the state are
+ * allowed (e.g. if the ARC isn't shutting down), this function might
+ * wind up in an infinite loop, continually trying to evict buffers.
+ */
+static uint64_t
+arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type,
+ boolean_t retry)
+{
+ uint64_t evicted = 0;
- /*
- * We have just evicted some data into the ghost state, make
- * sure we also adjust the ghost state size if necessary.
- */
- if (arc_no_grow &&
- arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) {
- int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size +
- arc_mru_ghost->arcs_size - arc_c;
+ while (refcount_count(&state->arcs_esize[type]) != 0) {
+ evicted += arc_evict_state(state, spa, ARC_EVICT_ALL, type);
- if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) {
- int64_t todelete =
- MIN(arc_mru_ghost->arcs_lsize[type], mru_over);
- arc_evict_ghost(arc_mru_ghost, 0, todelete);
- } else if (arc_mfu_ghost->arcs_lsize[type] > 0) {
- int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type],
- arc_mru_ghost->arcs_size +
- arc_mfu_ghost->arcs_size - arc_c);
- arc_evict_ghost(arc_mfu_ghost, 0, todelete);
- }
+ if (!retry)
+ break;
}
- if (stolen)
- ARCSTAT_BUMP(arcstat_stolen);
- return (stolen);
+ return (evicted);
}
/*
- * Remove buffers from list until we've removed the specified number of
- * bytes. Destroy the buffers that are removed.
+ * Evict the specified number of bytes from the state specified,
+ * restricting eviction to the spa and type given. This function
+ * prevents us from trying to evict more from a state's list than
+ * is "evictable", and to skip evicting altogether when passed a
+ * negative value for "bytes". In contrast, arc_evict_state() will
+ * evict everything it can, when passed a negative value for "bytes".
*/
-static void
-arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes)
+static uint64_t
+arc_adjust_impl(arc_state_t *state, uint64_t spa, int64_t bytes,
+ arc_buf_contents_t type)
{
- arc_buf_hdr_t *ab, *ab_prev;
- arc_buf_hdr_t marker = { 0 };
- list_t *list, *list_start;
- kmutex_t *hash_lock, *lock;
- uint64_t bytes_deleted = 0;
- uint64_t bufs_skipped = 0;
- static int evict_offset;
- int list_count, idx = evict_offset;
- int offset, count = 0;
+ int64_t delta;
- ASSERT(GHOST_STATE(state));
+ if (bytes > 0 && refcount_count(&state->arcs_esize[type]) > 0) {
+ delta = MIN(refcount_count(&state->arcs_esize[type]), bytes);
+ return (arc_evict_state(state, spa, delta, type));
+ }
+ return (0);
+}
+
+/*
+ * Evict metadata buffers from the cache, such that arc_meta_used is
+ * capped by the arc_meta_limit tunable.
+ */
+static uint64_t
+arc_adjust_meta(void)
+{
+ uint64_t total_evicted = 0;
+ int64_t target;
+
/*
- * data lists come after metadata lists
+ * If we're over the meta limit, we want to evict enough
+ * metadata to get back under the meta limit. We don't want to
+ * evict so much that we drop the MRU below arc_p, though. If
+ * we're over the meta limit more than we're over arc_p, we
+ * evict some from the MRU here, and some from the MFU below.
*/
- list_start = &state->arcs_lists[ARC_BUFC_NUMMETADATALISTS];
- list_count = ARC_BUFC_NUMDATALISTS;
- offset = ARC_BUFC_NUMMETADATALISTS;
+ target = MIN((int64_t)(arc_meta_used - arc_meta_limit),
+ (int64_t)(refcount_count(&arc_anon->arcs_size) +
+ refcount_count(&arc_mru->arcs_size) - arc_p));
-evict_start:
- list = &list_start[idx];
- lock = ARCS_LOCK(state, idx + offset);
+ total_evicted += arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
- mutex_enter(lock);
- for (ab = list_tail(list); ab; ab = ab_prev) {
- ab_prev = list_prev(list, ab);
- if (spa && ab->b_spa != spa)
- continue;
+ /*
+ * Similar to the above, we want to evict enough bytes to get us
+ * below the meta limit, but not so much as to drop us below the
+ * space alloted to the MFU (which is defined as arc_c - arc_p).
+ */
+ target = MIN((int64_t)(arc_meta_used - arc_meta_limit),
+ (int64_t)(refcount_count(&arc_mfu->arcs_size) - (arc_c - arc_p)));
- /* ignore markers */
- if (ab->b_spa == 0)
- continue;
+ total_evicted += arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
- hash_lock = HDR_LOCK(ab);
- /* caller may be trying to modify this buffer, skip it */
- if (MUTEX_HELD(hash_lock))
- continue;
- if (mutex_tryenter(hash_lock)) {
- ASSERT(!HDR_IO_IN_PROGRESS(ab));
- ASSERT(ab->b_buf == NULL);
- ARCSTAT_BUMP(arcstat_deleted);
- bytes_deleted += ab->b_size;
+ return (total_evicted);
+}
- if (ab->b_l2hdr != NULL) {
- /*
- * This buffer is cached on the 2nd Level ARC;
- * don't destroy the header.
- */
- arc_change_state(arc_l2c_only, ab, hash_lock);
- mutex_exit(hash_lock);
- } else {
- arc_change_state(arc_anon, ab, hash_lock);
- mutex_exit(hash_lock);
- arc_hdr_destroy(ab);
- }
+/*
+ * Return the type of the oldest buffer in the given arc state
+ *
+ * This function will select a random sublist of type ARC_BUFC_DATA and
+ * a random sublist of type ARC_BUFC_METADATA. The tail of each sublist
+ * is compared, and the type which contains the "older" buffer will be
+ * returned.
+ */
+static arc_buf_contents_t
+arc_adjust_type(arc_state_t *state)
+{
+ multilist_t *data_ml = &state->arcs_list[ARC_BUFC_DATA];
+ multilist_t *meta_ml = &state->arcs_list[ARC_BUFC_METADATA];
+ int data_idx = multilist_get_random_index(data_ml);
+ int meta_idx = multilist_get_random_index(meta_ml);
+ multilist_sublist_t *data_mls;
+ multilist_sublist_t *meta_mls;
+ arc_buf_contents_t type;
+ arc_buf_hdr_t *data_hdr;
+ arc_buf_hdr_t *meta_hdr;
- DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
- if (bytes >= 0 && bytes_deleted >= bytes)
- break;
- } else if (bytes < 0) {
- /*
- * Insert a list marker and then wait for the
- * hash lock to become available. Once its
- * available, restart from where we left off.
- */
- list_insert_after(list, ab, &marker);
- mutex_exit(lock);
- mutex_enter(hash_lock);
- mutex_exit(hash_lock);
- mutex_enter(lock);
- ab_prev = list_prev(list, &marker);
- list_remove(list, &marker);
- } else
- bufs_skipped += 1;
- }
- mutex_exit(lock);
- idx = ((idx + 1) & (ARC_BUFC_NUMDATALISTS - 1));
- count++;
+ /*
+ * We keep the sublist lock until we're finished, to prevent
+ * the headers from being destroyed via arc_evict_state().
+ */
+ data_mls = multilist_sublist_lock(data_ml, data_idx);
+ meta_mls = multilist_sublist_lock(meta_ml, meta_idx);
- if (count < list_count)
- goto evict_start;
+ /*
+ * These two loops are to ensure we skip any markers that
+ * might be at the tail of the lists due to arc_evict_state().
+ */
- evict_offset = idx;
- if ((uintptr_t)list > (uintptr_t)&state->arcs_lists[ARC_BUFC_NUMMETADATALISTS] &&
- (bytes < 0 || bytes_deleted < bytes)) {
- list_start = &state->arcs_lists[0];
- list_count = ARC_BUFC_NUMMETADATALISTS;
- offset = count = 0;
- goto evict_start;
+ for (data_hdr = multilist_sublist_tail(data_mls); data_hdr != NULL;
+ data_hdr = multilist_sublist_prev(data_mls, data_hdr)) {
+ if (data_hdr->b_spa != 0)
+ break;
}
- if (bufs_skipped) {
- ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped);
- ASSERT(bytes >= 0);
+ for (meta_hdr = multilist_sublist_tail(meta_mls); meta_hdr != NULL;
+ meta_hdr = multilist_sublist_prev(meta_mls, meta_hdr)) {
+ if (meta_hdr->b_spa != 0)
+ break;
}
- if (bytes_deleted < bytes)
- dprintf("only deleted %lld bytes from %p",
- (longlong_t)bytes_deleted, state);
+ if (data_hdr == NULL && meta_hdr == NULL) {
+ type = ARC_BUFC_DATA;
+ } else if (data_hdr == NULL) {
+ ASSERT3P(meta_hdr, !=, NULL);
+ type = ARC_BUFC_METADATA;
+ } else if (meta_hdr == NULL) {
+ ASSERT3P(data_hdr, !=, NULL);
+ type = ARC_BUFC_DATA;
+ } else {
+ ASSERT3P(data_hdr, !=, NULL);
+ ASSERT3P(meta_hdr, !=, NULL);
+
+ /* The headers can't be on the sublist without an L1 header */
+ ASSERT(HDR_HAS_L1HDR(data_hdr));
+ ASSERT(HDR_HAS_L1HDR(meta_hdr));
+
+ if (data_hdr->b_l1hdr.b_arc_access <
+ meta_hdr->b_l1hdr.b_arc_access) {
+ type = ARC_BUFC_DATA;
+ } else {
+ type = ARC_BUFC_METADATA;
+ }
+ }
+
+ multilist_sublist_unlock(meta_mls);
+ multilist_sublist_unlock(data_mls);
+
+ return (type);
}
-static void
+/*
+ * Evict buffers from the cache, such that arc_size is capped by arc_c.
+ */
+static uint64_t
arc_adjust(void)
{
- int64_t adjustment, delta;
+ uint64_t total_evicted = 0;
+ uint64_t bytes;
+ int64_t target;
/*
+ * If we're over arc_meta_limit, we want to correct that before
+ * potentially evicting data buffers below.
+ */
+ total_evicted += arc_adjust_meta();
+
+ /*
* Adjust MRU size
+ *
+ * If we're over the target cache size, we want to evict enough
+ * from the list to get back to our target size. We don't want
+ * to evict too much from the MRU, such that it drops below
+ * arc_p. So, if we're over our target cache size more than
+ * the MRU is over arc_p, we'll evict enough to get back to
+ * arc_p here, and then evict more from the MFU below.
*/
+ target = MIN((int64_t)(arc_size - arc_c),
+ (int64_t)(refcount_count(&arc_anon->arcs_size) +
+ refcount_count(&arc_mru->arcs_size) + arc_meta_used - arc_p));
- adjustment = MIN((int64_t)(arc_size - arc_c),
- (int64_t)(arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used -
- arc_p));
+ /*
+ * If we're below arc_meta_min, always prefer to evict data.
+ * Otherwise, try to satisfy the requested number of bytes to
+ * evict from the type which contains older buffers; in an
+ * effort to keep newer buffers in the cache regardless of their
+ * type. If we cannot satisfy the number of bytes from this
+ * type, spill over into the next type.
+ */
+ if (arc_adjust_type(arc_mru) == ARC_BUFC_METADATA &&
+ arc_meta_used > arc_meta_min) {
+ bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
+ total_evicted += bytes;
- if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) {
- delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment);
- (void) arc_evict(arc_mru, 0, delta, FALSE, ARC_BUFC_DATA);
- adjustment -= delta;
- }
+ /*
+ * If we couldn't evict our target number of bytes from
+ * metadata, we try to get the rest from data.
+ */
+ target -= bytes;
- if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
- delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment);
- (void) arc_evict(arc_mru, 0, delta, FALSE,
- ARC_BUFC_METADATA);
+ total_evicted +=
+ arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA);
+ } else {
+ bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA);
+ total_evicted += bytes;
+
+ /*
+ * If we couldn't evict our target number of bytes from
+ * data, we try to get the rest from metadata.
+ */
+ target -= bytes;
+
+ total_evicted +=
+ arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
}
/*
* Adjust MFU size
+ *
+ * Now that we've tried to evict enough from the MRU to get its
+ * size back to arc_p, if we're still above the target cache
+ * size, we evict the rest from the MFU.
*/
+ target = arc_size - arc_c;
- adjustment = arc_size - arc_c;
+ if (arc_adjust_type(arc_mfu) == ARC_BUFC_METADATA &&
+ arc_meta_used > arc_meta_min) {
+ bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
+ total_evicted += bytes;
- if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) {
- delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]);
- (void) arc_evict(arc_mfu, 0, delta, FALSE, ARC_BUFC_DATA);
- adjustment -= delta;
- }
+ /*
+ * If we couldn't evict our target number of bytes from
+ * metadata, we try to get the rest from data.
+ */
+ target -= bytes;
- if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
- int64_t delta = MIN(adjustment,
- arc_mfu->arcs_lsize[ARC_BUFC_METADATA]);
- (void) arc_evict(arc_mfu, 0, delta, FALSE,
- ARC_BUFC_METADATA);
+ total_evicted +=
+ arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA);
+ } else {
+ bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA);
+ total_evicted += bytes;
+
+ /*
+ * If we couldn't evict our target number of bytes from
+ * data, we try to get the rest from data.
+ */
+ target -= bytes;
+
+ total_evicted +=
+ arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
}
/*
* Adjust ghost lists
+ *
+ * In addition to the above, the ARC also defines target values
+ * for the ghost lists. The sum of the mru list and mru ghost
+ * list should never exceed the target size of the cache, and
+ * the sum of the mru list, mfu list, mru ghost list, and mfu
+ * ghost list should never exceed twice the target size of the
+ * cache. The following logic enforces these limits on the ghost
+ * caches, and evicts from them as needed.
*/
+ target = refcount_count(&arc_mru->arcs_size) +
+ refcount_count(&arc_mru_ghost->arcs_size) - arc_c;
- adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c;
+ bytes = arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_DATA);
+ total_evicted += bytes;
- if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) {
- delta = MIN(arc_mru_ghost->arcs_size, adjustment);
- arc_evict_ghost(arc_mru_ghost, 0, delta);
- }
+ target -= bytes;
- adjustment =
- arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c;
+ total_evicted +=
+ arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_METADATA);
- if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) {
- delta = MIN(arc_mfu_ghost->arcs_size, adjustment);
- arc_evict_ghost(arc_mfu_ghost, 0, delta);
- }
-}
-
-static void
-arc_do_user_evicts(void)
-{
- static arc_buf_t *tmp_arc_eviction_list;
-
/*
- * Move list over to avoid LOR
+ * We assume the sum of the mru list and mfu list is less than
+ * or equal to arc_c (we enforced this above), which means we
+ * can use the simpler of the two equations below:
+ *
+ * mru + mfu + mru ghost + mfu ghost <= 2 * arc_c
+ * mru ghost + mfu ghost <= arc_c
*/
-restart:
- mutex_enter(&arc_eviction_mtx);
- tmp_arc_eviction_list = arc_eviction_list;
- arc_eviction_list = NULL;
- mutex_exit(&arc_eviction_mtx);
+ target = refcount_count(&arc_mru_ghost->arcs_size) +
+ refcount_count(&arc_mfu_ghost->arcs_size) - arc_c;
- while (tmp_arc_eviction_list != NULL) {
- arc_buf_t *buf = tmp_arc_eviction_list;
- tmp_arc_eviction_list = buf->b_next;
- mutex_enter(&buf->b_evict_lock);
- buf->b_hdr = NULL;
- mutex_exit(&buf->b_evict_lock);
+ bytes = arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_DATA);
+ total_evicted += bytes;
- if (buf->b_efunc != NULL)
- VERIFY(buf->b_efunc(buf) == 0);
+ target -= bytes;
- buf->b_efunc = NULL;
- buf->b_private = NULL;
- kmem_cache_free(buf_cache, buf);
- }
+ total_evicted +=
+ arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_METADATA);
- if (arc_eviction_list != NULL)
- goto restart;
+ return (total_evicted);
}
-/*
- * Flush all *evictable* data from the cache for the given spa.
- * NOTE: this will not touch "active" (i.e. referenced) data.
- */
void
-arc_flush(spa_t *spa)
+arc_flush(spa_t *spa, boolean_t retry)
{
uint64_t guid = 0;
- if (spa)
+ /*
+ * If retry is B_TRUE, a spa must not be specified since we have
+ * no good way to determine if all of a spa's buffers have been
+ * evicted from an arc state.
+ */
+ ASSERT(!retry || spa == 0);
+
+ if (spa != NULL)
guid = spa_load_guid(spa);
- while (arc_mru->arcs_lsize[ARC_BUFC_DATA]) {
- (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA);
- if (spa)
- break;
- }
- while (arc_mru->arcs_lsize[ARC_BUFC_METADATA]) {
- (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA);
- if (spa)
- break;
- }
- while (arc_mfu->arcs_lsize[ARC_BUFC_DATA]) {
- (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA);
- if (spa)
- break;
- }
- while (arc_mfu->arcs_lsize[ARC_BUFC_METADATA]) {
- (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA);
- if (spa)
- break;
- }
+ (void) arc_flush_state(arc_mru, guid, ARC_BUFC_DATA, retry);
+ (void) arc_flush_state(arc_mru, guid, ARC_BUFC_METADATA, retry);
- arc_evict_ghost(arc_mru_ghost, guid, -1);
- arc_evict_ghost(arc_mfu_ghost, guid, -1);
+ (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_DATA, retry);
+ (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_METADATA, retry);
- mutex_enter(&arc_reclaim_thr_lock);
- arc_do_user_evicts();
- mutex_exit(&arc_reclaim_thr_lock);
- ASSERT(spa || arc_eviction_list == NULL);
+ (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_DATA, retry);
+ (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_METADATA, retry);
+
+ (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry);
+ (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry);
}
void
-arc_shrink(void)
+arc_shrink(int64_t to_free)
{
if (arc_c > arc_c_min) {
- uint64_t to_free;
-
-#ifdef _KERNEL
- to_free = arc_c >> arc_shrink_shift;
-#else
- to_free = arc_c >> arc_shrink_shift;
-#endif
+ DTRACE_PROBE4(arc__shrink, uint64_t, arc_c, uint64_t,
+ arc_c_min, uint64_t, arc_p, uint64_t, to_free);
if (arc_c > arc_c_min + to_free)
atomic_add_64(&arc_c, -to_free);
else
@@ -2378,39 +3815,80 @@
arc_c = MAX(arc_size, arc_c_min);
if (arc_p > arc_c)
arc_p = (arc_c >> 1);
+
+ DTRACE_PROBE2(arc__shrunk, uint64_t, arc_c, uint64_t,
+ arc_p);
+
ASSERT(arc_c >= arc_c_min);
ASSERT((int64_t)arc_p >= 0);
}
- if (arc_size > arc_c)
- arc_adjust();
+ if (arc_size > arc_c) {
+ DTRACE_PROBE2(arc__shrink_adjust, uint64_t, arc_size,
+ uint64_t, arc_c);
+ (void) arc_adjust();
+ }
}
-static int needfree = 0;
+static long needfree = 0;
-static int
-arc_reclaim_needed(void)
+typedef enum free_memory_reason_t {
+ FMR_UNKNOWN,
+ FMR_NEEDFREE,
+ FMR_LOTSFREE,
+ FMR_SWAPFS_MINFREE,
+ FMR_PAGES_PP_MAXIMUM,
+ FMR_HEAP_ARENA,
+ FMR_ZIO_ARENA,
+ FMR_ZIO_FRAG,
+} free_memory_reason_t;
+
+int64_t last_free_memory;
+free_memory_reason_t last_free_reason;
+
+/*
+ * Additional reserve of pages for pp_reserve.
+ */
+int64_t arc_pages_pp_reserve = 64;
+
+/*
+ * Additional reserve of pages for swapfs.
+ */
+int64_t arc_swapfs_reserve = 64;
+
+/*
+ * Return the amount of memory that can be consumed before reclaim will be
+ * needed. Positive if there is sufficient free memory, negative indicates
+ * the amount of memory that needs to be freed up.
+ */
+static int64_t
+arc_available_memory(void)
{
+ int64_t lowest = INT64_MAX;
+ int64_t n;
+ free_memory_reason_t r = FMR_UNKNOWN;
#ifdef _KERNEL
+ if (needfree > 0) {
+ n = PAGESIZE * (-needfree);
+ if (n < lowest) {
+ lowest = n;
+ r = FMR_NEEDFREE;
+ }
+ }
- if (needfree)
- return (1);
-
/*
* Cooperate with pagedaemon when it's time for it to scan
* and reclaim some pages.
*/
- if (vm_paging_needed())
- return (1);
+ n = PAGESIZE * ((int64_t)freemem - zfs_arc_free_target);
+ if (n < lowest) {
+ lowest = n;
+ r = FMR_LOTSFREE;
+ }
-#ifdef sun
+#ifdef illumos
/*
- * take 'desfree' extra pages, so we reclaim sooner, rather than later
- */
- extra = desfree;
-
- /*
* check that we're out of range of the pageout scanner. It starts to
* schedule paging if freemem is less than lotsfree and needfree.
* lotsfree is the high-water mark for pageout, and needfree is the
@@ -2417,8 +3895,11 @@
* number of needed free pages. We add extra pages here to make sure
* the scanner doesn't start up while we're freeing memory.
*/
- if (freemem < lotsfree + needfree + extra)
- return (1);
+ n = PAGESIZE * (freemem - lotsfree - needfree - desfree);
+ if (n < lowest) {
+ lowest = n;
+ r = FMR_LOTSFREE;
+ }
/*
* check to make sure that swapfs has enough space so that anon
@@ -2427,11 +3908,31 @@
* swap pages. We also add a bit of extra here just to prevent
* circumstances from getting really dire.
*/
- if (availrmem < swapfs_minfree + swapfs_reserve + extra)
- return (1);
+ n = PAGESIZE * (availrmem - swapfs_minfree - swapfs_reserve -
+ desfree - arc_swapfs_reserve);
+ if (n < lowest) {
+ lowest = n;
+ r = FMR_SWAPFS_MINFREE;
+ }
-#if defined(__i386)
+
/*
+ * Check that we have enough availrmem that memory locking (e.g., via
+ * mlock(3C) or memcntl(2)) can still succeed. (pages_pp_maximum
+ * stores the number of pages that cannot be locked; when availrmem
+ * drops below pages_pp_maximum, page locking mechanisms such as
+ * page_pp_lock() will fail.)
+ */
+ n = PAGESIZE * (availrmem - pages_pp_maximum -
+ arc_pages_pp_reserve);
+ if (n < lowest) {
+ lowest = n;
+ r = FMR_PAGES_PP_MAXIMUM;
+ }
+
+#endif /* illumos */
+#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC)
+ /*
* If we're on an i386 platform, it's possible that we'll exhaust the
* kernel heap space before we ever run out of available physical
* memory. Most checks of the size of the heap_area compare against
@@ -2442,32 +3943,85 @@
* heap is allocated. (Or, in the calculation, if less than 1/4th is
* free)
*/
- if (btop(vmem_size(heap_arena, VMEM_FREE)) <
- (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2))
- return (1);
+ n = (int64_t)vmem_size(heap_arena, VMEM_FREE) -
+ (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2);
+ if (n < lowest) {
+ lowest = n;
+ r = FMR_HEAP_ARENA;
+ }
+#define zio_arena NULL
+#else
+#define zio_arena heap_arena
#endif
-#else /* !sun */
- if (kmem_used() > (kmem_size() * 3) / 4)
- return (1);
-#endif /* sun */
-#else
+ /*
+ * If zio data pages are being allocated out of a separate heap segment,
+ * then enforce that the size of available vmem for this arena remains
+ * above about 1/16th free.
+ *
+ * Note: The 1/16th arena free requirement was put in place
+ * to aggressively evict memory from the arc in order to avoid
+ * memory fragmentation issues.
+ */
+ if (zio_arena != NULL) {
+ n = (int64_t)vmem_size(zio_arena, VMEM_FREE) -
+ (vmem_size(zio_arena, VMEM_ALLOC) >> 4);
+ if (n < lowest) {
+ lowest = n;
+ r = FMR_ZIO_ARENA;
+ }
+ }
+
+ /*
+ * Above limits know nothing about real level of KVA fragmentation.
+ * Start aggressive reclamation if too little sequential KVA left.
+ */
+ if (lowest > 0) {
+ n = (vmem_size(heap_arena, VMEM_MAXFREE) < zfs_max_recordsize) ?
+ -((int64_t)vmem_size(heap_arena, VMEM_ALLOC) >> 4) :
+ INT64_MAX;
+ if (n < lowest) {
+ lowest = n;
+ r = FMR_ZIO_FRAG;
+ }
+ }
+
+#else /* _KERNEL */
+ /* Every 100 calls, free a small amount */
if (spa_get_random(100) == 0)
- return (1);
-#endif
- return (0);
+ lowest = -1024;
+#endif /* _KERNEL */
+
+ last_free_memory = lowest;
+ last_free_reason = r;
+ DTRACE_PROBE2(arc__available_memory, int64_t, lowest, int, r);
+ return (lowest);
}
+
+/*
+ * Determine if the system is under memory pressure and is asking
+ * to reclaim memory. A return value of B_TRUE indicates that the system
+ * is under memory pressure and that the arc should adjust accordingly.
+ */
+static boolean_t
+arc_reclaim_needed(void)
+{
+ return (arc_available_memory() < 0);
+}
+
extern kmem_cache_t *zio_buf_cache[];
extern kmem_cache_t *zio_data_buf_cache[];
+extern kmem_cache_t *range_seg_cache;
-static void
-arc_kmem_reap_now(arc_reclaim_strategy_t strat)
+static __noinline void
+arc_kmem_reap_now(void)
{
size_t i;
kmem_cache_t *prev_cache = NULL;
kmem_cache_t *prev_data_cache = NULL;
+ DTRACE_PROBE(arc__kmem_reap_start);
#ifdef _KERNEL
if (arc_meta_used >= arc_meta_limit) {
/*
@@ -2484,13 +4038,6 @@
#endif
#endif
- /*
- * An aggressive reclamation will shrink the cache size as well as
- * reap free buffers from the arc kmem caches.
- */
- if (strat == ARC_RECLAIM_AGGR)
- arc_shrink();
-
for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
if (zio_buf_cache[i] != prev_cache) {
prev_cache = zio_buf_cache[i];
@@ -2502,75 +4049,144 @@
}
}
kmem_cache_reap_now(buf_cache);
- kmem_cache_reap_now(hdr_cache);
+ kmem_cache_reap_now(hdr_full_cache);
+ kmem_cache_reap_now(hdr_l2only_cache);
+ kmem_cache_reap_now(range_seg_cache);
+
+#ifdef illumos
+ if (zio_arena != NULL) {
+ /*
+ * Ask the vmem arena to reclaim unused memory from its
+ * quantum caches.
+ */
+ vmem_qcache_reap(zio_arena);
+ }
+#endif
+ DTRACE_PROBE(arc__kmem_reap_end);
}
+/*
+ * Threads can block in arc_get_data_buf() waiting for this thread to evict
+ * enough data and signal them to proceed. When this happens, the threads in
+ * arc_get_data_buf() are sleeping while holding the hash lock for their
+ * particular arc header. Thus, we must be careful to never sleep on a
+ * hash lock in this thread. This is to prevent the following deadlock:
+ *
+ * - Thread A sleeps on CV in arc_get_data_buf() holding hash lock "L",
+ * waiting for the reclaim thread to signal it.
+ *
+ * - arc_reclaim_thread() tries to acquire hash lock "L" using mutex_enter,
+ * fails, and goes to sleep forever.
+ *
+ * This possible deadlock is avoided by always acquiring a hash lock
+ * using mutex_tryenter() from arc_reclaim_thread().
+ */
static void
arc_reclaim_thread(void *dummy __unused)
{
- clock_t growtime = 0;
- arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS;
+ hrtime_t growtime = 0;
callb_cpr_t cpr;
- CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
+ CALLB_CPR_INIT(&cpr, &arc_reclaim_lock, callb_generic_cpr, FTAG);
- mutex_enter(&arc_reclaim_thr_lock);
- while (arc_thread_exit == 0) {
- if (arc_reclaim_needed()) {
+ mutex_enter(&arc_reclaim_lock);
+ while (!arc_reclaim_thread_exit) {
+ uint64_t evicted = 0;
- if (arc_no_grow) {
- if (last_reclaim == ARC_RECLAIM_CONS) {
- last_reclaim = ARC_RECLAIM_AGGR;
- } else {
- last_reclaim = ARC_RECLAIM_CONS;
- }
- } else {
- arc_no_grow = TRUE;
- last_reclaim = ARC_RECLAIM_AGGR;
- membar_producer();
- }
+ /*
+ * This is necessary in order for the mdb ::arc dcmd to
+ * show up to date information. Since the ::arc command
+ * does not call the kstat's update function, without
+ * this call, the command may show stale stats for the
+ * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even
+ * with this change, the data might be up to 1 second
+ * out of date; but that should suffice. The arc_state_t
+ * structures can be queried directly if more accurate
+ * information is needed.
+ */
+ if (arc_ksp != NULL)
+ arc_ksp->ks_update(arc_ksp, KSTAT_READ);
- /* reset the growth delay for every reclaim */
- growtime = ddi_get_lbolt() + (arc_grow_retry * hz);
+ mutex_exit(&arc_reclaim_lock);
- if (needfree && last_reclaim == ARC_RECLAIM_CONS) {
- /*
- * If needfree is TRUE our vm_lowmem hook
- * was called and in that case we must free some
- * memory, so switch to aggressive mode.
- */
- arc_no_grow = TRUE;
- last_reclaim = ARC_RECLAIM_AGGR;
- }
- arc_kmem_reap_now(last_reclaim);
+ /*
+ * We call arc_adjust() before (possibly) calling
+ * arc_kmem_reap_now(), so that we can wake up
+ * arc_get_data_buf() sooner.
+ */
+ evicted = arc_adjust();
+
+ int64_t free_memory = arc_available_memory();
+ if (free_memory < 0) {
+
+ arc_no_grow = B_TRUE;
arc_warm = B_TRUE;
- } else if (arc_no_grow && ddi_get_lbolt() >= growtime) {
- arc_no_grow = FALSE;
- }
+ /*
+ * Wait at least zfs_grow_retry (default 60) seconds
+ * before considering growing.
+ */
+ growtime = gethrtime() + SEC2NSEC(arc_grow_retry);
- arc_adjust();
+ arc_kmem_reap_now();
- if (arc_eviction_list != NULL)
- arc_do_user_evicts();
+ /*
+ * If we are still low on memory, shrink the ARC
+ * so that we have arc_shrink_min free space.
+ */
+ free_memory = arc_available_memory();
+ int64_t to_free =
+ (arc_c >> arc_shrink_shift) - free_memory;
+ if (to_free > 0) {
#ifdef _KERNEL
- if (needfree) {
+ to_free = MAX(to_free, ptob(needfree));
+#endif
+ arc_shrink(to_free);
+ }
+ } else if (free_memory < arc_c >> arc_no_grow_shift) {
+ arc_no_grow = B_TRUE;
+ } else if (gethrtime() >= growtime) {
+ arc_no_grow = B_FALSE;
+ }
+
+ mutex_enter(&arc_reclaim_lock);
+
+ /*
+ * If evicted is zero, we couldn't evict anything via
+ * arc_adjust(). This could be due to hash lock
+ * collisions, but more likely due to the majority of
+ * arc buffers being unevictable. Therefore, even if
+ * arc_size is above arc_c, another pass is unlikely to
+ * be helpful and could potentially cause us to enter an
+ * infinite loop.
+ */
+ if (arc_size <= arc_c || evicted == 0) {
+#ifdef _KERNEL
needfree = 0;
- wakeup(&needfree);
- }
#endif
+ /*
+ * We're either no longer overflowing, or we
+ * can't evict anything more, so we should wake
+ * up any threads before we go to sleep.
+ */
+ cv_broadcast(&arc_reclaim_waiters_cv);
- /* block until needed, or one second, whichever is shorter */
- CALLB_CPR_SAFE_BEGIN(&cpr);
- (void) cv_timedwait(&arc_reclaim_thr_cv,
- &arc_reclaim_thr_lock, hz);
- CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
+ /*
+ * Block until signaled, or after one second (we
+ * might need to perform arc_kmem_reap_now()
+ * even if we aren't being signalled)
+ */
+ CALLB_CPR_SAFE_BEGIN(&cpr);
+ (void) cv_timedwait_hires(&arc_reclaim_thread_cv,
+ &arc_reclaim_lock, SEC2NSEC(1), MSEC2NSEC(1), 0);
+ CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_lock);
+ }
}
- arc_thread_exit = 0;
- cv_broadcast(&arc_reclaim_thr_cv);
- CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_thr_lock */
+ arc_reclaim_thread_exit = B_FALSE;
+ cv_broadcast(&arc_reclaim_thread_cv);
+ CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_lock */
thread_exit();
}
@@ -2584,6 +4200,8 @@
{
int mult;
uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
+ int64_t mrug_size = refcount_count(&arc_mru_ghost->arcs_size);
+ int64_t mfug_size = refcount_count(&arc_mfu_ghost->arcs_size);
if (state == arc_l2c_only)
return;
@@ -2598,8 +4216,7 @@
* target size of the MRU list.
*/
if (state == arc_mru_ghost) {
- mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
- 1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
+ mult = (mrug_size >= mfug_size) ? 1 : (mfug_size / mrug_size);
mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
@@ -2606,8 +4223,7 @@
} else if (state == arc_mfu_ghost) {
uint64_t delta;
- mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
- 1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
+ mult = (mfug_size >= mrug_size) ? 1 : (mrug_size / mfug_size);
mult = MIN(mult, 10);
delta = MIN(bytes * mult, arc_p);
@@ -2616,7 +4232,7 @@
ASSERT((int64_t)arc_p >= 0);
if (arc_reclaim_needed()) {
- cv_signal(&arc_reclaim_thr_cv);
+ cv_signal(&arc_reclaim_thread_cv);
return;
}
@@ -2631,6 +4247,7 @@
* cache size, increment the target cache size
*/
if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
+ DTRACE_PROBE1(arc__inc_adapt, int, bytes);
atomic_add_64(&arc_c, (int64_t)bytes);
if (arc_c > arc_c_max)
arc_c = arc_c_max;
@@ -2643,152 +4260,159 @@
}
/*
- * Check if the cache has reached its limits and eviction is required
- * prior to insert.
+ * Check if arc_size has grown past our upper threshold, determined by
+ * zfs_arc_overflow_shift.
*/
-static int
-arc_evict_needed(arc_buf_contents_t type)
+static boolean_t
+arc_is_overflowing(void)
{
- if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit)
- return (1);
+ /* Always allow at least one block of overflow */
+ uint64_t overflow = MAX(SPA_MAXBLOCKSIZE,
+ arc_c >> zfs_arc_overflow_shift);
-#ifdef sun
-#ifdef _KERNEL
- /*
- * If zio data pages are being allocated out of a separate heap segment,
- * then enforce that the size of available vmem for this area remains
- * above about 1/32nd free.
- */
- if (type == ARC_BUFC_DATA && zio_arena != NULL &&
- vmem_size(zio_arena, VMEM_FREE) <
- (vmem_size(zio_arena, VMEM_ALLOC) >> 5))
- return (1);
-#endif
-#endif /* sun */
-
- if (arc_reclaim_needed())
- return (1);
-
- return (arc_size > arc_c);
+ return (arc_size >= arc_c + overflow);
}
/*
- * The buffer, supplied as the first argument, needs a data block.
- * So, if we are at cache max, determine which cache should be victimized.
- * We have the following cases:
- *
- * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) ->
- * In this situation if we're out of space, but the resident size of the MFU is
- * under the limit, victimize the MFU cache to satisfy this insertion request.
- *
- * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) ->
- * Here, we've used up all of the available space for the MRU, so we need to
- * evict from our own cache instead. Evict from the set of resident MRU
- * entries.
- *
- * 3. Insert for MFU (c - p) > sizeof(arc_mfu) ->
- * c minus p represents the MFU space in the cache, since p is the size of the
- * cache that is dedicated to the MRU. In this situation there's still space on
- * the MFU side, so the MRU side needs to be victimized.
- *
- * 4. Insert for MFU (c - p) < sizeof(arc_mfu) ->
- * MFU's resident set is consuming more space than it has been allotted. In
- * this situation, we must victimize our own cache, the MFU, for this insertion.
+ * Allocate a block and return it to the caller. If we are hitting the
+ * hard limit for the cache size, we must sleep, waiting for the eviction
+ * thread to catch up. If we're past the target size but below the hard
+ * limit, we'll only signal the reclaim thread and continue on.
*/
-static void
-arc_get_data_buf(arc_buf_t *buf)
+static void *
+arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
{
- arc_state_t *state = buf->b_hdr->b_state;
- uint64_t size = buf->b_hdr->b_size;
- arc_buf_contents_t type = buf->b_hdr->b_type;
+ void *datap = NULL;
+ arc_state_t *state = hdr->b_l1hdr.b_state;
+ arc_buf_contents_t type = arc_buf_type(hdr);
arc_adapt(size, state);
/*
- * We have not yet reached cache maximum size,
- * just allocate a new buffer.
+ * If arc_size is currently overflowing, and has grown past our
+ * upper limit, we must be adding data faster than the evict
+ * thread can evict. Thus, to ensure we don't compound the
+ * problem by adding more data and forcing arc_size to grow even
+ * further past it's target size, we halt and wait for the
+ * eviction thread to catch up.
+ *
+ * It's also possible that the reclaim thread is unable to evict
+ * enough buffers to get arc_size below the overflow limit (e.g.
+ * due to buffers being un-evictable, or hash lock collisions).
+ * In this case, we want to proceed regardless if we're
+ * overflowing; thus we don't use a while loop here.
*/
- if (!arc_evict_needed(type)) {
- if (type == ARC_BUFC_METADATA) {
- buf->b_data = zio_buf_alloc(size);
- arc_space_consume(size, ARC_SPACE_DATA);
- } else {
- ASSERT(type == ARC_BUFC_DATA);
- buf->b_data = zio_data_buf_alloc(size);
- ARCSTAT_INCR(arcstat_data_size, size);
- atomic_add_64(&arc_size, size);
+ if (arc_is_overflowing()) {
+ mutex_enter(&arc_reclaim_lock);
+
+ /*
+ * Now that we've acquired the lock, we may no longer be
+ * over the overflow limit, lets check.
+ *
+ * We're ignoring the case of spurious wake ups. If that
+ * were to happen, it'd let this thread consume an ARC
+ * buffer before it should have (i.e. before we're under
+ * the overflow limit and were signalled by the reclaim
+ * thread). As long as that is a rare occurrence, it
+ * shouldn't cause any harm.
+ */
+ if (arc_is_overflowing()) {
+ cv_signal(&arc_reclaim_thread_cv);
+ cv_wait(&arc_reclaim_waiters_cv, &arc_reclaim_lock);
}
- goto out;
+
+ mutex_exit(&arc_reclaim_lock);
}
- /*
- * If we are prefetching from the mfu ghost list, this buffer
- * will end up on the mru list; so steal space from there.
- */
- if (state == arc_mfu_ghost)
- state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu;
- else if (state == arc_mru_ghost)
- state = arc_mru;
-
- if (state == arc_mru || state == arc_anon) {
- uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
- state = (arc_mfu->arcs_lsize[type] >= size &&
- arc_p > mru_used) ? arc_mfu : arc_mru;
+ VERIFY3U(hdr->b_type, ==, type);
+ if (type == ARC_BUFC_METADATA) {
+ datap = zio_buf_alloc(size);
+ arc_space_consume(size, ARC_SPACE_META);
} else {
- /* MFU cases */
- uint64_t mfu_space = arc_c - arc_p;
- state = (arc_mru->arcs_lsize[type] >= size &&
- mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
+ ASSERT(type == ARC_BUFC_DATA);
+ datap = zio_data_buf_alloc(size);
+ arc_space_consume(size, ARC_SPACE_DATA);
}
- if ((buf->b_data = arc_evict(state, 0, size, TRUE, type)) == NULL) {
- if (type == ARC_BUFC_METADATA) {
- buf->b_data = zio_buf_alloc(size);
- arc_space_consume(size, ARC_SPACE_DATA);
- } else {
- ASSERT(type == ARC_BUFC_DATA);
- buf->b_data = zio_data_buf_alloc(size);
- ARCSTAT_INCR(arcstat_data_size, size);
- atomic_add_64(&arc_size, size);
- }
- ARCSTAT_BUMP(arcstat_recycle_miss);
- }
- ASSERT(buf->b_data != NULL);
-out:
+
/*
* Update the state size. Note that ghost states have a
* "ghost size" and so don't need to be updated.
*/
- if (!GHOST_STATE(buf->b_hdr->b_state)) {
- arc_buf_hdr_t *hdr = buf->b_hdr;
+ if (!GHOST_STATE(state)) {
- atomic_add_64(&hdr->b_state->arcs_size, size);
- if (list_link_active(&hdr->b_arc_node)) {
- ASSERT(refcount_is_zero(&hdr->b_refcnt));
- atomic_add_64(&hdr->b_state->arcs_lsize[type], size);
+ (void) refcount_add_many(&state->arcs_size, size, tag);
+
+ /*
+ * If this is reached via arc_read, the link is
+ * protected by the hash lock. If reached via
+ * arc_buf_alloc, the header should not be accessed by
+ * any other thread. And, if reached via arc_read_done,
+ * the hash lock will protect it if it's found in the
+ * hash table; otherwise no other thread should be
+ * trying to [add|remove]_reference it.
+ */
+ if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
+ ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
+ (void) refcount_add_many(&state->arcs_esize[type],
+ size, tag);
}
+
/*
* If we are growing the cache, and we are adding anonymous
* data, and we have outgrown arc_p, update arc_p
*/
- if (arc_size < arc_c && hdr->b_state == arc_anon &&
- arc_anon->arcs_size + arc_mru->arcs_size > arc_p)
+ if (arc_size < arc_c && hdr->b_l1hdr.b_state == arc_anon &&
+ (refcount_count(&arc_anon->arcs_size) +
+ refcount_count(&arc_mru->arcs_size) > arc_p))
arc_p = MIN(arc_c, arc_p + size);
}
ARCSTAT_BUMP(arcstat_allocated);
+ return (datap);
}
/*
+ * Free the arc data buffer.
+ */
+static void
+arc_free_data_buf(arc_buf_hdr_t *hdr, void *data, uint64_t size, void *tag)
+{
+ arc_state_t *state = hdr->b_l1hdr.b_state;
+ arc_buf_contents_t type = arc_buf_type(hdr);
+
+ /* protected by hash lock, if in the hash table */
+ if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
+ ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
+ ASSERT(state != arc_anon && state != arc_l2c_only);
+
+ (void) refcount_remove_many(&state->arcs_esize[type],
+ size, tag);
+ }
+ (void) refcount_remove_many(&state->arcs_size, size, tag);
+
+ VERIFY3U(hdr->b_type, ==, type);
+ if (type == ARC_BUFC_METADATA) {
+ zio_buf_free(data, size);
+ arc_space_return(size, ARC_SPACE_META);
+ } else {
+ ASSERT(type == ARC_BUFC_DATA);
+ zio_data_buf_free(data, size);
+ arc_space_return(size, ARC_SPACE_DATA);
+ }
+}
+
+/*
* This routine is called whenever a buffer is accessed.
* NOTE: the hash lock is dropped in this function.
*/
static void
-arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
+arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
{
clock_t now;
ASSERT(MUTEX_HELD(hash_lock));
+ ASSERT(HDR_HAS_L1HDR(hdr));
- if (buf->b_state == arc_anon) {
+ if (hdr->b_l1hdr.b_state == arc_anon) {
/*
* This buffer is not in the cache, and does not
* appear in our "ghost" list. Add the new buffer
@@ -2795,12 +4419,12 @@
* to the MRU state.
*/
- ASSERT(buf->b_arc_access == 0);
- buf->b_arc_access = ddi_get_lbolt();
- DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
- arc_change_state(arc_mru, buf, hash_lock);
+ ASSERT0(hdr->b_l1hdr.b_arc_access);
+ hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
+ DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
+ arc_change_state(arc_mru, hdr, hash_lock);
- } else if (buf->b_state == arc_mru) {
+ } else if (hdr->b_l1hdr.b_state == arc_mru) {
now = ddi_get_lbolt();
/*
@@ -2811,14 +4435,16 @@
* - move the buffer to the head of the list if this is
* another prefetch (to make it less likely to be evicted).
*/
- if ((buf->b_flags & ARC_PREFETCH) != 0) {
- if (refcount_count(&buf->b_refcnt) == 0) {
- ASSERT(list_link_active(&buf->b_arc_node));
+ if (HDR_PREFETCH(hdr)) {
+ if (refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
+ /* link protected by hash lock */
+ ASSERT(multilist_link_active(
+ &hdr->b_l1hdr.b_arc_node));
} else {
- buf->b_flags &= ~ARC_PREFETCH;
+ arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH);
ARCSTAT_BUMP(arcstat_mru_hits);
}
- buf->b_arc_access = now;
+ hdr->b_l1hdr.b_arc_access = now;
return;
}
@@ -2827,18 +4453,18 @@
* but it is still in the cache. Move it to the MFU
* state.
*/
- if (now > buf->b_arc_access + ARC_MINTIME) {
+ if (now > hdr->b_l1hdr.b_arc_access + ARC_MINTIME) {
/*
* More than 125ms have passed since we
* instantiated this buffer. Move it to the
* most frequently used state.
*/
- buf->b_arc_access = now;
- DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
- arc_change_state(arc_mfu, buf, hash_lock);
+ hdr->b_l1hdr.b_arc_access = now;
+ DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
+ arc_change_state(arc_mfu, hdr, hash_lock);
}
ARCSTAT_BUMP(arcstat_mru_hits);
- } else if (buf->b_state == arc_mru_ghost) {
+ } else if (hdr->b_l1hdr.b_state == arc_mru_ghost) {
arc_state_t *new_state;
/*
* This buffer has been "accessed" recently, but
@@ -2846,21 +4472,21 @@
* MFU state.
*/
- if (buf->b_flags & ARC_PREFETCH) {
+ if (HDR_PREFETCH(hdr)) {
new_state = arc_mru;
- if (refcount_count(&buf->b_refcnt) > 0)
- buf->b_flags &= ~ARC_PREFETCH;
- DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
+ if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0)
+ arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH);
+ DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
} else {
new_state = arc_mfu;
- DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
+ DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
}
- buf->b_arc_access = ddi_get_lbolt();
- arc_change_state(new_state, buf, hash_lock);
+ hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
+ arc_change_state(new_state, hdr, hash_lock);
ARCSTAT_BUMP(arcstat_mru_ghost_hits);
- } else if (buf->b_state == arc_mfu) {
+ } else if (hdr->b_l1hdr.b_state == arc_mfu) {
/*
* This buffer has been accessed more than once and is
* still in the cache. Keep it in the MFU state.
@@ -2870,13 +4496,14 @@
* If it was a prefetch, we will explicitly move it to
* the head of the list now.
*/
- if ((buf->b_flags & ARC_PREFETCH) != 0) {
- ASSERT(refcount_count(&buf->b_refcnt) == 0);
- ASSERT(list_link_active(&buf->b_arc_node));
+ if ((HDR_PREFETCH(hdr)) != 0) {
+ ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
+ /* link protected by hash_lock */
+ ASSERT(multilist_link_active(&hdr->b_l1hdr.b_arc_node));
}
ARCSTAT_BUMP(arcstat_mfu_hits);
- buf->b_arc_access = ddi_get_lbolt();
- } else if (buf->b_state == arc_mfu_ghost) {
+ hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
+ } else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) {
arc_state_t *new_state = arc_mfu;
/*
* This buffer has been accessed more than once but has
@@ -2884,28 +4511,28 @@
* MFU state.
*/
- if (buf->b_flags & ARC_PREFETCH) {
+ if (HDR_PREFETCH(hdr)) {
/*
* This is a prefetch access...
* move this block back to the MRU state.
*/
- ASSERT0(refcount_count(&buf->b_refcnt));
+ ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt));
new_state = arc_mru;
}
- buf->b_arc_access = ddi_get_lbolt();
- DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
- arc_change_state(new_state, buf, hash_lock);
+ hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
+ DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
+ arc_change_state(new_state, hdr, hash_lock);
ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
- } else if (buf->b_state == arc_l2c_only) {
+ } else if (hdr->b_l1hdr.b_state == arc_l2c_only) {
/*
* This buffer is on the 2nd Level ARC.
*/
- buf->b_arc_access = ddi_get_lbolt();
- DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
- arc_change_state(arc_mfu, buf, hash_lock);
+ hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
+ DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
+ arc_change_state(arc_mfu, hdr, hash_lock);
} else {
ASSERT(!"invalid arc state");
}
@@ -2917,8 +4544,8 @@
arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
{
if (zio == NULL || zio->io_error == 0)
- bcopy(buf->b_data, arg, buf->b_hdr->b_size);
- VERIFY(arc_buf_remove_ref(buf, arg));
+ bcopy(buf->b_data, arg, HDR_GET_LSIZE(buf->b_hdr));
+ arc_buf_destroy(buf, arg);
}
/* a generic arc_done_func_t */
@@ -2927,7 +4554,7 @@
{
arc_buf_t **bufp = arg;
if (zio && zio->io_error) {
- VERIFY(arc_buf_remove_ref(buf, arg));
+ arc_buf_destroy(buf, arg);
*bufp = NULL;
} else {
*bufp = buf;
@@ -2936,18 +4563,30 @@
}
static void
+arc_hdr_verify(arc_buf_hdr_t *hdr, blkptr_t *bp)
+{
+ if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) {
+ ASSERT3U(HDR_GET_PSIZE(hdr), ==, 0);
+ ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF);
+ } else {
+ if (HDR_COMPRESSION_ENABLED(hdr)) {
+ ASSERT3U(HDR_GET_COMPRESS(hdr), ==,
+ BP_GET_COMPRESS(bp));
+ }
+ ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp));
+ ASSERT3U(HDR_GET_PSIZE(hdr), ==, BP_GET_PSIZE(bp));
+ }
+}
+
+static void
arc_read_done(zio_t *zio)
{
- arc_buf_hdr_t *hdr, *found;
- arc_buf_t *buf;
- arc_buf_t *abuf; /* buffer we're assigning to callback */
- kmutex_t *hash_lock;
+ arc_buf_hdr_t *hdr = zio->io_private;
+ arc_buf_t *abuf = NULL; /* buffer we're assigning to callback */
+ kmutex_t *hash_lock = NULL;
arc_callback_t *callback_list, *acb;
- int freeable = FALSE;
+ int freeable = B_FALSE;
- buf = zio->io_private;
- hdr = buf->b_hdr;
-
/*
* The hdr was inserted into hash-table and removed from lists
* prior to starting I/O. We should find this header, since
@@ -2956,35 +4595,45 @@
* reason for it not to be found is if we were freed during the
* read.
*/
- found = buf_hash_find(hdr->b_spa, &hdr->b_dva, hdr->b_birth,
- &hash_lock);
+ if (HDR_IN_HASH_TABLE(hdr)) {
+ ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp));
+ ASSERT3U(hdr->b_dva.dva_word[0], ==,
+ BP_IDENTITY(zio->io_bp)->dva_word[0]);
+ ASSERT3U(hdr->b_dva.dva_word[1], ==,
+ BP_IDENTITY(zio->io_bp)->dva_word[1]);
- ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) ||
- (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
- (found == hdr && HDR_L2_READING(hdr)));
+ arc_buf_hdr_t *found = buf_hash_find(hdr->b_spa, zio->io_bp,
+ &hash_lock);
- hdr->b_flags &= ~ARC_L2_EVICTED;
- if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH))
- hdr->b_flags &= ~ARC_L2CACHE;
+ ASSERT((found == hdr &&
+ DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
+ (found == hdr && HDR_L2_READING(hdr)));
+ ASSERT3P(hash_lock, !=, NULL);
+ }
- /* byteswap if necessary */
- callback_list = hdr->b_acb;
- ASSERT(callback_list != NULL);
- if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) {
- dmu_object_byteswap_t bswap =
- DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
- arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ?
- byteswap_uint64_array :
- dmu_ot_byteswap[bswap].ob_func;
- func(buf->b_data, hdr->b_size);
+ if (zio->io_error == 0) {
+ /* byteswap if necessary */
+ if (BP_SHOULD_BYTESWAP(zio->io_bp)) {
+ if (BP_GET_LEVEL(zio->io_bp) > 0) {
+ hdr->b_l1hdr.b_byteswap = DMU_BSWAP_UINT64;
+ } else {
+ hdr->b_l1hdr.b_byteswap =
+ DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
+ }
+ } else {
+ hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
+ }
}
- arc_cksum_compute(buf, B_FALSE);
-#ifdef illumos
- arc_buf_watch(buf);
-#endif /* illumos */
+ arc_hdr_clear_flags(hdr, ARC_FLAG_L2_EVICTED);
+ if (l2arc_noprefetch && HDR_PREFETCH(hdr))
+ arc_hdr_clear_flags(hdr, ARC_FLAG_L2CACHE);
- if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) {
+ callback_list = hdr->b_l1hdr.b_acb;
+ ASSERT3P(callback_list, !=, NULL);
+
+ if (hash_lock && zio->io_error == 0 &&
+ hdr->b_l1hdr.b_state == arc_anon) {
/*
* Only call arc_access on anonymous buffers. This is because
* if we've issued an I/O for an evicted buffer, we've already
@@ -2995,35 +4644,55 @@
}
/* create copies of the data buffer for the callers */
- abuf = buf;
for (acb = callback_list; acb; acb = acb->acb_next) {
- if (acb->acb_done) {
+ if (acb->acb_done != NULL) {
+ /*
+ * If we're here, then this must be a demand read
+ * since prefetch requests don't have callbacks.
+ * If a read request has a callback (i.e. acb_done is
+ * not NULL), then we decompress the data for the
+ * first request and clone the rest. This avoids
+ * having to waste cpu resources decompressing data
+ * that nobody is explicitly waiting to read.
+ */
if (abuf == NULL) {
- ARCSTAT_BUMP(arcstat_duplicate_reads);
- abuf = arc_buf_clone(buf);
+ acb->acb_buf = arc_buf_alloc_impl(hdr,
+ acb->acb_private);
+ if (zio->io_error == 0) {
+ zio->io_error =
+ arc_decompress(acb->acb_buf);
+ }
+ abuf = acb->acb_buf;
+ } else {
+ add_reference(hdr, acb->acb_private);
+ acb->acb_buf = arc_buf_clone(abuf);
}
- acb->acb_buf = abuf;
- abuf = NULL;
}
}
- hdr->b_acb = NULL;
- hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
- ASSERT(!HDR_BUF_AVAILABLE(hdr));
- if (abuf == buf) {
- ASSERT(buf->b_efunc == NULL);
- ASSERT(hdr->b_datacnt == 1);
- hdr->b_flags |= ARC_BUF_AVAILABLE;
+ hdr->b_l1hdr.b_acb = NULL;
+ arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
+ if (abuf == NULL) {
+ /*
+ * This buffer didn't have a callback so it must
+ * be a prefetch.
+ */
+ ASSERT(HDR_PREFETCH(hdr));
+ ASSERT0(hdr->b_l1hdr.b_bufcnt);
+ ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
}
- ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
+ ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) ||
+ callback_list != NULL);
- if (zio->io_error != 0) {
- hdr->b_flags |= ARC_IO_ERROR;
- if (hdr->b_state != arc_anon)
+ if (zio->io_error == 0) {
+ arc_hdr_verify(hdr, zio->io_bp);
+ } else {
+ arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR);
+ if (hdr->b_l1hdr.b_state != arc_anon)
arc_change_state(arc_anon, hdr, hash_lock);
if (HDR_IN_HASH_TABLE(hdr))
buf_hash_remove(hdr);
- freeable = refcount_is_zero(&hdr->b_refcnt);
+ freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt);
}
/*
@@ -3031,9 +4700,9 @@
* that the hdr (and hence the cv) might be freed before we get to
* the cv_broadcast().
*/
- cv_broadcast(&hdr->b_cv);
+ cv_broadcast(&hdr->b_l1hdr.b_cv);
- if (hash_lock) {
+ if (hash_lock != NULL) {
mutex_exit(hash_lock);
} else {
/*
@@ -3042,8 +4711,8 @@
* moved to the anonymous state (so that it won't show up
* in the cache).
*/
- ASSERT3P(hdr->b_state, ==, arc_anon);
- freeable = refcount_is_zero(&hdr->b_refcnt);
+ ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
+ freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt);
}
/* execute each callback and free its structure */
@@ -3065,7 +4734,7 @@
}
/*
- * "Read" the block block at the specified DVA (in bp) via the
+ * "Read" the block at the specified DVA (in bp) via the
* cache. If the block is found in the cache, invoke the provided
* callback immediately and return. Note that the `zio' parameter
* in the callback will be NULL in this case, since no IO was
@@ -3084,33 +4753,72 @@
*/
int
arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
- void *private, int priority, int zio_flags, uint32_t *arc_flags,
- const zbookmark_t *zb)
+ void *private, zio_priority_t priority, int zio_flags,
+ arc_flags_t *arc_flags, const zbookmark_phys_t *zb)
{
- arc_buf_hdr_t *hdr;
- arc_buf_t *buf = NULL;
- kmutex_t *hash_lock;
+ arc_buf_hdr_t *hdr = NULL;
+ kmutex_t *hash_lock = NULL;
zio_t *rzio;
uint64_t guid = spa_load_guid(spa);
+ ASSERT(!BP_IS_EMBEDDED(bp) ||
+ BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
+
top:
- hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
- &hash_lock);
- if (hdr && hdr->b_datacnt > 0) {
+ if (!BP_IS_EMBEDDED(bp)) {
+ /*
+ * Embedded BP's have no DVA and require no I/O to "read".
+ * Create an anonymous arc buf to back it.
+ */
+ hdr = buf_hash_find(guid, bp, &hash_lock);
+ }
- *arc_flags |= ARC_CACHED;
+ if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_pdata != NULL) {
+ arc_buf_t *buf = NULL;
+ *arc_flags |= ARC_FLAG_CACHED;
if (HDR_IO_IN_PROGRESS(hdr)) {
- if (*arc_flags & ARC_WAIT) {
- cv_wait(&hdr->b_cv, hash_lock);
+ if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) &&
+ priority == ZIO_PRIORITY_SYNC_READ) {
+ /*
+ * This sync read must wait for an
+ * in-progress async read (e.g. a predictive
+ * prefetch). Async reads are queued
+ * separately at the vdev_queue layer, so
+ * this is a form of priority inversion.
+ * Ideally, we would "inherit" the demand
+ * i/o's priority by moving the i/o from
+ * the async queue to the synchronous queue,
+ * but there is currently no mechanism to do
+ * so. Track this so that we can evaluate
+ * the magnitude of this potential performance
+ * problem.
+ *
+ * Note that if the prefetch i/o is already
+ * active (has been issued to the device),
+ * the prefetch improved performance, because
+ * we issued it sooner than we would have
+ * without the prefetch.
+ */
+ DTRACE_PROBE1(arc__sync__wait__for__async,
+ arc_buf_hdr_t *, hdr);
+ ARCSTAT_BUMP(arcstat_sync_wait_for_async);
+ }
+ if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) {
+ arc_hdr_clear_flags(hdr,
+ ARC_FLAG_PREDICTIVE_PREFETCH);
+ }
+
+ if (*arc_flags & ARC_FLAG_WAIT) {
+ cv_wait(&hdr->b_l1hdr.b_cv, hash_lock);
mutex_exit(hash_lock);
goto top;
}
- ASSERT(*arc_flags & ARC_NOWAIT);
+ ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
if (done) {
- arc_callback_t *acb = NULL;
+ arc_callback_t *acb = NULL;
acb = kmem_zalloc(sizeof (arc_callback_t),
KM_SLEEP);
@@ -3120,10 +4828,9 @@
acb->acb_zio_dummy = zio_null(pio,
spa, NULL, NULL, NULL, zio_flags);
- ASSERT(acb->acb_done != NULL);
- acb->acb_next = hdr->b_acb;
- hdr->b_acb = acb;
- add_reference(hdr, hash_lock, private);
+ ASSERT3P(acb->acb_done, !=, NULL);
+ acb->acb_next = hdr->b_l1hdr.b_acb;
+ hdr->b_l1hdr.b_acb = acb;
mutex_exit(hash_lock);
return (0);
}
@@ -3131,122 +4838,151 @@
return (0);
}
- ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
+ ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
+ hdr->b_l1hdr.b_state == arc_mfu);
if (done) {
- add_reference(hdr, hash_lock, private);
+ if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) {
+ /*
+ * This is a demand read which does not have to
+ * wait for i/o because we did a predictive
+ * prefetch i/o for it, which has completed.
+ */
+ DTRACE_PROBE1(
+ arc__demand__hit__predictive__prefetch,
+ arc_buf_hdr_t *, hdr);
+ ARCSTAT_BUMP(
+ arcstat_demand_hit_predictive_prefetch);
+ arc_hdr_clear_flags(hdr,
+ ARC_FLAG_PREDICTIVE_PREFETCH);
+ }
+ ASSERT(!BP_IS_EMBEDDED(bp) || !BP_IS_HOLE(bp));
+
/*
* If this block is already in use, create a new
* copy of the data so that we will be guaranteed
* that arc_release() will always succeed.
*/
- buf = hdr->b_buf;
- ASSERT(buf);
- ASSERT(buf->b_data);
- if (HDR_BUF_AVAILABLE(hdr)) {
- ASSERT(buf->b_efunc == NULL);
- hdr->b_flags &= ~ARC_BUF_AVAILABLE;
+ buf = hdr->b_l1hdr.b_buf;
+ if (buf == NULL) {
+ ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt));
+ ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
+ buf = arc_buf_alloc_impl(hdr, private);
+ VERIFY0(arc_decompress(buf));
} else {
+ add_reference(hdr, private);
buf = arc_buf_clone(buf);
}
+ ASSERT3P(buf->b_data, !=, NULL);
- } else if (*arc_flags & ARC_PREFETCH &&
- refcount_count(&hdr->b_refcnt) == 0) {
- hdr->b_flags |= ARC_PREFETCH;
+ } else if (*arc_flags & ARC_FLAG_PREFETCH &&
+ refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
+ arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
}
DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
arc_access(hdr, hash_lock);
- if (*arc_flags & ARC_L2CACHE)
- hdr->b_flags |= ARC_L2CACHE;
- if (*arc_flags & ARC_L2COMPRESS)
- hdr->b_flags |= ARC_L2COMPRESS;
+ if (*arc_flags & ARC_FLAG_L2CACHE)
+ arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
mutex_exit(hash_lock);
ARCSTAT_BUMP(arcstat_hits);
- ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
- demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
+ ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
+ demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
data, metadata, hits);
if (done)
done(NULL, buf, private);
} else {
- uint64_t size = BP_GET_LSIZE(bp);
- arc_callback_t *acb;
+ uint64_t lsize = BP_GET_LSIZE(bp);
+ uint64_t psize = BP_GET_PSIZE(bp);
+ arc_callback_t *acb;
vdev_t *vd = NULL;
uint64_t addr = 0;
boolean_t devw = B_FALSE;
+ uint64_t size;
if (hdr == NULL) {
/* this block is not in the cache */
- arc_buf_hdr_t *exists;
+ arc_buf_hdr_t *exists = NULL;
arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
- buf = arc_buf_alloc(spa, size, private, type);
- hdr = buf->b_hdr;
- hdr->b_dva = *BP_IDENTITY(bp);
- hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
- hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
- exists = buf_hash_insert(hdr, &hash_lock);
- if (exists) {
+ hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize,
+ BP_GET_COMPRESS(bp), type);
+
+ if (!BP_IS_EMBEDDED(bp)) {
+ hdr->b_dva = *BP_IDENTITY(bp);
+ hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
+ exists = buf_hash_insert(hdr, &hash_lock);
+ }
+ if (exists != NULL) {
/* somebody beat us to the hash insert */
mutex_exit(hash_lock);
buf_discard_identity(hdr);
- (void) arc_buf_remove_ref(buf, private);
+ arc_hdr_destroy(hdr);
goto top; /* restart the IO request */
}
- /* if this is a prefetch, we don't have a reference */
- if (*arc_flags & ARC_PREFETCH) {
- (void) remove_reference(hdr, hash_lock,
- private);
- hdr->b_flags |= ARC_PREFETCH;
+ } else {
+ /*
+ * This block is in the ghost cache. If it was L2-only
+ * (and thus didn't have an L1 hdr), we realloc the
+ * header to add an L1 hdr.
+ */
+ if (!HDR_HAS_L1HDR(hdr)) {
+ hdr = arc_hdr_realloc(hdr, hdr_l2only_cache,
+ hdr_full_cache);
}
- if (*arc_flags & ARC_L2CACHE)
- hdr->b_flags |= ARC_L2CACHE;
- if (*arc_flags & ARC_L2COMPRESS)
- hdr->b_flags |= ARC_L2COMPRESS;
- if (BP_GET_LEVEL(bp) > 0)
- hdr->b_flags |= ARC_INDIRECT;
- } else {
- /* this block is in the ghost cache */
- ASSERT(GHOST_STATE(hdr->b_state));
+ ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
+ ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state));
ASSERT(!HDR_IO_IN_PROGRESS(hdr));
- ASSERT0(refcount_count(&hdr->b_refcnt));
- ASSERT(hdr->b_buf == NULL);
+ ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
+ ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
- /* if this is a prefetch, we don't have a reference */
- if (*arc_flags & ARC_PREFETCH)
- hdr->b_flags |= ARC_PREFETCH;
- else
- add_reference(hdr, hash_lock, private);
- if (*arc_flags & ARC_L2CACHE)
- hdr->b_flags |= ARC_L2CACHE;
- if (*arc_flags & ARC_L2COMPRESS)
- hdr->b_flags |= ARC_L2COMPRESS;
- buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
- buf->b_hdr = hdr;
- buf->b_data = NULL;
- buf->b_efunc = NULL;
- buf->b_private = NULL;
- buf->b_next = NULL;
- hdr->b_buf = buf;
- ASSERT(hdr->b_datacnt == 0);
- hdr->b_datacnt = 1;
- arc_get_data_buf(buf);
+ /*
+ * This is a delicate dance that we play here.
+ * This hdr is in the ghost list so we access it
+ * to move it out of the ghost list before we
+ * initiate the read. If it's a prefetch then
+ * it won't have a callback so we'll remove the
+ * reference that arc_buf_alloc_impl() created. We
+ * do this after we've called arc_access() to
+ * avoid hitting an assert in remove_reference().
+ */
arc_access(hdr, hash_lock);
+ arc_hdr_alloc_pdata(hdr);
}
+ ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
+ size = arc_hdr_size(hdr);
- ASSERT(!GHOST_STATE(hdr->b_state));
+ /*
+ * If compression is enabled on the hdr, then will do
+ * RAW I/O and will store the compressed data in the hdr's
+ * data block. Otherwise, the hdr's data block will contain
+ * the uncompressed data.
+ */
+ if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) {
+ zio_flags |= ZIO_FLAG_RAW;
+ }
+ if (*arc_flags & ARC_FLAG_PREFETCH)
+ arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
+ if (*arc_flags & ARC_FLAG_L2CACHE)
+ arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
+ if (BP_GET_LEVEL(bp) > 0)
+ arc_hdr_set_flags(hdr, ARC_FLAG_INDIRECT);
+ if (*arc_flags & ARC_FLAG_PREDICTIVE_PREFETCH)
+ arc_hdr_set_flags(hdr, ARC_FLAG_PREDICTIVE_PREFETCH);
+ ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state));
+
acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
acb->acb_done = done;
acb->acb_private = private;
- ASSERT(hdr->b_acb == NULL);
- hdr->b_acb = acb;
- hdr->b_flags |= ARC_IO_IN_PROGRESS;
+ ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
+ hdr->b_l1hdr.b_acb = acb;
+ arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
- if (HDR_L2CACHE(hdr) && hdr->b_l2hdr != NULL &&
- (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) {
- devw = hdr->b_l2hdr->b_dev->l2ad_writing;
- addr = hdr->b_l2hdr->b_daddr;
+ if (HDR_HAS_L2HDR(hdr) &&
+ (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) {
+ devw = hdr->b_l2hdr.b_dev->l2ad_writing;
+ addr = hdr->b_l2hdr.b_daddr;
/*
* Lock out device removal.
*/
@@ -3255,18 +4991,25 @@
vd = NULL;
}
- mutex_exit(hash_lock);
+ if (priority == ZIO_PRIORITY_ASYNC_READ)
+ arc_hdr_set_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ);
+ else
+ arc_hdr_clear_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ);
+ if (hash_lock != NULL)
+ mutex_exit(hash_lock);
+
/*
* At this point, we have a level 1 cache miss. Try again in
* L2ARC if possible.
*/
- ASSERT3U(hdr->b_size, ==, size);
+ ASSERT3U(HDR_GET_LSIZE(hdr), ==, lsize);
+
DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
- uint64_t, size, zbookmark_t *, zb);
+ uint64_t, lsize, zbookmark_phys_t *, zb);
ARCSTAT_BUMP(arcstat_misses);
- ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
- demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
+ ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
+ demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
data, metadata, misses);
#ifdef _KERNEL
curthread->td_ru.ru_inblock++;
@@ -3282,10 +5025,11 @@
* also have invalidated the vdev.
* 5. This isn't prefetch and l2arc_noprefetch is set.
*/
- if (hdr->b_l2hdr != NULL &&
+ if (HDR_HAS_L2HDR(hdr) &&
!HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
!(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
l2arc_read_callback_t *cb;
+ void* b_data;
DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
ARCSTAT_BUMP(arcstat_l2_hits);
@@ -3292,15 +5036,20 @@
cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
KM_SLEEP);
- cb->l2rcb_buf = buf;
- cb->l2rcb_spa = spa;
+ cb->l2rcb_hdr = hdr;
cb->l2rcb_bp = *bp;
cb->l2rcb_zb = *zb;
cb->l2rcb_flags = zio_flags;
- cb->l2rcb_compress = hdr->b_l2hdr->b_compress;
+ uint64_t asize = vdev_psize_to_asize(vd, size);
+ if (asize != size) {
+ b_data = zio_data_buf_alloc(asize);
+ cb->l2rcb_data = b_data;
+ } else {
+ b_data = hdr->b_l1hdr.b_pdata;
+ }
ASSERT(addr >= VDEV_LABEL_START_SIZE &&
- addr + size < vd->vdev_psize -
+ addr + asize < vd->vdev_psize -
VDEV_LABEL_END_SIZE);
/*
@@ -3309,35 +5058,26 @@
* Issue a null zio if the underlying buffer
* was squashed to zero size by compression.
*/
- if (hdr->b_l2hdr->b_compress ==
- ZIO_COMPRESS_EMPTY) {
- rzio = zio_null(pio, spa, vd,
- l2arc_read_done, cb,
- zio_flags | ZIO_FLAG_DONT_CACHE |
- ZIO_FLAG_CANFAIL |
- ZIO_FLAG_DONT_PROPAGATE |
- ZIO_FLAG_DONT_RETRY);
- } else {
- rzio = zio_read_phys(pio, vd, addr,
- hdr->b_l2hdr->b_asize,
- buf->b_data, ZIO_CHECKSUM_OFF,
- l2arc_read_done, cb, priority,
- zio_flags | ZIO_FLAG_DONT_CACHE |
- ZIO_FLAG_CANFAIL |
- ZIO_FLAG_DONT_PROPAGATE |
- ZIO_FLAG_DONT_RETRY, B_FALSE);
- }
+ ASSERT3U(HDR_GET_COMPRESS(hdr), !=,
+ ZIO_COMPRESS_EMPTY);
+ rzio = zio_read_phys(pio, vd, addr,
+ asize, b_data,
+ ZIO_CHECKSUM_OFF,
+ l2arc_read_done, cb, priority,
+ zio_flags | ZIO_FLAG_DONT_CACHE |
+ ZIO_FLAG_CANFAIL |
+ ZIO_FLAG_DONT_PROPAGATE |
+ ZIO_FLAG_DONT_RETRY, B_FALSE);
DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
zio_t *, rzio);
- ARCSTAT_INCR(arcstat_l2_read_bytes,
- hdr->b_l2hdr->b_asize);
+ ARCSTAT_INCR(arcstat_l2_read_bytes, size);
- if (*arc_flags & ARC_NOWAIT) {
+ if (*arc_flags & ARC_FLAG_NOWAIT) {
zio_nowait(rzio);
return (0);
}
- ASSERT(*arc_flags & ARC_WAIT);
+ ASSERT(*arc_flags & ARC_FLAG_WAIT);
if (zio_wait(rzio) == 0)
return (0);
@@ -3360,31 +5100,18 @@
}
}
- rzio = zio_read(pio, spa, bp, buf->b_data, size,
- arc_read_done, buf, priority, zio_flags, zb);
+ rzio = zio_read(pio, spa, bp, hdr->b_l1hdr.b_pdata, size,
+ arc_read_done, hdr, priority, zio_flags, zb);
- if (*arc_flags & ARC_WAIT)
+ if (*arc_flags & ARC_FLAG_WAIT)
return (zio_wait(rzio));
- ASSERT(*arc_flags & ARC_NOWAIT);
+ ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
zio_nowait(rzio);
}
return (0);
}
-void
-arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
-{
- ASSERT(buf->b_hdr != NULL);
- ASSERT(buf->b_hdr->b_state != arc_anon);
- ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL);
- ASSERT(buf->b_efunc == NULL);
- ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr));
-
- buf->b_efunc = func;
- buf->b_private = private;
-}
-
/*
* Notify the arc that a block was freed, and thus will never be used again.
*/
@@ -3395,18 +5122,39 @@
kmutex_t *hash_lock;
uint64_t guid = spa_load_guid(spa);
- hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
- &hash_lock);
+ ASSERT(!BP_IS_EMBEDDED(bp));
+
+ hdr = buf_hash_find(guid, bp, &hash_lock);
if (hdr == NULL)
return;
- if (HDR_BUF_AVAILABLE(hdr)) {
- arc_buf_t *buf = hdr->b_buf;
- add_reference(hdr, hash_lock, FTAG);
- hdr->b_flags &= ~ARC_BUF_AVAILABLE;
+
+ /*
+ * We might be trying to free a block that is still doing I/O
+ * (i.e. prefetch) or has a reference (i.e. a dedup-ed,
+ * dmu_sync-ed block). If this block is being prefetched, then it
+ * would still have the ARC_FLAG_IO_IN_PROGRESS flag set on the hdr
+ * until the I/O completes. A block may also have a reference if it is
+ * part of a dedup-ed, dmu_synced write. The dmu_sync() function would
+ * have written the new block to its final resting place on disk but
+ * without the dedup flag set. This would have left the hdr in the MRU
+ * state and discoverable. When the txg finally syncs it detects that
+ * the block was overridden in open context and issues an override I/O.
+ * Since this is a dedup block, the override I/O will determine if the
+ * block is already in the DDT. If so, then it will replace the io_bp
+ * with the bp from the DDT and allow the I/O to finish. When the I/O
+ * reaches the done callback, dbuf_write_override_done, it will
+ * check to see if the io_bp and io_bp_override are identical.
+ * If they are not, then it indicates that the bp was replaced with
+ * the bp in the DDT and the override bp is freed. This allows
+ * us to arrive here with a reference on a block that is being
+ * freed. So if we have an I/O in progress, or a reference to
+ * this hdr, then we don't destroy the hdr.
+ */
+ if (!HDR_HAS_L1HDR(hdr) || (!HDR_IO_IN_PROGRESS(hdr) &&
+ refcount_is_zero(&hdr->b_l1hdr.b_refcnt))) {
+ arc_change_state(arc_anon, hdr, hash_lock);
+ arc_hdr_destroy(hdr);
mutex_exit(hash_lock);
-
- arc_release(buf, FTAG);
- (void) arc_buf_remove_ref(buf, FTAG);
} else {
mutex_exit(hash_lock);
}
@@ -3414,94 +5162,6 @@
}
/*
- * This is used by the DMU to let the ARC know that a buffer is
- * being evicted, so the ARC should clean up. If this arc buf
- * is not yet in the evicted state, it will be put there.
- */
-int
-arc_buf_evict(arc_buf_t *buf)
-{
- arc_buf_hdr_t *hdr;
- kmutex_t *hash_lock;
- arc_buf_t **bufp;
- list_t *list, *evicted_list;
- kmutex_t *lock, *evicted_lock;
-
- mutex_enter(&buf->b_evict_lock);
- hdr = buf->b_hdr;
- if (hdr == NULL) {
- /*
- * We are in arc_do_user_evicts().
- */
- ASSERT(buf->b_data == NULL);
- mutex_exit(&buf->b_evict_lock);
- return (0);
- } else if (buf->b_data == NULL) {
- arc_buf_t copy = *buf; /* structure assignment */
- /*
- * We are on the eviction list; process this buffer now
- * but let arc_do_user_evicts() do the reaping.
- */
- buf->b_efunc = NULL;
- mutex_exit(&buf->b_evict_lock);
- VERIFY(copy.b_efunc(©) == 0);
- return (1);
- }
- hash_lock = HDR_LOCK(hdr);
- mutex_enter(hash_lock);
- hdr = buf->b_hdr;
- ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
-
- ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
- ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
-
- /*
- * Pull this buffer off of the hdr
- */
- bufp = &hdr->b_buf;
- while (*bufp != buf)
- bufp = &(*bufp)->b_next;
- *bufp = buf->b_next;
-
- ASSERT(buf->b_data != NULL);
- arc_buf_destroy(buf, FALSE, FALSE);
-
- if (hdr->b_datacnt == 0) {
- arc_state_t *old_state = hdr->b_state;
- arc_state_t *evicted_state;
-
- ASSERT(hdr->b_buf == NULL);
- ASSERT(refcount_is_zero(&hdr->b_refcnt));
-
- evicted_state =
- (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
-
- get_buf_info(hdr, old_state, &list, &lock);
- get_buf_info(hdr, evicted_state, &evicted_list, &evicted_lock);
- mutex_enter(lock);
- mutex_enter(evicted_lock);
-
- arc_change_state(evicted_state, hdr, hash_lock);
- ASSERT(HDR_IN_HASH_TABLE(hdr));
- hdr->b_flags |= ARC_IN_HASH_TABLE;
- hdr->b_flags &= ~ARC_BUF_AVAILABLE;
-
- mutex_exit(evicted_lock);
- mutex_exit(lock);
- }
- mutex_exit(hash_lock);
- mutex_exit(&buf->b_evict_lock);
-
- VERIFY(buf->b_efunc(buf) == 0);
- buf->b_efunc = NULL;
- buf->b_private = NULL;
- buf->b_hdr = NULL;
- buf->b_next = NULL;
- kmem_cache_free(buf_cache, buf);
- return (1);
-}
-
-/*
* Release this buffer from the cache, making it an anonymous buffer. This
* must be done after a read and prior to modifying the buffer contents.
* If the buffer has more than one reference, we must make
@@ -3510,10 +5170,7 @@
void
arc_release(arc_buf_t *buf, void *tag)
{
- arc_buf_hdr_t *hdr;
- kmutex_t *hash_lock = NULL;
- l2arc_buf_hdr_t *l2hdr;
- uint64_t buf_size;
+ arc_buf_hdr_t *hdr = buf->b_hdr;
/*
* It would be nice to assert that if it's DMU metadata (level >
@@ -3522,117 +5179,193 @@
*/
mutex_enter(&buf->b_evict_lock);
- hdr = buf->b_hdr;
- /* this buffer is not on any list */
- ASSERT(refcount_count(&hdr->b_refcnt) > 0);
+ ASSERT(HDR_HAS_L1HDR(hdr));
- if (hdr->b_state == arc_anon) {
- /* this buffer is already released */
- ASSERT(buf->b_efunc == NULL);
- } else {
- hash_lock = HDR_LOCK(hdr);
- mutex_enter(hash_lock);
- hdr = buf->b_hdr;
- ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
+ /*
+ * We don't grab the hash lock prior to this check, because if
+ * the buffer's header is in the arc_anon state, it won't be
+ * linked into the hash table.
+ */
+ if (hdr->b_l1hdr.b_state == arc_anon) {
+ mutex_exit(&buf->b_evict_lock);
+ ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+ ASSERT(!HDR_IN_HASH_TABLE(hdr));
+ ASSERT(!HDR_HAS_L2HDR(hdr));
+ ASSERT(HDR_EMPTY(hdr));
+ ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1);
+ ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1);
+ ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
+
+ hdr->b_l1hdr.b_arc_access = 0;
+
+ /*
+ * If the buf is being overridden then it may already
+ * have a hdr that is not empty.
+ */
+ buf_discard_identity(hdr);
+ arc_buf_thaw(buf);
+
+ return;
}
- l2hdr = hdr->b_l2hdr;
- if (l2hdr) {
- mutex_enter(&l2arc_buflist_mtx);
- hdr->b_l2hdr = NULL;
+ kmutex_t *hash_lock = HDR_LOCK(hdr);
+ mutex_enter(hash_lock);
+
+ /*
+ * This assignment is only valid as long as the hash_lock is
+ * held, we must be careful not to reference state or the
+ * b_state field after dropping the lock.
+ */
+ arc_state_t *state = hdr->b_l1hdr.b_state;
+ ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
+ ASSERT3P(state, !=, arc_anon);
+
+ /* this buffer is not on any list */
+ ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) > 0);
+
+ if (HDR_HAS_L2HDR(hdr)) {
+ mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx);
+
+ /*
+ * We have to recheck this conditional again now that
+ * we're holding the l2ad_mtx to prevent a race with
+ * another thread which might be concurrently calling
+ * l2arc_evict(). In that case, l2arc_evict() might have
+ * destroyed the header's L2 portion as we were waiting
+ * to acquire the l2ad_mtx.
+ */
+ if (HDR_HAS_L2HDR(hdr)) {
+ l2arc_trim(hdr);
+ arc_hdr_l2hdr_destroy(hdr);
+ }
+
+ mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx);
}
- buf_size = hdr->b_size;
/*
* Do we have more than one buf?
*/
- if (hdr->b_datacnt > 1) {
+ if (hdr->b_l1hdr.b_bufcnt > 1) {
arc_buf_hdr_t *nhdr;
arc_buf_t **bufp;
- uint64_t blksz = hdr->b_size;
uint64_t spa = hdr->b_spa;
- arc_buf_contents_t type = hdr->b_type;
- uint32_t flags = hdr->b_flags;
+ uint64_t psize = HDR_GET_PSIZE(hdr);
+ uint64_t lsize = HDR_GET_LSIZE(hdr);
+ enum zio_compress compress = HDR_GET_COMPRESS(hdr);
+ arc_buf_contents_t type = arc_buf_type(hdr);
+ VERIFY3U(hdr->b_type, ==, type);
- ASSERT(hdr->b_buf != buf || buf->b_next != NULL);
+ ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL);
+ (void) remove_reference(hdr, hash_lock, tag);
+
+ if (arc_buf_is_shared(buf)) {
+ ASSERT(HDR_SHARED_DATA(hdr));
+ ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf);
+ ASSERT(ARC_BUF_LAST(buf));
+ }
+
/*
* Pull the data off of this hdr and attach it to
- * a new anonymous hdr.
+ * a new anonymous hdr. Also find the last buffer
+ * in the hdr's buffer list.
*/
- (void) remove_reference(hdr, hash_lock, tag);
- bufp = &hdr->b_buf;
- while (*bufp != buf)
- bufp = &(*bufp)->b_next;
- *bufp = buf->b_next;
- buf->b_next = NULL;
+ arc_buf_t *lastbuf = NULL;
+ bufp = &hdr->b_l1hdr.b_buf;
+ while (*bufp != NULL) {
+ if (*bufp == buf) {
+ *bufp = buf->b_next;
+ }
- ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size);
- atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size);
- if (refcount_is_zero(&hdr->b_refcnt)) {
- uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type];
- ASSERT3U(*size, >=, hdr->b_size);
- atomic_add_64(size, -hdr->b_size);
+ /*
+ * If we've removed a buffer in the middle of
+ * the list then update the lastbuf and update
+ * bufp.
+ */
+ if (*bufp != NULL) {
+ lastbuf = *bufp;
+ bufp = &(*bufp)->b_next;
+ }
}
+ buf->b_next = NULL;
+ ASSERT3P(lastbuf, !=, buf);
+ ASSERT3P(lastbuf, !=, NULL);
/*
- * We're releasing a duplicate user data buffer, update
- * our statistics accordingly.
+ * If the current arc_buf_t and the hdr are sharing their data
+ * buffer, then we must stop sharing that block, transfer
+ * ownership and setup sharing with a new arc_buf_t at the end
+ * of the hdr's b_buf list.
*/
- if (hdr->b_type == ARC_BUFC_DATA) {
- ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
- ARCSTAT_INCR(arcstat_duplicate_buffers_size,
- -hdr->b_size);
+ if (arc_buf_is_shared(buf)) {
+ ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf);
+ ASSERT(ARC_BUF_LAST(lastbuf));
+ VERIFY(!arc_buf_is_shared(lastbuf));
+
+ /*
+ * First, sever the block sharing relationship between
+ * buf and the arc_buf_hdr_t. Then, setup a new
+ * block sharing relationship with the last buffer
+ * on the arc_buf_t list.
+ */
+ arc_unshare_buf(hdr, buf);
+ arc_share_buf(hdr, lastbuf);
+ VERIFY3P(lastbuf->b_data, !=, NULL);
+ } else if (HDR_SHARED_DATA(hdr)) {
+ ASSERT(arc_buf_is_shared(lastbuf));
}
- hdr->b_datacnt -= 1;
+ ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
+ ASSERT3P(state, !=, arc_l2c_only);
+
+ (void) refcount_remove_many(&state->arcs_size,
+ HDR_GET_LSIZE(hdr), buf);
+
+ if (refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) {
+ ASSERT3P(state, !=, arc_l2c_only);
+ (void) refcount_remove_many(&state->arcs_esize[type],
+ HDR_GET_LSIZE(hdr), buf);
+ }
+
+ hdr->b_l1hdr.b_bufcnt -= 1;
arc_cksum_verify(buf);
#ifdef illumos
arc_buf_unwatch(buf);
-#endif /* illumos */
+#endif
mutex_exit(hash_lock);
- nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
- nhdr->b_size = blksz;
- nhdr->b_spa = spa;
- nhdr->b_type = type;
- nhdr->b_buf = buf;
- nhdr->b_state = arc_anon;
- nhdr->b_arc_access = 0;
- nhdr->b_flags = flags & ARC_L2_WRITING;
- nhdr->b_l2hdr = NULL;
- nhdr->b_datacnt = 1;
- nhdr->b_freeze_cksum = NULL;
- (void) refcount_add(&nhdr->b_refcnt, tag);
+ /*
+ * Allocate a new hdr. The new hdr will contain a b_pdata
+ * buffer which will be freed in arc_write().
+ */
+ nhdr = arc_hdr_alloc(spa, psize, lsize, compress, type);
+ ASSERT3P(nhdr->b_l1hdr.b_buf, ==, NULL);
+ ASSERT0(nhdr->b_l1hdr.b_bufcnt);
+ ASSERT0(refcount_count(&nhdr->b_l1hdr.b_refcnt));
+ VERIFY3U(nhdr->b_type, ==, type);
+ ASSERT(!HDR_SHARED_DATA(nhdr));
+
+ nhdr->b_l1hdr.b_buf = buf;
+ nhdr->b_l1hdr.b_bufcnt = 1;
+ (void) refcount_add(&nhdr->b_l1hdr.b_refcnt, tag);
buf->b_hdr = nhdr;
+
mutex_exit(&buf->b_evict_lock);
- atomic_add_64(&arc_anon->arcs_size, blksz);
+ (void) refcount_add_many(&arc_anon->arcs_size,
+ HDR_GET_LSIZE(nhdr), buf);
} else {
mutex_exit(&buf->b_evict_lock);
- ASSERT(refcount_count(&hdr->b_refcnt) == 1);
- ASSERT(!list_link_active(&hdr->b_arc_node));
+ ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) == 1);
+ /* protected by hash lock, or hdr is on arc_anon */
+ ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
ASSERT(!HDR_IO_IN_PROGRESS(hdr));
- if (hdr->b_state != arc_anon)
- arc_change_state(arc_anon, hdr, hash_lock);
- hdr->b_arc_access = 0;
- if (hash_lock)
- mutex_exit(hash_lock);
+ arc_change_state(arc_anon, hdr, hash_lock);
+ hdr->b_l1hdr.b_arc_access = 0;
+ mutex_exit(hash_lock);
buf_discard_identity(hdr);
arc_buf_thaw(buf);
}
- buf->b_efunc = NULL;
- buf->b_private = NULL;
-
- if (l2hdr) {
- ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
- trim_map_free(l2hdr->b_dev->l2ad_vdev, l2hdr->b_daddr,
- hdr->b_size, 0);
- list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
- kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
- ARCSTAT_INCR(arcstat_l2_size, -buf_size);
- mutex_exit(&l2arc_buflist_mtx);
- }
}
int
@@ -3641,22 +5374,12 @@
int released;
mutex_enter(&buf->b_evict_lock);
- released = (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon);
+ released = (buf->b_data != NULL &&
+ buf->b_hdr->b_l1hdr.b_state == arc_anon);
mutex_exit(&buf->b_evict_lock);
return (released);
}
-int
-arc_has_callback(arc_buf_t *buf)
-{
- int callback;
-
- mutex_enter(&buf->b_evict_lock);
- callback = (buf->b_efunc != NULL);
- mutex_exit(&buf->b_evict_lock);
- return (callback);
-}
-
#ifdef ZFS_DEBUG
int
arc_referenced(arc_buf_t *buf)
@@ -3664,7 +5387,7 @@
int referenced;
mutex_enter(&buf->b_evict_lock);
- referenced = (refcount_count(&buf->b_hdr->b_refcnt));
+ referenced = (refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt));
mutex_exit(&buf->b_evict_lock);
return (referenced);
}
@@ -3676,29 +5399,109 @@
arc_write_callback_t *callback = zio->io_private;
arc_buf_t *buf = callback->awcb_buf;
arc_buf_hdr_t *hdr = buf->b_hdr;
+ uint64_t psize = BP_IS_HOLE(zio->io_bp) ? 0 : BP_GET_PSIZE(zio->io_bp);
- ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt));
- callback->awcb_ready(zio, buf, callback->awcb_private);
+ ASSERT(HDR_HAS_L1HDR(hdr));
+ ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt));
+ ASSERT(hdr->b_l1hdr.b_bufcnt > 0);
/*
- * If the IO is already in progress, then this is a re-write
- * attempt, so we need to thaw and re-compute the cksum.
- * It is the responsibility of the callback to handle the
- * accounting for any re-write attempt.
+ * If we're reexecuting this zio because the pool suspended, then
+ * cleanup any state that was previously set the first time the
+ * callback as invoked.
*/
- if (HDR_IO_IN_PROGRESS(hdr)) {
- mutex_enter(&hdr->b_freeze_lock);
- if (hdr->b_freeze_cksum != NULL) {
- kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
- hdr->b_freeze_cksum = NULL;
+ if (zio->io_flags & ZIO_FLAG_REEXECUTED) {
+ arc_cksum_free(hdr);
+#ifdef illumos
+ arc_buf_unwatch(buf);
+#endif
+ if (hdr->b_l1hdr.b_pdata != NULL) {
+ if (arc_buf_is_shared(buf)) {
+ ASSERT(HDR_SHARED_DATA(hdr));
+
+ arc_unshare_buf(hdr, buf);
+ } else {
+ arc_hdr_free_pdata(hdr);
+ }
}
- mutex_exit(&hdr->b_freeze_lock);
}
- arc_cksum_compute(buf, B_FALSE);
- hdr->b_flags |= ARC_IO_IN_PROGRESS;
+ ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
+ ASSERT(!HDR_SHARED_DATA(hdr));
+ ASSERT(!arc_buf_is_shared(buf));
+
+ callback->awcb_ready(zio, buf, callback->awcb_private);
+
+ if (HDR_IO_IN_PROGRESS(hdr))
+ ASSERT(zio->io_flags & ZIO_FLAG_REEXECUTED);
+
+ arc_cksum_compute(buf);
+ arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
+
+ enum zio_compress compress;
+ if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) {
+ compress = ZIO_COMPRESS_OFF;
+ } else {
+ ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(zio->io_bp));
+ compress = BP_GET_COMPRESS(zio->io_bp);
+ }
+ HDR_SET_PSIZE(hdr, psize);
+ arc_hdr_set_compress(hdr, compress);
+
+ /*
+ * If the hdr is compressed, then copy the compressed
+ * zio contents into arc_buf_hdr_t. Otherwise, copy the original
+ * data buf into the hdr. Ideally, we would like to always copy the
+ * io_data into b_pdata but the user may have disabled compressed
+ * arc thus the on-disk block may or may not match what we maintain
+ * in the hdr's b_pdata field.
+ */
+ if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) {
+ ASSERT(BP_GET_COMPRESS(zio->io_bp) != ZIO_COMPRESS_OFF);
+ ASSERT3U(psize, >, 0);
+ arc_hdr_alloc_pdata(hdr);
+ bcopy(zio->io_data, hdr->b_l1hdr.b_pdata, psize);
+ } else {
+ ASSERT3P(buf->b_data, ==, zio->io_orig_data);
+ ASSERT3U(zio->io_orig_size, ==, HDR_GET_LSIZE(hdr));
+ ASSERT3U(hdr->b_l1hdr.b_byteswap, ==, DMU_BSWAP_NUMFUNCS);
+ ASSERT(!HDR_SHARED_DATA(hdr));
+ ASSERT(!arc_buf_is_shared(buf));
+ ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1);
+ ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
+
+ /*
+ * This hdr is not compressed so we're able to share
+ * the arc_buf_t data buffer with the hdr.
+ */
+ arc_share_buf(hdr, buf);
+ VERIFY0(bcmp(zio->io_orig_data, hdr->b_l1hdr.b_pdata,
+ HDR_GET_LSIZE(hdr)));
+ }
+ arc_hdr_verify(hdr, zio->io_bp);
}
static void
+arc_write_children_ready(zio_t *zio)
+{
+ arc_write_callback_t *callback = zio->io_private;
+ arc_buf_t *buf = callback->awcb_buf;
+
+ callback->awcb_children_ready(zio, buf, callback->awcb_private);
+}
+
+/*
+ * The SPA calls this callback for each physical write that happens on behalf
+ * of a logical write. See the comment in dbuf_write_physdone() for details.
+ */
+static void
+arc_write_physdone(zio_t *zio)
+{
+ arc_write_callback_t *cb = zio->io_private;
+ if (cb->awcb_physdone != NULL)
+ cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private);
+}
+
+static void
arc_write_done(zio_t *zio)
{
arc_write_callback_t *callback = zio->io_private;
@@ -3705,23 +5508,28 @@
arc_buf_t *buf = callback->awcb_buf;
arc_buf_hdr_t *hdr = buf->b_hdr;
- ASSERT(hdr->b_acb == NULL);
+ ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
if (zio->io_error == 0) {
- hdr->b_dva = *BP_IDENTITY(zio->io_bp);
- hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
- hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
+ arc_hdr_verify(hdr, zio->io_bp);
+
+ if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) {
+ buf_discard_identity(hdr);
+ } else {
+ hdr->b_dva = *BP_IDENTITY(zio->io_bp);
+ hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
+ }
} else {
- ASSERT(BUF_EMPTY(hdr));
+ ASSERT(HDR_EMPTY(hdr));
}
/*
- * If the block to be written was all-zero, we may have
- * compressed it away. In this case no write was performed
- * so there will be no dva/birth/checksum. The buffer must
- * therefore remain anonymous (and uncached).
+ * If the block to be written was all-zero or compressed enough to be
+ * embedded in the BP, no write was performed so there will be no
+ * dva/birth/checksum. The buffer must therefore remain anonymous
+ * (and uncached).
*/
- if (!BUF_EMPTY(hdr)) {
+ if (!HDR_EMPTY(hdr)) {
arc_buf_hdr_t *exists;
kmutex_t *hash_lock;
@@ -3730,7 +5538,7 @@
arc_cksum_verify(buf);
exists = buf_hash_insert(hdr, &hash_lock);
- if (exists) {
+ if (exists != NULL) {
/*
* This can only happen if we overwrite for
* sync-to-convergence, because we remove
@@ -3740,7 +5548,8 @@
if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
panic("bad overwrite, hdr=%p exists=%p",
(void *)hdr, (void *)exists);
- ASSERT(refcount_is_zero(&exists->b_refcnt));
+ ASSERT(refcount_is_zero(
+ &exists->b_l1hdr.b_refcnt));
arc_change_state(arc_anon, exists, hash_lock);
mutex_exit(hash_lock);
arc_hdr_destroy(exists);
@@ -3754,22 +5563,22 @@
(void *)hdr, (void *)exists);
} else {
/* Dedup */
- ASSERT(hdr->b_datacnt == 1);
- ASSERT(hdr->b_state == arc_anon);
+ ASSERT(hdr->b_l1hdr.b_bufcnt == 1);
+ ASSERT(hdr->b_l1hdr.b_state == arc_anon);
ASSERT(BP_GET_DEDUP(zio->io_bp));
ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
}
}
- hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
+ arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
/* if it's not anon, we are doing a scrub */
- if (!exists && hdr->b_state == arc_anon)
+ if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon)
arc_access(hdr, hash_lock);
mutex_exit(hash_lock);
} else {
- hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
+ arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
}
- ASSERT(!refcount_is_zero(&hdr->b_refcnt));
+ ASSERT(!refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
callback->awcb_done(zio, buf, callback->awcb_private);
kmem_free(callback, sizeof (arc_write_callback_t));
@@ -3776,52 +5585,78 @@
}
zio_t *
-arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
- blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress,
- const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *done,
- void *private, int priority, int zio_flags, const zbookmark_t *zb)
+arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
+ boolean_t l2arc, const zio_prop_t *zp, arc_done_func_t *ready,
+ arc_done_func_t *children_ready, arc_done_func_t *physdone,
+ arc_done_func_t *done, void *private, zio_priority_t priority,
+ int zio_flags, const zbookmark_phys_t *zb)
{
arc_buf_hdr_t *hdr = buf->b_hdr;
arc_write_callback_t *callback;
zio_t *zio;
- ASSERT(ready != NULL);
- ASSERT(done != NULL);
+ ASSERT3P(ready, !=, NULL);
+ ASSERT3P(done, !=, NULL);
ASSERT(!HDR_IO_ERROR(hdr));
- ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
- ASSERT(hdr->b_acb == NULL);
+ ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+ ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
+ ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0);
if (l2arc)
- hdr->b_flags |= ARC_L2CACHE;
- if (l2arc_compress)
- hdr->b_flags |= ARC_L2COMPRESS;
+ arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
callback->awcb_ready = ready;
+ callback->awcb_children_ready = children_ready;
+ callback->awcb_physdone = physdone;
callback->awcb_done = done;
callback->awcb_private = private;
callback->awcb_buf = buf;
- zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
- arc_write_ready, arc_write_done, callback, priority, zio_flags, zb);
+ /*
+ * The hdr's b_pdata is now stale, free it now. A new data block
+ * will be allocated when the zio pipeline calls arc_write_ready().
+ */
+ if (hdr->b_l1hdr.b_pdata != NULL) {
+ /*
+ * If the buf is currently sharing the data block with
+ * the hdr then we need to break that relationship here.
+ * The hdr will remain with a NULL data pointer and the
+ * buf will take sole ownership of the block.
+ */
+ if (arc_buf_is_shared(buf)) {
+ ASSERT(ARC_BUF_LAST(buf));
+ arc_unshare_buf(hdr, buf);
+ } else {
+ arc_hdr_free_pdata(hdr);
+ }
+ VERIFY3P(buf->b_data, !=, NULL);
+ arc_hdr_set_compress(hdr, ZIO_COMPRESS_OFF);
+ }
+ ASSERT(!arc_buf_is_shared(buf));
+ ASSERT3P(hdr->b_l1hdr.b_pdata, ==, NULL);
+ zio = zio_write(pio, spa, txg, bp, buf->b_data, HDR_GET_LSIZE(hdr), zp,
+ arc_write_ready,
+ (children_ready != NULL) ? arc_write_children_ready : NULL,
+ arc_write_physdone, arc_write_done, callback,
+ priority, zio_flags, zb);
+
return (zio);
}
static int
-arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg)
+arc_memory_throttle(uint64_t reserve, uint64_t txg)
{
#ifdef _KERNEL
- uint64_t available_memory =
- ptoa((uintmax_t)cnt.v_free_count + cnt.v_cache_count);
+ uint64_t available_memory = ptob(freemem);
static uint64_t page_load = 0;
static uint64_t last_txg = 0;
-#ifdef sun
-#if defined(__i386)
+#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC)
available_memory =
- MIN(available_memory, vmem_size(heap_arena, VMEM_FREE));
+ MIN(available_memory, ptob(vmem_size(heap_arena, VMEM_FREE)));
#endif
-#endif /* sun */
- if (available_memory >= zfs_write_limit_max)
+
+ if (freemem > (uint64_t)physmem * arc_lotsfree_percent / 100)
return (0);
if (txg > last_txg) {
@@ -3834,7 +5669,7 @@
* continue to let page writes occur as quickly as possible.
*/
if (curproc == pageproc) {
- if (page_load > available_memory / 4)
+ if (page_load > MAX(ptob(minfree), available_memory) / 4)
return (SET_ERROR(ERESTART));
/* Note: reserve is inflated, so we deflate */
page_load += reserve / 8;
@@ -3845,20 +5680,6 @@
return (SET_ERROR(EAGAIN));
}
page_load = 0;
-
- if (arc_size > arc_c_min) {
- uint64_t evictable_memory =
- arc_mru->arcs_lsize[ARC_BUFC_DATA] +
- arc_mru->arcs_lsize[ARC_BUFC_METADATA] +
- arc_mfu->arcs_lsize[ARC_BUFC_DATA] +
- arc_mfu->arcs_lsize[ARC_BUFC_METADATA];
- available_memory += MIN(evictable_memory, arc_size - arc_c_min);
- }
-
- if (inflight_data > available_memory / 4) {
- ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
- return (SET_ERROR(ERESTART));
- }
#endif
return (0);
}
@@ -3876,17 +5697,10 @@
int error;
uint64_t anon_size;
-#ifdef ZFS_DEBUG
- /*
- * Once in a while, fail for no reason. Everything should cope.
- */
- if (spa_get_random(10000) == 0) {
- dprintf("forcing random failure\n");
- return (SET_ERROR(ERESTART));
+ if (reserve > arc_c/4 && !arc_no_grow) {
+ arc_c = MIN(arc_c_max, reserve * 4);
+ DTRACE_PROBE1(arc__set_reserve, uint64_t, arc_c);
}
-#endif
- if (reserve > arc_c/4 && !arc_no_grow)
- arc_c = MIN(arc_c_max, reserve * 4);
if (reserve > arc_c)
return (SET_ERROR(ENOMEM));
@@ -3895,7 +5709,8 @@
* network delays from blocking transactions that are ready to be
* assigned to a txg.
*/
- anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0);
+ anon_size = MAX((int64_t)(refcount_count(&arc_anon->arcs_size) -
+ arc_loaned_bytes), 0);
/*
* Writes will, almost always, require additional memory allocations
@@ -3902,7 +5717,8 @@
* in order to compress/encrypt/etc the data. We therefore need to
* make sure that there is sufficient available memory for this.
*/
- if (error = arc_memory_throttle(reserve, anon_size, txg))
+ error = arc_memory_throttle(reserve, txg);
+ if (error != 0)
return (error);
/*
@@ -3915,12 +5731,14 @@
if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
anon_size > arc_c / 4) {
+ uint64_t meta_esize =
+ refcount_count(&arc_anon->arcs_esize[ARC_BUFC_METADATA]);
+ uint64_t data_esize =
+ refcount_count(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
"anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
- arc_tempreserve>>10,
- arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
- arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
- reserve>>10, arc_c>>10);
+ arc_tempreserve >> 10, meta_esize >> 10,
+ data_esize >> 10, reserve >> 10, arc_c >> 10);
return (SET_ERROR(ERESTART));
}
atomic_add_64(&arc_tempreserve, reserve);
@@ -3927,7 +5745,85 @@
return (0);
}
-static kmutex_t arc_lowmem_lock;
+static void
+arc_kstat_update_state(arc_state_t *state, kstat_named_t *size,
+ kstat_named_t *evict_data, kstat_named_t *evict_metadata)
+{
+ size->value.ui64 = refcount_count(&state->arcs_size);
+ evict_data->value.ui64 =
+ refcount_count(&state->arcs_esize[ARC_BUFC_DATA]);
+ evict_metadata->value.ui64 =
+ refcount_count(&state->arcs_esize[ARC_BUFC_METADATA]);
+}
+
+static int
+arc_kstat_update(kstat_t *ksp, int rw)
+{
+ arc_stats_t *as = ksp->ks_data;
+
+ if (rw == KSTAT_WRITE) {
+ return (EACCES);
+ } else {
+ arc_kstat_update_state(arc_anon,
+ &as->arcstat_anon_size,
+ &as->arcstat_anon_evictable_data,
+ &as->arcstat_anon_evictable_metadata);
+ arc_kstat_update_state(arc_mru,
+ &as->arcstat_mru_size,
+ &as->arcstat_mru_evictable_data,
+ &as->arcstat_mru_evictable_metadata);
+ arc_kstat_update_state(arc_mru_ghost,
+ &as->arcstat_mru_ghost_size,
+ &as->arcstat_mru_ghost_evictable_data,
+ &as->arcstat_mru_ghost_evictable_metadata);
+ arc_kstat_update_state(arc_mfu,
+ &as->arcstat_mfu_size,
+ &as->arcstat_mfu_evictable_data,
+ &as->arcstat_mfu_evictable_metadata);
+ arc_kstat_update_state(arc_mfu_ghost,
+ &as->arcstat_mfu_ghost_size,
+ &as->arcstat_mfu_ghost_evictable_data,
+ &as->arcstat_mfu_ghost_evictable_metadata);
+ }
+
+ return (0);
+}
+
+/*
+ * This function *must* return indices evenly distributed between all
+ * sublists of the multilist. This is needed due to how the ARC eviction
+ * code is laid out; arc_evict_state() assumes ARC buffers are evenly
+ * distributed between all sublists and uses this assumption when
+ * deciding which sublist to evict from and how much to evict from it.
+ */
+unsigned int
+arc_state_multilist_index_func(multilist_t *ml, void *obj)
+{
+ arc_buf_hdr_t *hdr = obj;
+
+ /*
+ * We rely on b_dva to generate evenly distributed index
+ * numbers using buf_hash below. So, as an added precaution,
+ * let's make sure we never add empty buffers to the arc lists.
+ */
+ ASSERT(!HDR_EMPTY(hdr));
+
+ /*
+ * The assumption here, is the hash value for a given
+ * arc_buf_hdr_t will remain constant throughout it's lifetime
+ * (i.e. it's b_spa, b_dva, and b_birth fields don't change).
+ * Thus, we don't need to store the header's sublist index
+ * on insertion, as this index can be recalculated on removal.
+ *
+ * Also, the low order bits of the hash value are thought to be
+ * distributed evenly. Otherwise, in the case that the multilist
+ * has a power of two number of sublists, each sublists' usage
+ * would not be evenly distributed.
+ */
+ return (buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth) %
+ multilist_get_num_sublists(ml));
+}
+
#ifdef _KERNEL
static eventhandler_tag arc_event_lowmem = NULL;
@@ -3935,11 +5831,11 @@
arc_lowmem(void *arg __unused, int howto __unused)
{
- /* Serialize access via arc_lowmem_lock. */
- mutex_enter(&arc_lowmem_lock);
- mutex_enter(&arc_reclaim_thr_lock);
- needfree = 1;
- cv_signal(&arc_reclaim_thr_cv);
+ mutex_enter(&arc_reclaim_lock);
+ /* XXX: Memory deficit should be passed as argument. */
+ needfree = btoc(arc_c >> arc_shrink_shift);
+ DTRACE_PROBE(arc__needfree);
+ cv_signal(&arc_reclaim_thread_cv);
/*
* It is unsafe to block here in arbitrary threads, because we can come
@@ -3946,23 +5842,131 @@
* here from ARC itself and may hold ARC locks and thus risk a deadlock
* with ARC reclaim thread.
*/
- if (curproc == pageproc) {
- while (needfree)
- msleep(&needfree, &arc_reclaim_thr_lock, 0, "zfs:lowmem", 0);
- }
- mutex_exit(&arc_reclaim_thr_lock);
- mutex_exit(&arc_lowmem_lock);
+ if (curproc == pageproc)
+ (void) cv_wait(&arc_reclaim_waiters_cv, &arc_reclaim_lock);
+ mutex_exit(&arc_reclaim_lock);
}
#endif
+static void
+arc_state_init(void)
+{
+ arc_anon = &ARC_anon;
+ arc_mru = &ARC_mru;
+ arc_mru_ghost = &ARC_mru_ghost;
+ arc_mfu = &ARC_mfu;
+ arc_mfu_ghost = &ARC_mfu_ghost;
+ arc_l2c_only = &ARC_l2c_only;
+
+ multilist_create(&arc_mru->arcs_list[ARC_BUFC_METADATA],
+ sizeof (arc_buf_hdr_t),
+ offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+ zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
+ multilist_create(&arc_mru->arcs_list[ARC_BUFC_DATA],
+ sizeof (arc_buf_hdr_t),
+ offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+ zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
+ multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA],
+ sizeof (arc_buf_hdr_t),
+ offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+ zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
+ multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA],
+ sizeof (arc_buf_hdr_t),
+ offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+ zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
+ multilist_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA],
+ sizeof (arc_buf_hdr_t),
+ offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+ zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
+ multilist_create(&arc_mfu->arcs_list[ARC_BUFC_DATA],
+ sizeof (arc_buf_hdr_t),
+ offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+ zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
+ multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA],
+ sizeof (arc_buf_hdr_t),
+ offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+ zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
+ multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA],
+ sizeof (arc_buf_hdr_t),
+ offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+ zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
+ multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA],
+ sizeof (arc_buf_hdr_t),
+ offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+ zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
+ multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
+ sizeof (arc_buf_hdr_t),
+ offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
+ zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
+
+ refcount_create(&arc_anon->arcs_esize[ARC_BUFC_METADATA]);
+ refcount_create(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
+ refcount_create(&arc_mru->arcs_esize[ARC_BUFC_METADATA]);
+ refcount_create(&arc_mru->arcs_esize[ARC_BUFC_DATA]);
+ refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]);
+ refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]);
+ refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]);
+ refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_DATA]);
+ refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]);
+ refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]);
+ refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]);
+ refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]);
+
+ refcount_create(&arc_anon->arcs_size);
+ refcount_create(&arc_mru->arcs_size);
+ refcount_create(&arc_mru_ghost->arcs_size);
+ refcount_create(&arc_mfu->arcs_size);
+ refcount_create(&arc_mfu_ghost->arcs_size);
+ refcount_create(&arc_l2c_only->arcs_size);
+}
+
+static void
+arc_state_fini(void)
+{
+ refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_METADATA]);
+ refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
+ refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_METADATA]);
+ refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_DATA]);
+ refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]);
+ refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]);
+ refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]);
+ refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_DATA]);
+ refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]);
+ refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]);
+ refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]);
+ refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]);
+
+ refcount_destroy(&arc_anon->arcs_size);
+ refcount_destroy(&arc_mru->arcs_size);
+ refcount_destroy(&arc_mru_ghost->arcs_size);
+ refcount_destroy(&arc_mfu->arcs_size);
+ refcount_destroy(&arc_mfu_ghost->arcs_size);
+ refcount_destroy(&arc_l2c_only->arcs_size);
+
+ multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
+ multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
+ multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
+ multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
+ multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]);
+ multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
+ multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
+ multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
+}
+
+uint64_t
+arc_max_bytes(void)
+{
+ return (arc_c_max);
+}
+
void
arc_init(void)
{
int i, prefetch_tunable_set = 0;
- mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
- cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
- mutex_init(&arc_lowmem_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&arc_reclaim_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&arc_reclaim_thread_cv, NULL, CV_DEFAULT, NULL);
+ cv_init(&arc_reclaim_waiters_cv, NULL, CV_DEFAULT, NULL);
/* Convert seconds to clock ticks */
arc_min_prefetch_lifespan = 1 * hz;
@@ -3970,7 +5974,7 @@
/* Start out with 1/8 of all memory */
arc_c = kmem_size() / 8;
-#ifdef sun
+#ifdef illumos
#ifdef _KERNEL
/*
* On architectures where the physical memory can be larger
@@ -3979,29 +5983,42 @@
*/
arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
#endif
-#endif /* sun */
- /* set min cache to 1/32 of all memory, or 16MB, whichever is more */
- arc_c_min = MAX(arc_c / 4, 64<<18);
+#endif /* illumos */
+ /* set min cache to 1/32 of all memory, or arc_abs_min, whichever is more */
+ arc_c_min = MAX(arc_c / 4, arc_abs_min);
/* set max to 1/2 of all memory, or all but 1GB, whichever is more */
- if (arc_c * 8 >= 1<<30)
- arc_c_max = (arc_c * 8) - (1<<30);
+ if (arc_c * 8 >= 1 << 30)
+ arc_c_max = (arc_c * 8) - (1 << 30);
else
arc_c_max = arc_c_min;
arc_c_max = MAX(arc_c * 5, arc_c_max);
+ /*
+ * In userland, there's only the memory pressure that we artificially
+ * create (see arc_available_memory()). Don't let arc_c get too
+ * small, because it can cause transactions to be larger than
+ * arc_c, causing arc_tempreserve_space() to fail.
+ */
+#ifndef _KERNEL
+ arc_c_min = arc_c_max / 2;
+#endif
+
#ifdef _KERNEL
/*
* Allow the tunables to override our calculations if they are
- * reasonable (ie. over 16MB)
+ * reasonable.
*/
- if (zfs_arc_max > 64<<18 && zfs_arc_max < kmem_size())
+ if (zfs_arc_max > arc_abs_min && zfs_arc_max < kmem_size()) {
arc_c_max = zfs_arc_max;
- if (zfs_arc_min > 64<<18 && zfs_arc_min <= arc_c_max)
+ arc_c_min = MIN(arc_c_min, arc_c_max);
+ }
+ if (zfs_arc_min > arc_abs_min && zfs_arc_min <= arc_c_max)
arc_c_min = zfs_arc_min;
#endif
arc_c = arc_c_max;
arc_p = (arc_c >> 1);
+ arc_size = 0;
/* limit meta-data to 1/4 of the arc capacity */
arc_meta_limit = arc_c_max / 4;
@@ -4013,6 +6030,12 @@
if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
arc_c_min = arc_meta_limit / 2;
+ if (zfs_arc_meta_min > 0) {
+ arc_meta_min = zfs_arc_meta_min;
+ } else {
+ arc_meta_min = arc_c_min / 2;
+ }
+
if (zfs_arc_grow_retry > 0)
arc_grow_retry = zfs_arc_grow_retry;
@@ -4019,9 +6042,18 @@
if (zfs_arc_shrink_shift > 0)
arc_shrink_shift = zfs_arc_shrink_shift;
+ /*
+ * Ensure that arc_no_grow_shift is less than arc_shrink_shift.
+ */
+ if (arc_no_grow_shift >= arc_shrink_shift)
+ arc_no_grow_shift = arc_shrink_shift - 1;
+
if (zfs_arc_p_min_shift > 0)
arc_p_min_shift = zfs_arc_p_min_shift;
+ if (zfs_arc_num_sublists_per_state < 1)
+ zfs_arc_num_sublists_per_state = MAX(max_ncpus, 1);
+
/* if kmem_flags are set, lets try to use less memory */
if (kmem_debugging())
arc_c = arc_c / 2;
@@ -4031,48 +6063,10 @@
zfs_arc_min = arc_c_min;
zfs_arc_max = arc_c_max;
- arc_anon = &ARC_anon;
- arc_mru = &ARC_mru;
- arc_mru_ghost = &ARC_mru_ghost;
- arc_mfu = &ARC_mfu;
- arc_mfu_ghost = &ARC_mfu_ghost;
- arc_l2c_only = &ARC_l2c_only;
- arc_size = 0;
-
- for (i = 0; i < ARC_BUFC_NUMLISTS; i++) {
- mutex_init(&arc_anon->arcs_locks[i].arcs_lock,
- NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&arc_mru->arcs_locks[i].arcs_lock,
- NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&arc_mru_ghost->arcs_locks[i].arcs_lock,
- NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&arc_mfu->arcs_locks[i].arcs_lock,
- NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&arc_mfu_ghost->arcs_locks[i].arcs_lock,
- NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&arc_l2c_only->arcs_locks[i].arcs_lock,
- NULL, MUTEX_DEFAULT, NULL);
-
- list_create(&arc_mru->arcs_lists[i],
- sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
- list_create(&arc_mru_ghost->arcs_lists[i],
- sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
- list_create(&arc_mfu->arcs_lists[i],
- sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
- list_create(&arc_mfu_ghost->arcs_lists[i],
- sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
- list_create(&arc_mfu_ghost->arcs_lists[i],
- sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
- list_create(&arc_l2c_only->arcs_lists[i],
- sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
- }
-
+ arc_state_init();
buf_init();
- arc_thread_exit = 0;
- arc_eviction_list = NULL;
- mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
- bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
+ arc_reclaim_thread_exit = B_FALSE;
arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
@@ -4079,6 +6073,7 @@
if (arc_ksp != NULL) {
arc_ksp->ks_data = &arc_stats;
+ arc_ksp->ks_update = arc_kstat_update;
kstat_install(arc_ksp);
}
@@ -4090,14 +6085,23 @@
EVENTHANDLER_PRI_FIRST);
#endif
- arc_dead = FALSE;
+ arc_dead = B_FALSE;
arc_warm = B_FALSE;
- if (zfs_write_limit_max == 0)
- zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift;
- else
- zfs_write_limit_shift = 0;
- mutex_init(&zfs_write_limit_lock, NULL, MUTEX_DEFAULT, NULL);
+ /*
+ * Calculate maximum amount of dirty data per pool.
+ *
+ * If it has been set by /etc/system, take that.
+ * Otherwise, use a percentage of physical memory defined by
+ * zfs_dirty_data_max_percent (default 10%) with a cap at
+ * zfs_dirty_data_max_max (default 4GB).
+ */
+ if (zfs_dirty_data_max == 0) {
+ zfs_dirty_data_max = ptob(physmem) *
+ zfs_dirty_data_max_percent / 100;
+ zfs_dirty_data_max = MIN(zfs_dirty_data_max,
+ zfs_dirty_data_max_max);
+ }
#ifdef _KERNEL
if (TUNABLE_INT_FETCH("vfs.zfs.prefetch_disable", &zfs_prefetch_disable))
@@ -4139,50 +6143,37 @@
void
arc_fini(void)
{
- int i;
+ mutex_enter(&arc_reclaim_lock);
+ arc_reclaim_thread_exit = B_TRUE;
+ /*
+ * The reclaim thread will set arc_reclaim_thread_exit back to
+ * B_FALSE when it is finished exiting; we're waiting for that.
+ */
+ while (arc_reclaim_thread_exit) {
+ cv_signal(&arc_reclaim_thread_cv);
+ cv_wait(&arc_reclaim_thread_cv, &arc_reclaim_lock);
+ }
+ mutex_exit(&arc_reclaim_lock);
- mutex_enter(&arc_reclaim_thr_lock);
- arc_thread_exit = 1;
- cv_signal(&arc_reclaim_thr_cv);
- while (arc_thread_exit != 0)
- cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
- mutex_exit(&arc_reclaim_thr_lock);
+ /* Use B_TRUE to ensure *all* buffers are evicted */
+ arc_flush(NULL, B_TRUE);
- arc_flush(NULL);
+ arc_dead = B_TRUE;
- arc_dead = TRUE;
-
if (arc_ksp != NULL) {
kstat_delete(arc_ksp);
arc_ksp = NULL;
}
- mutex_destroy(&arc_eviction_mtx);
- mutex_destroy(&arc_reclaim_thr_lock);
- cv_destroy(&arc_reclaim_thr_cv);
+ mutex_destroy(&arc_reclaim_lock);
+ cv_destroy(&arc_reclaim_thread_cv);
+ cv_destroy(&arc_reclaim_waiters_cv);
- for (i = 0; i < ARC_BUFC_NUMLISTS; i++) {
- list_destroy(&arc_mru->arcs_lists[i]);
- list_destroy(&arc_mru_ghost->arcs_lists[i]);
- list_destroy(&arc_mfu->arcs_lists[i]);
- list_destroy(&arc_mfu_ghost->arcs_lists[i]);
- list_destroy(&arc_l2c_only->arcs_lists[i]);
-
- mutex_destroy(&arc_anon->arcs_locks[i].arcs_lock);
- mutex_destroy(&arc_mru->arcs_locks[i].arcs_lock);
- mutex_destroy(&arc_mru_ghost->arcs_locks[i].arcs_lock);
- mutex_destroy(&arc_mfu->arcs_locks[i].arcs_lock);
- mutex_destroy(&arc_mfu_ghost->arcs_locks[i].arcs_lock);
- mutex_destroy(&arc_l2c_only->arcs_locks[i].arcs_lock);
- }
-
- mutex_destroy(&zfs_write_limit_lock);
-
+ arc_state_fini();
buf_fini();
- ASSERT(arc_loaned_bytes == 0);
+ ASSERT0(arc_loaned_bytes);
- mutex_destroy(&arc_lowmem_lock);
#ifdef _KERNEL
if (arc_event_lowmem != NULL)
EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem);
@@ -4335,7 +6326,7 @@
*/
static boolean_t
-l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab)
+l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr)
{
/*
* A buffer is *not* eligible for the L2ARC if it:
@@ -4344,19 +6335,19 @@
* 3. has an I/O in progress (it may be an incomplete read).
* 4. is flagged not eligible (zfs property).
*/
- if (ab->b_spa != spa_guid) {
+ if (hdr->b_spa != spa_guid) {
ARCSTAT_BUMP(arcstat_l2_write_spa_mismatch);
return (B_FALSE);
}
- if (ab->b_l2hdr != NULL) {
+ if (HDR_HAS_L2HDR(hdr)) {
ARCSTAT_BUMP(arcstat_l2_write_in_l2);
return (B_FALSE);
}
- if (HDR_IO_IN_PROGRESS(ab)) {
+ if (HDR_IO_IN_PROGRESS(hdr)) {
ARCSTAT_BUMP(arcstat_l2_write_hdr_io_in_progress);
return (B_FALSE);
}
- if (!HDR_L2CACHE(ab)) {
+ if (!HDR_L2CACHE(hdr)) {
ARCSTAT_BUMP(arcstat_l2_write_not_cacheable);
return (B_FALSE);
}
@@ -4410,20 +6401,6 @@
return (next);
}
-static void
-l2arc_hdr_stat_add(void)
-{
- ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE);
- ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
-}
-
-static void
-l2arc_hdr_stat_remove(void)
-{
- ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE));
- ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
-}
-
/*
* Cycle through L2ARC devices. This is how L2ARC load balances.
* If a device is returned, this also returns holding the spa config lock.
@@ -4499,9 +6476,13 @@
for (df = list_tail(buflist); df; df = df_prev) {
df_prev = list_prev(buflist, df);
- ASSERT(df->l2df_data != NULL);
- ASSERT(df->l2df_func != NULL);
- df->l2df_func(df->l2df_data, df->l2df_size);
+ ASSERT3P(df->l2df_data, !=, NULL);
+ if (df->l2df_type == ARC_BUFC_METADATA) {
+ zio_buf_free(df->l2df_data, df->l2df_size);
+ } else {
+ ASSERT(df->l2df_type == ARC_BUFC_DATA);
+ zio_data_buf_free(df->l2df_data, df->l2df_size);
+ }
list_remove(buflist, df);
kmem_free(df, sizeof (l2arc_data_free_t));
}
@@ -4519,18 +6500,18 @@
l2arc_write_callback_t *cb;
l2arc_dev_t *dev;
list_t *buflist;
- arc_buf_hdr_t *head, *ab, *ab_prev;
- l2arc_buf_hdr_t *abl2;
+ arc_buf_hdr_t *head, *hdr, *hdr_prev;
kmutex_t *hash_lock;
+ int64_t bytes_dropped = 0;
cb = zio->io_private;
- ASSERT(cb != NULL);
+ ASSERT3P(cb, !=, NULL);
dev = cb->l2wcb_dev;
- ASSERT(dev != NULL);
+ ASSERT3P(dev, !=, NULL);
head = cb->l2wcb_head;
- ASSERT(head != NULL);
- buflist = dev->l2ad_buflist;
- ASSERT(buflist != NULL);
+ ASSERT3P(head, !=, NULL);
+ buflist = &dev->l2ad_buflist;
+ ASSERT3P(buflist, !=, NULL);
DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
l2arc_write_callback_t *, cb);
@@ -4537,50 +6518,78 @@
if (zio->io_error != 0)
ARCSTAT_BUMP(arcstat_l2_writes_error);
- mutex_enter(&l2arc_buflist_mtx);
-
/*
* All writes completed, or an error was hit.
*/
- for (ab = list_prev(buflist, head); ab; ab = ab_prev) {
- ab_prev = list_prev(buflist, ab);
+top:
+ mutex_enter(&dev->l2ad_mtx);
+ for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) {
+ hdr_prev = list_prev(buflist, hdr);
- hash_lock = HDR_LOCK(ab);
+ hash_lock = HDR_LOCK(hdr);
+
+ /*
+ * We cannot use mutex_enter or else we can deadlock
+ * with l2arc_write_buffers (due to swapping the order
+ * the hash lock and l2ad_mtx are taken).
+ */
if (!mutex_tryenter(hash_lock)) {
/*
- * This buffer misses out. It may be in a stage
- * of eviction. Its ARC_L2_WRITING flag will be
- * left set, denying reads to this buffer.
+ * Missed the hash lock. We must retry so we
+ * don't leave the ARC_FLAG_L2_WRITING bit set.
*/
- ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
- continue;
+ ARCSTAT_BUMP(arcstat_l2_writes_lock_retry);
+
+ /*
+ * We don't want to rescan the headers we've
+ * already marked as having been written out, so
+ * we reinsert the head node so we can pick up
+ * where we left off.
+ */
+ list_remove(buflist, head);
+ list_insert_after(buflist, hdr, head);
+
+ mutex_exit(&dev->l2ad_mtx);
+
+ /*
+ * We wait for the hash lock to become available
+ * to try and prevent busy waiting, and increase
+ * the chance we'll be able to acquire the lock
+ * the next time around.
+ */
+ mutex_enter(hash_lock);
+ mutex_exit(hash_lock);
+ goto top;
}
- abl2 = ab->b_l2hdr;
-
/*
- * Release the temporary compressed buffer as soon as possible.
+ * We could not have been moved into the arc_l2c_only
+ * state while in-flight due to our ARC_FLAG_L2_WRITING
+ * bit being set. Let's just ensure that's being enforced.
*/
- if (abl2->b_compress != ZIO_COMPRESS_OFF)
- l2arc_release_cdata_buf(ab);
+ ASSERT(HDR_HAS_L1HDR(hdr));
if (zio->io_error != 0) {
/*
* Error - drop L2ARC entry.
*/
- list_remove(buflist, ab);
- ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
- ab->b_l2hdr = NULL;
- trim_map_free(abl2->b_dev->l2ad_vdev, abl2->b_daddr,
- ab->b_size, 0);
- kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
- ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
+ list_remove(buflist, hdr);
+ l2arc_trim(hdr);
+ arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR);
+
+ ARCSTAT_INCR(arcstat_l2_asize, -arc_hdr_size(hdr));
+ ARCSTAT_INCR(arcstat_l2_size, -HDR_GET_LSIZE(hdr));
+
+ bytes_dropped += arc_hdr_size(hdr);
+ (void) refcount_remove_many(&dev->l2ad_alloc,
+ arc_hdr_size(hdr), hdr);
}
/*
- * Allow ARC to begin reads to this L2ARC entry.
+ * Allow ARC to begin reads and ghost list evictions to
+ * this L2ARC entry.
*/
- ab->b_flags &= ~ARC_L2_WRITING;
+ arc_hdr_clear_flags(hdr, ARC_FLAG_L2_WRITING);
mutex_exit(hash_lock);
}
@@ -4587,9 +6596,12 @@
atomic_inc_64(&l2arc_writes_done);
list_remove(buflist, head);
- kmem_cache_free(hdr_cache, head);
- mutex_exit(&l2arc_buflist_mtx);
+ ASSERT(!HDR_HAS_L1HDR(head));
+ kmem_cache_free(hdr_l2only_cache, head);
+ mutex_exit(&dev->l2ad_mtx);
+ vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0);
+
l2arc_do_free_on_write();
kmem_free(cb, sizeof (l2arc_write_callback_t));
@@ -4604,41 +6616,63 @@
{
l2arc_read_callback_t *cb;
arc_buf_hdr_t *hdr;
- arc_buf_t *buf;
kmutex_t *hash_lock;
- int equal;
+ boolean_t valid_cksum;
- ASSERT(zio->io_vd != NULL);
+ ASSERT3P(zio->io_vd, !=, NULL);
ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
cb = zio->io_private;
- ASSERT(cb != NULL);
- buf = cb->l2rcb_buf;
- ASSERT(buf != NULL);
+ ASSERT3P(cb, !=, NULL);
+ hdr = cb->l2rcb_hdr;
+ ASSERT3P(hdr, !=, NULL);
- hash_lock = HDR_LOCK(buf->b_hdr);
+ hash_lock = HDR_LOCK(hdr);
mutex_enter(hash_lock);
- hdr = buf->b_hdr;
ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
/*
- * If the buffer was compressed, decompress it first.
+ * If the data was read into a temporary buffer,
+ * move it and free the buffer.
*/
- if (cb->l2rcb_compress != ZIO_COMPRESS_OFF)
- l2arc_decompress_zio(zio, hdr, cb->l2rcb_compress);
- ASSERT(zio->io_data != NULL);
+ if (cb->l2rcb_data != NULL) {
+ ASSERT3U(arc_hdr_size(hdr), <, zio->io_size);
+ if (zio->io_error == 0) {
+ bcopy(cb->l2rcb_data, hdr->b_l1hdr.b_pdata,
+ arc_hdr_size(hdr));
+ }
+ /*
+ * The following must be done regardless of whether
+ * there was an error:
+ * - free the temporary buffer
+ * - point zio to the real ARC buffer
+ * - set zio size accordingly
+ * These are required because zio is either re-used for
+ * an I/O of the block in the case of the error
+ * or the zio is passed to arc_read_done() and it
+ * needs real data.
+ */
+ zio_data_buf_free(cb->l2rcb_data, zio->io_size);
+ zio->io_size = zio->io_orig_size = arc_hdr_size(hdr);
+ zio->io_data = zio->io_orig_data = hdr->b_l1hdr.b_pdata;
+ }
+
+ ASSERT3P(zio->io_data, !=, NULL);
+
/*
* Check this survived the L2ARC journey.
*/
- equal = arc_cksum_equal(buf);
- if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
+ ASSERT3P(zio->io_data, ==, hdr->b_l1hdr.b_pdata);
+ zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */
+ zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */
+
+ valid_cksum = arc_cksum_is_equal(hdr, zio);
+ if (valid_cksum && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
mutex_exit(hash_lock);
- zio->io_private = buf;
- zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */
- zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */
+ zio->io_private = hdr;
arc_read_done(zio);
} else {
mutex_exit(hash_lock);
@@ -4651,7 +6685,7 @@
} else {
zio->io_error = SET_ERROR(EIO);
}
- if (!equal)
+ if (!valid_cksum)
ARCSTAT_BUMP(arcstat_l2_cksum_bad);
/*
@@ -4664,9 +6698,10 @@
ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
- zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp,
- buf->b_data, zio->io_size, arc_read_done, buf,
- zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb));
+ zio_nowait(zio_read(pio, zio->io_spa, zio->io_bp,
+ hdr->b_l1hdr.b_pdata, zio->io_size, arc_read_done,
+ hdr, zio->io_priority, cb->l2rcb_flags,
+ &cb->l2rcb_zb));
}
}
@@ -4683,36 +6718,37 @@
* the data lists. This function returns a locked list, and also returns
* the lock pointer.
*/
-static list_t *
-l2arc_list_locked(int list_num, kmutex_t **lock)
+static multilist_sublist_t *
+l2arc_sublist_lock(int list_num)
{
- list_t *list = NULL;
- int idx;
+ multilist_t *ml = NULL;
+ unsigned int idx;
- ASSERT(list_num >= 0 && list_num < 2 * ARC_BUFC_NUMLISTS);
+ ASSERT(list_num >= 0 && list_num <= 3);
- if (list_num < ARC_BUFC_NUMMETADATALISTS) {
- idx = list_num;
- list = &arc_mfu->arcs_lists[idx];
- *lock = ARCS_LOCK(arc_mfu, idx);
- } else if (list_num < ARC_BUFC_NUMMETADATALISTS * 2) {
- idx = list_num - ARC_BUFC_NUMMETADATALISTS;
- list = &arc_mru->arcs_lists[idx];
- *lock = ARCS_LOCK(arc_mru, idx);
- } else if (list_num < (ARC_BUFC_NUMMETADATALISTS * 2 +
- ARC_BUFC_NUMDATALISTS)) {
- idx = list_num - ARC_BUFC_NUMMETADATALISTS;
- list = &arc_mfu->arcs_lists[idx];
- *lock = ARCS_LOCK(arc_mfu, idx);
- } else {
- idx = list_num - ARC_BUFC_NUMLISTS;
- list = &arc_mru->arcs_lists[idx];
- *lock = ARCS_LOCK(arc_mru, idx);
+ switch (list_num) {
+ case 0:
+ ml = &arc_mfu->arcs_list[ARC_BUFC_METADATA];
+ break;
+ case 1:
+ ml = &arc_mru->arcs_list[ARC_BUFC_METADATA];
+ break;
+ case 2:
+ ml = &arc_mfu->arcs_list[ARC_BUFC_DATA];
+ break;
+ case 3:
+ ml = &arc_mru->arcs_list[ARC_BUFC_DATA];
+ break;
}
- ASSERT(!(MUTEX_HELD(*lock)));
- mutex_enter(*lock);
- return (list);
+ /*
+ * Return a randomly-selected sublist. This is acceptable
+ * because the caller feeds only a little bit of data for each
+ * call (8MB). Subsequent calls will result in different
+ * sublists being selected.
+ */
+ idx = multilist_get_random_index(ml);
+ return (multilist_sublist_lock(ml, idx));
}
/*
@@ -4725,16 +6761,12 @@
l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
{
list_t *buflist;
- l2arc_buf_hdr_t *abl2;
- arc_buf_hdr_t *ab, *ab_prev;
+ arc_buf_hdr_t *hdr, *hdr_prev;
kmutex_t *hash_lock;
uint64_t taddr;
- buflist = dev->l2ad_buflist;
+ buflist = &dev->l2ad_buflist;
- if (buflist == NULL)
- return;
-
if (!all && dev->l2ad_first) {
/*
* This is the first sweep through the device. There is
@@ -4756,35 +6788,41 @@
uint64_t, taddr, boolean_t, all);
top:
- mutex_enter(&l2arc_buflist_mtx);
- for (ab = list_tail(buflist); ab; ab = ab_prev) {
- ab_prev = list_prev(buflist, ab);
+ mutex_enter(&dev->l2ad_mtx);
+ for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) {
+ hdr_prev = list_prev(buflist, hdr);
- hash_lock = HDR_LOCK(ab);
+ hash_lock = HDR_LOCK(hdr);
+
+ /*
+ * We cannot use mutex_enter or else we can deadlock
+ * with l2arc_write_buffers (due to swapping the order
+ * the hash lock and l2ad_mtx are taken).
+ */
if (!mutex_tryenter(hash_lock)) {
/*
* Missed the hash lock. Retry.
*/
ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
- mutex_exit(&l2arc_buflist_mtx);
+ mutex_exit(&dev->l2ad_mtx);
mutex_enter(hash_lock);
mutex_exit(hash_lock);
goto top;
}
- if (HDR_L2_WRITE_HEAD(ab)) {
+ if (HDR_L2_WRITE_HEAD(hdr)) {
/*
* We hit a write head node. Leave it for
* l2arc_write_done().
*/
- list_remove(buflist, ab);
+ list_remove(buflist, hdr);
mutex_exit(hash_lock);
continue;
}
- if (!all && ab->b_l2hdr != NULL &&
- (ab->b_l2hdr->b_daddr > taddr ||
- ab->b_l2hdr->b_daddr < dev->l2ad_hand)) {
+ if (!all && HDR_HAS_L2HDR(hdr) &&
+ (hdr->b_l2hdr.b_daddr >= taddr ||
+ hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) {
/*
* We've evicted to the target address,
* or the end of the device.
@@ -4793,64 +6831,43 @@
break;
}
- if (HDR_FREE_IN_PROGRESS(ab)) {
+ ASSERT(HDR_HAS_L2HDR(hdr));
+ if (!HDR_HAS_L1HDR(hdr)) {
+ ASSERT(!HDR_L2_READING(hdr));
/*
- * Already on the path to destruction.
- */
- mutex_exit(hash_lock);
- continue;
- }
-
- if (ab->b_state == arc_l2c_only) {
- ASSERT(!HDR_L2_READING(ab));
- /*
* This doesn't exist in the ARC. Destroy.
* arc_hdr_destroy() will call list_remove()
* and decrement arcstat_l2_size.
*/
- arc_change_state(arc_anon, ab, hash_lock);
- arc_hdr_destroy(ab);
+ arc_change_state(arc_anon, hdr, hash_lock);
+ arc_hdr_destroy(hdr);
} else {
+ ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only);
+ ARCSTAT_BUMP(arcstat_l2_evict_l1cached);
/*
* Invalidate issued or about to be issued
* reads, since we may be about to write
* over this location.
*/
- if (HDR_L2_READING(ab)) {
+ if (HDR_L2_READING(hdr)) {
ARCSTAT_BUMP(arcstat_l2_evict_reading);
- ab->b_flags |= ARC_L2_EVICTED;
+ arc_hdr_set_flags(hdr, ARC_FLAG_L2_EVICTED);
}
- /*
- * Tell ARC this no longer exists in L2ARC.
- */
- if (ab->b_l2hdr != NULL) {
- abl2 = ab->b_l2hdr;
- ARCSTAT_INCR(arcstat_l2_asize, -abl2->b_asize);
- ab->b_l2hdr = NULL;
- kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
- ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
- }
- list_remove(buflist, ab);
+ /* Ensure this header has finished being written */
+ ASSERT(!HDR_L2_WRITING(hdr));
- /*
- * This may have been leftover after a
- * failed write.
- */
- ab->b_flags &= ~ARC_L2_WRITING;
+ arc_hdr_l2hdr_destroy(hdr);
}
mutex_exit(hash_lock);
}
- mutex_exit(&l2arc_buflist_mtx);
-
- vdev_space_update(dev->l2ad_vdev, -(taddr - dev->l2ad_evict), 0, 0);
- dev->l2ad_evict = taddr;
+ mutex_exit(&dev->l2ad_mtx);
}
/*
* Find and write ARC buffers to the L2ARC device.
*
- * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid
+ * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid
* for reading until they have completed writing.
* The headroom_boost is an in-out parameter used to maintain headroom boost
* state between calls to this function.
@@ -4859,48 +6876,32 @@
* the delta by which the device hand has changed due to alignment).
*/
static uint64_t
-l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
- boolean_t *headroom_boost)
+l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
{
- arc_buf_hdr_t *ab, *ab_prev, *head;
- list_t *list;
- uint64_t write_asize, write_psize, write_sz, headroom,
- buf_compress_minsz;
- void *buf_data;
- kmutex_t *list_lock;
+ arc_buf_hdr_t *hdr, *hdr_prev, *head;
+ uint64_t write_asize, write_psize, write_sz, headroom;
boolean_t full;
l2arc_write_callback_t *cb;
zio_t *pio, *wzio;
uint64_t guid = spa_load_guid(spa);
- const boolean_t do_headroom_boost = *headroom_boost;
int try;
- ASSERT(dev->l2ad_vdev != NULL);
+ ASSERT3P(dev->l2ad_vdev, !=, NULL);
- /* Lower the flag now, we might want to raise it again later. */
- *headroom_boost = B_FALSE;
-
pio = NULL;
write_sz = write_asize = write_psize = 0;
full = B_FALSE;
- head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE);
- head->b_flags |= ARC_L2_WRITE_HEAD;
+ head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE);
+ arc_hdr_set_flags(head, ARC_FLAG_L2_WRITE_HEAD | ARC_FLAG_HAS_L2HDR);
ARCSTAT_BUMP(arcstat_l2_write_buffer_iter);
/*
- * We will want to try to compress buffers that are at least 2x the
- * device sector size.
- */
- buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift;
-
- /*
* Copy buffers for L2ARC writing.
*/
- mutex_enter(&l2arc_buflist_mtx);
- for (try = 0; try < 2 * ARC_BUFC_NUMLISTS; try++) {
+ for (try = 0; try <= 3; try++) {
+ multilist_sublist_t *mls = l2arc_sublist_lock(try);
uint64_t passed_sz = 0;
- list = l2arc_list_locked(try, &list_lock);
ARCSTAT_BUMP(arcstat_l2_write_buffer_list_iter);
/*
@@ -4910,28 +6911,27 @@
* head of the ARC lists rather than the tail.
*/
if (arc_warm == B_FALSE)
- ab = list_head(list);
+ hdr = multilist_sublist_head(mls);
else
- ab = list_tail(list);
- if (ab == NULL)
+ hdr = multilist_sublist_tail(mls);
+ if (hdr == NULL)
ARCSTAT_BUMP(arcstat_l2_write_buffer_list_null_iter);
headroom = target_sz * l2arc_headroom;
- if (do_headroom_boost)
+ if (zfs_compressed_arc_enabled)
headroom = (headroom * l2arc_headroom_boost) / 100;
- for (; ab; ab = ab_prev) {
- l2arc_buf_hdr_t *l2hdr;
+ for (; hdr; hdr = hdr_prev) {
kmutex_t *hash_lock;
- uint64_t buf_sz;
if (arc_warm == B_FALSE)
- ab_prev = list_next(list, ab);
+ hdr_prev = multilist_sublist_next(mls, hdr);
else
- ab_prev = list_prev(list, ab);
- ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned, ab->b_size);
+ hdr_prev = multilist_sublist_prev(mls, hdr);
+ ARCSTAT_INCR(arcstat_l2_write_buffer_bytes_scanned,
+ HDR_GET_LSIZE(hdr));
- hash_lock = HDR_LOCK(ab);
+ hash_lock = HDR_LOCK(hdr);
if (!mutex_tryenter(hash_lock)) {
ARCSTAT_BUMP(arcstat_l2_write_trylock_fail);
/*
@@ -4940,7 +6940,7 @@
continue;
}
- passed_sz += ab->b_size;
+ passed_sz += HDR_GET_LSIZE(hdr);
if (passed_sz > headroom) {
/*
* Searched too far.
@@ -4950,12 +6950,27 @@
break;
}
- if (!l2arc_write_eligible(guid, ab)) {
+ if (!l2arc_write_eligible(guid, hdr)) {
mutex_exit(hash_lock);
continue;
}
- if ((write_sz + ab->b_size) > target_sz) {
+ /*
+ * We rely on the L1 portion of the header below, so
+ * it's invalid for this header to have been evicted out
+ * of the ghost cache, prior to being written out. The
+ * ARC_FLAG_L2_WRITING bit ensures this won't happen.
+ */
+ ASSERT(HDR_HAS_L1HDR(hdr));
+
+ ASSERT3U(HDR_GET_PSIZE(hdr), >, 0);
+ ASSERT3P(hdr->b_l1hdr.b_pdata, !=, NULL);
+ ASSERT3U(arc_hdr_size(hdr), >, 0);
+ uint64_t size = arc_hdr_size(hdr);
+ uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev,
+ size);
+
+ if ((write_psize + asize) > target_sz) {
full = B_TRUE;
mutex_exit(hash_lock);
ARCSTAT_BUMP(arcstat_l2_write_full);
@@ -4968,7 +6983,9 @@
* l2arc_write_done() can find where the
* write buffers begin without searching.
*/
- list_insert_head(dev->l2ad_buflist, head);
+ mutex_enter(&dev->l2ad_mtx);
+ list_insert_head(&dev->l2ad_buflist, head);
+ mutex_exit(&dev->l2ad_mtx);
cb = kmem_alloc(
sizeof (l2arc_write_callback_t), KM_SLEEP);
@@ -4979,43 +6996,63 @@
ARCSTAT_BUMP(arcstat_l2_write_pios);
}
- /*
- * Create and add a new L2ARC header.
- */
- l2hdr = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
- l2hdr->b_dev = dev;
- ab->b_flags |= ARC_L2_WRITING;
+ hdr->b_l2hdr.b_dev = dev;
+ hdr->b_l2hdr.b_daddr = dev->l2ad_hand;
+ arc_hdr_set_flags(hdr,
+ ARC_FLAG_L2_WRITING | ARC_FLAG_HAS_L2HDR);
+ mutex_enter(&dev->l2ad_mtx);
+ list_insert_head(&dev->l2ad_buflist, hdr);
+ mutex_exit(&dev->l2ad_mtx);
+
+ (void) refcount_add_many(&dev->l2ad_alloc, size, hdr);
+
/*
- * Temporarily stash the data buffer in b_tmp_cdata.
- * The subsequent write step will pick it up from
- * there. This is because can't access ab->b_buf
- * without holding the hash_lock, which we in turn
- * can't access without holding the ARC list locks
- * (which we want to avoid during compression/writing).
+ * Normally the L2ARC can use the hdr's data, but if
+ * we're sharing data between the hdr and one of its
+ * bufs, L2ARC needs its own copy of the data so that
+ * the ZIO below can't race with the buf consumer. To
+ * ensure that this copy will be available for the
+ * lifetime of the ZIO and be cleaned up afterwards, we
+ * add it to the l2arc_free_on_write queue.
*/
- l2hdr->b_compress = ZIO_COMPRESS_OFF;
- l2hdr->b_asize = ab->b_size;
- l2hdr->b_tmp_cdata = ab->b_buf->b_data;
+ void *to_write;
+ if (!HDR_SHARED_DATA(hdr) && size == asize) {
+ to_write = hdr->b_l1hdr.b_pdata;
+ } else {
+ arc_buf_contents_t type = arc_buf_type(hdr);
+ if (type == ARC_BUFC_METADATA) {
+ to_write = zio_buf_alloc(asize);
+ } else {
+ ASSERT3U(type, ==, ARC_BUFC_DATA);
+ to_write = zio_data_buf_alloc(asize);
+ }
- buf_sz = ab->b_size;
- ab->b_l2hdr = l2hdr;
+ bcopy(hdr->b_l1hdr.b_pdata, to_write, size);
+ if (asize != size)
+ bzero(to_write + size, asize - size);
+ l2arc_free_data_on_write(to_write, asize, type);
+ }
+ wzio = zio_write_phys(pio, dev->l2ad_vdev,
+ hdr->b_l2hdr.b_daddr, asize, to_write,
+ ZIO_CHECKSUM_OFF, NULL, hdr,
+ ZIO_PRIORITY_ASYNC_WRITE,
+ ZIO_FLAG_CANFAIL, B_FALSE);
- list_insert_head(dev->l2ad_buflist, ab);
+ write_sz += HDR_GET_LSIZE(hdr);
+ DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
+ zio_t *, wzio);
- /*
- * Compute and store the buffer cksum before
- * writing. On debug the cksum is verified first.
- */
- arc_cksum_verify(ab->b_buf);
- arc_cksum_compute(ab->b_buf, B_TRUE);
+ write_asize += size;
+ write_psize += asize;
+ dev->l2ad_hand += asize;
mutex_exit(hash_lock);
- write_sz += buf_sz;
+ (void) zio_nowait(wzio);
}
- mutex_exit(list_lock);
+ multilist_sublist_unlock(mls);
if (full == B_TRUE)
break;
@@ -5024,79 +7061,17 @@
/* No buffers selected for writing? */
if (pio == NULL) {
ASSERT0(write_sz);
- mutex_exit(&l2arc_buflist_mtx);
- kmem_cache_free(hdr_cache, head);
+ ASSERT(!HDR_HAS_L1HDR(head));
+ kmem_cache_free(hdr_l2only_cache, head);
return (0);
}
- /*
- * Now start writing the buffers. We're starting at the write head
- * and work backwards, retracing the course of the buffer selector
- * loop above.
- */
- for (ab = list_prev(dev->l2ad_buflist, head); ab;
- ab = list_prev(dev->l2ad_buflist, ab)) {
- l2arc_buf_hdr_t *l2hdr;
- uint64_t buf_sz;
-
- /*
- * We shouldn't need to lock the buffer here, since we flagged
- * it as ARC_L2_WRITING in the previous step, but we must take
- * care to only access its L2 cache parameters. In particular,
- * ab->b_buf may be invalid by now due to ARC eviction.
- */
- l2hdr = ab->b_l2hdr;
- l2hdr->b_daddr = dev->l2ad_hand;
-
- if ((ab->b_flags & ARC_L2COMPRESS) &&
- l2hdr->b_asize >= buf_compress_minsz) {
- if (l2arc_compress_buf(l2hdr)) {
- /*
- * If compression succeeded, enable headroom
- * boost on the next scan cycle.
- */
- *headroom_boost = B_TRUE;
- }
- }
-
- /*
- * Pick up the buffer data we had previously stashed away
- * (and now potentially also compressed).
- */
- buf_data = l2hdr->b_tmp_cdata;
- buf_sz = l2hdr->b_asize;
-
- /* Compression may have squashed the buffer to zero length. */
- if (buf_sz != 0) {
- uint64_t buf_p_sz;
-
- wzio = zio_write_phys(pio, dev->l2ad_vdev,
- dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
- NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
- ZIO_FLAG_CANFAIL, B_FALSE);
-
- DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
- zio_t *, wzio);
- (void) zio_nowait(wzio);
-
- write_asize += buf_sz;
- /*
- * Keep the clock hand suitably device-aligned.
- */
- buf_p_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
- write_psize += buf_p_sz;
- dev->l2ad_hand += buf_p_sz;
- }
- }
-
- mutex_exit(&l2arc_buflist_mtx);
-
- ASSERT3U(write_asize, <=, target_sz);
+ ASSERT3U(write_psize, <=, target_sz);
ARCSTAT_BUMP(arcstat_l2_writes_sent);
ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize);
ARCSTAT_INCR(arcstat_l2_size, write_sz);
ARCSTAT_INCR(arcstat_l2_asize, write_asize);
- vdev_space_update(dev->l2ad_vdev, write_psize, 0, 0);
+ vdev_space_update(dev->l2ad_vdev, write_asize, 0, 0);
/*
* Bump device hand to the device start if it is approaching the end.
@@ -5103,10 +7078,7 @@
* l2arc_evict() will already have evicted ahead for this case.
*/
if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
- vdev_space_update(dev->l2ad_vdev,
- dev->l2ad_end - dev->l2ad_hand, 0, 0);
dev->l2ad_hand = dev->l2ad_start;
- dev->l2ad_evict = dev->l2ad_start;
dev->l2ad_first = B_FALSE;
}
@@ -5118,152 +7090,6 @@
}
/*
- * Compresses an L2ARC buffer.
- * The data to be compressed must be prefilled in l2hdr->b_tmp_cdata and its
- * size in l2hdr->b_asize. This routine tries to compress the data and
- * depending on the compression result there are three possible outcomes:
- * *) The buffer was incompressible. The original l2hdr contents were left
- * untouched and are ready for writing to an L2 device.
- * *) The buffer was all-zeros, so there is no need to write it to an L2
- * device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is
- * set to zero and b_compress is set to ZIO_COMPRESS_EMPTY.
- * *) Compression succeeded and b_tmp_cdata was replaced with a temporary
- * data buffer which holds the compressed data to be written, and b_asize
- * tells us how much data there is. b_compress is set to the appropriate
- * compression algorithm. Once writing is done, invoke
- * l2arc_release_cdata_buf on this l2hdr to free this temporary buffer.
- *
- * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the
- * buffer was incompressible).
- */
-static boolean_t
-l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr)
-{
- void *cdata;
- size_t csize, len;
-
- ASSERT(l2hdr->b_compress == ZIO_COMPRESS_OFF);
- ASSERT(l2hdr->b_tmp_cdata != NULL);
-
- len = l2hdr->b_asize;
- cdata = zio_data_buf_alloc(len);
- csize = zio_compress_data(ZIO_COMPRESS_LZ4, l2hdr->b_tmp_cdata,
- cdata, l2hdr->b_asize);
-
- if (csize == 0) {
- /* zero block, indicate that there's nothing to write */
- zio_data_buf_free(cdata, len);
- l2hdr->b_compress = ZIO_COMPRESS_EMPTY;
- l2hdr->b_asize = 0;
- l2hdr->b_tmp_cdata = NULL;
- ARCSTAT_BUMP(arcstat_l2_compress_zeros);
- return (B_TRUE);
- } else if (csize > 0 && csize < len) {
- /*
- * Compression succeeded, we'll keep the cdata around for
- * writing and release it afterwards.
- */
- l2hdr->b_compress = ZIO_COMPRESS_LZ4;
- l2hdr->b_asize = csize;
- l2hdr->b_tmp_cdata = cdata;
- ARCSTAT_BUMP(arcstat_l2_compress_successes);
- return (B_TRUE);
- } else {
- /*
- * Compression failed, release the compressed buffer.
- * l2hdr will be left unmodified.
- */
- zio_data_buf_free(cdata, len);
- ARCSTAT_BUMP(arcstat_l2_compress_failures);
- return (B_FALSE);
- }
-}
-
-/*
- * Decompresses a zio read back from an l2arc device. On success, the
- * underlying zio's io_data buffer is overwritten by the uncompressed
- * version. On decompression error (corrupt compressed stream), the
- * zio->io_error value is set to signal an I/O error.
- *
- * Please note that the compressed data stream is not checksummed, so
- * if the underlying device is experiencing data corruption, we may feed
- * corrupt data to the decompressor, so the decompressor needs to be
- * able to handle this situation (LZ4 does).
- */
-static void
-l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c)
-{
- ASSERT(L2ARC_IS_VALID_COMPRESS(c));
-
- if (zio->io_error != 0) {
- /*
- * An io error has occured, just restore the original io
- * size in preparation for a main pool read.
- */
- zio->io_orig_size = zio->io_size = hdr->b_size;
- return;
- }
-
- if (c == ZIO_COMPRESS_EMPTY) {
- /*
- * An empty buffer results in a null zio, which means we
- * need to fill its io_data after we're done restoring the
- * buffer's contents.
- */
- ASSERT(hdr->b_buf != NULL);
- bzero(hdr->b_buf->b_data, hdr->b_size);
- zio->io_data = zio->io_orig_data = hdr->b_buf->b_data;
- } else {
- ASSERT(zio->io_data != NULL);
- /*
- * We copy the compressed data from the start of the arc buffer
- * (the zio_read will have pulled in only what we need, the
- * rest is garbage which we will overwrite at decompression)
- * and then decompress back to the ARC data buffer. This way we
- * can minimize copying by simply decompressing back over the
- * original compressed data (rather than decompressing to an
- * aux buffer and then copying back the uncompressed buffer,
- * which is likely to be much larger).
- */
- uint64_t csize;
- void *cdata;
-
- csize = zio->io_size;
- cdata = zio_data_buf_alloc(csize);
- bcopy(zio->io_data, cdata, csize);
- if (zio_decompress_data(c, cdata, zio->io_data, csize,
- hdr->b_size) != 0)
- zio->io_error = EIO;
- zio_data_buf_free(cdata, csize);
- }
-
- /* Restore the expected uncompressed IO size. */
- zio->io_orig_size = zio->io_size = hdr->b_size;
-}
-
-/*
- * Releases the temporary b_tmp_cdata buffer in an l2arc header structure.
- * This buffer serves as a temporary holder of compressed data while
- * the buffer entry is being written to an l2arc device. Once that is
- * done, we can dispose of it.
- */
-static void
-l2arc_release_cdata_buf(arc_buf_hdr_t *ab)
-{
- l2arc_buf_hdr_t *l2hdr = ab->b_l2hdr;
-
- if (l2hdr->b_compress == ZIO_COMPRESS_LZ4) {
- /*
- * If the data was compressed, then we've allocated a
- * temporary buffer for it, so now we need to release it.
- */
- ASSERT(l2hdr->b_tmp_cdata != NULL);
- zio_data_buf_free(l2hdr->b_tmp_cdata, ab->b_size);
- }
- l2hdr->b_tmp_cdata = NULL;
-}
-
-/*
* This thread feeds the L2ARC at regular intervals. This is the beating
* heart of the L2ARC.
*/
@@ -5275,7 +7101,6 @@
spa_t *spa;
uint64_t size, wrote;
clock_t begin, next = ddi_get_lbolt();
- boolean_t headroom_boost = B_FALSE;
CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
@@ -5313,7 +7138,7 @@
continue;
spa = dev->l2ad_spa;
- ASSERT(spa != NULL);
+ ASSERT3P(spa, !=, NULL);
/*
* If the pool is read-only then force the feed thread to
@@ -5346,7 +7171,7 @@
/*
* Write ARC buffers.
*/
- wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost);
+ wrote = l2arc_write_buffers(spa, dev, size);
/*
* Calculate interval between writes.
@@ -5388,6 +7213,8 @@
ASSERT(!l2arc_vdev_present(vd));
+ vdev_ashift_optimize(vd);
+
/*
* Create a new l2arc device entry.
*/
@@ -5397,19 +7224,19 @@
adddev->l2ad_start = VDEV_LABEL_START_SIZE;
adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
adddev->l2ad_hand = adddev->l2ad_start;
- adddev->l2ad_evict = adddev->l2ad_start;
adddev->l2ad_first = B_TRUE;
adddev->l2ad_writing = B_FALSE;
+ mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL);
/*
* This is a list of all ARC buffers that are still valid on the
* device.
*/
- adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP);
- list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
- offsetof(arc_buf_hdr_t, b_l2node));
+ list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
+ offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node));
vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
+ refcount_create(&adddev->l2ad_alloc);
/*
* Add device to global list
@@ -5439,7 +7266,7 @@
break;
}
}
- ASSERT(remdev != NULL);
+ ASSERT3P(remdev, !=, NULL);
/*
* Remove device from global list
@@ -5453,8 +7280,9 @@
* Clear all buflists and ARC references. L2ARC device flush.
*/
l2arc_evict(remdev, 0, B_TRUE);
- list_destroy(remdev->l2ad_buflist);
- kmem_free(remdev->l2ad_buflist, sizeof (list_t));
+ list_destroy(&remdev->l2ad_buflist);
+ mutex_destroy(&remdev->l2ad_mtx);
+ refcount_destroy(&remdev->l2ad_alloc);
kmem_free(remdev, sizeof (l2arc_dev_t));
}
@@ -5469,7 +7297,6 @@
mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
- mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
l2arc_dev_list = &L2ARC_dev_list;
@@ -5494,7 +7321,6 @@
mutex_destroy(&l2arc_feed_thr_lock);
cv_destroy(&l2arc_feed_thr_cv);
mutex_destroy(&l2arc_dev_mtx);
- mutex_destroy(&l2arc_buflist_mtx);
mutex_destroy(&l2arc_free_on_write_mtx);
list_destroy(l2arc_dev_list);
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bpobj.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,7 +21,8 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
*/
#include <sys/bpobj.h>
@@ -37,22 +38,20 @@
uint64_t
bpobj_alloc_empty(objset_t *os, int blocksize, dmu_tx_t *tx)
{
- zfeature_info_t *empty_bpobj_feat =
- &spa_feature_table[SPA_FEATURE_EMPTY_BPOBJ];
spa_t *spa = dmu_objset_spa(os);
dsl_pool_t *dp = dmu_objset_pool(os);
- if (spa_feature_is_enabled(spa, empty_bpobj_feat)) {
- if (!spa_feature_is_active(spa, empty_bpobj_feat)) {
+ if (spa_feature_is_enabled(spa, SPA_FEATURE_EMPTY_BPOBJ)) {
+ if (!spa_feature_is_active(spa, SPA_FEATURE_EMPTY_BPOBJ)) {
ASSERT0(dp->dp_empty_bpobj);
dp->dp_empty_bpobj =
- bpobj_alloc(os, SPA_MAXBLOCKSIZE, tx);
+ bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx);
VERIFY(zap_add(os,
DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1,
&dp->dp_empty_bpobj, tx) == 0);
}
- spa_feature_incr(spa, empty_bpobj_feat, tx);
+ spa_feature_incr(spa, SPA_FEATURE_EMPTY_BPOBJ, tx);
ASSERT(dp->dp_empty_bpobj != 0);
return (dp->dp_empty_bpobj);
} else {
@@ -63,12 +62,11 @@
void
bpobj_decr_empty(objset_t *os, dmu_tx_t *tx)
{
- zfeature_info_t *empty_bpobj_feat =
- &spa_feature_table[SPA_FEATURE_EMPTY_BPOBJ];
dsl_pool_t *dp = dmu_objset_pool(os);
- spa_feature_decr(dmu_objset_spa(os), empty_bpobj_feat, tx);
- if (!spa_feature_is_active(dmu_objset_spa(os), empty_bpobj_feat)) {
+ spa_feature_decr(dmu_objset_spa(os), SPA_FEATURE_EMPTY_BPOBJ, tx);
+ if (!spa_feature_is_active(dmu_objset_spa(os),
+ SPA_FEATURE_EMPTY_BPOBJ)) {
VERIFY3U(0, ==, zap_remove(dp->dp_meta_objset,
DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_EMPTY_BPOBJ, tx));
@@ -196,6 +194,13 @@
mutex_destroy(&bpo->bpo_lock);
}
+static boolean_t
+bpobj_hasentries(bpobj_t *bpo)
+{
+ return (bpo->bpo_phys->bpo_num_blkptrs != 0 ||
+ (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_num_subobjs != 0));
+}
+
static int
bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx,
boolean_t free)
@@ -253,9 +258,8 @@
dbuf = NULL;
}
if (free) {
- i++;
VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os, bpo->bpo_object,
- i * sizeof (blkptr_t), -1ULL, tx));
+ (i + 1) * sizeof (blkptr_t), -1ULL, tx));
}
if (err || !bpo->bpo_havesubobj || bpo->bpo_phys->bpo_subobjs == 0)
goto out;
@@ -266,6 +270,7 @@
mutex_exit(&bpo->bpo_lock);
return (err);
}
+ ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ_SUBOBJ);
epb = doi.doi_data_block_size / sizeof (uint64_t);
for (i = bpo->bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) {
@@ -297,8 +302,10 @@
if (free) {
err = bpobj_space(&sublist,
&used_before, &comp_before, &uncomp_before);
- if (err)
+ if (err != 0) {
+ bpobj_close(&sublist);
break;
+ }
}
err = bpobj_iterate_impl(&sublist, func, arg, tx, free);
if (free) {
@@ -335,9 +342,11 @@
out:
/* If there are no entries, there should be no bytes. */
- ASSERT(bpo->bpo_phys->bpo_num_blkptrs > 0 ||
- (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_num_subobjs > 0) ||
- bpo->bpo_phys->bpo_bytes == 0);
+ if (!bpobj_hasentries(bpo)) {
+ ASSERT0(bpo->bpo_phys->bpo_bytes);
+ ASSERT0(bpo->bpo_phys->bpo_comp);
+ ASSERT0(bpo->bpo_phys->bpo_uncomp);
+ }
mutex_exit(&bpo->bpo_lock);
return (err);
@@ -380,7 +389,7 @@
VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj));
VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp));
- if (used == 0) {
+ if (!bpobj_hasentries(&subbpo)) {
/* No point in having an empty subobj. */
bpobj_close(&subbpo);
bpobj_free(bpo->bpo_os, subobj, tx);
@@ -390,7 +399,8 @@
dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
if (bpo->bpo_phys->bpo_subobjs == 0) {
bpo->bpo_phys->bpo_subobjs = dmu_object_alloc(bpo->bpo_os,
- DMU_OT_BPOBJ_SUBOBJ, SPA_MAXBLOCKSIZE, DMU_OT_NONE, 0, tx);
+ DMU_OT_BPOBJ_SUBOBJ, SPA_OLD_MAXBLOCKSIZE,
+ DMU_OT_NONE, 0, tx);
}
dmu_object_info_t doi;
@@ -456,13 +466,29 @@
ASSERT(!BP_IS_HOLE(bp));
ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj);
+ if (BP_IS_EMBEDDED(bp)) {
+ /*
+ * The bpobj will compress better without the payload.
+ *
+ * Note that we store EMBEDDED bp's because they have an
+ * uncompressed size, which must be accounted for. An
+ * alternative would be to add their size to bpo_uncomp
+ * without storing the bp, but that would create additional
+ * complications: bpo_uncomp would be inconsistent with the
+ * set of BP's stored, and bpobj_iterate() wouldn't visit
+ * all the space accounted for in the bpobj.
+ */
+ bzero(&stored_bp, sizeof (stored_bp));
+ stored_bp.blk_prop = bp->blk_prop;
+ stored_bp.blk_birth = bp->blk_birth;
+ } else if (!BP_GET_DEDUP(bp)) {
+ /* The bpobj will compress better without the checksum */
+ bzero(&stored_bp.blk_cksum, sizeof (stored_bp.blk_cksum));
+ }
+
/* We never need the fill count. */
stored_bp.blk_fill = 0;
- /* The bpobj will compress better if we can leave off the checksum */
- if (!BP_GET_DEDUP(bp))
- bzero(&stored_bp.blk_cksum, sizeof (stored_bp.blk_cksum));
-
mutex_enter(&bpo->bpo_lock);
offset = bpo->bpo_phys->bpo_num_blkptrs * sizeof (stored_bp);
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/bptree.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,7 +21,8 @@
*/
/*
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
*/
#include <sys/arc.h>
@@ -66,7 +67,7 @@
bptree_phys_t *bt;
obj = dmu_object_alloc(os, DMU_OTN_UINT64_METADATA,
- SPA_MAXBLOCKSIZE, DMU_OTN_UINT64_METADATA,
+ SPA_OLD_MAXBLOCKSIZE, DMU_OTN_UINT64_METADATA,
sizeof (bptree_phys_t), tx);
/*
@@ -103,6 +104,20 @@
return (dmu_object_free(os, obj, tx));
}
+boolean_t
+bptree_is_empty(objset_t *os, uint64_t obj)
+{
+ dmu_buf_t *db;
+ bptree_phys_t *bt;
+ boolean_t rv;
+
+ VERIFY0(dmu_bonus_hold(os, obj, FTAG, &db));
+ bt = db->db_data;
+ rv = (bt->bt_begin == bt->bt_end);
+ dmu_buf_rele(db, FTAG);
+ return (rv);
+}
+
void
bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg,
uint64_t bytes, uint64_t comp, uint64_t uncomp, dmu_tx_t *tx)
@@ -109,7 +124,7 @@
{
dmu_buf_t *db;
bptree_phys_t *bt;
- bptree_entry_phys_t bte;
+ bptree_entry_phys_t bte = { 0 };
/*
* bptree objects are in the pool mos, therefore they can only be
@@ -123,7 +138,6 @@
bte.be_birth_txg = birth_txg;
bte.be_bp = *bp;
- bzero(&bte.be_zb, sizeof (bte.be_zb));
dmu_write(os, obj, bt->bt_end * sizeof (bte), sizeof (bte), &bte, tx);
dmu_buf_will_dirty(db, tx);
@@ -137,12 +151,12 @@
/* ARGSUSED */
static int
bptree_visit_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
- const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
+ const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
{
int err;
struct bptree_args *ba = arg;
- if (bp == NULL)
+ if (bp == NULL || BP_IS_HOLE(bp))
return (0);
err = ba->ba_func(ba->ba_arg, bp, ba->ba_tx);
@@ -154,10 +168,27 @@
return (err);
}
+/*
+ * If "free" is set:
+ * - It is assumed that "func" will be freeing the block pointers.
+ * - If "func" returns nonzero, the bookmark will be remembered and
+ * iteration will be restarted from this point on next invocation.
+ * - If an i/o error is encountered (e.g. "func" returns EIO or ECKSUM),
+ * bptree_iterate will remember the bookmark, continue traversing
+ * any additional entries, and return 0.
+ *
+ * If "free" is not set, traversal will stop and return an error if
+ * an i/o error is encountered.
+ *
+ * In either case, if zfs_free_leak_on_eio is set, i/o errors will be
+ * ignored and traversal will continue (i.e. TRAVERSE_HARD will be passed to
+ * traverse_dataset_destroyed()).
+ */
int
bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func,
void *arg, dmu_tx_t *tx)
{
+ boolean_t ioerr = B_FALSE;
int err;
uint64_t i;
dmu_buf_t *db;
@@ -181,20 +212,33 @@
err = 0;
for (i = ba.ba_phys->bt_begin; i < ba.ba_phys->bt_end; i++) {
bptree_entry_phys_t bte;
+ int flags = TRAVERSE_PREFETCH_METADATA | TRAVERSE_POST;
- ASSERT(!free || i == ba.ba_phys->bt_begin);
-
err = dmu_read(os, obj, i * sizeof (bte), sizeof (bte),
&bte, DMU_READ_NO_PREFETCH);
if (err != 0)
break;
+ if (zfs_free_leak_on_eio)
+ flags |= TRAVERSE_HARD;
+ zfs_dbgmsg("bptree index %lld: traversing from min_txg=%lld "
+ "bookmark %lld/%lld/%lld/%lld",
+ (longlong_t)i,
+ (longlong_t)bte.be_birth_txg,
+ (longlong_t)bte.be_zb.zb_objset,
+ (longlong_t)bte.be_zb.zb_object,
+ (longlong_t)bte.be_zb.zb_level,
+ (longlong_t)bte.be_zb.zb_blkid);
err = traverse_dataset_destroyed(os->os_spa, &bte.be_bp,
- bte.be_birth_txg, &bte.be_zb,
- TRAVERSE_PREFETCH_METADATA | TRAVERSE_POST,
+ bte.be_birth_txg, &bte.be_zb, flags,
bptree_visit_cb, &ba);
if (free) {
- ASSERT(err == 0 || err == ERESTART);
+ /*
+ * The callback has freed the visited block pointers.
+ * Record our traversal progress on disk, either by
+ * updating this record's bookmark, or by logically
+ * removing this record by advancing bt_begin.
+ */
if (err != 0) {
/* save bookmark for future resume */
ASSERT3U(bte.be_zb.zb_objset, ==,
@@ -202,19 +246,51 @@
ASSERT0(bte.be_zb.zb_level);
dmu_write(os, obj, i * sizeof (bte),
sizeof (bte), &bte, tx);
- break;
- } else {
+ if (err == EIO || err == ECKSUM ||
+ err == ENXIO) {
+ /*
+ * Skip the rest of this tree and
+ * continue on to the next entry.
+ */
+ err = 0;
+ ioerr = B_TRUE;
+ } else {
+ break;
+ }
+ } else if (ioerr) {
+ /*
+ * This entry is finished, but there were
+ * i/o errors on previous entries, so we
+ * can't adjust bt_begin. Set this entry's
+ * be_birth_txg such that it will be
+ * treated as a no-op in future traversals.
+ */
+ bte.be_birth_txg = UINT64_MAX;
+ dmu_write(os, obj, i * sizeof (bte),
+ sizeof (bte), &bte, tx);
+ }
+
+ if (!ioerr) {
ba.ba_phys->bt_begin++;
(void) dmu_free_range(os, obj,
i * sizeof (bte), sizeof (bte), tx);
}
+ } else if (err != 0) {
+ break;
}
}
- ASSERT(!free || err != 0 || ba.ba_phys->bt_begin == ba.ba_phys->bt_end);
+ ASSERT(!free || err != 0 || ioerr ||
+ ba.ba_phys->bt_begin == ba.ba_phys->bt_end);
/* if all blocks are free there should be no used space */
if (ba.ba_phys->bt_begin == ba.ba_phys->bt_end) {
+ if (zfs_free_leak_on_eio) {
+ ba.ba_phys->bt_bytes = 0;
+ ba.ba_phys->bt_comp = 0;
+ ba.ba_phys->bt_uncomp = 0;
+ }
+
ASSERT0(ba.ba_phys->bt_bytes);
ASSERT0(ba.ba_phys->bt_comp);
ASSERT0(ba.ba_phys->bt_uncomp);
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -22,12 +22,16 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
*/
#include <sys/zfs_context.h>
#include <sys/dmu.h>
+#include <sys/dmu_send.h>
#include <sys/dmu_impl.h>
#include <sys/dbuf.h>
#include <sys/dmu_objset.h>
@@ -39,8 +43,19 @@
#include <sys/dmu_zfetch.h>
#include <sys/sa.h>
#include <sys/sa_impl.h>
+#include <sys/zfeature.h>
+#include <sys/blkptr.h>
+#include <sys/range_tree.h>
+#include <sys/callb.h>
-static void dbuf_destroy(dmu_buf_impl_t *db);
+uint_t zfs_dbuf_evict_key;
+
+/*
+ * Number of times that zfs_free_range() took the slow path while doing
+ * a zfs receive. A nonzero value indicates a potential performance problem.
+ */
+uint64_t zfs_free_range_recv_miss;
+
static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
@@ -47,8 +62,76 @@
/*
* Global data structures and functions for the dbuf cache.
*/
-static kmem_cache_t *dbuf_cache;
+static kmem_cache_t *dbuf_kmem_cache;
+static taskq_t *dbu_evict_taskq;
+static kthread_t *dbuf_cache_evict_thread;
+static kmutex_t dbuf_evict_lock;
+static kcondvar_t dbuf_evict_cv;
+static boolean_t dbuf_evict_thread_exit;
+
+/*
+ * LRU cache of dbufs. The dbuf cache maintains a list of dbufs that
+ * are not currently held but have been recently released. These dbufs
+ * are not eligible for arc eviction until they are aged out of the cache.
+ * Dbufs are added to the dbuf cache once the last hold is released. If a
+ * dbuf is later accessed and still exists in the dbuf cache, then it will
+ * be removed from the cache and later re-added to the head of the cache.
+ * Dbufs that are aged out of the cache will be immediately destroyed and
+ * become eligible for arc eviction.
+ */
+static multilist_t dbuf_cache;
+static refcount_t dbuf_cache_size;
+uint64_t dbuf_cache_max_bytes = 100 * 1024 * 1024;
+
+/* Cap the size of the dbuf cache to log2 fraction of arc size. */
+int dbuf_cache_max_shift = 5;
+
+/*
+ * The dbuf cache uses a three-stage eviction policy:
+ * - A low water marker designates when the dbuf eviction thread
+ * should stop evicting from the dbuf cache.
+ * - When we reach the maximum size (aka mid water mark), we
+ * signal the eviction thread to run.
+ * - The high water mark indicates when the eviction thread
+ * is unable to keep up with the incoming load and eviction must
+ * happen in the context of the calling thread.
+ *
+ * The dbuf cache:
+ * (max size)
+ * low water mid water hi water
+ * +----------------------------------------+----------+----------+
+ * | | | |
+ * | | | |
+ * | | | |
+ * | | | |
+ * +----------------------------------------+----------+----------+
+ * stop signal evict
+ * evicting eviction directly
+ * thread
+ *
+ * The high and low water marks indicate the operating range for the eviction
+ * thread. The low water mark is, by default, 90% of the total size of the
+ * cache and the high water mark is at 110% (both of these percentages can be
+ * changed by setting dbuf_cache_lowater_pct and dbuf_cache_hiwater_pct,
+ * respectively). The eviction thread will try to ensure that the cache remains
+ * within this range by waking up every second and checking if the cache is
+ * above the low water mark. The thread can also be woken up by callers adding
+ * elements into the cache if the cache is larger than the mid water (i.e max
+ * cache size). Once the eviction thread is woken up and eviction is required,
+ * it will continue evicting buffers until it's able to reduce the cache size
+ * to the low water mark. If the cache size continues to grow and hits the high
+ * water mark, then callers adding elments to the cache will begin to evict
+ * directly from the cache until the cache is no longer above the high water
+ * mark.
+ */
+
+/*
+ * The percentage above and below the maximum cache size.
+ */
+uint_t dbuf_cache_hiwater_pct = 10;
+uint_t dbuf_cache_lowater_pct = 10;
+
/* ARGSUSED */
static int
dbuf_cons(void *vdb, void *unused, int kmflag)
@@ -58,7 +141,9 @@
mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
+ multilist_link_init(&db->db_cache_link);
refcount_create(&db->db_holds);
+
return (0);
}
@@ -69,6 +154,7 @@
dmu_buf_impl_t *db = vdb;
mutex_destroy(&db->db_mtx);
cv_destroy(&db->db_changed);
+ ASSERT(!multilist_link_active(&db->db_cache_link));
refcount_destroy(&db->db_holds);
}
@@ -98,8 +184,6 @@
return (crc);
}
-#define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid);
-
#define DBUF_EQUAL(dbuf, os, obj, level, blkid) \
((dbuf)->db.db_object == (obj) && \
(dbuf)->db_objset == (os) && \
@@ -107,12 +191,10 @@
(dbuf)->db_blkid == (blkid))
dmu_buf_impl_t *
-dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid)
+dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid)
{
dbuf_hash_table_t *h = &dbuf_hash_table;
- objset_t *os = dn->dn_objset;
- uint64_t obj = dn->dn_object;
- uint64_t hv = DBUF_HASH(os, obj, level, blkid);
+ uint64_t hv = dbuf_hash(os, obj, level, blkid);
uint64_t idx = hv & h->hash_table_mask;
dmu_buf_impl_t *db;
@@ -131,6 +213,24 @@
return (NULL);
}
+static dmu_buf_impl_t *
+dbuf_find_bonus(objset_t *os, uint64_t object)
+{
+ dnode_t *dn;
+ dmu_buf_impl_t *db = NULL;
+
+ if (dnode_hold(os, object, FTAG, &dn) == 0) {
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ if (dn->dn_bonus != NULL) {
+ db = dn->dn_bonus;
+ mutex_enter(&db->db_mtx);
+ }
+ rw_exit(&dn->dn_struct_rwlock);
+ dnode_rele(dn, FTAG);
+ }
+ return (db);
+}
+
/*
* Insert an entry into the hash table. If there is already an element
* equal to elem in the hash table, then the already existing element
@@ -145,7 +245,7 @@
uint64_t obj = db->db.db_object;
int level = db->db_level;
uint64_t blkid = db->db_blkid;
- uint64_t hv = DBUF_HASH(os, obj, level, blkid);
+ uint64_t hv = dbuf_hash(os, obj, level, blkid);
uint64_t idx = hv & h->hash_table_mask;
dmu_buf_impl_t *dbf;
@@ -165,26 +265,25 @@
db->db_hash_next = h->hash_table[idx];
h->hash_table[idx] = db;
mutex_exit(DBUF_HASH_MUTEX(h, idx));
- atomic_add_64(&dbuf_hash_count, 1);
+ atomic_inc_64(&dbuf_hash_count);
return (NULL);
}
/*
- * Remove an entry from the hash table. This operation will
- * fail if there are any existing holds on the db.
+ * Remove an entry from the hash table. It must be in the EVICTING state.
*/
static void
dbuf_hash_remove(dmu_buf_impl_t *db)
{
dbuf_hash_table_t *h = &dbuf_hash_table;
- uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object,
+ uint64_t hv = dbuf_hash(db->db_objset, db->db.db_object,
db->db_level, db->db_blkid);
uint64_t idx = hv & h->hash_table_mask;
dmu_buf_impl_t *dbf, **dbp;
/*
- * We musn't hold db_mtx to maintin lock ordering:
+ * We musn't hold db_mtx to maintain lock ordering:
* DBUF_HASH_MUTEX > db_mtx.
*/
ASSERT(refcount_is_zero(&db->db_holds));
@@ -200,25 +299,75 @@
*dbp = db->db_hash_next;
db->db_hash_next = NULL;
mutex_exit(DBUF_HASH_MUTEX(h, idx));
- atomic_add_64(&dbuf_hash_count, -1);
+ atomic_dec_64(&dbuf_hash_count);
}
-static arc_evict_func_t dbuf_do_evict;
+typedef enum {
+ DBVU_EVICTING,
+ DBVU_NOT_EVICTING
+} dbvu_verify_type_t;
static void
+dbuf_verify_user(dmu_buf_impl_t *db, dbvu_verify_type_t verify_type)
+{
+#ifdef ZFS_DEBUG
+ int64_t holds;
+
+ if (db->db_user == NULL)
+ return;
+
+ /* Only data blocks support the attachment of user data. */
+ ASSERT(db->db_level == 0);
+
+ /* Clients must resolve a dbuf before attaching user data. */
+ ASSERT(db->db.db_data != NULL);
+ ASSERT3U(db->db_state, ==, DB_CACHED);
+
+ holds = refcount_count(&db->db_holds);
+ if (verify_type == DBVU_EVICTING) {
+ /*
+ * Immediate eviction occurs when holds == dirtycnt.
+ * For normal eviction buffers, holds is zero on
+ * eviction, except when dbuf_fix_old_data() calls
+ * dbuf_clear_data(). However, the hold count can grow
+ * during eviction even though db_mtx is held (see
+ * dmu_bonus_hold() for an example), so we can only
+ * test the generic invariant that holds >= dirtycnt.
+ */
+ ASSERT3U(holds, >=, db->db_dirtycnt);
+ } else {
+ if (db->db_user_immediate_evict == TRUE)
+ ASSERT3U(holds, >=, db->db_dirtycnt);
+ else
+ ASSERT3U(holds, >, 0);
+ }
+#endif
+}
+
+static void
dbuf_evict_user(dmu_buf_impl_t *db)
{
+ dmu_buf_user_t *dbu = db->db_user;
+
ASSERT(MUTEX_HELD(&db->db_mtx));
- if (db->db_level != 0 || db->db_evict_func == NULL)
+ if (dbu == NULL)
return;
- if (db->db_user_data_ptr_ptr)
- *db->db_user_data_ptr_ptr = db->db.db_data;
- db->db_evict_func(&db->db, db->db_user_ptr);
- db->db_user_ptr = NULL;
- db->db_user_data_ptr_ptr = NULL;
- db->db_evict_func = NULL;
+ dbuf_verify_user(db, DBVU_EVICTING);
+ db->db_user = NULL;
+
+#ifdef ZFS_DEBUG
+ if (dbu->dbu_clear_on_evict_dbufp != NULL)
+ *dbu->dbu_clear_on_evict_dbufp = NULL;
+#endif
+
+ /*
+ * Invoke the callback from a taskq to avoid lock order reversals
+ * and limit stack depth.
+ */
+ taskq_dispatch_ent(dbu_evict_taskq, dbu->dbu_evict_func, dbu, 0,
+ &dbu->dbu_tqent);
}
boolean_t
@@ -237,17 +386,183 @@
}
}
-void
-dbuf_evict(dmu_buf_impl_t *db)
+/*
+ * This function *must* return indices evenly distributed between all
+ * sublists of the multilist. This is needed due to how the dbuf eviction
+ * code is laid out; dbuf_evict_thread() assumes dbufs are evenly
+ * distributed between all sublists and uses this assumption when
+ * deciding which sublist to evict from and how much to evict from it.
+ */
+unsigned int
+dbuf_cache_multilist_index_func(multilist_t *ml, void *obj)
{
- ASSERT(MUTEX_HELD(&db->db_mtx));
- ASSERT(db->db_buf == NULL);
- ASSERT(db->db_data_pending == NULL);
+ dmu_buf_impl_t *db = obj;
- dbuf_clear(db);
- dbuf_destroy(db);
+ /*
+ * The assumption here, is the hash value for a given
+ * dmu_buf_impl_t will remain constant throughout it's lifetime
+ * (i.e. it's objset, object, level and blkid fields don't change).
+ * Thus, we don't need to store the dbuf's sublist index
+ * on insertion, as this index can be recalculated on removal.
+ *
+ * Also, the low order bits of the hash value are thought to be
+ * distributed evenly. Otherwise, in the case that the multilist
+ * has a power of two number of sublists, each sublists' usage
+ * would not be evenly distributed.
+ */
+ return (dbuf_hash(db->db_objset, db->db.db_object,
+ db->db_level, db->db_blkid) %
+ multilist_get_num_sublists(ml));
}
+static inline boolean_t
+dbuf_cache_above_hiwater(void)
+{
+ uint64_t dbuf_cache_hiwater_bytes =
+ (dbuf_cache_max_bytes * dbuf_cache_hiwater_pct) / 100;
+
+ return (refcount_count(&dbuf_cache_size) >
+ dbuf_cache_max_bytes + dbuf_cache_hiwater_bytes);
+}
+
+static inline boolean_t
+dbuf_cache_above_lowater(void)
+{
+ uint64_t dbuf_cache_lowater_bytes =
+ (dbuf_cache_max_bytes * dbuf_cache_lowater_pct) / 100;
+
+ return (refcount_count(&dbuf_cache_size) >
+ dbuf_cache_max_bytes - dbuf_cache_lowater_bytes);
+}
+
+/*
+ * Evict the oldest eligible dbuf from the dbuf cache.
+ */
+static void
+dbuf_evict_one(void)
+{
+ int idx = multilist_get_random_index(&dbuf_cache);
+ multilist_sublist_t *mls = multilist_sublist_lock(&dbuf_cache, idx);
+
+ ASSERT(!MUTEX_HELD(&dbuf_evict_lock));
+
+ /*
+ * Set the thread's tsd to indicate that it's processing evictions.
+ * Once a thread stops evicting from the dbuf cache it will
+ * reset its tsd to NULL.
+ */
+ ASSERT3P(tsd_get(zfs_dbuf_evict_key), ==, NULL);
+ (void) tsd_set(zfs_dbuf_evict_key, (void *)B_TRUE);
+
+ dmu_buf_impl_t *db = multilist_sublist_tail(mls);
+ while (db != NULL && mutex_tryenter(&db->db_mtx) == 0) {
+ db = multilist_sublist_prev(mls, db);
+ }
+
+ DTRACE_PROBE2(dbuf__evict__one, dmu_buf_impl_t *, db,
+ multilist_sublist_t *, mls);
+
+ if (db != NULL) {
+ multilist_sublist_remove(mls, db);
+ multilist_sublist_unlock(mls);
+ (void) refcount_remove_many(&dbuf_cache_size,
+ db->db.db_size, db);
+ dbuf_destroy(db);
+ } else {
+ multilist_sublist_unlock(mls);
+ }
+ (void) tsd_set(zfs_dbuf_evict_key, NULL);
+}
+
+/*
+ * The dbuf evict thread is responsible for aging out dbufs from the
+ * cache. Once the cache has reached it's maximum size, dbufs are removed
+ * and destroyed. The eviction thread will continue running until the size
+ * of the dbuf cache is at or below the maximum size. Once the dbuf is aged
+ * out of the cache it is destroyed and becomes eligible for arc eviction.
+ */
+static void
+dbuf_evict_thread(void *dummy __unused)
+{
+ callb_cpr_t cpr;
+
+ CALLB_CPR_INIT(&cpr, &dbuf_evict_lock, callb_generic_cpr, FTAG);
+
+ mutex_enter(&dbuf_evict_lock);
+ while (!dbuf_evict_thread_exit) {
+ while (!dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) {
+ CALLB_CPR_SAFE_BEGIN(&cpr);
+ (void) cv_timedwait_hires(&dbuf_evict_cv,
+ &dbuf_evict_lock, SEC2NSEC(1), MSEC2NSEC(1), 0);
+ CALLB_CPR_SAFE_END(&cpr, &dbuf_evict_lock);
+ }
+ mutex_exit(&dbuf_evict_lock);
+
+ /*
+ * Keep evicting as long as we're above the low water mark
+ * for the cache. We do this without holding the locks to
+ * minimize lock contention.
+ */
+ while (dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) {
+ dbuf_evict_one();
+ }
+
+ mutex_enter(&dbuf_evict_lock);
+ }
+
+ dbuf_evict_thread_exit = B_FALSE;
+ cv_broadcast(&dbuf_evict_cv);
+ CALLB_CPR_EXIT(&cpr); /* drops dbuf_evict_lock */
+ thread_exit();
+}
+
+/*
+ * Wake up the dbuf eviction thread if the dbuf cache is at its max size.
+ * If the dbuf cache is at its high water mark, then evict a dbuf from the
+ * dbuf cache using the callers context.
+ */
+static void
+dbuf_evict_notify(void)
+{
+
+ /*
+ * We use thread specific data to track when a thread has
+ * started processing evictions. This allows us to avoid deeply
+ * nested stacks that would have a call flow similar to this:
+ *
+ * dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify()
+ * ^ |
+ * | |
+ * +-----dbuf_destroy()<--dbuf_evict_one()<--------+
+ *
+ * The dbuf_eviction_thread will always have its tsd set until
+ * that thread exits. All other threads will only set their tsd
+ * if they are participating in the eviction process. This only
+ * happens if the eviction thread is unable to process evictions
+ * fast enough. To keep the dbuf cache size in check, other threads
+ * can evict from the dbuf cache directly. Those threads will set
+ * their tsd values so that we ensure that they only evict one dbuf
+ * from the dbuf cache.
+ */
+ if (tsd_get(zfs_dbuf_evict_key) != NULL)
+ return;
+
+ if (refcount_count(&dbuf_cache_size) > dbuf_cache_max_bytes) {
+ boolean_t evict_now = B_FALSE;
+
+ mutex_enter(&dbuf_evict_lock);
+ if (refcount_count(&dbuf_cache_size) > dbuf_cache_max_bytes) {
+ evict_now = dbuf_cache_above_hiwater();
+ cv_signal(&dbuf_evict_cv);
+ }
+ mutex_exit(&dbuf_evict_lock);
+
+ if (evict_now) {
+ dbuf_evict_one();
+ }
+ }
+}
+
void
dbuf_init(void)
{
@@ -273,12 +588,38 @@
goto retry;
}
- dbuf_cache = kmem_cache_create("dmu_buf_impl_t",
+ dbuf_kmem_cache = kmem_cache_create("dmu_buf_impl_t",
sizeof (dmu_buf_impl_t),
0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
for (i = 0; i < DBUF_MUTEXES; i++)
mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
+
+ /*
+ * Setup the parameters for the dbuf cache. We cap the size of the
+ * dbuf cache to 1/32nd (default) of the size of the ARC.
+ */
+ dbuf_cache_max_bytes = MIN(dbuf_cache_max_bytes,
+ arc_max_bytes() >> dbuf_cache_max_shift);
+
+ /*
+ * All entries are queued via taskq_dispatch_ent(), so min/maxalloc
+ * configuration is not required.
+ */
+ dbu_evict_taskq = taskq_create("dbu_evict", 1, minclsyspri, 0, 0, 0);
+
+ multilist_create(&dbuf_cache, sizeof (dmu_buf_impl_t),
+ offsetof(dmu_buf_impl_t, db_cache_link),
+ zfs_arc_num_sublists_per_state,
+ dbuf_cache_multilist_index_func);
+ refcount_create(&dbuf_cache_size);
+
+ tsd_create(&zfs_dbuf_evict_key, NULL);
+ dbuf_evict_thread_exit = B_FALSE;
+ mutex_init(&dbuf_evict_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&dbuf_evict_cv, NULL, CV_DEFAULT, NULL);
+ dbuf_cache_evict_thread = thread_create(NULL, 0, dbuf_evict_thread,
+ NULL, 0, &p0, TS_RUN, minclsyspri);
}
void
@@ -290,7 +631,23 @@
for (i = 0; i < DBUF_MUTEXES; i++)
mutex_destroy(&h->hash_mutexes[i]);
kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
- kmem_cache_destroy(dbuf_cache);
+ kmem_cache_destroy(dbuf_kmem_cache);
+ taskq_destroy(dbu_evict_taskq);
+
+ mutex_enter(&dbuf_evict_lock);
+ dbuf_evict_thread_exit = B_TRUE;
+ while (dbuf_evict_thread_exit) {
+ cv_signal(&dbuf_evict_cv);
+ cv_wait(&dbuf_evict_cv, &dbuf_evict_lock);
+ }
+ mutex_exit(&dbuf_evict_lock);
+ tsd_destroy(&zfs_dbuf_evict_key);
+
+ mutex_destroy(&dbuf_evict_lock);
+ cv_destroy(&dbuf_evict_cv);
+
+ refcount_destroy(&dbuf_cache_size);
+ multilist_destroy(&dbuf_cache);
}
/*
@@ -321,7 +678,7 @@
ASSERT3U(db->db_level, <, dn->dn_nlevels);
ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
db->db_blkid == DMU_SPILL_BLKID ||
- !list_is_empty(&dn->dn_dbufs));
+ !avl_is_empty(&dn->dn_dbufs));
}
if (db->db_blkid == DMU_BONUS_BLKID) {
ASSERT(dn != NULL);
@@ -393,13 +750,49 @@
* If the blkptr isn't set but they have nonzero data,
* it had better be dirty, otherwise we'll lose that
* data when we evict this buffer.
+ *
+ * There is an exception to this rule for indirect blocks; in
+ * this case, if the indirect block is a hole, we fill in a few
+ * fields on each of the child blocks (importantly, birth time)
+ * to prevent hole birth times from being lost when you
+ * partially fill in a hole.
*/
if (db->db_dirtycnt == 0) {
- uint64_t *buf = db->db.db_data;
- int i;
+ if (db->db_level == 0) {
+ uint64_t *buf = db->db.db_data;
+ int i;
- for (i = 0; i < db->db.db_size >> 3; i++) {
- ASSERT(buf[i] == 0);
+ for (i = 0; i < db->db.db_size >> 3; i++) {
+ ASSERT(buf[i] == 0);
+ }
+ } else {
+ blkptr_t *bps = db->db.db_data;
+ ASSERT3U(1 << DB_DNODE(db)->dn_indblkshift, ==,
+ db->db.db_size);
+ /*
+ * We want to verify that all the blkptrs in the
+ * indirect block are holes, but we may have
+ * automatically set up a few fields for them.
+ * We iterate through each blkptr and verify
+ * they only have those fields set.
+ */
+ for (int i = 0;
+ i < db->db.db_size / sizeof (blkptr_t);
+ i++) {
+ blkptr_t *bp = &bps[i];
+ ASSERT(ZIO_CHECKSUM_IS_ZERO(
+ &bp->blk_cksum));
+ ASSERT(
+ DVA_IS_EMPTY(&bp->blk_dva[0]) &&
+ DVA_IS_EMPTY(&bp->blk_dva[1]) &&
+ DVA_IS_EMPTY(&bp->blk_dva[2]));
+ ASSERT0(bp->blk_fill);
+ ASSERT0(bp->blk_pad[0]);
+ ASSERT0(bp->blk_pad[1]);
+ ASSERT(!BP_IS_EMBEDDED(bp));
+ ASSERT(BP_IS_HOLE(bp));
+ ASSERT0(bp->blk_phys_birth);
+ }
}
}
}
@@ -408,13 +801,14 @@
#endif
static void
-dbuf_update_data(dmu_buf_impl_t *db)
+dbuf_clear_data(dmu_buf_impl_t *db)
{
ASSERT(MUTEX_HELD(&db->db_mtx));
- if (db->db_level == 0 && db->db_user_data_ptr_ptr) {
- ASSERT(!refcount_is_zero(&db->db_holds));
- *db->db_user_data_ptr_ptr = db->db.db_data;
- }
+ dbuf_evict_user(db);
+ ASSERT3P(db->db_buf, ==, NULL);
+ db->db.db_data = NULL;
+ if (db->db_state != DB_NOFILL)
+ db->db_state = DB_UNCACHED;
}
static void
@@ -421,20 +815,11 @@
dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
{
ASSERT(MUTEX_HELD(&db->db_mtx));
- ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf));
+ ASSERT(buf != NULL);
+
db->db_buf = buf;
- if (buf != NULL) {
- ASSERT(buf->b_data != NULL);
- db->db.db_data = buf->b_data;
- if (!arc_released(buf))
- arc_set_callback(buf, dbuf_do_evict, db);
- dbuf_update_data(db);
- } else {
- dbuf_evict_user(db);
- db->db.db_data = NULL;
- if (db->db_state != DB_NOFILL)
- db->db_state = DB_UNCACHED;
- }
+ ASSERT(buf->b_data != NULL);
+ db->db.db_data = buf->b_data;
}
/*
@@ -445,29 +830,54 @@
{
arc_buf_t *abuf;
+ ASSERT(db->db_blkid != DMU_BONUS_BLKID);
mutex_enter(&db->db_mtx);
if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) {
int blksz = db->db.db_size;
- spa_t *spa;
+ spa_t *spa = db->db_objset->os_spa;
mutex_exit(&db->db_mtx);
- DB_GET_SPA(&spa, db);
abuf = arc_loan_buf(spa, blksz);
bcopy(db->db.db_data, abuf->b_data, blksz);
} else {
abuf = db->db_buf;
arc_loan_inuse_buf(abuf, db);
- dbuf_set_data(db, NULL);
+ db->db_buf = NULL;
+ dbuf_clear_data(db);
mutex_exit(&db->db_mtx);
}
return (abuf);
}
+/*
+ * Calculate which level n block references the data at the level 0 offset
+ * provided.
+ */
uint64_t
-dbuf_whichblock(dnode_t *dn, uint64_t offset)
+dbuf_whichblock(dnode_t *dn, int64_t level, uint64_t offset)
{
- if (dn->dn_datablkshift) {
- return (offset >> dn->dn_datablkshift);
+ if (dn->dn_datablkshift != 0 && dn->dn_indblkshift != 0) {
+ /*
+ * The level n blkid is equal to the level 0 blkid divided by
+ * the number of level 0s in a level n block.
+ *
+ * The level 0 blkid is offset >> datablkshift =
+ * offset / 2^datablkshift.
+ *
+ * The number of level 0s in a level n is the number of block
+ * pointers in an indirect block, raised to the power of level.
+ * This is 2^(indblkshift - SPA_BLKPTRSHIFT)^level =
+ * 2^(level*(indblkshift - SPA_BLKPTRSHIFT)).
+ *
+ * Thus, the level n blkid is: offset /
+ * ((2^datablkshift)*(2^(level*(indblkshift - SPA_BLKPTRSHIFT)))
+ * = offset / 2^(datablkshift + level *
+ * (indblkshift - SPA_BLKPTRSHIFT))
+ * = offset >> (datablkshift + level *
+ * (indblkshift - SPA_BLKPTRSHIFT))
+ */
+ return (offset >> (dn->dn_datablkshift + level *
+ (dn->dn_indblkshift - SPA_BLKPTRSHIFT)));
} else {
ASSERT3U(offset, <, dn->dn_datablksz);
return (0);
@@ -501,7 +911,7 @@
} else {
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
ASSERT3P(db->db_buf, ==, NULL);
- VERIFY(arc_buf_remove_ref(buf, db));
+ arc_buf_destroy(buf, db);
db->db_state = DB_UNCACHED;
}
cv_broadcast(&db->db_changed);
@@ -509,12 +919,11 @@
}
static void
-dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
+dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
{
dnode_t *dn;
- spa_t *spa;
- zbookmark_t zb;
- uint32_t aflags = ARC_NOWAIT;
+ zbookmark_phys_t zb;
+ arc_flags_t aflags = ARC_FLAG_NOWAIT;
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
@@ -536,7 +945,6 @@
if (bonuslen)
bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen);
DB_DNODE_EXIT(db);
- dbuf_update_data(db);
db->db_state = DB_CACHED;
mutex_exit(&db->db_mtx);
return;
@@ -552,17 +960,36 @@
BP_IS_HOLE(db->db_blkptr)))) {
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
- dbuf_set_data(db, arc_buf_alloc(dn->dn_objset->os_spa,
+ dbuf_set_data(db, arc_alloc_buf(db->db_objset->os_spa,
db->db.db_size, db, type));
+ bzero(db->db.db_data, db->db.db_size);
+
+ if (db->db_blkptr != NULL && db->db_level > 0 &&
+ BP_IS_HOLE(db->db_blkptr) &&
+ db->db_blkptr->blk_birth != 0) {
+ blkptr_t *bps = db->db.db_data;
+ for (int i = 0; i < ((1 <<
+ DB_DNODE(db)->dn_indblkshift) / sizeof (blkptr_t));
+ i++) {
+ blkptr_t *bp = &bps[i];
+ ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
+ 1 << dn->dn_indblkshift);
+ BP_SET_LSIZE(bp,
+ BP_GET_LEVEL(db->db_blkptr) == 1 ?
+ dn->dn_datablksz :
+ BP_GET_LSIZE(db->db_blkptr));
+ BP_SET_TYPE(bp, BP_GET_TYPE(db->db_blkptr));
+ BP_SET_LEVEL(bp,
+ BP_GET_LEVEL(db->db_blkptr) - 1);
+ BP_SET_BIRTH(bp, db->db_blkptr->blk_birth, 0);
+ }
+ }
DB_DNODE_EXIT(db);
- bzero(db->db.db_data, db->db.db_size);
db->db_state = DB_CACHED;
- *flags |= DB_RF_CACHED;
mutex_exit(&db->db_mtx);
return;
}
- spa = dn->dn_objset->os_spa;
DB_DNODE_EXIT(db);
db->db_state = DB_READ;
@@ -569,9 +996,7 @@
mutex_exit(&db->db_mtx);
if (DBUF_IS_L2CACHEABLE(db))
- aflags |= ARC_L2CACHE;
- if (DBUF_IS_L2COMPRESSIBLE(db))
- aflags |= ARC_L2COMPRESS;
+ aflags |= ARC_FLAG_L2CACHE;
SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ?
db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET,
@@ -579,12 +1004,10 @@
dbuf_add_ref(db, NULL);
- (void) arc_read(zio, spa, db->db_blkptr,
+ (void) arc_read(zio, db->db_objset->os_spa, db->db_blkptr,
dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
- (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
+ (flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
&aflags, &zb);
- if (aflags & ARC_CACHED)
- *flags |= DB_RF_CACHED;
}
int
@@ -591,8 +1014,8 @@
dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
{
int err = 0;
- int havepzio = (zio != NULL);
- int prefetch;
+ boolean_t havepzio = (zio != NULL);
+ boolean_t prefetch;
dnode_t *dn;
/*
@@ -617,8 +1040,7 @@
if (db->db_state == DB_CACHED) {
mutex_exit(&db->db_mtx);
if (prefetch)
- dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
- db->db.db_size, TRUE);
+ dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE);
if ((flags & DB_RF_HAVESTRUCT) == 0)
rw_exit(&dn->dn_struct_rwlock);
DB_DNODE_EXIT(db);
@@ -627,13 +1049,12 @@
if (zio == NULL)
zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
- dbuf_read_impl(db, zio, &flags);
+ dbuf_read_impl(db, zio, flags);
/* dbuf_read_impl has dropped db_mtx for us */
if (prefetch)
- dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
- db->db.db_size, flags & DB_RF_CACHED);
+ dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE);
if ((flags & DB_RF_HAVESTRUCT) == 0)
rw_exit(&dn->dn_struct_rwlock);
@@ -652,8 +1073,7 @@
*/
mutex_exit(&db->db_mtx);
if (prefetch)
- dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
- db->db.db_size, TRUE);
+ dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE);
if ((flags & DB_RF_HAVESTRUCT) == 0)
rw_exit(&dn->dn_struct_rwlock);
DB_DNODE_EXIT(db);
@@ -665,6 +1085,8 @@
db->db_state == DB_FILL) {
ASSERT(db->db_state == DB_READ ||
(flags & DB_RF_HAVESTRUCT) == 0);
+ DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *,
+ db, zio_t *, zio);
cv_wait(&db->db_changed, &db->db_mtx);
}
if (db->db_state == DB_UNCACHED)
@@ -687,15 +1109,14 @@
cv_wait(&db->db_changed, &db->db_mtx);
if (db->db_state == DB_UNCACHED) {
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
- spa_t *spa;
+ spa_t *spa = db->db_objset->os_spa;
ASSERT(db->db_buf == NULL);
ASSERT(db->db.db_data == NULL);
- DB_GET_SPA(&spa, db);
- dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type));
+ dbuf_set_data(db, arc_alloc_buf(spa, db->db.db_size, db, type));
db->db_state = DB_FILL;
} else if (db->db_state == DB_NOFILL) {
- dbuf_set_data(db, NULL);
+ dbuf_clear_data(db);
} else {
ASSERT3U(db->db_state, ==, DB_CACHED);
}
@@ -746,13 +1167,13 @@
} else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
int size = db->db.db_size;
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
- spa_t *spa;
+ spa_t *spa = db->db_objset->os_spa;
- DB_GET_SPA(&spa, db);
- dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type);
+ dr->dt.dl.dr_data = arc_alloc_buf(spa, size, db, type);
bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
} else {
- dbuf_set_data(db, NULL);
+ db->db_buf = NULL;
+ dbuf_clear_data(db);
}
}
@@ -774,12 +1195,9 @@
ASSERT(db->db_data_pending != dr);
/* free this block */
- if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite) {
- spa_t *spa;
+ if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite)
+ zio_free(db->db_objset->os_spa, txg, bp);
- DB_GET_SPA(&spa, db);
- zio_free(spa, txg, bp);
- }
dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
dr->dt.dl.dr_nopwrite = B_FALSE;
@@ -797,49 +1215,63 @@
/*
* Evict (if its unreferenced) or clear (if its referenced) any level-0
* data blocks in the free range, so that any future readers will find
- * empty blocks. Also, if we happen accross any level-1 dbufs in the
- * range that have not already been marked dirty, mark them dirty so
- * they stay in memory.
+ * empty blocks.
+ *
+ * This is a no-op if the dataset is in the middle of an incremental
+ * receive; see comment below for details.
*/
void
-dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx)
+dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
+ dmu_tx_t *tx)
{
+ dmu_buf_impl_t db_search;
dmu_buf_impl_t *db, *db_next;
uint64_t txg = tx->tx_txg;
- int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
- uint64_t first_l1 = start >> epbs;
- uint64_t last_l1 = end >> epbs;
+ avl_index_t where;
- if (end > dn->dn_maxblkid && (end != DMU_SPILL_BLKID)) {
- end = dn->dn_maxblkid;
- last_l1 = end >> epbs;
+ if (end_blkid > dn->dn_maxblkid && (end_blkid != DMU_SPILL_BLKID))
+ end_blkid = dn->dn_maxblkid;
+ dprintf_dnode(dn, "start=%llu end=%llu\n", start_blkid, end_blkid);
+
+ db_search.db_level = 0;
+ db_search.db_blkid = start_blkid;
+ db_search.db_state = DB_SEARCH;
+
+ mutex_enter(&dn->dn_dbufs_mtx);
+ if (start_blkid >= dn->dn_unlisted_l0_blkid) {
+ /* There can't be any dbufs in this range; no need to search. */
+#ifdef DEBUG
+ db = avl_find(&dn->dn_dbufs, &db_search, &where);
+ ASSERT3P(db, ==, NULL);
+ db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
+ ASSERT(db == NULL || db->db_level > 0);
+#endif
+ mutex_exit(&dn->dn_dbufs_mtx);
+ return;
+ } else if (dmu_objset_is_receiving(dn->dn_objset)) {
+ /*
+ * If we are receiving, we expect there to be no dbufs in
+ * the range to be freed, because receive modifies each
+ * block at most once, and in offset order. If this is
+ * not the case, it can lead to performance problems,
+ * so note that we unexpectedly took the slow path.
+ */
+ atomic_inc_64(&zfs_free_range_recv_miss);
}
- dprintf_dnode(dn, "start=%llu end=%llu\n", start, end);
- mutex_enter(&dn->dn_dbufs_mtx);
- for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
- db_next = list_next(&dn->dn_dbufs, db);
+
+ db = avl_find(&dn->dn_dbufs, &db_search, &where);
+ ASSERT3P(db, ==, NULL);
+ db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
+
+ for (; db != NULL; db = db_next) {
+ db_next = AVL_NEXT(&dn->dn_dbufs, db);
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
- if (db->db_level == 1 &&
- db->db_blkid >= first_l1 && db->db_blkid <= last_l1) {
- mutex_enter(&db->db_mtx);
- if (db->db_last_dirty &&
- db->db_last_dirty->dr_txg < txg) {
- dbuf_add_ref(db, FTAG);
- mutex_exit(&db->db_mtx);
- dbuf_will_dirty(db, tx);
- dbuf_rele(db, FTAG);
- } else {
- mutex_exit(&db->db_mtx);
- }
+ if (db->db_level != 0 || db->db_blkid > end_blkid) {
+ break;
}
+ ASSERT3U(db->db_blkid, >=, start_blkid);
- if (db->db_level != 0)
- continue;
- dprintf_dbuf(db, "found buf %s\n", "");
- if (db->db_blkid < start || db->db_blkid > end)
- continue;
-
/* found a level 0 buffer in the range */
mutex_enter(&db->db_mtx);
if (dbuf_undirty(db, tx)) {
@@ -862,7 +1294,7 @@
}
if (refcount_count(&db->db_holds) == 0) {
ASSERT(db->db_buf);
- dbuf_clear(db);
+ dbuf_destroy(db);
continue;
}
/* The dbuf is referenced */
@@ -913,24 +1345,29 @@
* We don't need any locking to protect db_blkptr:
* If it's syncing, then db_last_dirty will be set
* so we'll ignore db_blkptr.
+ *
+ * This logic ensures that only block births for
+ * filled blocks are considered.
*/
ASSERT(MUTEX_HELD(&db->db_mtx));
- if (db->db_last_dirty)
+ if (db->db_last_dirty && (db->db_blkptr == NULL ||
+ !BP_IS_HOLE(db->db_blkptr))) {
birth_txg = db->db_last_dirty->dr_txg;
- else if (db->db_blkptr)
+ } else if (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) {
birth_txg = db->db_blkptr->blk_birth;
+ }
/*
- * If we don't exist or are in a snapshot, we can't be freed.
+ * If this block don't exist or is in a snapshot, it can't be freed.
* Don't pass the bp to dsl_dataset_block_freeable() since we
* are holding the db_mtx lock and might deadlock if we are
* prefetching a dedup-ed block.
*/
- if (birth_txg)
+ if (birth_txg != 0)
return (ds == NULL ||
dsl_dataset_block_freeable(ds, NULL, birth_txg));
else
- return (FALSE);
+ return (B_FALSE);
}
void
@@ -950,7 +1387,7 @@
ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
/*
- * This call to dbuf_will_dirty() with the dn_struct_rwlock held
+ * This call to dmu_buf_will_dirty() with the dn_struct_rwlock held
* is OK, because there can be no other references to the db
* when we are changing its size, so no concurrent DB_FILL can
* be happening.
@@ -959,10 +1396,10 @@
* XXX we should be doing a dbuf_read, checking the return
* value and returning that up to our callers
*/
- dbuf_will_dirty(db, tx);
+ dmu_buf_will_dirty(&db->db, tx);
/* create the data buffer for the new block */
- buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type);
+ buf = arc_alloc_buf(dn->dn_objset->os_spa, size, db, type);
/* copy old block data to the new block */
obuf = db->db_buf;
@@ -973,7 +1410,7 @@
mutex_enter(&db->db_mtx);
dbuf_set_data(db, buf);
- VERIFY(arc_buf_remove_ref(obuf, db));
+ arc_buf_destroy(obuf, db);
db->db.db_size = size;
if (db->db_level == 0) {
@@ -989,9 +1426,8 @@
void
dbuf_release_bp(dmu_buf_impl_t *db)
{
- objset_t *os;
+ objset_t *os = db->db_objset;
- DB_GET_OBJSET(&os, db);
ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
ASSERT(arc_released(os->os_phys_buf) ||
list_link_active(&os->os_dsl_dataset->ds_synced_link));
@@ -1000,6 +1436,32 @@
(void) arc_release(db->db_buf, db);
}
+/*
+ * We already have a dirty record for this TXG, and we are being
+ * dirtied again.
+ */
+static void
+dbuf_redirty(dbuf_dirty_record_t *dr)
+{
+ dmu_buf_impl_t *db = dr->dr_dbuf;
+
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+
+ if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) {
+ /*
+ * If this buffer has already been written out,
+ * we now need to reset its state.
+ */
+ dbuf_unoverride(dr);
+ if (db->db.db_object != DMU_META_DNODE_OBJECT &&
+ db->db_state != DB_NOFILL) {
+ /* Already released on initial dirty, so just thaw. */
+ ASSERT(arc_released(db->db_buf));
+ arc_buf_thaw(db->db_buf);
+ }
+ }
+}
+
dbuf_dirty_record_t *
dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
{
@@ -1021,10 +1483,18 @@
* objects may be dirtied in syncing context, but only if they
* were already pre-dirtied in open context.
*/
+#ifdef DEBUG
+ if (dn->dn_objset->os_dsl_dataset != NULL) {
+ rrw_enter(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock,
+ RW_READER, FTAG);
+ }
ASSERT(!dmu_tx_is_syncing(tx) ||
BP_IS_HOLE(dn->dn_objset->os_rootbp) ||
DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
dn->dn_objset->os_dsl_dataset == NULL);
+ if (dn->dn_objset->os_dsl_dataset != NULL)
+ rrw_exit(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, FTAG);
+#endif
/*
* We make this assert for private objects as well, but after we
* check if we're already dirty. They are allowed to re-dirty
@@ -1049,12 +1519,21 @@
* Don't set dirtyctx to SYNC if we're just modifying this as we
* initialize the objset.
*/
- if (dn->dn_dirtyctx == DN_UNDIRTIED &&
- !BP_IS_HOLE(dn->dn_objset->os_rootbp)) {
- dn->dn_dirtyctx =
- (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN);
- ASSERT(dn->dn_dirtyctx_firstset == NULL);
- dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP);
+ if (dn->dn_dirtyctx == DN_UNDIRTIED) {
+ if (dn->dn_objset->os_dsl_dataset != NULL) {
+ rrw_enter(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock,
+ RW_READER, FTAG);
+ }
+ if (!BP_IS_HOLE(dn->dn_objset->os_rootbp)) {
+ dn->dn_dirtyctx = (dmu_tx_is_syncing(tx) ?
+ DN_DIRTY_SYNC : DN_DIRTY_OPEN);
+ ASSERT(dn->dn_dirtyctx_firstset == NULL);
+ dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP);
+ }
+ if (dn->dn_objset->os_dsl_dataset != NULL) {
+ rrw_exit(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock,
+ FTAG);
+ }
}
mutex_exit(&dn->dn_mtx);
@@ -1072,16 +1551,7 @@
if (dr && dr->dr_txg == tx->tx_txg) {
DB_DNODE_EXIT(db);
- if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) {
- /*
- * If this buffer has already been written out,
- * we now need to reset its state.
- */
- dbuf_unoverride(dr);
- if (db->db.db_object != DMU_META_DNODE_OBJECT &&
- db->db_state != DB_NOFILL)
- arc_buf_thaw(db->db_buf);
- }
+ dbuf_redirty(dr);
mutex_exit(&db->db_mtx);
return (dr);
}
@@ -1108,8 +1578,14 @@
* this assertion only if we're not already dirty.
*/
os = dn->dn_objset;
+#ifdef DEBUG
+ if (dn->dn_objset->os_dsl_dataset != NULL)
+ rrw_enter(&os->os_dsl_dataset->ds_bp_rwlock, RW_READER, FTAG);
ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp));
+ if (dn->dn_objset->os_dsl_dataset != NULL)
+ rrw_exit(&os->os_dsl_dataset->ds_bp_rwlock, FTAG);
+#endif
ASSERT(db->db.db_size != 0);
dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
@@ -1162,6 +1638,8 @@
sizeof (dbuf_dirty_record_t),
offsetof(dbuf_dirty_record_t, dr_dirty_node));
}
+ if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL)
+ dr->dr_accounted = db->db.db_size;
dr->dr_dbuf = db;
dr->dr_txg = tx->tx_txg;
dr->dr_next = *drp;
@@ -1175,7 +1653,10 @@
if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
db->db_blkid != DMU_SPILL_BLKID) {
mutex_enter(&dn->dn_mtx);
- dnode_clear_range(dn, db->db_blkid, 1, tx);
+ if (dn->dn_free_ranges[txgoff] != NULL) {
+ range_tree_clear(dn->dn_free_ranges[txgoff],
+ db->db_blkid, 1);
+ }
mutex_exit(&dn->dn_mtx);
db->db_freed_in_flight = FALSE;
}
@@ -1198,7 +1679,20 @@
dnode_setdirty(dn, tx);
DB_DNODE_EXIT(db);
return (dr);
- } else if (do_free_accounting) {
+ }
+
+ /*
+ * The dn_struct_rwlock prevents db_blkptr from changing
+ * due to a write from syncing context completing
+ * while we are running, so we want to acquire it before
+ * looking at db_blkptr.
+ */
+ if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ drop_struct_lock = TRUE;
+ }
+
+ if (do_free_accounting) {
blkptr_t *bp = db->db_blkptr;
int64_t willfree = (bp && !BP_IS_HOLE(bp)) ?
bp_get_dsize(os->os_spa, bp) : db->db.db_size;
@@ -1214,11 +1708,6 @@
dnode_willuse_space(dn, -willfree, tx);
}
- if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
- rw_enter(&dn->dn_struct_rwlock, RW_READER);
- drop_struct_lock = TRUE;
- }
-
if (db->db_level == 0) {
dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock);
ASSERT(dn->dn_maxblkid >= db->db_blkid);
@@ -1245,7 +1734,10 @@
dbuf_rele(parent, FTAG);
mutex_enter(&db->db_mtx);
- /* possible race with dbuf_undirty() */
+ /*
+ * Since we've dropped the mutex, it's possible that
+ * dbuf_undirty() might have changed this out from under us.
+ */
if (db->db_last_dirty == dr ||
dn->dn_object == DMU_META_DNODE_OBJECT) {
mutex_enter(&di->dt.di.dr_mtx);
@@ -1285,6 +1777,16 @@
dbuf_dirty_record_t *dr, **drp;
ASSERT(txg != 0);
+
+ /*
+ * Due to our use of dn_nlevels below, this can only be called
+ * in open context, unless we are operating on the MOS.
+ * From syncing context, dn_nlevels may be different from the
+ * dn_nlevels used when dbuf was dirtied.
+ */
+ ASSERT(db->db_objset ==
+ dmu_objset_pool(db->db_objset)->dp_meta_objset ||
+ txg != spa_syncing_txg(dmu_objset_spa(db->db_objset)));
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
ASSERT0(db->db_level);
ASSERT(MUTEX_HELD(&db->db_mtx));
@@ -1303,19 +1805,12 @@
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
- /*
- * Note: This code will probably work even if there are concurrent
- * holders, but it is untested in that scenerio, as the ZPL and
- * ztest have additional locking (the range locks) that prevents
- * that type of concurrent access.
- */
- ASSERT3U(refcount_count(&db->db_holds), ==, db->db_dirtycnt);
-
dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
ASSERT(db->db.db_size != 0);
- /* XXX would be nice to fix up dn_towrite_space[] */
+ dsl_pool_undirty_space(dmu_objset_pool(dn->dn_objset),
+ dr->dr_accounted, txg);
*drp = dr->dr_next;
@@ -1330,7 +1825,7 @@
list_remove(&dr->dr_parent->dt.di.dr_children, dr);
mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
} else if (db->db_blkid == DMU_SPILL_BLKID ||
- db->db_level+1 == dn->dn_nlevels) {
+ db->db_level + 1 == dn->dn_nlevels) {
ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf);
mutex_enter(&dn->dn_mtx);
list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
@@ -1344,8 +1839,9 @@
ASSERT(db->db_buf != NULL);
ASSERT(dr->dt.dl.dr_data != NULL);
if (dr->dt.dl.dr_data != db->db_buf)
- VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db));
+ arc_buf_destroy(dr->dt.dl.dr_data, db);
}
+
kmem_free(dr, sizeof (dbuf_dirty_record_t));
ASSERT(db->db_dirtycnt > 0);
@@ -1352,12 +1848,8 @@
db->db_dirtycnt -= 1;
if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
- arc_buf_t *buf = db->db_buf;
-
- ASSERT(db->db_state == DB_NOFILL || arc_released(buf));
- dbuf_set_data(db, NULL);
- VERIFY(arc_buf_remove_ref(buf, db));
- dbuf_evict(db);
+ ASSERT(db->db_state == DB_NOFILL || arc_released(db->db_buf));
+ dbuf_destroy(db);
return (B_TRUE);
}
@@ -1364,15 +1856,39 @@
return (B_FALSE);
}
-#pragma weak dmu_buf_will_dirty = dbuf_will_dirty
void
-dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
+dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH;
ASSERT(tx->tx_txg != 0);
ASSERT(!refcount_is_zero(&db->db_holds));
+ /*
+ * Quick check for dirtyness. For already dirty blocks, this
+ * reduces runtime of this function by >90%, and overall performance
+ * by 50% for some workloads (e.g. file deletion with indirect blocks
+ * cached).
+ */
+ mutex_enter(&db->db_mtx);
+ dbuf_dirty_record_t *dr;
+ for (dr = db->db_last_dirty;
+ dr != NULL && dr->dr_txg >= tx->tx_txg; dr = dr->dr_next) {
+ /*
+ * It's possible that it is already dirty but not cached,
+ * because there are some calls to dbuf_dirty() that don't
+ * go through dmu_buf_will_dirty().
+ */
+ if (dr->dr_txg == tx->tx_txg && db->db_state == DB_CACHED) {
+ /* This dbuf is already dirty and cached. */
+ dbuf_redirty(dr);
+ mutex_exit(&db->db_mtx);
+ return;
+ }
+ }
+ mutex_exit(&db->db_mtx);
+
DB_DNODE_ENTER(db);
if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
rf |= DB_RF_HAVESTRUCT;
@@ -1430,6 +1946,43 @@
mutex_exit(&db->db_mtx);
}
+void
+dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
+ bp_embedded_type_t etype, enum zio_compress comp,
+ int uncompressed_size, int compressed_size, int byteorder,
+ dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
+ struct dirty_leaf *dl;
+ dmu_object_type_t type;
+
+ if (etype == BP_EMBEDDED_TYPE_DATA) {
+ ASSERT(spa_feature_is_active(dmu_objset_spa(db->db_objset),
+ SPA_FEATURE_EMBEDDED_DATA));
+ }
+
+ DB_DNODE_ENTER(db);
+ type = DB_DNODE(db)->dn_type;
+ DB_DNODE_EXIT(db);
+
+ ASSERT0(db->db_level);
+ ASSERT(db->db_blkid != DMU_BONUS_BLKID);
+
+ dmu_buf_will_not_fill(dbuf, tx);
+
+ ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
+ dl = &db->db_last_dirty->dt.dl;
+ encode_embedded_bp_compressed(&dl->dr_overridden_by,
+ data, comp, uncompressed_size, compressed_size);
+ BPE_SET_ETYPE(&dl->dr_overridden_by, etype);
+ BP_SET_TYPE(&dl->dr_overridden_by, type);
+ BP_SET_LEVEL(&dl->dr_overridden_by, 0);
+ BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder);
+
+ dl->dr_override_state = DR_OVERRIDDEN;
+ dl->dr_overridden_by.blk_birth = db->db_last_dirty->dr_txg;
+}
+
/*
* Directly assign a provided arc buf to a given dbuf if it's not referenced
* by anybody except our caller. Otherwise copy arcbuf's contents to dbuf.
@@ -1460,7 +2013,7 @@
mutex_exit(&db->db_mtx);
(void) dbuf_dirty(db, tx);
bcopy(buf->b_data, db->db.db_data, db->db.db_size);
- VERIFY(arc_buf_remove_ref(buf, db));
+ arc_buf_destroy(buf, db);
xuio_stat_wbuf_copied();
return;
}
@@ -1478,10 +2031,10 @@
arc_release(db->db_buf, db);
}
dr->dt.dl.dr_data = buf;
- VERIFY(arc_buf_remove_ref(db->db_buf, db));
+ arc_buf_destroy(db->db_buf, db);
} else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) {
arc_release(db->db_buf, db);
- VERIFY(arc_buf_remove_ref(db->db_buf, db));
+ arc_buf_destroy(db->db_buf, db);
}
db->db_buf = NULL;
}
@@ -1490,45 +2043,39 @@
db->db_state = DB_FILL;
mutex_exit(&db->db_mtx);
(void) dbuf_dirty(db, tx);
- dbuf_fill_done(db, tx);
+ dmu_buf_fill_done(&db->db, tx);
}
-/*
- * "Clear" the contents of this dbuf. This will mark the dbuf
- * EVICTING and clear *most* of its references. Unfortunetely,
- * when we are not holding the dn_dbufs_mtx, we can't clear the
- * entry in the dn_dbufs list. We have to wait until dbuf_destroy()
- * in this case. For callers from the DMU we will usually see:
- * dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy()
- * For the arc callback, we will usually see:
- * dbuf_do_evict()->dbuf_clear();dbuf_destroy()
- * Sometimes, though, we will get a mix of these two:
- * DMU: dbuf_clear()->arc_buf_evict()
- * ARC: dbuf_do_evict()->dbuf_destroy()
- */
void
-dbuf_clear(dmu_buf_impl_t *db)
+dbuf_destroy(dmu_buf_impl_t *db)
{
dnode_t *dn;
dmu_buf_impl_t *parent = db->db_parent;
dmu_buf_impl_t *dndb;
- int dbuf_gone = FALSE;
ASSERT(MUTEX_HELD(&db->db_mtx));
ASSERT(refcount_is_zero(&db->db_holds));
- dbuf_evict_user(db);
+ if (db->db_buf != NULL) {
+ arc_buf_destroy(db->db_buf, db);
+ db->db_buf = NULL;
+ }
- if (db->db_state == DB_CACHED) {
+ if (db->db_blkid == DMU_BONUS_BLKID) {
ASSERT(db->db.db_data != NULL);
- if (db->db_blkid == DMU_BONUS_BLKID) {
- zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
- arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
- }
- db->db.db_data = NULL;
+ zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
+ arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
db->db_state = DB_UNCACHED;
}
+ dbuf_clear_data(db);
+
+ if (multilist_link_active(&db->db_cache_link)) {
+ multilist_remove(&dbuf_cache, db);
+ (void) refcount_remove_many(&dbuf_cache_size,
+ db->db.db_size, db);
+ }
+
ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
ASSERT(db->db_data_pending == NULL);
@@ -1535,14 +2082,26 @@
db->db_state = DB_EVICTING;
db->db_blkptr = NULL;
+ /*
+ * Now that db_state is DB_EVICTING, nobody else can find this via
+ * the hash table. We can now drop db_mtx, which allows us to
+ * acquire the dn_dbufs_mtx.
+ */
+ mutex_exit(&db->db_mtx);
+
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
dndb = dn->dn_dbuf;
- if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
- list_remove(&dn->dn_dbufs, db);
- (void) atomic_dec_32_nv(&dn->dn_dbufs_count);
+ if (db->db_blkid != DMU_BONUS_BLKID) {
+ boolean_t needlock = !MUTEX_HELD(&dn->dn_dbufs_mtx);
+ if (needlock)
+ mutex_enter(&dn->dn_dbufs_mtx);
+ avl_remove(&dn->dn_dbufs, db);
+ atomic_dec_32(&dn->dn_dbufs_count);
membar_producer();
DB_DNODE_EXIT(db);
+ if (needlock)
+ mutex_exit(&dn->dn_dbufs_mtx);
/*
* Decrementing the dbuf count means that the hold corresponding
* to the removed dbuf is no longer discounted in dnode_move(),
@@ -1553,16 +2112,26 @@
*/
dnode_rele(dn, db);
db->db_dnode_handle = NULL;
+
+ dbuf_hash_remove(db);
} else {
DB_DNODE_EXIT(db);
}
- if (db->db_buf)
- dbuf_gone = arc_buf_evict(db->db_buf);
+ ASSERT(refcount_is_zero(&db->db_holds));
- if (!dbuf_gone)
- mutex_exit(&db->db_mtx);
+ db->db_parent = NULL;
+ ASSERT(db->db_buf == NULL);
+ ASSERT(db->db.db_data == NULL);
+ ASSERT(db->db_hash_next == NULL);
+ ASSERT(db->db_blkptr == NULL);
+ ASSERT(db->db_data_pending == NULL);
+ ASSERT(!multilist_link_active(&db->db_cache_link));
+
+ kmem_cache_free(dbuf_kmem_cache, db);
+ arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
+
/*
* If this dbuf is referenced from an indirect dbuf,
* decrement the ref count on the indirect dbuf.
@@ -1571,6 +2140,12 @@
dbuf_rele(parent, db);
}
+/*
+ * Note: While bpp will always be updated if the function returns success,
+ * parentp will not be updated if the dnode does not have dn_dbuf filled in;
+ * this happens when the dnode is the meta-dnode, or a userused or groupused
+ * object.
+ */
static int
dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
dmu_buf_impl_t **parentp, blkptr_t **bpp)
@@ -1611,7 +2186,7 @@
} else if (level < nlevels-1) {
/* this block is referenced from an indirect block */
int err = dbuf_hold_impl(dn, level+1,
- blkid >> epbs, fail_sparse, NULL, parentp);
+ blkid >> epbs, fail_sparse, FALSE, NULL, parentp);
if (err)
return (err);
err = dbuf_read(*parentp, NULL,
@@ -1648,7 +2223,7 @@
ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
ASSERT(dn->dn_type != DMU_OT_NONE);
- db = kmem_cache_alloc(dbuf_cache, KM_SLEEP);
+ db = kmem_cache_alloc(dbuf_kmem_cache, KM_SLEEP);
db->db_objset = os;
db->db.db_object = dn->dn_object;
@@ -1660,11 +2235,10 @@
db->db_parent = parent;
db->db_blkptr = blkptr;
- db->db_user_ptr = NULL;
- db->db_user_data_ptr_ptr = NULL;
- db->db_evict_func = NULL;
- db->db_immediate_evict = 0;
- db->db_freed_in_flight = 0;
+ db->db_user = NULL;
+ db->db_user_immediate_evict = FALSE;
+ db->db_freed_in_flight = FALSE;
+ db->db_pending_evict = FALSE;
if (blkid == DMU_BONUS_BLKID) {
ASSERT3P(parent, ==, dn->dn_dbuf);
@@ -1682,7 +2256,7 @@
db->db.db_offset = 0;
} else {
int blocksize =
- db->db_level ? 1<<dn->dn_indblkshift : dn->dn_datablksz;
+ db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz;
db->db.db_size = blocksize;
db->db.db_offset = db->db_blkid * blocksize;
}
@@ -1698,11 +2272,14 @@
db->db_state = DB_EVICTING;
if ((odb = dbuf_hash_insert(db)) != NULL) {
/* someone else inserted it first */
- kmem_cache_free(dbuf_cache, db);
+ kmem_cache_free(dbuf_kmem_cache, db);
mutex_exit(&dn->dn_dbufs_mtx);
return (odb);
}
- list_insert_head(&dn->dn_dbufs, db);
+ avl_add(&dn->dn_dbufs, db);
+ if (db->db_level == 0 && db->db_blkid >=
+ dn->dn_unlisted_l0_blkid)
+ dn->dn_unlisted_l0_blkid = db->db_blkid + 1;
db->db_state = DB_UNCACHED;
mutex_exit(&dn->dn_dbufs_mtx);
arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
@@ -1713,7 +2290,7 @@
ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
refcount_count(&dn->dn_holds) > 0);
(void) refcount_add(&dn->dn_holds, db);
- (void) atomic_inc_32_nv(&dn->dn_dbufs_count);
+ atomic_inc_32(&dn->dn_dbufs_count);
dprintf_dbuf(db, "db=%p\n", db);
@@ -1720,115 +2297,246 @@
return (db);
}
-static int
-dbuf_do_evict(void *private)
+typedef struct dbuf_prefetch_arg {
+ spa_t *dpa_spa; /* The spa to issue the prefetch in. */
+ zbookmark_phys_t dpa_zb; /* The target block to prefetch. */
+ int dpa_epbs; /* Entries (blkptr_t's) Per Block Shift. */
+ int dpa_curlevel; /* The current level that we're reading */
+ dnode_t *dpa_dnode; /* The dnode associated with the prefetch */
+ zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */
+ zio_t *dpa_zio; /* The parent zio_t for all prefetches. */
+ arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */
+} dbuf_prefetch_arg_t;
+
+/*
+ * Actually issue the prefetch read for the block given.
+ */
+static void
+dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp)
{
- arc_buf_t *buf = private;
- dmu_buf_impl_t *db = buf->b_private;
+ if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
+ return;
- if (!MUTEX_HELD(&db->db_mtx))
- mutex_enter(&db->db_mtx);
+ arc_flags_t aflags =
+ dpa->dpa_aflags | ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
- ASSERT(refcount_is_zero(&db->db_holds));
-
- if (db->db_state != DB_EVICTING) {
- ASSERT(db->db_state == DB_CACHED);
- DBUF_VERIFY(db);
- db->db_buf = NULL;
- dbuf_evict(db);
- } else {
- mutex_exit(&db->db_mtx);
- dbuf_destroy(db);
- }
- return (0);
+ ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));
+ ASSERT3U(dpa->dpa_curlevel, ==, dpa->dpa_zb.zb_level);
+ ASSERT(dpa->dpa_zio != NULL);
+ (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp, NULL, NULL,
+ dpa->dpa_prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
+ &aflags, &dpa->dpa_zb);
}
+/*
+ * Called when an indirect block above our prefetch target is read in. This
+ * will either read in the next indirect block down the tree or issue the actual
+ * prefetch if the next block down is our target.
+ */
static void
-dbuf_destroy(dmu_buf_impl_t *db)
+dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private)
{
- ASSERT(refcount_is_zero(&db->db_holds));
+ dbuf_prefetch_arg_t *dpa = private;
- if (db->db_blkid != DMU_BONUS_BLKID) {
- /*
- * If this dbuf is still on the dn_dbufs list,
- * remove it from that list.
- */
- if (db->db_dnode_handle != NULL) {
- dnode_t *dn;
+ ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel);
+ ASSERT3S(dpa->dpa_curlevel, >, 0);
- DB_DNODE_ENTER(db);
- dn = DB_DNODE(db);
- mutex_enter(&dn->dn_dbufs_mtx);
- list_remove(&dn->dn_dbufs, db);
- (void) atomic_dec_32_nv(&dn->dn_dbufs_count);
- mutex_exit(&dn->dn_dbufs_mtx);
- DB_DNODE_EXIT(db);
- /*
- * Decrementing the dbuf count means that the hold
- * corresponding to the removed dbuf is no longer
- * discounted in dnode_move(), so the dnode cannot be
- * moved until after we release the hold.
- */
- dnode_rele(dn, db);
- db->db_dnode_handle = NULL;
+ /*
+ * The dpa_dnode is only valid if we are called with a NULL
+ * zio. This indicates that the arc_read() returned without
+ * first calling zio_read() to issue a physical read. Once
+ * a physical read is made the dpa_dnode must be invalidated
+ * as the locks guarding it may have been dropped. If the
+ * dpa_dnode is still valid, then we want to add it to the dbuf
+ * cache. To do so, we must hold the dbuf associated with the block
+ * we just prefetched, read its contents so that we associate it
+ * with an arc_buf_t, and then release it.
+ */
+ if (zio != NULL) {
+ ASSERT3S(BP_GET_LEVEL(zio->io_bp), ==, dpa->dpa_curlevel);
+ if (zio->io_flags & ZIO_FLAG_RAW) {
+ ASSERT3U(BP_GET_PSIZE(zio->io_bp), ==, zio->io_size);
+ } else {
+ ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size);
}
- dbuf_hash_remove(db);
+ ASSERT3P(zio->io_spa, ==, dpa->dpa_spa);
+
+ dpa->dpa_dnode = NULL;
+ } else if (dpa->dpa_dnode != NULL) {
+ uint64_t curblkid = dpa->dpa_zb.zb_blkid >>
+ (dpa->dpa_epbs * (dpa->dpa_curlevel -
+ dpa->dpa_zb.zb_level));
+ dmu_buf_impl_t *db = dbuf_hold_level(dpa->dpa_dnode,
+ dpa->dpa_curlevel, curblkid, FTAG);
+ (void) dbuf_read(db, NULL,
+ DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_HAVESTRUCT);
+ dbuf_rele(db, FTAG);
}
- db->db_parent = NULL;
- db->db_buf = NULL;
- ASSERT(!list_link_active(&db->db_link));
- ASSERT(db->db.db_data == NULL);
- ASSERT(db->db_hash_next == NULL);
- ASSERT(db->db_blkptr == NULL);
- ASSERT(db->db_data_pending == NULL);
+ dpa->dpa_curlevel--;
- kmem_cache_free(dbuf_cache, db);
- arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
+ uint64_t nextblkid = dpa->dpa_zb.zb_blkid >>
+ (dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level));
+ blkptr_t *bp = ((blkptr_t *)abuf->b_data) +
+ P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs);
+ if (BP_IS_HOLE(bp) || (zio != NULL && zio->io_error != 0)) {
+ kmem_free(dpa, sizeof (*dpa));
+ } else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) {
+ ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid);
+ dbuf_issue_final_prefetch(dpa, bp);
+ kmem_free(dpa, sizeof (*dpa));
+ } else {
+ arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
+ zbookmark_phys_t zb;
+
+ /* flag if L2ARC eligible, l2arc_noprefetch then decides */
+ if (dpa->dpa_aflags & ARC_FLAG_L2CACHE)
+ iter_aflags |= ARC_FLAG_L2CACHE;
+
+ ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));
+
+ SET_BOOKMARK(&zb, dpa->dpa_zb.zb_objset,
+ dpa->dpa_zb.zb_object, dpa->dpa_curlevel, nextblkid);
+
+ (void) arc_read(dpa->dpa_zio, dpa->dpa_spa,
+ bp, dbuf_prefetch_indirect_done, dpa, dpa->dpa_prio,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
+ &iter_aflags, &zb);
+ }
+
+ arc_buf_destroy(abuf, private);
}
+/*
+ * Issue prefetch reads for the given block on the given level. If the indirect
+ * blocks above that block are not in memory, we will read them in
+ * asynchronously. As a result, this call never blocks waiting for a read to
+ * complete.
+ */
void
-dbuf_prefetch(dnode_t *dn, uint64_t blkid)
+dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
+ arc_flags_t aflags)
{
- dmu_buf_impl_t *db = NULL;
- blkptr_t *bp = NULL;
+ blkptr_t bp;
+ int epbs, nlevels, curlevel;
+ uint64_t curblkid;
ASSERT(blkid != DMU_BONUS_BLKID);
ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
+ if (blkid > dn->dn_maxblkid)
+ return;
+
if (dnode_block_freed(dn, blkid))
return;
- /* dbuf_find() returns with db_mtx held */
- if (db = dbuf_find(dn, 0, blkid)) {
+ /*
+ * This dnode hasn't been written to disk yet, so there's nothing to
+ * prefetch.
+ */
+ nlevels = dn->dn_phys->dn_nlevels;
+ if (level >= nlevels || dn->dn_phys->dn_nblkptr == 0)
+ return;
+
+ epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+ if (dn->dn_phys->dn_maxblkid < blkid << (epbs * level))
+ return;
+
+ dmu_buf_impl_t *db = dbuf_find(dn->dn_objset, dn->dn_object,
+ level, blkid);
+ if (db != NULL) {
+ mutex_exit(&db->db_mtx);
/*
- * This dbuf is already in the cache. We assume that
- * it is already CACHED, or else about to be either
- * read or filled.
+ * This dbuf already exists. It is either CACHED, or
+ * (we assume) about to be read or filled.
*/
- mutex_exit(&db->db_mtx);
return;
}
- if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) {
- if (bp && !BP_IS_HOLE(bp)) {
- int priority = dn->dn_type == DMU_OT_DDT_ZAP ?
- ZIO_PRIORITY_DDT_PREFETCH : ZIO_PRIORITY_ASYNC_READ;
- dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
- uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
- zbookmark_t zb;
+ /*
+ * Find the closest ancestor (indirect block) of the target block
+ * that is present in the cache. In this indirect block, we will
+ * find the bp that is at curlevel, curblkid.
+ */
+ curlevel = level;
+ curblkid = blkid;
+ while (curlevel < nlevels - 1) {
+ int parent_level = curlevel + 1;
+ uint64_t parent_blkid = curblkid >> epbs;
+ dmu_buf_impl_t *db;
- SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
- dn->dn_object, 0, blkid);
+ if (dbuf_hold_impl(dn, parent_level, parent_blkid,
+ FALSE, TRUE, FTAG, &db) == 0) {
+ blkptr_t *bpp = db->db_buf->b_data;
+ bp = bpp[P2PHASE(curblkid, 1 << epbs)];
+ dbuf_rele(db, FTAG);
+ break;
+ }
- (void) arc_read(NULL, dn->dn_objset->os_spa,
- bp, NULL, NULL, priority,
- ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
- &aflags, &zb);
- }
- if (db)
- dbuf_rele(db, NULL);
+ curlevel = parent_level;
+ curblkid = parent_blkid;
}
+
+ if (curlevel == nlevels - 1) {
+ /* No cached indirect blocks found. */
+ ASSERT3U(curblkid, <, dn->dn_phys->dn_nblkptr);
+ bp = dn->dn_phys->dn_blkptr[curblkid];
+ }
+ if (BP_IS_HOLE(&bp))
+ return;
+
+ ASSERT3U(curlevel, ==, BP_GET_LEVEL(&bp));
+
+ zio_t *pio = zio_root(dmu_objset_spa(dn->dn_objset), NULL, NULL,
+ ZIO_FLAG_CANFAIL);
+
+ dbuf_prefetch_arg_t *dpa = kmem_zalloc(sizeof (*dpa), KM_SLEEP);
+ dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
+ SET_BOOKMARK(&dpa->dpa_zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,
+ dn->dn_object, level, blkid);
+ dpa->dpa_curlevel = curlevel;
+ dpa->dpa_prio = prio;
+ dpa->dpa_aflags = aflags;
+ dpa->dpa_spa = dn->dn_objset->os_spa;
+ dpa->dpa_dnode = dn;
+ dpa->dpa_epbs = epbs;
+ dpa->dpa_zio = pio;
+
+ /* flag if L2ARC eligible, l2arc_noprefetch then decides */
+ if (DNODE_LEVEL_IS_L2CACHEABLE(dn, level))
+ dpa->dpa_aflags |= ARC_FLAG_L2CACHE;
+
+ /*
+ * If we have the indirect just above us, no need to do the asynchronous
+ * prefetch chain; we'll just run the last step ourselves. If we're at
+ * a higher level, though, we want to issue the prefetches for all the
+ * indirect blocks asynchronously, so we can go on with whatever we were
+ * doing.
+ */
+ if (curlevel == level) {
+ ASSERT3U(curblkid, ==, blkid);
+ dbuf_issue_final_prefetch(dpa, &bp);
+ kmem_free(dpa, sizeof (*dpa));
+ } else {
+ arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
+ zbookmark_phys_t zb;
+
+ /* flag if L2ARC eligible, l2arc_noprefetch then decides */
+ if (DNODE_LEVEL_IS_L2CACHEABLE(dn, level))
+ iter_aflags |= ARC_FLAG_L2CACHE;
+
+ SET_BOOKMARK(&zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,
+ dn->dn_object, curlevel, curblkid);
+ (void) arc_read(dpa->dpa_zio, dpa->dpa_spa,
+ &bp, dbuf_prefetch_indirect_done, dpa, prio,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
+ &iter_aflags, &zb);
+ }
+ /*
+ * We use pio here instead of dpa_zio since it's possible that
+ * dpa may have already been freed.
+ */
+ zio_nowait(pio);
}
/*
@@ -1836,7 +2544,8 @@
* Note: dn_struct_rwlock must be held.
*/
int
-dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
+dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
+ boolean_t fail_sparse, boolean_t fail_uncached,
void *tag, dmu_buf_impl_t **dbp)
{
dmu_buf_impl_t *db, *parent = NULL;
@@ -1848,12 +2557,15 @@
*dbp = NULL;
top:
/* dbuf_find() returns with db_mtx held */
- db = dbuf_find(dn, level, blkid);
+ db = dbuf_find(dn->dn_objset, dn->dn_object, level, blkid);
if (db == NULL) {
blkptr_t *bp = NULL;
int err;
+ if (fail_uncached)
+ return (SET_ERROR(ENOENT));
+
ASSERT3P(parent, ==, NULL);
err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);
if (fail_sparse) {
@@ -1870,19 +2582,14 @@
db = dbuf_create(dn, level, blkid, parent, bp);
}
- if (db->db_buf && refcount_is_zero(&db->db_holds)) {
- arc_buf_add_ref(db->db_buf, db);
- if (db->db_buf->b_data == NULL) {
- dbuf_clear(db);
- if (parent) {
- dbuf_rele(parent, NULL);
- parent = NULL;
- }
- goto top;
- }
- ASSERT3P(db->db.db_data, ==, db->db_buf->b_data);
+ if (fail_uncached && db->db_state != DB_CACHED) {
+ mutex_exit(&db->db_mtx);
+ return (SET_ERROR(ENOENT));
}
+ if (db->db_buf != NULL)
+ ASSERT3P(db->db.db_data, ==, db->db_buf->b_data);
+
ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf));
/*
@@ -1899,7 +2606,7 @@
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
dbuf_set_data(db,
- arc_buf_alloc(dn->dn_objset->os_spa,
+ arc_alloc_buf(dn->dn_objset->os_spa,
db->db.db_size, db, type));
bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data,
db->db.db_size);
@@ -1906,8 +2613,13 @@
}
}
+ if (multilist_link_active(&db->db_cache_link)) {
+ ASSERT(refcount_is_zero(&db->db_holds));
+ multilist_remove(&dbuf_cache, db);
+ (void) refcount_remove_many(&dbuf_cache_size,
+ db->db.db_size, db);
+ }
(void) refcount_add(&db->db_holds, tag);
- dbuf_update_data(db);
DBUF_VERIFY(db);
mutex_exit(&db->db_mtx);
@@ -1926,9 +2638,7 @@
dmu_buf_impl_t *
dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
{
- dmu_buf_impl_t *db;
- int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db);
- return (err ? NULL : db);
+ return (dbuf_hold_level(dn, 0, blkid, tag));
}
dmu_buf_impl_t *
@@ -1935,7 +2645,7 @@
dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
{
dmu_buf_impl_t *db;
- int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db);
+ int err = dbuf_hold_impl(dn, level, blkid, FALSE, FALSE, tag, &db);
return (err ? NULL : db);
}
@@ -1958,10 +2668,8 @@
return (SET_ERROR(ENOTSUP));
if (blksz == 0)
blksz = SPA_MINBLOCKSIZE;
- if (blksz > SPA_MAXBLOCKSIZE)
- blksz = SPA_MAXBLOCKSIZE;
- else
- blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
+ ASSERT3U(blksz, <=, spa_maxblocksize(dmu_objset_spa(db->db_objset)));
+ blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
@@ -1984,9 +2692,33 @@
dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
{
int64_t holds = refcount_add(&db->db_holds, tag);
- ASSERT(holds > 1);
+ ASSERT3S(holds, >, 1);
}
+#pragma weak dmu_buf_try_add_ref = dbuf_try_add_ref
+boolean_t
+dbuf_try_add_ref(dmu_buf_t *db_fake, objset_t *os, uint64_t obj, uint64_t blkid,
+ void *tag)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ dmu_buf_impl_t *found_db;
+ boolean_t result = B_FALSE;
+
+ if (db->db_blkid == DMU_BONUS_BLKID)
+ found_db = dbuf_find_bonus(os, obj);
+ else
+ found_db = dbuf_find(os, obj, 0, blkid);
+
+ if (found_db != NULL) {
+ if (db == found_db && dbuf_refcount(db) > db->db_dirtycnt) {
+ (void) refcount_add(&db->db_holds, tag);
+ result = B_TRUE;
+ }
+ mutex_exit(&db->db_mtx);
+ }
+ return (result);
+}
+
/*
* If you call dbuf_rele() you had better not be referencing the dnode handle
* unless you have some other direct or indirect hold on the dnode. (An indirect
@@ -1994,7 +2726,6 @@
* Without that, the dbuf_rele() could lead to a dnode_rele() followed by the
* dnode's parent dbuf evicting its dnode handles.
*/
-#pragma weak dmu_buf_rele = dbuf_rele
void
dbuf_rele(dmu_buf_impl_t *db, void *tag)
{
@@ -2002,6 +2733,12 @@
dbuf_rele_and_unlock(db, tag);
}
+void
+dmu_buf_rele(dmu_buf_t *db, void *tag)
+{
+ dbuf_rele((dmu_buf_impl_t *)db, tag);
+}
+
/*
* dbuf_rele() for an already-locked dbuf. This is necessary to allow
* db_dirtycnt and db_holds to be updated atomically.
@@ -2026,30 +2763,47 @@
* We can't freeze indirects if there is a possibility that they
* may be modified in the current syncing context.
*/
- if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0))
+ if (db->db_buf != NULL &&
+ holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) {
arc_buf_freeze(db->db_buf);
+ }
if (holds == db->db_dirtycnt &&
- db->db_level == 0 && db->db_immediate_evict)
+ db->db_level == 0 && db->db_user_immediate_evict)
dbuf_evict_user(db);
if (holds == 0) {
if (db->db_blkid == DMU_BONUS_BLKID) {
- mutex_exit(&db->db_mtx);
+ dnode_t *dn;
+ boolean_t evict_dbuf = db->db_pending_evict;
/*
- * If the dnode moves here, we cannot cross this barrier
- * until the move completes.
+ * If the dnode moves here, we cannot cross this
+ * barrier until the move completes.
*/
DB_DNODE_ENTER(db);
- (void) atomic_dec_32_nv(&DB_DNODE(db)->dn_dbufs_count);
+
+ dn = DB_DNODE(db);
+ atomic_dec_32(&dn->dn_dbufs_count);
+
+ /*
+ * Decrementing the dbuf count means that the bonus
+ * buffer's dnode hold is no longer discounted in
+ * dnode_move(). The dnode cannot move until after
+ * the dnode_rele() below.
+ */
DB_DNODE_EXIT(db);
+
/*
- * The bonus buffer's dnode hold is no longer discounted
- * in dnode_move(). The dnode cannot move until after
- * the dnode_rele().
+ * Do not reference db after its lock is dropped.
+ * Another thread may evict it.
*/
- dnode_rele(DB_DNODE(db), db);
+ mutex_exit(&db->db_mtx);
+
+ if (evict_dbuf)
+ dnode_evict_bonus(dn);
+
+ dnode_rele(dn, db);
} else if (db->db_buf == NULL) {
/*
* This is a special case: we never associated this
@@ -2057,42 +2811,44 @@
*/
ASSERT(db->db_state == DB_UNCACHED ||
db->db_state == DB_NOFILL);
- dbuf_evict(db);
+ dbuf_destroy(db);
} else if (arc_released(db->db_buf)) {
- arc_buf_t *buf = db->db_buf;
/*
* This dbuf has anonymous data associated with it.
*/
- dbuf_set_data(db, NULL);
- VERIFY(arc_buf_remove_ref(buf, db));
- dbuf_evict(db);
+ dbuf_destroy(db);
} else {
- VERIFY(!arc_buf_remove_ref(db->db_buf, db));
+ boolean_t do_arc_evict = B_FALSE;
+ blkptr_t bp;
+ spa_t *spa = dmu_objset_spa(db->db_objset);
- /*
- * A dbuf will be eligible for eviction if either the
- * 'primarycache' property is set or a duplicate
- * copy of this buffer is already cached in the arc.
- *
- * In the case of the 'primarycache' a buffer
- * is considered for eviction if it matches the
- * criteria set in the property.
- *
- * To decide if our buffer is considered a
- * duplicate, we must call into the arc to determine
- * if multiple buffers are referencing the same
- * block on-disk. If so, then we simply evict
- * ourselves.
- */
+ if (!DBUF_IS_CACHEABLE(db) &&
+ db->db_blkptr != NULL &&
+ !BP_IS_HOLE(db->db_blkptr) &&
+ !BP_IS_EMBEDDED(db->db_blkptr)) {
+ do_arc_evict = B_TRUE;
+ bp = *db->db_blkptr;
+ }
+
if (!DBUF_IS_CACHEABLE(db) ||
- arc_buf_eviction_needed(db->db_buf))
- dbuf_clear(db);
- else
+ db->db_pending_evict) {
+ dbuf_destroy(db);
+ } else if (!multilist_link_active(&db->db_cache_link)) {
+ multilist_insert(&dbuf_cache, db);
+ (void) refcount_add_many(&dbuf_cache_size,
+ db->db.db_size, db);
mutex_exit(&db->db_mtx);
+
+ dbuf_evict_notify();
+ }
+
+ if (do_arc_evict)
+ arc_freed(spa, &bp);
}
} else {
mutex_exit(&db->db_mtx);
}
+
}
#pragma weak dmu_buf_refcount = dbuf_refcount
@@ -2103,47 +2859,42 @@
}
void *
-dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
- dmu_buf_evict_func_t *evict_func)
+dmu_buf_replace_user(dmu_buf_t *db_fake, dmu_buf_user_t *old_user,
+ dmu_buf_user_t *new_user)
{
- return (dmu_buf_update_user(db_fake, NULL, user_ptr,
- user_data_ptr_ptr, evict_func));
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+
+ mutex_enter(&db->db_mtx);
+ dbuf_verify_user(db, DBVU_NOT_EVICTING);
+ if (db->db_user == old_user)
+ db->db_user = new_user;
+ else
+ old_user = db->db_user;
+ dbuf_verify_user(db, DBVU_NOT_EVICTING);
+ mutex_exit(&db->db_mtx);
+
+ return (old_user);
}
void *
-dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
- dmu_buf_evict_func_t *evict_func)
+dmu_buf_set_user(dmu_buf_t *db_fake, dmu_buf_user_t *user)
{
- dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
-
- db->db_immediate_evict = TRUE;
- return (dmu_buf_update_user(db_fake, NULL, user_ptr,
- user_data_ptr_ptr, evict_func));
+ return (dmu_buf_replace_user(db_fake, NULL, user));
}
void *
-dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr,
- void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func)
+dmu_buf_set_user_ie(dmu_buf_t *db_fake, dmu_buf_user_t *user)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
- ASSERT(db->db_level == 0);
- ASSERT((user_ptr == NULL) == (evict_func == NULL));
+ db->db_user_immediate_evict = TRUE;
+ return (dmu_buf_set_user(db_fake, user));
+}
- mutex_enter(&db->db_mtx);
-
- if (db->db_user_ptr == old_user_ptr) {
- db->db_user_ptr = user_ptr;
- db->db_user_data_ptr_ptr = user_data_ptr_ptr;
- db->db_evict_func = evict_func;
-
- dbuf_update_data(db);
- } else {
- old_user_ptr = db->db_user_ptr;
- }
-
- mutex_exit(&db->db_mtx);
- return (old_user_ptr);
+void *
+dmu_buf_remove_user(dmu_buf_t *db_fake, dmu_buf_user_t *user)
+{
+ return (dmu_buf_replace_user(db_fake, user, NULL));
}
void *
@@ -2150,11 +2901,17 @@
dmu_buf_get_user(dmu_buf_t *db_fake)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
- ASSERT(!refcount_is_zero(&db->db_holds));
- return (db->db_user_ptr);
+ dbuf_verify_user(db, DBVU_NOT_EVICTING);
+ return (db->db_user);
}
+void
+dmu_buf_user_evict_wait()
+{
+ taskq_wait(dbu_evict_taskq);
+}
+
boolean_t
dmu_buf_freeable(dmu_buf_t *dbuf)
{
@@ -2175,6 +2932,28 @@
return (dbi->db_blkptr);
}
+objset_t *
+dmu_buf_get_objset(dmu_buf_t *db)
+{
+ dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
+ return (dbi->db_objset);
+}
+
+dnode_t *
+dmu_buf_dnode_enter(dmu_buf_t *db)
+{
+ dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
+ DB_DNODE_ENTER(dbi);
+ return (DB_DNODE(dbi));
+}
+
+void
+dmu_buf_dnode_exit(dmu_buf_t *db)
+{
+ dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
+ DB_DNODE_EXIT(dbi);
+}
+
static void
dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
{
@@ -2208,8 +2987,8 @@
if (parent == NULL) {
mutex_exit(&db->db_mtx);
rw_enter(&dn->dn_struct_rwlock, RW_READER);
- (void) dbuf_hold_impl(dn, db->db_level+1,
- db->db_blkid >> epbs, FALSE, db, &parent);
+ parent = dbuf_hold_level(dn, db->db_level + 1,
+ db->db_blkid >> epbs, db);
rw_exit(&dn->dn_struct_rwlock);
mutex_enter(&db->db_mtx);
db->db_parent = parent;
@@ -2260,7 +3039,7 @@
zio = dr->dr_zio;
mutex_enter(&dr->dt.di.dr_mtx);
- dbuf_sync_list(&dr->dt.di.dr_children, tx);
+ dbuf_sync_list(&dr->dt.di.dr_children, db->db_level - 1, tx);
ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
mutex_exit(&dr->dt.di.dr_mtx);
zio_nowait(zio);
@@ -2379,7 +3158,7 @@
*/
int blksz = arc_buf_size(*datap);
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
- *datap = arc_buf_alloc(os->os_spa, blksz, db, type);
+ *datap = arc_alloc_buf(os->os_spa, blksz, db, type);
bcopy(db->db.db_data, (*datap)->b_data, blksz);
}
db->db_data_pending = dr;
@@ -2406,7 +3185,7 @@
}
void
-dbuf_sync_list(list_t *list, dmu_tx_t *tx)
+dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx)
{
dbuf_dirty_record_t *dr;
@@ -2423,6 +3202,10 @@
DMU_META_DNODE_OBJECT);
break;
}
+ if (dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
+ dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {
+ VERIFY3U(dr->dr_dbuf->db_level, ==, level);
+ }
list_remove(list, dr);
if (dr->dr_dbuf->db_level > 0)
dbuf_sync_indirect(dr, tx);
@@ -2444,7 +3227,8 @@
uint64_t fill = 0;
int i;
- ASSERT(db->db_blkptr == bp);
+ ASSERT3P(db->db_blkptr, !=, NULL);
+ ASSERT3P(&db->db_data_pending->dr_bp_copy, ==, bp);
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
@@ -2452,24 +3236,21 @@
dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
zio->io_prev_space_delta = delta;
- if (BP_IS_HOLE(bp)) {
- ASSERT(bp->blk_fill == 0);
- DB_DNODE_EXIT(db);
- return;
+ if (bp->blk_birth != 0) {
+ ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
+ BP_GET_TYPE(bp) == dn->dn_type) ||
+ (db->db_blkid == DMU_SPILL_BLKID &&
+ BP_GET_TYPE(bp) == dn->dn_bonustype) ||
+ BP_IS_EMBEDDED(bp));
+ ASSERT(BP_GET_LEVEL(bp) == db->db_level);
}
- ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
- BP_GET_TYPE(bp) == dn->dn_type) ||
- (db->db_blkid == DMU_SPILL_BLKID &&
- BP_GET_TYPE(bp) == dn->dn_bonustype));
- ASSERT(BP_GET_LEVEL(bp) == db->db_level);
-
mutex_enter(&db->db_mtx);
#ifdef ZFS_DEBUG
if (db->db_blkid == DMU_SPILL_BLKID) {
ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
- ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
+ ASSERT(!(BP_IS_HOLE(bp)) &&
db->db_blkptr == &dn->dn_phys->dn_spill);
}
#endif
@@ -2489,7 +3270,11 @@
fill++;
}
} else {
- fill = 1;
+ if (BP_IS_HOLE(bp)) {
+ fill = 0;
+ } else {
+ fill = 1;
+ }
}
} else {
blkptr_t *ibp = db->db.db_data;
@@ -2497,24 +3282,101 @@
for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
if (BP_IS_HOLE(ibp))
continue;
- fill += ibp->blk_fill;
+ fill += BP_GET_FILL(ibp);
}
}
DB_DNODE_EXIT(db);
- bp->blk_fill = fill;
+ if (!BP_IS_EMBEDDED(bp))
+ bp->blk_fill = fill;
mutex_exit(&db->db_mtx);
+
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ *db->db_blkptr = *bp;
+ rw_exit(&dn->dn_struct_rwlock);
}
/* ARGSUSED */
+/*
+ * This function gets called just prior to running through the compression
+ * stage of the zio pipeline. If we're an indirect block comprised of only
+ * holes, then we want this indirect to be compressed away to a hole. In
+ * order to do that we must zero out any information about the holes that
+ * this indirect points to prior to before we try to compress it.
+ */
static void
+dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
+{
+ dmu_buf_impl_t *db = vdb;
+ dnode_t *dn;
+ blkptr_t *bp;
+ uint64_t i;
+ int epbs;
+
+ ASSERT3U(db->db_level, >, 0);
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+
+ /* Determine if all our children are holes */
+ for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++) {
+ if (!BP_IS_HOLE(bp))
+ break;
+ }
+
+ /*
+ * If all the children are holes, then zero them all out so that
+ * we may get compressed away.
+ */
+ if (i == 1 << epbs) {
+ /* didn't find any non-holes */
+ bzero(db->db.db_data, db->db.db_size);
+ }
+ DB_DNODE_EXIT(db);
+}
+
+/*
+ * The SPA will call this callback several times for each zio - once
+ * for every physical child i/o (zio->io_phys_children times). This
+ * allows the DMU to monitor the progress of each logical i/o. For example,
+ * there may be 2 copies of an indirect block, or many fragments of a RAID-Z
+ * block. There may be a long delay before all copies/fragments are completed,
+ * so this callback allows us to retire dirty space gradually, as the physical
+ * i/os complete.
+ */
+/* ARGSUSED */
+static void
+dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg)
+{
+ dmu_buf_impl_t *db = arg;
+ objset_t *os = db->db_objset;
+ dsl_pool_t *dp = dmu_objset_pool(os);
+ dbuf_dirty_record_t *dr;
+ int delta = 0;
+
+ dr = db->db_data_pending;
+ ASSERT3U(dr->dr_txg, ==, zio->io_txg);
+
+ /*
+ * The callback will be called io_phys_children times. Retire one
+ * portion of our dirty space each time we are called. Any rounding
+ * error will be cleaned up by dsl_pool_sync()'s call to
+ * dsl_pool_undirty_space().
+ */
+ delta = dr->dr_accounted / zio->io_phys_children;
+ dsl_pool_undirty_space(dp, delta, zio->io_txg);
+}
+
+/* ARGSUSED */
+static void
dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
{
dmu_buf_impl_t *db = vdb;
- blkptr_t *bp = zio->io_bp;
blkptr_t *bp_orig = &zio->io_bp_orig;
- uint64_t txg = zio->io_txg;
+ blkptr_t *bp = db->db_blkptr;
+ objset_t *os = db->db_objset;
+ dmu_tx_t *tx = os->os_synctx;
dbuf_dirty_record_t **drp, *dr;
ASSERT0(zio->io_error);
@@ -2527,14 +3389,7 @@
if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) {
ASSERT(BP_EQUAL(bp, bp_orig));
} else {
- objset_t *os;
- dsl_dataset_t *ds;
- dmu_tx_t *tx;
-
- DB_GET_OBJSET(&os, db);
- ds = os->os_dsl_dataset;
- tx = os->os_synctx;
-
+ dsl_dataset_t *ds = os->os_dsl_dataset;
(void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
dsl_dataset_block_born(ds, bp, tx);
}
@@ -2547,7 +3402,6 @@
while ((dr = *drp) != db->db_data_pending)
drp = &dr->dr_next;
ASSERT(!list_link_active(&dr->dr_dirty_node));
- ASSERT(dr->dr_txg == txg);
ASSERT(dr->dr_dbuf == db);
ASSERT(dr->dr_next == NULL);
*drp = dr->dr_next;
@@ -2570,10 +3424,7 @@
ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
if (db->db_state != DB_NOFILL) {
if (dr->dt.dl.dr_data != db->db_buf)
- VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data,
- db));
- else if (!arc_released(db->db_buf))
- arc_set_callback(db->db_buf, dbuf_do_evict, db);
+ arc_buf_destroy(dr->dt.dl.dr_data, db);
}
} else {
dnode_t *dn;
@@ -2581,15 +3432,14 @@
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
- ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
+ ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift);
if (!BP_IS_HOLE(db->db_blkptr)) {
int epbs =
dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+ ASSERT3U(db->db_blkid, <=,
+ dn->dn_phys->dn_maxblkid >> (db->db_level * epbs));
ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
db->db.db_size);
- ASSERT3U(dn->dn_phys->dn_maxblkid
- >> (db->db_level * epbs), >=, db->db_blkid);
- arc_set_callback(db->db_buf, dbuf_do_evict, db);
}
DB_DNODE_EXIT(db);
mutex_destroy(&dr->dt.di.dr_mtx);
@@ -2601,7 +3451,7 @@
ASSERT(db->db_dirtycnt > 0);
db->db_dirtycnt -= 1;
db->db_data_pending = NULL;
- dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
+ dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg);
}
static void
@@ -2652,11 +3502,13 @@
objset_t *os;
dmu_buf_impl_t *parent = db->db_parent;
uint64_t txg = tx->tx_txg;
- zbookmark_t zb;
+ zbookmark_phys_t zb;
zio_prop_t zp;
zio_t *zio;
int wp_flag = 0;
+ ASSERT(dmu_tx_is_syncing(tx));
+
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
os = dn->dn_objset;
@@ -2715,12 +3567,27 @@
dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
DB_DNODE_EXIT(db);
- if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
- ASSERT(db->db_state != DB_NOFILL);
+ /*
+ * We copy the blkptr now (rather than when we instantiate the dirty
+ * record), because its value can change between open context and
+ * syncing context. We do not need to hold dn_struct_rwlock to read
+ * db_blkptr because we are in syncing context.
+ */
+ dr->dr_bp_copy = *db->db_blkptr;
+
+ if (db->db_level == 0 &&
+ dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
+ /*
+ * The BP for this block has been provided by open context
+ * (by dmu_sync() or dmu_buf_write_embedded()).
+ */
+ void *contents = (data != NULL) ? data->b_data : NULL;
+
dr->dr_zio = zio_write(zio, os->os_spa, txg,
- db->db_blkptr, data->b_data, arc_buf_size(data), &zp,
- dbuf_write_override_ready, dbuf_write_override_done, dr,
- ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
+ &dr->dr_bp_copy, contents, db->db.db_size, &zp,
+ dbuf_write_override_ready, NULL, NULL,
+ dbuf_write_override_done,
+ dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
mutex_enter(&db->db_mtx);
dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
@@ -2727,18 +3594,30 @@
dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite);
mutex_exit(&db->db_mtx);
} else if (db->db_state == DB_NOFILL) {
- ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF);
+ ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF ||
+ zp.zp_checksum == ZIO_CHECKSUM_NOPARITY);
dr->dr_zio = zio_write(zio, os->os_spa, txg,
- db->db_blkptr, NULL, db->db.db_size, &zp,
- dbuf_write_nofill_ready, dbuf_write_nofill_done, db,
+ &dr->dr_bp_copy, NULL, db->db.db_size, &zp,
+ dbuf_write_nofill_ready, NULL, NULL,
+ dbuf_write_nofill_done, db,
ZIO_PRIORITY_ASYNC_WRITE,
ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
} else {
ASSERT(arc_released(data));
+
+ /*
+ * For indirect blocks, we want to setup the children
+ * ready callback so that we can properly handle an indirect
+ * block that only contains holes.
+ */
+ arc_done_func_t *children_ready_cb = NULL;
+ if (db->db_level != 0)
+ children_ready_cb = dbuf_write_children_ready;
+
dr->dr_zio = arc_write(zio, os->os_spa, txg,
- db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db),
- DBUF_IS_L2COMPRESSIBLE(db), &zp, dbuf_write_ready,
- dbuf_write_done, db, ZIO_PRIORITY_ASYNC_WRITE,
- ZIO_FLAG_MUSTSUCCEED, &zb);
+ &dr->dr_bp_copy, data, DBUF_IS_L2CACHEABLE(db),
+ &zp, dbuf_write_ready, children_ready_cb,
+ dbuf_write_physdone, dbuf_write_done, db,
+ ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
}
}
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/ddt.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -22,7 +22,7 @@
/*
* Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -66,7 +66,8 @@
spa_t *spa = ddt->ddt_spa;
objset_t *os = ddt->ddt_os;
uint64_t *objectp = &ddt->ddt_object[type][class];
- boolean_t prehash = zio_checksum_table[ddt->ddt_checksum].ci_dedup;
+ boolean_t prehash = zio_checksum_table[ddt->ddt_checksum].ci_flags &
+ ZCHECKSUM_FLAG_DEDUP;
char name[DDT_NAMELEN];
ddt_object_name(ddt, type, class, name);
@@ -120,12 +121,12 @@
error = zap_lookup(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name,
sizeof (uint64_t), 1, &ddt->ddt_object[type][class]);
- if (error)
+ if (error != 0)
return (error);
- error = zap_lookup(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
+ VERIFY0(zap_lookup(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
- &ddt->ddt_histogram[type][class]);
+ &ddt->ddt_histogram[type][class]));
/*
* Seed the cached statistics.
@@ -140,8 +141,7 @@
ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9;
ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size;
- ASSERT(error == 0);
- return (error);
+ return (0);
}
static void
@@ -418,7 +418,7 @@
ddt_stat_generate(ddt, dde, &dds);
- bucket = highbit(dds.dds_ref_blocks) - 1;
+ bucket = highbit64(dds.dds_ref_blocks) - 1;
ASSERT(bucket >= 0);
ddh = &ddt->ddt_histogram[dde->dde_type][dde->dde_class];
@@ -596,7 +596,10 @@
bcopy(src, dst, s_len);
}
- *version = (ZFS_HOST_BYTEORDER & DDT_COMPRESS_BYTEORDER_MASK) | cpfunc;
+ *version = cpfunc;
+ /* CONSTCOND */
+ if (ZFS_HOST_BYTEORDER)
+ *version |= DDT_COMPRESS_BYTEORDER_MASK;
return (c_len + 1);
}
@@ -613,7 +616,8 @@
else
bcopy(src, dst, d_len);
- if ((version ^ ZFS_HOST_BYTEORDER) & DDT_COMPRESS_BYTEORDER_MASK)
+ if (((version & DDT_COMPRESS_BYTEORDER_MASK) != 0) !=
+ (ZFS_HOST_BYTEORDER != 0))
byteswap_uint64_array(dst, d_len);
}
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,10 +21,11 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
*/
-
/* Copyright (c) 2013 by Saso Kiselkov. All rights reserved. */
+/* Copyright (c) 2013, Joyent, Inc. All rights reserved. */
+/* Copyright (c) 2014, Nexenta Systems, Inc. All rights reserved. */
#include <sys/dmu.h>
#include <sys/dmu_impl.h>
@@ -45,7 +46,9 @@
#include <sys/zio_checksum.h>
#include <sys/zio_compress.h>
#include <sys/sa.h>
+#include <sys/zfeature.h>
#ifdef _KERNEL
+#include <sys/vm.h>
#include <sys/zfs_znode.h>
#endif
@@ -129,41 +132,99 @@
};
int
-dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
- void *tag, dmu_buf_t **dbp, int flags)
+dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset,
+ void *tag, dmu_buf_t **dbp)
{
+ uint64_t blkid;
+ dmu_buf_impl_t *db;
+
+ blkid = dbuf_whichblock(dn, 0, offset);
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ db = dbuf_hold(dn, blkid, tag);
+ rw_exit(&dn->dn_struct_rwlock);
+
+ if (db == NULL) {
+ *dbp = NULL;
+ return (SET_ERROR(EIO));
+ }
+
+ *dbp = &db->db;
+ return (0);
+}
+int
+dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset,
+ void *tag, dmu_buf_t **dbp)
+{
dnode_t *dn;
uint64_t blkid;
dmu_buf_impl_t *db;
int err;
- int db_flags = DB_RF_CANFAIL;
- if (flags & DMU_READ_NO_PREFETCH)
- db_flags |= DB_RF_NOPREFETCH;
-
err = dnode_hold(os, object, FTAG, &dn);
if (err)
return (err);
- blkid = dbuf_whichblock(dn, offset);
+ blkid = dbuf_whichblock(dn, 0, offset);
rw_enter(&dn->dn_struct_rwlock, RW_READER);
db = dbuf_hold(dn, blkid, tag);
rw_exit(&dn->dn_struct_rwlock);
+ dnode_rele(dn, FTAG);
+
if (db == NULL) {
- err = SET_ERROR(EIO);
- } else {
+ *dbp = NULL;
+ return (SET_ERROR(EIO));
+ }
+
+ *dbp = &db->db;
+ return (err);
+}
+
+int
+dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset,
+ void *tag, dmu_buf_t **dbp, int flags)
+{
+ int err;
+ int db_flags = DB_RF_CANFAIL;
+
+ if (flags & DMU_READ_NO_PREFETCH)
+ db_flags |= DB_RF_NOPREFETCH;
+
+ err = dmu_buf_hold_noread_by_dnode(dn, offset, tag, dbp);
+ if (err == 0) {
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp);
err = dbuf_read(db, NULL, db_flags);
- if (err) {
+ if (err != 0) {
dbuf_rele(db, tag);
- db = NULL;
+ *dbp = NULL;
}
}
- dnode_rele(dn, FTAG);
- *dbp = &db->db; /* NULL db plus first field offset is NULL */
return (err);
}
int
+dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
+ void *tag, dmu_buf_t **dbp, int flags)
+{
+ int err;
+ int db_flags = DB_RF_CANFAIL;
+
+ if (flags & DMU_READ_NO_PREFETCH)
+ db_flags |= DB_RF_NOPREFETCH;
+
+ err = dmu_buf_hold_noread(os, object, offset, tag, dbp);
+ if (err == 0) {
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp);
+ err = dbuf_read(db, NULL, db_flags);
+ if (err != 0) {
+ dbuf_rele(db, tag);
+ *dbp = NULL;
+ }
+ }
+
+ return (err);
+}
+
+int
dmu_bonus_max(void)
{
return (DN_MAX_BONUSLEN);
@@ -271,7 +332,7 @@
/* as long as the bonus buf is held, the dnode will be held */
if (refcount_add(&db->db_holds, tag) == 1) {
VERIFY(dnode_add_ref(dn, db));
- (void) atomic_inc_32_nv(&dn->dn_dbufs_count);
+ atomic_inc_32(&dn->dn_dbufs_count);
}
/*
@@ -373,27 +434,29 @@
*/
static int
dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
- int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags)
+ boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags)
{
- dsl_pool_t *dp = NULL;
dmu_buf_t **dbp;
uint64_t blkid, nblks, i;
uint32_t dbuf_flags;
int err;
zio_t *zio;
- hrtime_t start;
ASSERT(length <= DMU_MAX_ACCESS);
- dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT;
- if (flags & DMU_READ_NO_PREFETCH || length > zfetch_array_rd_sz)
- dbuf_flags |= DB_RF_NOPREFETCH;
+ /*
+ * Note: We directly notify the prefetch code of this read, so that
+ * we can tell it about the multi-block read. dbuf_read() only knows
+ * about the one block it is accessing.
+ */
+ dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT |
+ DB_RF_NOPREFETCH;
rw_enter(&dn->dn_struct_rwlock, RW_READER);
if (dn->dn_datablkshift) {
int blkshift = dn->dn_datablkshift;
- nblks = (P2ROUNDUP(offset+length, 1ULL<<blkshift) -
- P2ALIGN(offset, 1ULL<<blkshift)) >> blkshift;
+ nblks = (P2ROUNDUP(offset + length, 1ULL << blkshift) -
+ P2ALIGN(offset, 1ULL << blkshift)) >> blkshift;
} else {
if (offset + length > dn->dn_datablksz) {
zfs_panic_recover("zfs: accessing past end of object "
@@ -409,13 +472,10 @@
}
dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP);
- if (dn->dn_objset->os_dsl_dataset)
- dp = dn->dn_objset->os_dsl_dataset->ds_dir->dd_pool;
- start = gethrtime();
zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL);
- blkid = dbuf_whichblock(dn, offset);
+ blkid = dbuf_whichblock(dn, 0, offset);
for (i = 0; i < nblks; i++) {
- dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag);
+ dmu_buf_impl_t *db = dbuf_hold(dn, blkid + i, tag);
if (db == NULL) {
rw_exit(&dn->dn_struct_rwlock);
dmu_buf_rele_array(dbp, nblks, tag);
@@ -422,6 +482,7 @@
zio_nowait(zio);
return (SET_ERROR(EIO));
}
+
/* initiate async i/o */
if (read)
(void) dbuf_read(db, zio, dbuf_flags);
@@ -431,13 +492,16 @@
#endif
dbp[i] = &db->db;
}
+
+ if ((flags & DMU_READ_NO_PREFETCH) == 0 &&
+ DNODE_META_IS_CACHEABLE(dn) && length <= zfetch_array_rd_sz) {
+ dmu_zfetch(&dn->dn_zfetch, blkid, nblks,
+ read && DNODE_IS_CACHEABLE(dn));
+ }
rw_exit(&dn->dn_struct_rwlock);
/* wait for async i/o */
err = zio_wait(zio);
- /* track read overhead when we are in sync context */
- if (dp && dsl_pool_sync_context(dp))
- dp->dp_read_overhead += gethrtime() - start;
if (err) {
dmu_buf_rele_array(dbp, nblks, tag);
return (err);
@@ -487,7 +551,8 @@
int
dmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset,
- uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp)
+ uint64_t length, boolean_t read, void *tag, int *numbufsp,
+ dmu_buf_t ***dbpp)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
dnode_t *dn;
@@ -519,16 +584,22 @@
kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs);
}
+/*
+ * Issue prefetch i/os for the given blocks. If level is greater than 0, the
+ * indirect blocks prefeteched will be those that point to the blocks containing
+ * the data starting at offset, and continuing to offset + len.
+ *
+ * Note that if the indirect blocks above the blocks being prefetched are not in
+ * cache, they will be asychronously read in.
+ */
void
-dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
+dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
+ uint64_t len, zio_priority_t pri)
{
dnode_t *dn;
uint64_t blkid;
- int nblks, i, err;
+ int nblks, err;
- if (zfs_prefetch_disable)
- return;
-
if (len == 0) { /* they're interested in the bonus buffer */
dn = DMU_META_DNODE(os);
@@ -536,8 +607,9 @@
return;
rw_enter(&dn->dn_struct_rwlock, RW_READER);
- blkid = dbuf_whichblock(dn, object * sizeof (dnode_phys_t));
- dbuf_prefetch(dn, blkid);
+ blkid = dbuf_whichblock(dn, level,
+ object * sizeof (dnode_phys_t));
+ dbuf_prefetch(dn, level, blkid, pri, 0);
rw_exit(&dn->dn_struct_rwlock);
return;
}
@@ -552,18 +624,24 @@
return;
rw_enter(&dn->dn_struct_rwlock, RW_READER);
- if (dn->dn_datablkshift) {
- int blkshift = dn->dn_datablkshift;
- nblks = (P2ROUNDUP(offset+len, 1<<blkshift) -
- P2ALIGN(offset, 1<<blkshift)) >> blkshift;
+ /*
+ * offset + len - 1 is the last byte we want to prefetch for, and offset
+ * is the first. Then dbuf_whichblk(dn, level, off + len - 1) is the
+ * last block we want to prefetch, and dbuf_whichblock(dn, level,
+ * offset) is the first. Then the number we need to prefetch is the
+ * last - first + 1.
+ */
+ if (level > 0 || dn->dn_datablkshift != 0) {
+ nblks = dbuf_whichblock(dn, level, offset + len - 1) -
+ dbuf_whichblock(dn, level, offset) + 1;
} else {
nblks = (offset < dn->dn_datablksz);
}
if (nblks != 0) {
- blkid = dbuf_whichblock(dn, offset);
- for (i = 0; i < nblks; i++)
- dbuf_prefetch(dn, blkid+i);
+ blkid = dbuf_whichblock(dn, level, offset);
+ for (int i = 0; i < nblks; i++)
+ dbuf_prefetch(dn, level, blkid + i, pri, 0);
}
rw_exit(&dn->dn_struct_rwlock);
@@ -576,98 +654,99 @@
* the end so that the file gets shorter over time (if we crashes in the
* middle, this will leave us in a better state). We find allocated file
* data by simply searching the allocated level 1 indirects.
+ *
+ * On input, *start should be the first offset that does not need to be
+ * freed (e.g. "offset + length"). On return, *start will be the first
+ * offset that should be freed.
*/
static int
-get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t limit)
+get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum)
{
- uint64_t len = *start - limit;
- uint64_t blkcnt = 0;
- uint64_t maxblks = DMU_MAX_ACCESS / (1ULL << (dn->dn_indblkshift + 1));
+ uint64_t maxblks = DMU_MAX_ACCESS >> (dn->dn_indblkshift + 1);
+ /* bytes of data covered by a level-1 indirect block */
uint64_t iblkrange =
dn->dn_datablksz * EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT);
- ASSERT(limit <= *start);
+ ASSERT3U(minimum, <=, *start);
- if (len <= iblkrange * maxblks) {
- *start = limit;
+ if (*start - minimum <= iblkrange * maxblks) {
+ *start = minimum;
return (0);
}
ASSERT(ISP2(iblkrange));
- while (*start > limit && blkcnt < maxblks) {
+ for (uint64_t blks = 0; *start > minimum && blks < maxblks; blks++) {
int err;
- /* find next allocated L1 indirect */
+ /*
+ * dnode_next_offset(BACKWARDS) will find an allocated L1
+ * indirect block at or before the input offset. We must
+ * decrement *start so that it is at the end of the region
+ * to search.
+ */
+ (*start)--;
err = dnode_next_offset(dn,
DNODE_FIND_BACKWARDS, start, 2, 1, 0);
- /* if there are no more, then we are done */
+ /* if there are no indirect blocks before start, we are done */
if (err == ESRCH) {
- *start = limit;
- return (0);
- } else if (err) {
+ *start = minimum;
+ break;
+ } else if (err != 0) {
return (err);
}
- blkcnt += 1;
- /* reset offset to end of "next" block back */
+ /* set start to the beginning of this L1 indirect */
*start = P2ALIGN(*start, iblkrange);
- if (*start <= limit)
- *start = limit;
- else
- *start -= 1;
}
+ if (*start < minimum)
+ *start = minimum;
return (0);
}
static int
dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset,
- uint64_t length, boolean_t free_dnode)
+ uint64_t length)
{
- dmu_tx_t *tx;
- uint64_t object_size, start, end, len;
- boolean_t trunc = (length == DMU_OBJECT_END);
- int align, err;
+ uint64_t object_size = (dn->dn_maxblkid + 1) * dn->dn_datablksz;
+ int err;
- align = 1 << dn->dn_datablkshift;
- ASSERT(align > 0);
- object_size = align == 1 ? dn->dn_datablksz :
- (dn->dn_maxblkid + 1) << dn->dn_datablkshift;
-
- end = offset + length;
- if (trunc || end > object_size)
- end = object_size;
- if (end <= offset)
+ if (offset >= object_size)
return (0);
- length = end - offset;
- while (length) {
- start = end;
- /* assert(offset <= start) */
- err = get_next_chunk(dn, &start, offset);
+ if (length == DMU_OBJECT_END || offset + length > object_size)
+ length = object_size - offset;
+
+ while (length != 0) {
+ uint64_t chunk_end, chunk_begin;
+
+ chunk_end = chunk_begin = offset + length;
+
+ /* move chunk_begin backwards to the beginning of this chunk */
+ err = get_next_chunk(dn, &chunk_begin, offset);
if (err)
return (err);
- len = trunc ? DMU_OBJECT_END : end - start;
+ ASSERT3U(chunk_begin, >=, offset);
+ ASSERT3U(chunk_begin, <=, chunk_end);
- tx = dmu_tx_create(os);
- dmu_tx_hold_free(tx, dn->dn_object, start, len);
+ dmu_tx_t *tx = dmu_tx_create(os);
+ dmu_tx_hold_free(tx, dn->dn_object,
+ chunk_begin, chunk_end - chunk_begin);
+
+ /*
+ * Mark this transaction as typically resulting in a net
+ * reduction in space used.
+ */
+ dmu_tx_mark_netfree(tx);
err = dmu_tx_assign(tx, TXG_WAIT);
if (err) {
dmu_tx_abort(tx);
return (err);
}
+ dnode_free_range(dn, chunk_begin, chunk_end - chunk_begin, tx);
+ dmu_tx_commit(tx);
- dnode_free_range(dn, start, trunc ? -1 : len, tx);
-
- if (start == 0 && free_dnode) {
- ASSERT(trunc);
- dnode_free(dn, tx);
- }
-
- length -= end - start;
-
- dmu_tx_commit(tx);
- end = start;
+ length -= chunk_end - chunk_begin;
}
return (0);
}
@@ -682,38 +761,43 @@
err = dnode_hold(os, object, FTAG, &dn);
if (err != 0)
return (err);
- err = dmu_free_long_range_impl(os, dn, offset, length, FALSE);
+ err = dmu_free_long_range_impl(os, dn, offset, length);
+
+ /*
+ * It is important to zero out the maxblkid when freeing the entire
+ * file, so that (a) subsequent calls to dmu_free_long_range_impl()
+ * will take the fast path, and (b) dnode_reallocate() can verify
+ * that the entire file has been freed.
+ */
+ if (err == 0 && offset == 0 && length == DMU_OBJECT_END)
+ dn->dn_maxblkid = 0;
+
dnode_rele(dn, FTAG);
return (err);
}
int
-dmu_free_object(objset_t *os, uint64_t object)
+dmu_free_long_object(objset_t *os, uint64_t object)
{
- dnode_t *dn;
dmu_tx_t *tx;
int err;
- err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED,
- FTAG, &dn);
+ err = dmu_free_long_range(os, object, 0, DMU_OBJECT_END);
if (err != 0)
return (err);
- if (dn->dn_nlevels == 1) {
- tx = dmu_tx_create(os);
- dmu_tx_hold_bonus(tx, object);
- dmu_tx_hold_free(tx, dn->dn_object, 0, DMU_OBJECT_END);
- err = dmu_tx_assign(tx, TXG_WAIT);
- if (err == 0) {
- dnode_free_range(dn, 0, DMU_OBJECT_END, tx);
- dnode_free(dn, tx);
- dmu_tx_commit(tx);
- } else {
- dmu_tx_abort(tx);
- }
+
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_bonus(tx, object);
+ dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END);
+ dmu_tx_mark_netfree(tx);
+ err = dmu_tx_assign(tx, TXG_WAIT);
+ if (err == 0) {
+ err = dmu_object_free(os, object, tx);
+ dmu_tx_commit(tx);
} else {
- err = dmu_free_long_range_impl(os, dn, 0, DMU_OBJECT_END, TRUE);
+ dmu_tx_abort(tx);
}
- dnode_rele(dn, FTAG);
+
return (err);
}
@@ -854,6 +938,25 @@
dmu_buf_rele_array(dbp, numbufs, FTAG);
}
+void
+dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset,
+ void *data, uint8_t etype, uint8_t comp, int uncompressed_size,
+ int compressed_size, int byteorder, dmu_tx_t *tx)
+{
+ dmu_buf_t *db;
+
+ ASSERT3U(etype, <, NUM_BP_EMBEDDED_TYPES);
+ ASSERT3U(comp, <, ZIO_COMPRESS_FUNCTIONS);
+ VERIFY0(dmu_buf_hold_noread(os, object, offset,
+ FTAG, &db));
+
+ dmu_buf_write_embedded(db,
+ data, (bp_embedded_type_t)etype, (enum zio_compress)comp,
+ uncompressed_size, compressed_size, byteorder, tx);
+
+ dmu_buf_rele(db, FTAG);
+}
+
/*
* DMU support for xuio
*/
@@ -978,8 +1081,8 @@
}
#ifdef _KERNEL
-int
-dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size)
+static int
+dmu_read_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size)
{
dmu_buf_t **dbp;
int numbufs, i, err;
@@ -989,8 +1092,8 @@
* NB: we could do this block-at-a-time, but it's nice
* to be reading in parallel.
*/
- err = dmu_buf_hold_array(os, object, uio->uio_loffset, size, TRUE, FTAG,
- &numbufs, &dbp);
+ err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size,
+ TRUE, FTAG, &numbufs, &dbp, 0);
if (err)
return (err);
@@ -1024,8 +1127,13 @@
else
XUIOSTAT_BUMP(xuiostat_rbuf_copied);
} else {
+#ifdef illumos
err = uiomove((char *)db->db_data + bufoff, tocpy,
UIO_READ, uio);
+#else
+ err = vn_io_fault_uiomove((char *)db->db_data + bufoff,
+ tocpy, uio);
+#endif
}
if (err)
break;
@@ -1037,6 +1145,58 @@
return (err);
}
+/*
+ * Read 'size' bytes into the uio buffer.
+ * From object zdb->db_object.
+ * Starting at offset uio->uio_loffset.
+ *
+ * If the caller already has a dbuf in the target object
+ * (e.g. its bonus buffer), this routine is faster than dmu_read_uio(),
+ * because we don't have to find the dnode_t for the object.
+ */
+int
+dmu_read_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb;
+ dnode_t *dn;
+ int err;
+
+ if (size == 0)
+ return (0);
+
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ err = dmu_read_uio_dnode(dn, uio, size);
+ DB_DNODE_EXIT(db);
+
+ return (err);
+}
+
+/*
+ * Read 'size' bytes into the uio buffer.
+ * From the specified object
+ * Starting at offset uio->uio_loffset.
+ */
+int
+dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size)
+{
+ dnode_t *dn;
+ int err;
+
+ if (size == 0)
+ return (0);
+
+ err = dnode_hold(os, object, FTAG, &dn);
+ if (err)
+ return (err);
+
+ err = dmu_read_uio_dnode(dn, uio, size);
+
+ dnode_rele(dn, FTAG);
+
+ return (err);
+}
+
static int
dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx)
{
@@ -1067,6 +1227,7 @@
else
dmu_buf_will_dirty(db, tx);
+#ifdef illumos
/*
* XXX uiomove could block forever (eg. nfs-backed
* pages). There needs to be a uiolockdown() function
@@ -1075,6 +1236,10 @@
*/
err = uiomove((char *)db->db_data + bufoff, tocpy,
UIO_WRITE, uio);
+#else
+ err = vn_io_fault_uiomove((char *)db->db_data + bufoff, tocpy,
+ uio);
+#endif
if (tocpy == db->db_size)
dmu_buf_fill_done(db, tx);
@@ -1089,6 +1254,15 @@
return (err);
}
+/*
+ * Write 'size' bytes from the uio buffer.
+ * To object zdb->db_object.
+ * Starting at offset uio->uio_loffset.
+ *
+ * If the caller already has a dbuf in the target object
+ * (e.g. its bonus buffer), this routine is faster than dmu_write_uio(),
+ * because we don't have to find the dnode_t for the object.
+ */
int
dmu_write_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size,
dmu_tx_t *tx)
@@ -1108,6 +1282,11 @@
return (err);
}
+/*
+ * Write 'size' bytes from the uio buffer.
+ * To the specified object.
+ * Starting at offset uio->uio_loffset.
+ */
int
dmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size,
dmu_tx_t *tx)
@@ -1129,7 +1308,7 @@
return (err);
}
-#ifdef sun
+#ifdef illumos
int
dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
page_t *pp, dmu_tx_t *tx)
@@ -1184,9 +1363,67 @@
dmu_buf_rele_array(dbp, numbufs, FTAG);
return (err);
}
-#endif /* sun */
-#endif
+#else /* !illumos */
+
+int
+dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+ vm_page_t *ma, dmu_tx_t *tx)
+{
+ dmu_buf_t **dbp;
+ struct sf_buf *sf;
+ int numbufs, i;
+ int err;
+
+ if (size == 0)
+ return (0);
+
+ err = dmu_buf_hold_array(os, object, offset, size,
+ FALSE, FTAG, &numbufs, &dbp);
+ if (err)
+ return (err);
+
+ for (i = 0; i < numbufs; i++) {
+ int tocpy, copied, thiscpy;
+ int bufoff;
+ dmu_buf_t *db = dbp[i];
+ caddr_t va;
+
+ ASSERT(size > 0);
+ ASSERT3U(db->db_size, >=, PAGESIZE);
+
+ bufoff = offset - db->db_offset;
+ tocpy = (int)MIN(db->db_size - bufoff, size);
+
+ ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
+
+ if (tocpy == db->db_size)
+ dmu_buf_will_fill(db, tx);
+ else
+ dmu_buf_will_dirty(db, tx);
+
+ for (copied = 0; copied < tocpy; copied += PAGESIZE) {
+ ASSERT3U(ptoa((*ma)->pindex), ==, db->db_offset + bufoff);
+ thiscpy = MIN(PAGESIZE, tocpy - copied);
+ va = zfs_map_page(*ma, &sf);
+ bcopy(va, (char *)db->db_data + bufoff, thiscpy);
+ zfs_unmap_page(sf);
+ ma += 1;
+ bufoff += PAGESIZE;
+ }
+
+ if (tocpy == db->db_size)
+ dmu_buf_fill_done(db, tx);
+
+ offset += tocpy;
+ size -= tocpy;
+ }
+ dmu_buf_rele_array(dbp, numbufs, FTAG);
+ return (err);
+}
+#endif /* illumos */
+#endif /* _KERNEL */
+
/*
* Allocate a loaned anonymous arc buffer.
*/
@@ -1194,10 +1431,8 @@
dmu_request_arcbuf(dmu_buf_t *handle, int size)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle;
- spa_t *spa;
- DB_GET_SPA(&spa, db);
- return (arc_loan_buf(spa, size));
+ return (arc_loan_buf(db->db_objset->os_spa, size));
}
/*
@@ -1207,7 +1442,7 @@
dmu_return_arcbuf(arc_buf_t *buf)
{
arc_return_buf(buf, FTAG);
- VERIFY(arc_buf_remove_ref(buf, FTAG));
+ arc_buf_destroy(buf, FTAG);
}
/*
@@ -1228,12 +1463,22 @@
DB_DNODE_ENTER(dbuf);
dn = DB_DNODE(dbuf);
rw_enter(&dn->dn_struct_rwlock, RW_READER);
- blkid = dbuf_whichblock(dn, offset);
+ blkid = dbuf_whichblock(dn, 0, offset);
VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL);
rw_exit(&dn->dn_struct_rwlock);
DB_DNODE_EXIT(dbuf);
- if (offset == db->db.db_offset && blksz == db->db.db_size) {
+ /*
+ * We can only assign if the offset is aligned, the arc buf is the
+ * same size as the dbuf, and the dbuf is not metadata. It
+ * can't be metadata because the loaned arc buf comes from the
+ * user-data kmem arena.
+ */
+ if (offset == db->db.db_offset && blksz == db->db.db_size &&
+ DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA) {
+#ifdef _KERNEL
+ curthread->td_ru.ru_oublock++;
+#endif
dbuf_assign_arcbuf(db, buf, tx);
dbuf_rele(db, FTAG);
} else {
@@ -1275,7 +1520,7 @@
* block size still needs to be known for replay.
*/
BP_SET_LSIZE(bp, db->db_size);
- } else {
+ } else if (!BP_IS_EMBEDDED(bp)) {
ASSERT(BP_GET_LEVEL(bp) == 0);
bp->blk_fill = 1;
}
@@ -1307,12 +1552,25 @@
ASSERT(BP_EQUAL(bp, bp_orig));
ASSERT(zio->io_prop.zp_compress != ZIO_COMPRESS_OFF);
- ASSERT(zio_checksum_table[chksum].ci_dedup);
+ ASSERT(zio_checksum_table[chksum].ci_flags &
+ ZCHECKSUM_FLAG_NOPWRITE);
}
dr->dt.dl.dr_overridden_by = *zio->io_bp;
dr->dt.dl.dr_override_state = DR_OVERRIDDEN;
dr->dt.dl.dr_copies = zio->io_prop.zp_copies;
- if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by))
+
+ /*
+ * Old style holes are filled with all zeros, whereas
+ * new-style holes maintain their lsize, type, level,
+ * and birth time (see zio_write_compress). While we
+ * need to reset the BP_SET_LSIZE() call that happened
+ * in dmu_sync_ready for old style holes, we do *not*
+ * want to wipe out the information contained in new
+ * style holes. Thus, only zero out the block pointer if
+ * it's an old style hole.
+ */
+ if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by) &&
+ dr->dt.dl.dr_overridden_by.blk_birth == 0)
BP_ZERO(&dr->dt.dl.dr_overridden_by);
} else {
dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
@@ -1357,7 +1615,7 @@
static int
dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
- zio_prop_t *zp, zbookmark_t *zb)
+ zio_prop_t *zp, zbookmark_phys_t *zb)
{
dmu_sync_arg_t *dsa;
dmu_tx_t *tx;
@@ -1376,10 +1634,11 @@
dsa->dsa_zgd = zgd;
dsa->dsa_tx = tx;
- zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp,
- zgd->zgd_db->db_data, zgd->zgd_db->db_size, zp,
- dmu_sync_late_arrival_ready, dmu_sync_late_arrival_done, dsa,
- ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb));
+ zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx),
+ zgd->zgd_bp, zgd->zgd_db->db_data, zgd->zgd_db->db_size,
+ zp, dmu_sync_late_arrival_ready, NULL,
+ NULL, dmu_sync_late_arrival_done, dsa, ZIO_PRIORITY_SYNC_WRITE,
+ ZIO_FLAG_CANFAIL, zb));
return (0);
}
@@ -1418,7 +1677,7 @@
dsl_dataset_t *ds = os->os_dsl_dataset;
dbuf_dirty_record_t *dr;
dmu_sync_arg_t *dsa;
- zbookmark_t zb;
+ zbookmark_phys_t zb;
zio_prop_t zp;
dnode_t *dn;
@@ -1480,19 +1739,32 @@
ASSERT(dr->dr_next == NULL || dr->dr_next->dr_txg < txg);
/*
- * Assume the on-disk data is X, the current syncing data is Y,
- * and the current in-memory data is Z (currently in dmu_sync).
- * X and Z are identical but Y is has been modified. Normally,
- * when X and Z are the same we will perform a nopwrite but if Y
- * is different we must disable nopwrite since the resulting write
- * of Y to disk can free the block containing X. If we allowed a
- * nopwrite to occur the block pointing to Z would reference a freed
- * block. Since this is a rare case we simplify this by disabling
- * nopwrite if the current dmu_sync-ing dbuf has been modified in
- * a previous transaction.
+ * Assume the on-disk data is X, the current syncing data (in
+ * txg - 1) is Y, and the current in-memory data is Z (currently
+ * in dmu_sync).
+ *
+ * We usually want to perform a nopwrite if X and Z are the
+ * same. However, if Y is different (i.e. the BP is going to
+ * change before this write takes effect), then a nopwrite will
+ * be incorrect - we would override with X, which could have
+ * been freed when Y was written.
+ *
+ * (Note that this is not a concern when we are nop-writing from
+ * syncing context, because X and Y must be identical, because
+ * all previous txgs have been synced.)
+ *
+ * Therefore, we disable nopwrite if the current BP could change
+ * before this TXG. There are two ways it could change: by
+ * being dirty (dr_next is non-NULL), or by being freed
+ * (dnode_block_freed()). This behavior is verified by
+ * zio_done(), which VERIFYs that the override BP is identical
+ * to the on-disk BP.
*/
- if (dr->dr_next)
+ DB_DNODE_ENTER(db);
+ dn = DB_DNODE(db);
+ if (dr->dr_next != NULL || dnode_block_freed(dn, db->db_blkid))
zp.zp_nopwrite = B_FALSE;
+ DB_DNODE_EXIT(db);
ASSERT(dr->dr_txg == txg);
if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC ||
@@ -1518,8 +1790,8 @@
zio_nowait(arc_write(pio, os->os_spa, txg,
bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db),
- DBUF_IS_L2COMPRESSIBLE(db), &zp, dmu_sync_ready, dmu_sync_done,
- dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb));
+ &zp, dmu_sync_ready, NULL, NULL, dmu_sync_done, dsa,
+ ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb));
return (0);
}
@@ -1526,7 +1798,7 @@
int
dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs,
- dmu_tx_t *tx)
+ dmu_tx_t *tx)
{
dnode_t *dn;
int err;
@@ -1541,13 +1813,19 @@
void
dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
- dmu_tx_t *tx)
+ dmu_tx_t *tx)
{
dnode_t *dn;
- /* XXX assumes dnode_hold will not get an i/o error */
- (void) dnode_hold(os, object, FTAG, &dn);
- ASSERT(checksum < ZIO_CHECKSUM_FUNCTIONS);
+ /*
+ * Send streams include each object's checksum function. This
+ * check ensures that the receiving system can understand the
+ * checksum function transmitted.
+ */
+ ASSERT3U(checksum, <, ZIO_CHECKSUM_LEGACY_FUNCTIONS);
+
+ VERIFY0(dnode_hold(os, object, FTAG, &dn));
+ ASSERT3U(checksum, <, ZIO_CHECKSUM_FUNCTIONS);
dn->dn_checksum = checksum;
dnode_setdirty(dn, tx);
dnode_rele(dn, FTAG);
@@ -1555,13 +1833,18 @@
void
dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
- dmu_tx_t *tx)
+ dmu_tx_t *tx)
{
dnode_t *dn;
- /* XXX assumes dnode_hold will not get an i/o error */
- (void) dnode_hold(os, object, FTAG, &dn);
- ASSERT(compress < ZIO_COMPRESS_FUNCTIONS);
+ /*
+ * Send streams include each object's compression function. This
+ * check ensures that the receiving system can understand the
+ * compression function transmitted.
+ */
+ ASSERT3U(compress, <, ZIO_COMPRESS_LEGACY_FUNCTIONS);
+
+ VERIFY0(dnode_hold(os, object, FTAG, &dn));
dn->dn_compress = compress;
dnode_setdirty(dn, tx);
dnode_rele(dn, FTAG);
@@ -1572,6 +1855,12 @@
SYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RW,
&zfs_mdcomp_disable, 0, "Disable metadata compression");
+/*
+ * When the "redundant_metadata" property is set to "most", only indirect
+ * blocks of this level and higher will have an additional ditto block.
+ */
+int zfs_redundant_metadata_most_ditto_level = 2;
+
void
dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
{
@@ -1594,12 +1883,16 @@
* 3. all other level 0 blocks
*/
if (ismd) {
- /*
- * XXX -- we should design a compression algorithm
- * that specializes in arrays of bps.
- */
- compress = zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY :
- ZIO_COMPRESS_LZJB;
+ if (zfs_mdcomp_disable) {
+ compress = ZIO_COMPRESS_EMPTY;
+ } else {
+ /*
+ * XXX -- we should design a compression algorithm
+ * that specializes in arrays of bps.
+ */
+ compress = zio_compress_select(os->os_spa,
+ ZIO_COMPRESS_ON, ZIO_COMPRESS_ON);
+ }
/*
* Metadata always gets checksummed. If the data
@@ -1608,9 +1901,18 @@
* as well. Otherwise, the metadata checksum defaults
* to fletcher4.
*/
- if (zio_checksum_table[checksum].ci_correctable < 1 ||
- zio_checksum_table[checksum].ci_eck)
+ if (!(zio_checksum_table[checksum].ci_flags &
+ ZCHECKSUM_FLAG_METADATA) ||
+ (zio_checksum_table[checksum].ci_flags &
+ ZCHECKSUM_FLAG_EMBEDDED))
checksum = ZIO_CHECKSUM_FLETCHER_4;
+
+ if (os->os_redundant_metadata == ZFS_REDUNDANT_METADATA_ALL ||
+ (os->os_redundant_metadata ==
+ ZFS_REDUNDANT_METADATA_MOST &&
+ (level >= zfs_redundant_metadata_most_ditto_level ||
+ DMU_OT_IS_METADATA(type) || (wp & WP_SPILL))))
+ copies++;
} else if (wp & WP_NOFILL) {
ASSERT(level == 0);
@@ -1622,9 +1924,10 @@
* pipeline.
*/
compress = ZIO_COMPRESS_OFF;
- checksum = ZIO_CHECKSUM_OFF;
+ checksum = ZIO_CHECKSUM_NOPARITY;
} else {
- compress = zio_compress_select(dn->dn_compress, compress);
+ compress = zio_compress_select(os->os_spa, dn->dn_compress,
+ compress);
checksum = (dedup_checksum == ZIO_CHECKSUM_OFF) ?
zio_checksum_select(dn->dn_checksum, checksum) :
@@ -1640,17 +1943,20 @@
*/
if (dedup_checksum != ZIO_CHECKSUM_OFF) {
dedup = (wp & WP_DMU_SYNC) ? B_FALSE : B_TRUE;
- if (!zio_checksum_table[checksum].ci_dedup)
+ if (!(zio_checksum_table[checksum].ci_flags &
+ ZCHECKSUM_FLAG_DEDUP))
dedup_verify = B_TRUE;
}
/*
- * Enable nopwrite if we have a cryptographically secure
- * checksum that has no known collisions (i.e. SHA-256)
- * and compression is enabled. We don't enable nopwrite if
- * dedup is enabled as the two features are mutually exclusive.
+ * Enable nopwrite if we have secure enough checksum
+ * algorithm (see comment in zio_nop_write) and
+ * compression is enabled. We don't enable nopwrite if
+ * dedup is enabled as the two features are mutually
+ * exclusive.
*/
- nopwrite = (!dedup && zio_checksum_table[checksum].ci_dedup &&
+ nopwrite = (!dedup && (zio_checksum_table[checksum].ci_flags &
+ ZCHECKSUM_FLAG_NOPWRITE) &&
compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled);
}
@@ -1658,7 +1964,7 @@
zp->zp_compress = compress;
zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type;
zp->zp_level = level;
- zp->zp_copies = MIN(copies + ismd, spa_max_replication(os->os_spa));
+ zp->zp_copies = MIN(copies, spa_max_replication(os->os_spa));
zp->zp_dedup = dedup;
zp->zp_dedup_verify = dedup && dedup_verify;
zp->zp_nopwrite = nopwrite;
@@ -1668,25 +1974,20 @@
dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
{
dnode_t *dn;
- int i, err;
+ int err;
- err = dnode_hold(os, object, FTAG, &dn);
- if (err)
- return (err);
/*
* Sync any current changes before
* we go trundling through the block pointers.
*/
- for (i = 0; i < TXG_SIZE; i++) {
- if (list_link_active(&dn->dn_dirty_link[i]))
- break;
+ err = dmu_object_wait_synced(os, object);
+ if (err) {
+ return (err);
}
- if (i != TXG_SIZE) {
- dnode_rele(dn, FTAG);
- txg_wait_synced(dmu_objset_pool(os), 0);
- err = dnode_hold(os, object, FTAG, &dn);
- if (err)
- return (err);
+
+ err = dnode_hold(os, object, FTAG, &dn);
+ if (err) {
+ return (err);
}
err = dnode_next_offset(dn, (hole ? DNODE_FIND_HOLE : 0), off, 1, 1, 0);
@@ -1695,6 +1996,37 @@
return (err);
}
+/*
+ * Given the ZFS object, if it contains any dirty nodes
+ * this function flushes all dirty blocks to disk. This
+ * ensures the DMU object info is updated. A more efficient
+ * future version might just find the TXG with the maximum
+ * ID and wait for that to be synced.
+ */
+int
+dmu_object_wait_synced(objset_t *os, uint64_t object)
+{
+ dnode_t *dn;
+ int error, i;
+
+ error = dnode_hold(os, object, FTAG, &dn);
+ if (error) {
+ return (error);
+ }
+
+ for (i = 0; i < TXG_SIZE; i++) {
+ if (list_link_active(&dn->dn_dirty_link[i])) {
+ break;
+ }
+ }
+ dnode_rele(dn, FTAG);
+ if (i != TXG_SIZE) {
+ txg_wait_synced(dmu_objset_pool(os), 0);
+ }
+
+ return (0);
+}
+
void
dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
{
@@ -1714,11 +2046,12 @@
doi->doi_indirection = dn->dn_nlevels;
doi->doi_checksum = dn->dn_checksum;
doi->doi_compress = dn->dn_compress;
+ doi->doi_nblkptr = dn->dn_nblkptr;
doi->doi_physical_blocks_512 = (DN_USED_BYTES(dnp) + 256) >> 9;
doi->doi_max_offset = (dn->dn_maxblkid + 1) * dn->dn_datablksz;
doi->doi_fill_count = 0;
for (int i = 0; i < dnp->dn_nblkptr; i++)
- doi->doi_fill_count += dnp->dn_blkptr[i].blk_fill;
+ doi->doi_fill_count += BP_GET_FILL(&dnp->dn_blkptr[i]);
mutex_exit(&dn->dn_mtx);
rw_exit(&dn->dn_struct_rwlock);
@@ -1831,10 +2164,11 @@
xuio_stat_init();
dmu_objset_init();
dnode_init();
- dbuf_init();
zfetch_init();
+ zio_compress_init();
l2arc_init();
arc_init();
+ dbuf_init();
}
void
@@ -1843,6 +2177,7 @@
arc_fini(); /* arc depends on l2arc, so arc must go first */
l2arc_fini();
zfetch_fini();
+ zio_compress_fini();
dbuf_fini();
dnode_fini();
dmu_objset_fini();
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_diff.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_diff.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_diff.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,7 +21,7 @@
*/
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
*/
#include <sys/dmu.h>
@@ -131,7 +131,7 @@
/* ARGSUSED */
static int
diff_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
- const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
+ const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
{
struct diffarg *da = arg;
int err = 0;
@@ -139,10 +139,10 @@
if (issig(JUSTLOOKING) && issig(FORREAL))
return (SET_ERROR(EINTR));
- if (zb->zb_object != DMU_META_DNODE_OBJECT)
+ if (bp == NULL || zb->zb_object != DMU_META_DNODE_OBJECT)
return (0);
- if (bp == NULL) {
+ if (BP_IS_HOLE(bp)) {
uint64_t span = DBP_SPAN(dnp, zb->zb_level);
uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT;
@@ -153,7 +153,7 @@
} else if (zb->zb_level == 0) {
dnode_phys_t *blk;
arc_buf_t *abuf;
- uint32_t aflags = ARC_WAIT;
+ arc_flags_t aflags = ARC_FLAG_WAIT;
int blksz = BP_GET_LSIZE(bp);
int i;
@@ -170,7 +170,7 @@
if (err)
break;
}
- (void) arc_buf_remove_ref(abuf, &abuf);
+ arc_buf_destroy(abuf, &abuf);
if (err)
return (err);
/* Don't care about the data blocks */
@@ -215,7 +215,7 @@
return (error);
}
- if (!dsl_dataset_is_before(tosnap, fromsnap)) {
+ if (!dsl_dataset_is_before(tosnap, fromsnap, 0)) {
dsl_dataset_rele(fromsnap, FTAG);
dsl_dataset_rele(tosnap, FTAG);
dsl_pool_rele(dp, FTAG);
@@ -222,7 +222,7 @@
return (SET_ERROR(EXDEV));
}
- fromtxg = fromsnap->ds_phys->ds_creation_txg;
+ fromtxg = dsl_dataset_phys(fromsnap)->ds_creation_txg;
dsl_dataset_rele(fromsnap, FTAG);
dsl_dataset_long_hold(tosnap, FTAG);
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,7 +21,8 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
+ * Copyright 2014 HybridCluster. All rights reserved.
*/
#include <sys/dmu.h>
@@ -28,6 +29,8 @@
#include <sys/dmu_objset.h>
#include <sys/dmu_tx.h>
#include <sys/dnode.h>
+#include <sys/zap.h>
+#include <sys/zfeature.h>
uint64_t
dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
@@ -48,6 +51,12 @@
* reasonably sparse (at most 1/4 full). Look from the
* beginning once, but after that keep looking from here.
* If we can't find one, just keep going from here.
+ *
+ * Note that dmu_traverse depends on the behavior that we use
+ * multiple blocks of the dnode object before going back to
+ * reuse objects. Any change to this algorithm should preserve
+ * that property or find another solution to the issues
+ * described in traverse_visitbp.
*/
if (P2PHASE(object, L2_dnode_count) == 0) {
uint64_t offset = restarted ? object << DNODE_SHIFT : 0;
@@ -106,11 +115,9 @@
int
dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
- int blocksize, dmu_object_type_t bonustype, int bonuslen)
+ int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
{
dnode_t *dn;
- dmu_tx_t *tx;
- int nblkptr;
int err;
if (object == DMU_META_DNODE_OBJECT)
@@ -121,44 +128,9 @@
if (err)
return (err);
- if (dn->dn_type == ot && dn->dn_datablksz == blocksize &&
- dn->dn_bonustype == bonustype && dn->dn_bonuslen == bonuslen) {
- /* nothing is changing, this is a noop */
- dnode_rele(dn, FTAG);
- return (0);
- }
-
- if (bonustype == DMU_OT_SA) {
- nblkptr = 1;
- } else {
- nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
- }
-
- /*
- * If we are losing blkptrs or changing the block size this must
- * be a new file instance. We must clear out the previous file
- * contents before we can change this type of metadata in the dnode.
- */
- if (dn->dn_nblkptr > nblkptr || dn->dn_datablksz != blocksize) {
- err = dmu_free_long_range(os, object, 0, DMU_OBJECT_END);
- if (err)
- goto out;
- }
-
- tx = dmu_tx_create(os);
- dmu_tx_hold_bonus(tx, object);
- err = dmu_tx_assign(tx, TXG_WAIT);
- if (err) {
- dmu_tx_abort(tx);
- goto out;
- }
-
dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, tx);
- dmu_tx_commit(tx);
-out:
dnode_rele(dn, FTAG);
-
return (err);
}
@@ -183,6 +155,11 @@
return (0);
}
+/*
+ * Return (in *objectp) the next object which is allocated (or a hole)
+ * after *object, taking into account only objects that may have been modified
+ * after the specified txg.
+ */
int
dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg)
{
@@ -196,3 +173,54 @@
return (error);
}
+
+/*
+ * Turn this object from old_type into DMU_OTN_ZAP_METADATA, and bump the
+ * refcount on SPA_FEATURE_EXTENSIBLE_DATASET.
+ *
+ * Only for use from syncing context, on MOS objects.
+ */
+void
+dmu_object_zapify(objset_t *mos, uint64_t object, dmu_object_type_t old_type,
+ dmu_tx_t *tx)
+{
+ dnode_t *dn;
+
+ ASSERT(dmu_tx_is_syncing(tx));
+
+ VERIFY0(dnode_hold(mos, object, FTAG, &dn));
+ if (dn->dn_type == DMU_OTN_ZAP_METADATA) {
+ dnode_rele(dn, FTAG);
+ return;
+ }
+ ASSERT3U(dn->dn_type, ==, old_type);
+ ASSERT0(dn->dn_maxblkid);
+ dn->dn_next_type[tx->tx_txg & TXG_MASK] = dn->dn_type =
+ DMU_OTN_ZAP_METADATA;
+ dnode_setdirty(dn, tx);
+ dnode_rele(dn, FTAG);
+
+ mzap_create_impl(mos, object, 0, 0, tx);
+
+ spa_feature_incr(dmu_objset_spa(mos),
+ SPA_FEATURE_EXTENSIBLE_DATASET, tx);
+}
+
+void
+dmu_object_free_zapified(objset_t *mos, uint64_t object, dmu_tx_t *tx)
+{
+ dnode_t *dn;
+ dmu_object_type_t t;
+
+ ASSERT(dmu_tx_is_syncing(tx));
+
+ VERIFY0(dnode_hold(mos, object, FTAG, &dn));
+ t = dn->dn_type;
+ dnode_rele(dn, FTAG);
+
+ if (t == DMU_OTN_ZAP_METADATA) {
+ spa_feature_decr(dmu_objset_spa(mos),
+ SPA_FEATURE_EXTENSIBLE_DATASET, tx);
+ }
+ VERIFY0(dmu_object_free(mos, object, tx));
+}
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,8 +21,13 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2015, STRATO AG, Inc. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
*/
/* Portions Copyright 2010 Robert Milkowski */
@@ -47,6 +52,7 @@
#include <sys/sa.h>
#include <sys/zfs_onexit.h>
#include <sys/dsl_destroy.h>
+#include <sys/vdev.h>
/*
* Needed to close a window in dnode_move() that allows the objset to be freed
@@ -54,6 +60,16 @@
*/
krwlock_t os_lock;
+/*
+ * Tunable to overwrite the maximum number of threads for the parallization
+ * of dmu_objset_find_dp, needed to speed up the import of pools with many
+ * datasets.
+ * Default is 4 times the number of leaf vdevs.
+ */
+int dmu_find_threads = 0;
+
+static void dmu_objset_find_dp_cb(void *arg);
+
void
dmu_objset_init(void)
{
@@ -115,13 +131,13 @@
return (ds ? ds->ds_object : 0);
}
-uint64_t
+zfs_sync_type_t
dmu_objset_syncprop(objset_t *os)
{
return (os->os_sync);
}
-uint64_t
+zfs_logbias_op_t
dmu_objset_logbias(objset_t *os)
{
return (os->os_logbias);
@@ -150,7 +166,8 @@
*/
ASSERT(newval != ZIO_COMPRESS_INHERIT);
- os->os_compress = zio_compress_select(newval, ZIO_COMPRESS_ON_VALUE);
+ os->os_compress = zio_compress_select(os->os_spa, newval,
+ ZIO_COMPRESS_ON);
}
static void
@@ -230,6 +247,20 @@
}
static void
+redundant_metadata_changed_cb(void *arg, uint64_t newval)
+{
+ objset_t *os = arg;
+
+ /*
+ * Inheritance and range checking should have been done by now.
+ */
+ ASSERT(newval == ZFS_REDUNDANT_METADATA_ALL ||
+ newval == ZFS_REDUNDANT_METADATA_MOST);
+
+ os->os_redundant_metadata = newval;
+}
+
+static void
logbias_changed_cb(void *arg, uint64_t newval)
{
objset_t *os = arg;
@@ -241,6 +272,14 @@
zil_set_logbias(os->os_zil, newval);
}
+static void
+recordsize_changed_cb(void *arg, uint64_t newval)
+{
+ objset_t *os = arg;
+
+ os->os_recordsize = newval;
+}
+
void
dmu_objset_byteswap(void *buf, size_t size)
{
@@ -271,15 +310,13 @@
os->os_spa = spa;
os->os_rootbp = bp;
if (!BP_IS_HOLE(os->os_rootbp)) {
- uint32_t aflags = ARC_WAIT;
- zbookmark_t zb;
+ arc_flags_t aflags = ARC_FLAG_WAIT;
+ zbookmark_phys_t zb;
SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
if (DMU_OS_IS_L2CACHEABLE(os))
- aflags |= ARC_L2CACHE;
- if (DMU_OS_IS_L2COMPRESSIBLE(os))
- aflags |= ARC_L2COMPRESS;
+ aflags |= ARC_FLAG_L2CACHE;
dprintf_bp(os->os_rootbp, "reading %s", "");
err = arc_read(NULL, spa, os->os_rootbp,
@@ -296,14 +333,13 @@
/* Increase the blocksize if we are permitted. */
if (spa_version(spa) >= SPA_VERSION_USERSPACE &&
arc_buf_size(os->os_phys_buf) < sizeof (objset_phys_t)) {
- arc_buf_t *buf = arc_buf_alloc(spa,
+ arc_buf_t *buf = arc_alloc_buf(spa,
sizeof (objset_phys_t), &os->os_phys_buf,
ARC_BUFC_METADATA);
bzero(buf->b_data, sizeof (objset_phys_t));
bcopy(os->os_phys_buf->b_data, buf->b_data,
arc_buf_size(os->os_phys_buf));
- (void) arc_buf_remove_ref(os->os_phys_buf,
- &os->os_phys_buf);
+ arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf);
os->os_phys_buf = buf;
}
@@ -312,7 +348,7 @@
} else {
int size = spa_version(spa) >= SPA_VERSION_USERSPACE ?
sizeof (objset_phys_t) : OBJSET_OLD_PHYS_SIZE;
- os->os_phys_buf = arc_buf_alloc(spa, size,
+ os->os_phys_buf = arc_alloc_buf(spa, size,
&os->os_phys_buf, ARC_BUFC_METADATA);
os->os_phys = os->os_phys_buf->b_data;
bzero(os->os_phys, size);
@@ -324,7 +360,18 @@
* default (fletcher2/off). Snapshots don't need to know about
* checksum/compression/copies.
*/
- if (ds) {
+ if (ds != NULL) {
+ boolean_t needlock = B_FALSE;
+
+ /*
+ * Note: it's valid to open the objset if the dataset is
+ * long-held, in which case the pool_config lock will not
+ * be held.
+ */
+ if (!dsl_pool_config_held(dmu_objset_pool(os))) {
+ needlock = B_TRUE;
+ dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
+ }
err = dsl_prop_register(ds,
zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE),
primary_cache_changed_cb, os);
@@ -333,7 +380,7 @@
zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE),
secondary_cache_changed_cb, os);
}
- if (!dsl_dataset_is_snapshot(ds)) {
+ if (!ds->ds_is_snapshot) {
if (err == 0) {
err = dsl_prop_register(ds,
zfs_prop_to_name(ZFS_PROP_CHECKSUM),
@@ -364,27 +411,39 @@
zfs_prop_to_name(ZFS_PROP_SYNC),
sync_changed_cb, os);
}
+ if (err == 0) {
+ err = dsl_prop_register(ds,
+ zfs_prop_to_name(
+ ZFS_PROP_REDUNDANT_METADATA),
+ redundant_metadata_changed_cb, os);
+ }
+ if (err == 0) {
+ err = dsl_prop_register(ds,
+ zfs_prop_to_name(ZFS_PROP_RECORDSIZE),
+ recordsize_changed_cb, os);
+ }
}
+ if (needlock)
+ dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
if (err != 0) {
- VERIFY(arc_buf_remove_ref(os->os_phys_buf,
- &os->os_phys_buf));
+ arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf);
kmem_free(os, sizeof (objset_t));
return (err);
}
- } else if (ds == NULL) {
+ } else {
/* It's the meta-objset. */
os->os_checksum = ZIO_CHECKSUM_FLETCHER_4;
- os->os_compress = ZIO_COMPRESS_LZJB;
+ os->os_compress = ZIO_COMPRESS_ON;
os->os_copies = spa_max_replication(spa);
os->os_dedup_checksum = ZIO_CHECKSUM_OFF;
- os->os_dedup_verify = 0;
- os->os_logbias = 0;
- os->os_sync = 0;
+ os->os_dedup_verify = B_FALSE;
+ os->os_logbias = ZFS_LOGBIAS_LATENCY;
+ os->os_sync = ZFS_SYNC_STANDARD;
os->os_primary_cache = ZFS_CACHE_ALL;
os->os_secondary_cache = ZFS_CACHE_ALL;
}
- if (ds == NULL || !dsl_dataset_is_snapshot(ds))
+ if (ds == NULL || !ds->ds_is_snapshot)
os->os_zil_header = os->os_phys->os_zil_header;
os->os_zil = zil_alloc(os, &os->os_zil_header);
@@ -403,29 +462,15 @@
mutex_init(&os->os_obj_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&os->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL);
- DMU_META_DNODE(os) = dnode_special_open(os,
- &os->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT,
- &os->os_meta_dnode);
+ dnode_special_open(os, &os->os_phys->os_meta_dnode,
+ DMU_META_DNODE_OBJECT, &os->os_meta_dnode);
if (arc_buf_size(os->os_phys_buf) >= sizeof (objset_phys_t)) {
- DMU_USERUSED_DNODE(os) = dnode_special_open(os,
- &os->os_phys->os_userused_dnode, DMU_USERUSED_OBJECT,
- &os->os_userused_dnode);
- DMU_GROUPUSED_DNODE(os) = dnode_special_open(os,
- &os->os_phys->os_groupused_dnode, DMU_GROUPUSED_OBJECT,
- &os->os_groupused_dnode);
+ dnode_special_open(os, &os->os_phys->os_userused_dnode,
+ DMU_USERUSED_OBJECT, &os->os_userused_dnode);
+ dnode_special_open(os, &os->os_phys->os_groupused_dnode,
+ DMU_GROUPUSED_OBJECT, &os->os_groupused_dnode);
}
- /*
- * We should be the only thread trying to do this because we
- * have ds_opening_lock
- */
- if (ds) {
- mutex_enter(&ds->ds_lock);
- ASSERT(ds->ds_objset == NULL);
- ds->ds_objset = os;
- mutex_exit(&ds->ds_lock);
- }
-
*osp = os;
return (0);
}
@@ -435,12 +480,29 @@
{
int err = 0;
+ /*
+ * We shouldn't be doing anything with dsl_dataset_t's unless the
+ * pool_config lock is held, or the dataset is long-held.
+ */
+ ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool) ||
+ dsl_dataset_long_held(ds));
+
mutex_enter(&ds->ds_opening_lock);
- *osp = ds->ds_objset;
- if (*osp == NULL) {
+ if (ds->ds_objset == NULL) {
+ objset_t *os;
+ rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
err = dmu_objset_open_impl(dsl_dataset_get_spa(ds),
- ds, dsl_dataset_get_blkptr(ds), osp);
+ ds, dsl_dataset_get_blkptr(ds), &os);
+ rrw_exit(&ds->ds_bp_rwlock, FTAG);
+
+ if (err == 0) {
+ mutex_enter(&ds->ds_lock);
+ ASSERT(ds->ds_objset == NULL);
+ ds->ds_objset = os;
+ mutex_exit(&ds->ds_lock);
+ }
}
+ *osp = ds->ds_objset;
mutex_exit(&ds->ds_opening_lock);
return (err);
}
@@ -474,6 +536,25 @@
return (err);
}
+static int
+dmu_objset_own_impl(dsl_dataset_t *ds, dmu_objset_type_t type,
+ boolean_t readonly, void *tag, objset_t **osp)
+{
+ int err;
+
+ err = dmu_objset_from_ds(ds, osp);
+ if (err != 0) {
+ dsl_dataset_disown(ds, tag);
+ } else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) {
+ dsl_dataset_disown(ds, tag);
+ return (SET_ERROR(EINVAL));
+ } else if (!readonly && dsl_dataset_is_snapshot(ds)) {
+ dsl_dataset_disown(ds, tag);
+ return (SET_ERROR(EROFS));
+ }
+ return (err);
+}
+
/*
* dsl_pool must not be held when this is called.
* Upon successful return, there will be a longhold on the dataset,
@@ -495,21 +576,26 @@
dsl_pool_rele(dp, FTAG);
return (err);
}
+ err = dmu_objset_own_impl(ds, type, readonly, tag, osp);
+ dsl_pool_rele(dp, FTAG);
- err = dmu_objset_from_ds(ds, osp);
- dsl_pool_rele(dp, FTAG);
- if (err != 0) {
- dsl_dataset_disown(ds, tag);
- } else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) {
- dsl_dataset_disown(ds, tag);
- return (SET_ERROR(EINVAL));
- } else if (!readonly && dsl_dataset_is_snapshot(ds)) {
- dsl_dataset_disown(ds, tag);
- return (SET_ERROR(EROFS));
- }
return (err);
}
+int
+dmu_objset_own_obj(dsl_pool_t *dp, uint64_t obj, dmu_objset_type_t type,
+ boolean_t readonly, void *tag, objset_t **osp)
+{
+ dsl_dataset_t *ds;
+ int err;
+
+ err = dsl_dataset_own_obj(dp, obj, tag, &ds);
+ if (err != 0)
+ return (err);
+
+ return (dmu_objset_own_impl(ds, type, readonly, tag, osp));
+}
+
void
dmu_objset_rele(objset_t *os, void *tag)
{
@@ -518,7 +604,37 @@
dsl_pool_rele(dp, tag);
}
+/*
+ * When we are called, os MUST refer to an objset associated with a dataset
+ * that is owned by 'tag'; that is, is held and long held by 'tag' and ds_owner
+ * == tag. We will then release and reacquire ownership of the dataset while
+ * holding the pool config_rwlock to avoid intervening namespace or ownership
+ * changes may occur.
+ *
+ * This exists solely to accommodate zfs_ioc_userspace_upgrade()'s desire to
+ * release the hold on its dataset and acquire a new one on the dataset of the
+ * same name so that it can be partially torn down and reconstructed.
+ */
void
+dmu_objset_refresh_ownership(dsl_dataset_t *ds, dsl_dataset_t **newds,
+ void *tag)
+{
+ dsl_pool_t *dp;
+ char name[ZFS_MAX_DATASET_NAME_LEN];
+
+ VERIFY3P(ds, !=, NULL);
+ VERIFY3P(ds->ds_owner, ==, tag);
+ VERIFY(dsl_dataset_long_held(ds));
+
+ dsl_dataset_name(ds, name);
+ dp = ds->ds_dir->dd_pool;
+ dsl_pool_config_enter(dp, FTAG);
+ dsl_dataset_disown(ds, tag);
+ VERIFY0(dsl_dataset_own(dp, name, tag, newds));
+ dsl_pool_config_exit(dp, FTAG);
+}
+
+void
dmu_objset_disown(objset_t *os, void *tag)
{
dsl_dataset_disown(os->os_dsl_dataset, tag);
@@ -527,41 +643,53 @@
void
dmu_objset_evict_dbufs(objset_t *os)
{
+ dnode_t dn_marker;
dnode_t *dn;
mutex_enter(&os->os_lock);
+ dn = list_head(&os->os_dnodes);
+ while (dn != NULL) {
+ /*
+ * Skip dnodes without holds. We have to do this dance
+ * because dnode_add_ref() only works if there is already a
+ * hold. If the dnode has no holds, then it has no dbufs.
+ */
+ if (dnode_add_ref(dn, FTAG)) {
+ list_insert_after(&os->os_dnodes, dn, &dn_marker);
+ mutex_exit(&os->os_lock);
- /* process the mdn last, since the other dnodes have holds on it */
- list_remove(&os->os_dnodes, DMU_META_DNODE(os));
- list_insert_tail(&os->os_dnodes, DMU_META_DNODE(os));
+ dnode_evict_dbufs(dn);
+ dnode_rele(dn, FTAG);
- /*
- * Find the first dnode with holds. We have to do this dance
- * because dnode_add_ref() only works if you already have a
- * hold. If there are no holds then it has no dbufs so OK to
- * skip.
- */
- for (dn = list_head(&os->os_dnodes);
- dn && !dnode_add_ref(dn, FTAG);
- dn = list_next(&os->os_dnodes, dn))
- continue;
+ mutex_enter(&os->os_lock);
+ dn = list_next(&os->os_dnodes, &dn_marker);
+ list_remove(&os->os_dnodes, &dn_marker);
+ } else {
+ dn = list_next(&os->os_dnodes, dn);
+ }
+ }
+ mutex_exit(&os->os_lock);
- while (dn) {
- dnode_t *next_dn = dn;
-
- do {
- next_dn = list_next(&os->os_dnodes, next_dn);
- } while (next_dn && !dnode_add_ref(next_dn, FTAG));
-
- mutex_exit(&os->os_lock);
- dnode_evict_dbufs(dn);
- dnode_rele(dn, FTAG);
- mutex_enter(&os->os_lock);
- dn = next_dn;
+ if (DMU_USERUSED_DNODE(os) != NULL) {
+ dnode_evict_dbufs(DMU_GROUPUSED_DNODE(os));
+ dnode_evict_dbufs(DMU_USERUSED_DNODE(os));
}
- mutex_exit(&os->os_lock);
+ dnode_evict_dbufs(DMU_META_DNODE(os));
}
+/*
+ * Objset eviction processing is split into into two pieces.
+ * The first marks the objset as evicting, evicts any dbufs that
+ * have a refcount of zero, and then queues up the objset for the
+ * second phase of eviction. Once os->os_dnodes has been cleared by
+ * dnode_buf_pageout()->dnode_destroy(), the second phase is executed.
+ * The second phase closes the special dnodes, dequeues the objset from
+ * the list of those undergoing eviction, and finally frees the objset.
+ *
+ * NOTE: Due to asynchronous eviction processing (invocation of
+ * dnode_buf_pageout()), it is possible for the meta dnode for the
+ * objset to have no holds even though os->os_dnodes is not empty.
+ */
void
dmu_objset_evict(objset_t *os)
{
@@ -570,34 +698,8 @@
for (int t = 0; t < TXG_SIZE; t++)
ASSERT(!dmu_objset_is_dirty(os, t));
- if (ds) {
- if (!dsl_dataset_is_snapshot(ds)) {
- VERIFY0(dsl_prop_unregister(ds,
- zfs_prop_to_name(ZFS_PROP_CHECKSUM),
- checksum_changed_cb, os));
- VERIFY0(dsl_prop_unregister(ds,
- zfs_prop_to_name(ZFS_PROP_COMPRESSION),
- compression_changed_cb, os));
- VERIFY0(dsl_prop_unregister(ds,
- zfs_prop_to_name(ZFS_PROP_COPIES),
- copies_changed_cb, os));
- VERIFY0(dsl_prop_unregister(ds,
- zfs_prop_to_name(ZFS_PROP_DEDUP),
- dedup_changed_cb, os));
- VERIFY0(dsl_prop_unregister(ds,
- zfs_prop_to_name(ZFS_PROP_LOGBIAS),
- logbias_changed_cb, os));
- VERIFY0(dsl_prop_unregister(ds,
- zfs_prop_to_name(ZFS_PROP_SYNC),
- sync_changed_cb, os));
- }
- VERIFY0(dsl_prop_unregister(ds,
- zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE),
- primary_cache_changed_cb, os));
- VERIFY0(dsl_prop_unregister(ds,
- zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE),
- secondary_cache_changed_cb, os));
- }
+ if (ds)
+ dsl_prop_unregister_all(ds, os);
if (os->os_sa)
sa_tear_down(os);
@@ -604,6 +706,21 @@
dmu_objset_evict_dbufs(os);
+ mutex_enter(&os->os_lock);
+ spa_evicting_os_register(os->os_spa, os);
+ if (list_is_empty(&os->os_dnodes)) {
+ mutex_exit(&os->os_lock);
+ dmu_objset_evict_done(os);
+ } else {
+ mutex_exit(&os->os_lock);
+ }
+}
+
+void
+dmu_objset_evict_done(objset_t *os)
+{
+ ASSERT3P(list_head(&os->os_dnodes), ==, NULL);
+
dnode_special_close(&os->os_meta_dnode);
if (DMU_USERUSED_DNODE(os)) {
dnode_special_close(&os->os_userused_dnode);
@@ -611,10 +728,8 @@
}
zil_free(os->os_zil);
- ASSERT3P(list_head(&os->os_dnodes), ==, NULL);
+ arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf);
- VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf));
-
/*
* This is a barrier to prevent the objset from going away in
* dnode_move() until we can safely ensure that the objset is still in
@@ -627,6 +742,7 @@
mutex_destroy(&os->os_lock);
mutex_destroy(&os->os_obj_lock);
mutex_destroy(&os->os_user_ptr_lock);
+ spa_evicting_os_deregister(os->os_spa, os);
kmem_free(os, sizeof (objset_t));
}
@@ -672,11 +788,17 @@
/*
* Determine the number of levels necessary for the meta-dnode
- * to contain DN_MAX_OBJECT dnodes.
+ * to contain DN_MAX_OBJECT dnodes. Note that in order to
+ * ensure that we do not overflow 64 bits, there has to be
+ * a nlevels that gives us a number of blocks > DN_MAX_OBJECT
+ * but < 2^64. Therefore,
+ * (mdn->dn_indblkshift - SPA_BLKPTRSHIFT) (10) must be
+ * less than (64 - log2(DN_MAX_OBJECT)) (16).
*/
- while ((uint64_t)mdn->dn_nblkptr << (mdn->dn_datablkshift +
+ while ((uint64_t)mdn->dn_nblkptr <<
+ (mdn->dn_datablkshift - DNODE_SHIFT +
(levels - 1) * (mdn->dn_indblkshift - SPA_BLKPTRSHIFT)) <
- DN_MAX_OBJECT * sizeof (dnode_phys_t))
+ DN_MAX_OBJECT)
levels++;
mdn->dn_next_nlevels[tx->tx_txg & TXG_MASK] =
@@ -720,6 +842,9 @@
if (strchr(doca->doca_name, '@') != NULL)
return (SET_ERROR(EINVAL));
+ if (strlen(doca->doca_name) >= ZFS_MAX_DATASET_NAME_LEN)
+ return (SET_ERROR(ENAMETOOLONG));
+
error = dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail);
if (error != 0)
return (error);
@@ -727,9 +852,11 @@
dsl_dir_rele(pdd, FTAG);
return (SET_ERROR(EEXIST));
}
+ error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL,
+ doca->doca_cred);
dsl_dir_rele(pdd, FTAG);
- return (0);
+ return (error);
}
static void
@@ -750,9 +877,11 @@
doca->doca_cred, tx);
VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds));
+ rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
bp = dsl_dataset_get_blkptr(ds);
os = dmu_objset_create_impl(pdd->dd_pool->dp_spa,
ds, bp, doca->doca_type, tx);
+ rrw_exit(&ds->ds_bp_rwlock, FTAG);
if (doca->doca_userfunc != NULL) {
doca->doca_userfunc(os, doca->doca_userarg,
@@ -778,7 +907,8 @@
doca.doca_type = type;
return (dsl_sync_task(name,
- dmu_objset_create_check, dmu_objset_create_sync, &doca, 5));
+ dmu_objset_create_check, dmu_objset_create_sync, &doca,
+ 5, ZFS_SPACE_CHECK_NORMAL));
}
typedef struct dmu_objset_clone_arg {
@@ -801,6 +931,9 @@
if (strchr(doca->doca_clone, '@') != NULL)
return (SET_ERROR(EINVAL));
+ if (strlen(doca->doca_clone) >= ZFS_MAX_DATASET_NAME_LEN)
+ return (SET_ERROR(ENAMETOOLONG));
+
error = dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail);
if (error != 0)
return (error);
@@ -808,10 +941,12 @@
dsl_dir_rele(pdd, FTAG);
return (SET_ERROR(EEXIST));
}
- /* You can't clone across pools. */
- if (pdd->dd_pool != dp) {
+
+ error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL,
+ doca->doca_cred);
+ if (error != 0) {
dsl_dir_rele(pdd, FTAG);
- return (SET_ERROR(EXDEV));
+ return (SET_ERROR(EDQUOT));
}
dsl_dir_rele(pdd, FTAG);
@@ -819,14 +954,8 @@
if (error != 0)
return (error);
- /* You can't clone across pools. */
- if (origin->ds_dir->dd_pool != dp) {
- dsl_dataset_rele(origin, FTAG);
- return (SET_ERROR(EXDEV));
- }
-
/* You can only clone snapshots, not the head datasets. */
- if (!dsl_dataset_is_snapshot(origin)) {
+ if (!origin->ds_is_snapshot) {
dsl_dataset_rele(origin, FTAG);
return (SET_ERROR(EINVAL));
}
@@ -844,7 +973,7 @@
const char *tail;
dsl_dataset_t *origin, *ds;
uint64_t obj;
- char namebuf[MAXNAMELEN];
+ char namebuf[ZFS_MAX_DATASET_NAME_LEN];
VERIFY0(dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail));
VERIFY0(dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin));
@@ -871,7 +1000,8 @@
doca.doca_cred = CRED();
return (dsl_sync_task(clone,
- dmu_objset_clone_check, dmu_objset_clone_sync, &doca, 5));
+ dmu_objset_clone_check, dmu_objset_clone_sync, &doca,
+ 5, ZFS_SPACE_CHECK_NORMAL));
}
int
@@ -923,7 +1053,7 @@
objset_t *os = arg;
dnode_phys_t *dnp = &os->os_phys->os_meta_dnode;
- ASSERT3P(bp, ==, os->os_rootbp);
+ ASSERT(!BP_IS_EMBEDDED(bp));
ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_OBJSET);
ASSERT0(BP_GET_LEVEL(bp));
@@ -935,7 +1065,12 @@
*/
bp->blk_fill = 0;
for (int i = 0; i < dnp->dn_nblkptr; i++)
- bp->blk_fill += dnp->dn_blkptr[i].blk_fill;
+ bp->blk_fill += BP_GET_FILL(&dnp->dn_blkptr[i]);
+ if (os->os_dsl_dataset != NULL)
+ rrw_enter(&os->os_dsl_dataset->ds_bp_rwlock, RW_WRITER, FTAG);
+ *os->os_rootbp = *bp;
+ if (os->os_dsl_dataset != NULL)
+ rrw_exit(&os->os_dsl_dataset->ds_bp_rwlock, FTAG);
}
/* ARGSUSED */
@@ -955,6 +1090,7 @@
(void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
dsl_dataset_block_born(ds, bp, tx);
}
+ kmem_free(bp, sizeof (*bp));
}
/* called from dsl */
@@ -962,12 +1098,14 @@
dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx)
{
int txgoff;
- zbookmark_t zb;
+ zbookmark_phys_t zb;
zio_prop_t zp;
zio_t *zio;
list_t *list;
list_t *newlist = NULL;
dbuf_dirty_record_t *dr;
+ blkptr_t *blkptr_copy = kmem_alloc(sizeof (*os->os_rootbp), KM_SLEEP);
+ *blkptr_copy = *os->os_rootbp;
dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", tx->tx_txg);
@@ -995,10 +1133,9 @@
dmu_write_policy(os, NULL, 0, 0, &zp);
zio = arc_write(pio, os->os_spa, tx->tx_txg,
- os->os_rootbp, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os),
- DMU_OS_IS_L2COMPRESSIBLE(os), &zp, dmu_objset_write_ready,
- dmu_objset_write_done, os, ZIO_PRIORITY_ASYNC_WRITE,
- ZIO_FLAG_MUSTSUCCEED, &zb);
+ blkptr_copy, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os),
+ &zp, dmu_objset_write_ready, NULL, NULL, dmu_objset_write_done,
+ os, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
/*
* Sync special dnodes - the parent IO for the sync is the root block
@@ -1069,18 +1206,83 @@
DMU_USERUSED_DNODE(os) != NULL);
}
+typedef struct userquota_node {
+ uint64_t uqn_id;
+ int64_t uqn_delta;
+ avl_node_t uqn_node;
+} userquota_node_t;
+
+typedef struct userquota_cache {
+ avl_tree_t uqc_user_deltas;
+ avl_tree_t uqc_group_deltas;
+} userquota_cache_t;
+
+static int
+userquota_compare(const void *l, const void *r)
+{
+ const userquota_node_t *luqn = l;
+ const userquota_node_t *ruqn = r;
+
+ if (luqn->uqn_id < ruqn->uqn_id)
+ return (-1);
+ if (luqn->uqn_id > ruqn->uqn_id)
+ return (1);
+ return (0);
+}
+
static void
-do_userquota_update(objset_t *os, uint64_t used, uint64_t flags,
- uint64_t user, uint64_t group, boolean_t subtract, dmu_tx_t *tx)
+do_userquota_cacheflush(objset_t *os, userquota_cache_t *cache, dmu_tx_t *tx)
{
+ void *cookie;
+ userquota_node_t *uqn;
+
+ ASSERT(dmu_tx_is_syncing(tx));
+
+ cookie = NULL;
+ while ((uqn = avl_destroy_nodes(&cache->uqc_user_deltas,
+ &cookie)) != NULL) {
+ VERIFY0(zap_increment_int(os, DMU_USERUSED_OBJECT,
+ uqn->uqn_id, uqn->uqn_delta, tx));
+ kmem_free(uqn, sizeof (*uqn));
+ }
+ avl_destroy(&cache->uqc_user_deltas);
+
+ cookie = NULL;
+ while ((uqn = avl_destroy_nodes(&cache->uqc_group_deltas,
+ &cookie)) != NULL) {
+ VERIFY0(zap_increment_int(os, DMU_GROUPUSED_OBJECT,
+ uqn->uqn_id, uqn->uqn_delta, tx));
+ kmem_free(uqn, sizeof (*uqn));
+ }
+ avl_destroy(&cache->uqc_group_deltas);
+}
+
+static void
+userquota_update_cache(avl_tree_t *avl, uint64_t id, int64_t delta)
+{
+ userquota_node_t search = { .uqn_id = id };
+ avl_index_t idx;
+
+ userquota_node_t *uqn = avl_find(avl, &search, &idx);
+ if (uqn == NULL) {
+ uqn = kmem_zalloc(sizeof (*uqn), KM_SLEEP);
+ uqn->uqn_id = id;
+ avl_insert(avl, uqn, idx);
+ }
+ uqn->uqn_delta += delta;
+}
+
+static void
+do_userquota_update(userquota_cache_t *cache, uint64_t used, uint64_t flags,
+ uint64_t user, uint64_t group, boolean_t subtract)
+{
if ((flags & DNODE_FLAG_USERUSED_ACCOUNTED)) {
int64_t delta = DNODE_SIZE + used;
if (subtract)
delta = -delta;
- VERIFY3U(0, ==, zap_increment_int(os, DMU_USERUSED_OBJECT,
- user, delta, tx));
- VERIFY3U(0, ==, zap_increment_int(os, DMU_GROUPUSED_OBJECT,
- group, delta, tx));
+
+ userquota_update_cache(&cache->uqc_user_deltas, user, delta);
+ userquota_update_cache(&cache->uqc_group_deltas, group, delta);
}
}
@@ -1089,9 +1291,15 @@
{
dnode_t *dn;
list_t *list = &os->os_synced_dnodes;
+ userquota_cache_t cache = { 0 };
ASSERT(list_head(list) == NULL || dmu_objset_userused_enabled(os));
+ avl_create(&cache.uqc_user_deltas, userquota_compare,
+ sizeof (userquota_node_t), offsetof(userquota_node_t, uqn_node));
+ avl_create(&cache.uqc_group_deltas, userquota_compare,
+ sizeof (userquota_node_t), offsetof(userquota_node_t, uqn_node));
+
while (dn = list_head(list)) {
int flags;
ASSERT(!DMU_OBJECT_IS_SPECIAL(dn->dn_object));
@@ -1101,32 +1309,26 @@
/* Allocate the user/groupused objects if necessary. */
if (DMU_USERUSED_DNODE(os)->dn_type == DMU_OT_NONE) {
- VERIFY(0 == zap_create_claim(os,
+ VERIFY0(zap_create_claim(os,
DMU_USERUSED_OBJECT,
DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
- VERIFY(0 == zap_create_claim(os,
+ VERIFY0(zap_create_claim(os,
DMU_GROUPUSED_OBJECT,
DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
}
- /*
- * We intentionally modify the zap object even if the
- * net delta is zero. Otherwise
- * the block of the zap obj could be shared between
- * datasets but need to be different between them after
- * a bprewrite.
- */
-
flags = dn->dn_id_flags;
ASSERT(flags);
if (flags & DN_ID_OLD_EXIST) {
- do_userquota_update(os, dn->dn_oldused, dn->dn_oldflags,
- dn->dn_olduid, dn->dn_oldgid, B_TRUE, tx);
+ do_userquota_update(&cache,
+ dn->dn_oldused, dn->dn_oldflags,
+ dn->dn_olduid, dn->dn_oldgid, B_TRUE);
}
if (flags & DN_ID_NEW_EXIST) {
- do_userquota_update(os, DN_USED_BYTES(dn->dn_phys),
+ do_userquota_update(&cache,
+ DN_USED_BYTES(dn->dn_phys),
dn->dn_phys->dn_flags, dn->dn_newuid,
- dn->dn_newgid, B_FALSE, tx);
+ dn->dn_newgid, B_FALSE);
}
mutex_enter(&dn->dn_mtx);
@@ -1147,6 +1349,7 @@
list_remove(list, dn);
dnode_rele(dn, list);
}
+ do_userquota_cacheflush(os, &cache, tx);
}
/*
@@ -1388,7 +1591,7 @@
dmu_objset_is_snapshot(objset_t *os)
{
if (os->os_dsl_dataset != NULL)
- return (dsl_dataset_is_snapshot(os->os_dsl_dataset));
+ return (os->os_dsl_dataset->ds_is_snapshot);
else
return (B_FALSE);
}
@@ -1400,12 +1603,12 @@
dsl_dataset_t *ds = os->os_dsl_dataset;
uint64_t ignored;
- if (ds->ds_phys->ds_snapnames_zapobj == 0)
+ if (dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0)
return (SET_ERROR(ENOENT));
return (zap_lookup_norm(ds->ds_dir->dd_pool->dp_meta_objset,
- ds->ds_phys->ds_snapnames_zapobj, name, 8, 1, &ignored, MT_FIRST,
- real, maxlen, conflict));
+ dsl_dataset_phys(ds)->ds_snapnames_zapobj, name, 8, 1, &ignored,
+ MT_FIRST, real, maxlen, conflict));
}
int
@@ -1418,12 +1621,12 @@
ASSERT(dsl_pool_config_held(dmu_objset_pool(os)));
- if (ds->ds_phys->ds_snapnames_zapobj == 0)
+ if (dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0)
return (SET_ERROR(ENOENT));
zap_cursor_init_serialized(&cursor,
ds->ds_dir->dd_pool->dp_meta_objset,
- ds->ds_phys->ds_snapnames_zapobj, *offp);
+ dsl_dataset_phys(ds)->ds_snapnames_zapobj, *offp);
if (zap_cursor_retrieve(&cursor, &attr) != 0) {
zap_cursor_fini(&cursor);
@@ -1457,12 +1660,12 @@
/* there is no next dir on a snapshot! */
if (os->os_dsl_dataset->ds_object !=
- dd->dd_phys->dd_head_dataset_obj)
+ dsl_dir_phys(dd)->dd_head_dataset_obj)
return (SET_ERROR(ENOENT));
zap_cursor_init_serialized(&cursor,
dd->dd_pool->dp_meta_objset,
- dd->dd_phys->dd_child_dir_zapobj, *offp);
+ dsl_dir_phys(dd)->dd_child_dir_zapobj, *offp);
if (zap_cursor_retrieve(&cursor, &attr) != 0) {
zap_cursor_fini(&cursor);
@@ -1484,41 +1687,52 @@
return (0);
}
-/*
- * Find objsets under and including ddobj, call func(ds) on each.
- */
-int
-dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj,
- int func(dsl_pool_t *, dsl_dataset_t *, void *), void *arg, int flags)
+typedef struct dmu_objset_find_ctx {
+ taskq_t *dc_tq;
+ dsl_pool_t *dc_dp;
+ uint64_t dc_ddobj;
+ int (*dc_func)(dsl_pool_t *, dsl_dataset_t *, void *);
+ void *dc_arg;
+ int dc_flags;
+ kmutex_t *dc_error_lock;
+ int *dc_error;
+} dmu_objset_find_ctx_t;
+
+static void
+dmu_objset_find_dp_impl(dmu_objset_find_ctx_t *dcp)
{
+ dsl_pool_t *dp = dcp->dc_dp;
+ dmu_objset_find_ctx_t *child_dcp;
dsl_dir_t *dd;
dsl_dataset_t *ds;
zap_cursor_t zc;
zap_attribute_t *attr;
uint64_t thisobj;
- int err;
+ int err = 0;
- ASSERT(dsl_pool_config_held(dp));
+ /* don't process if there already was an error */
+ if (*dcp->dc_error != 0)
+ goto out;
- err = dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd);
+ err = dsl_dir_hold_obj(dp, dcp->dc_ddobj, NULL, FTAG, &dd);
if (err != 0)
- return (err);
+ goto out;
/* Don't visit hidden ($MOS & $ORIGIN) objsets. */
if (dd->dd_myname[0] == '$') {
dsl_dir_rele(dd, FTAG);
- return (0);
+ goto out;
}
- thisobj = dd->dd_phys->dd_head_dataset_obj;
+ thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj;
attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
/*
* Iterate over all children.
*/
- if (flags & DS_FIND_CHILDREN) {
+ if (dcp->dc_flags & DS_FIND_CHILDREN) {
for (zap_cursor_init(&zc, dp->dp_meta_objset,
- dd->dd_phys->dd_child_dir_zapobj);
+ dsl_dir_phys(dd)->dd_child_dir_zapobj);
zap_cursor_retrieve(&zc, attr) == 0;
(void) zap_cursor_advance(&zc)) {
ASSERT3U(attr->za_integer_length, ==,
@@ -1525,29 +1739,29 @@
sizeof (uint64_t));
ASSERT3U(attr->za_num_integers, ==, 1);
- err = dmu_objset_find_dp(dp, attr->za_first_integer,
- func, arg, flags);
- if (err != 0)
- break;
+ child_dcp = kmem_alloc(sizeof (*child_dcp), KM_SLEEP);
+ *child_dcp = *dcp;
+ child_dcp->dc_ddobj = attr->za_first_integer;
+ if (dcp->dc_tq != NULL)
+ (void) taskq_dispatch(dcp->dc_tq,
+ dmu_objset_find_dp_cb, child_dcp, TQ_SLEEP);
+ else
+ dmu_objset_find_dp_impl(child_dcp);
}
zap_cursor_fini(&zc);
-
- if (err != 0) {
- dsl_dir_rele(dd, FTAG);
- kmem_free(attr, sizeof (zap_attribute_t));
- return (err);
- }
}
/*
* Iterate over all snapshots.
*/
- if (flags & DS_FIND_SNAPSHOTS) {
+ if (dcp->dc_flags & DS_FIND_SNAPSHOTS) {
dsl_dataset_t *ds;
err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
if (err == 0) {
- uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj;
+ uint64_t snapobj;
+
+ snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
dsl_dataset_rele(ds, FTAG);
for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj);
@@ -1561,7 +1775,7 @@
attr->za_first_integer, FTAG, &ds);
if (err != 0)
break;
- err = func(dp, ds, arg);
+ err = dcp->dc_func(dp, ds, dcp->dc_arg);
dsl_dataset_rele(ds, FTAG);
if (err != 0)
break;
@@ -1574,7 +1788,7 @@
kmem_free(attr, sizeof (zap_attribute_t));
if (err != 0)
- return (err);
+ goto out;
/*
* Apply to self.
@@ -1581,13 +1795,122 @@
*/
err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
if (err != 0)
- return (err);
- err = func(dp, ds, arg);
+ goto out;
+ err = dcp->dc_func(dp, ds, dcp->dc_arg);
dsl_dataset_rele(ds, FTAG);
- return (err);
+
+out:
+ if (err != 0) {
+ mutex_enter(dcp->dc_error_lock);
+ /* only keep first error */
+ if (*dcp->dc_error == 0)
+ *dcp->dc_error = err;
+ mutex_exit(dcp->dc_error_lock);
+ }
+
+ kmem_free(dcp, sizeof (*dcp));
}
+static void
+dmu_objset_find_dp_cb(void *arg)
+{
+ dmu_objset_find_ctx_t *dcp = arg;
+ dsl_pool_t *dp = dcp->dc_dp;
+
+ /*
+ * We need to get a pool_config_lock here, as there are several
+ * asssert(pool_config_held) down the stack. Getting a lock via
+ * dsl_pool_config_enter is risky, as it might be stalled by a
+ * pending writer. This would deadlock, as the write lock can
+ * only be granted when our parent thread gives up the lock.
+ * The _prio interface gives us priority over a pending writer.
+ */
+ dsl_pool_config_enter_prio(dp, FTAG);
+
+ dmu_objset_find_dp_impl(dcp);
+
+ dsl_pool_config_exit(dp, FTAG);
+}
+
/*
+ * Find objsets under and including ddobj, call func(ds) on each.
+ * The order for the enumeration is completely undefined.
+ * func is called with dsl_pool_config held.
+ */
+int
+dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj,
+ int func(dsl_pool_t *, dsl_dataset_t *, void *), void *arg, int flags)
+{
+ int error = 0;
+ taskq_t *tq = NULL;
+ int ntasks;
+ dmu_objset_find_ctx_t *dcp;
+ kmutex_t err_lock;
+
+ mutex_init(&err_lock, NULL, MUTEX_DEFAULT, NULL);
+ dcp = kmem_alloc(sizeof (*dcp), KM_SLEEP);
+ dcp->dc_tq = NULL;
+ dcp->dc_dp = dp;
+ dcp->dc_ddobj = ddobj;
+ dcp->dc_func = func;
+ dcp->dc_arg = arg;
+ dcp->dc_flags = flags;
+ dcp->dc_error_lock = &err_lock;
+ dcp->dc_error = &error;
+
+ if ((flags & DS_FIND_SERIALIZE) || dsl_pool_config_held_writer(dp)) {
+ /*
+ * In case a write lock is held we can't make use of
+ * parallelism, as down the stack of the worker threads
+ * the lock is asserted via dsl_pool_config_held.
+ * In case of a read lock this is solved by getting a read
+ * lock in each worker thread, which isn't possible in case
+ * of a writer lock. So we fall back to the synchronous path
+ * here.
+ * In the future it might be possible to get some magic into
+ * dsl_pool_config_held in a way that it returns true for
+ * the worker threads so that a single lock held from this
+ * thread suffices. For now, stay single threaded.
+ */
+ dmu_objset_find_dp_impl(dcp);
+ mutex_destroy(&err_lock);
+
+ return (error);
+ }
+
+ ntasks = dmu_find_threads;
+ if (ntasks == 0)
+ ntasks = vdev_count_leaves(dp->dp_spa) * 4;
+ tq = taskq_create("dmu_objset_find", ntasks, minclsyspri, ntasks,
+ INT_MAX, 0);
+ if (tq == NULL) {
+ kmem_free(dcp, sizeof (*dcp));
+ mutex_destroy(&err_lock);
+
+ return (SET_ERROR(ENOMEM));
+ }
+ dcp->dc_tq = tq;
+
+ /* dcp will be freed by task */
+ (void) taskq_dispatch(tq, dmu_objset_find_dp_cb, dcp, TQ_SLEEP);
+
+ /*
+ * PORTING: this code relies on the property of taskq_wait to wait
+ * until no more tasks are queued and no more tasks are active. As
+ * we always queue new tasks from within other tasks, task_wait
+ * reliably waits for the full recursion to finish, even though we
+ * enqueue new tasks after taskq_wait has been called.
+ * On platforms other than illumos, taskq_wait may not have this
+ * property.
+ */
+ taskq_wait(tq);
+ taskq_destroy(tq);
+ mutex_destroy(&err_lock);
+
+ return (error);
+}
+
+/*
* Find all objsets under name, and for each, call 'func(child_name, arg)'.
* The dp_config_rwlock must not be held when this is called, and it
* will not be held when the callback is called.
@@ -1622,7 +1945,7 @@
return (0);
}
- thisobj = dd->dd_phys->dd_head_dataset_obj;
+ thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj;
attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
/*
@@ -1630,7 +1953,7 @@
*/
if (flags & DS_FIND_CHILDREN) {
for (zap_cursor_init(&zc, dp->dp_meta_objset,
- dd->dd_phys->dd_child_dir_zapobj);
+ dsl_dir_phys(dd)->dd_child_dir_zapobj);
zap_cursor_retrieve(&zc, attr) == 0;
(void) zap_cursor_advance(&zc)) {
ASSERT3U(attr->za_integer_length, ==,
@@ -1663,7 +1986,9 @@
err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
if (err == 0) {
- uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj;
+ uint64_t snapobj;
+
+ snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
dsl_dataset_rele(ds, FTAG);
for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj);
@@ -1731,7 +2056,7 @@
/*
* Determine name of filesystem, given name of snapshot.
- * buf must be at least MAXNAMELEN bytes
+ * buf must be at least ZFS_MAX_DATASET_NAME_LEN bytes
*/
int
dmu_fsname(const char *snapname, char *buf)
@@ -1739,7 +2064,7 @@
char *atp = strchr(snapname, '@');
if (atp == NULL)
return (SET_ERROR(EINVAL));
- if (atp - snapname >= MAXNAMELEN)
+ if (atp - snapname >= ZFS_MAX_DATASET_NAME_LEN)
return (SET_ERROR(ENAMETOOLONG));
(void) strlcpy(buf, snapname, atp - snapname + 1);
return (0);
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -22,9 +22,12 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
- * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
* Copyright (c) 2012, Martin Matuska <mm at FreeBSD.org>. All rights reserved.
+ * Copyright 2014 HybridCluster. All rights reserved.
+ * Copyright 2016 RackTop Systems.
+ * Copyright (c) 2014 Integros [integros.com]
*/
#include <sys/dmu.h>
@@ -50,22 +53,76 @@
#include <sys/zfs_onexit.h>
#include <sys/dmu_send.h>
#include <sys/dsl_destroy.h>
+#include <sys/blkptr.h>
+#include <sys/dsl_bookmark.h>
+#include <sys/zfeature.h>
+#include <sys/bqueue.h>
+#ifdef __FreeBSD__
+#undef dump_write
+#define dump_write dmu_dump_write
+#endif
+
/* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */
int zfs_send_corrupt_data = B_FALSE;
+int zfs_send_queue_length = 16 * 1024 * 1024;
+int zfs_recv_queue_length = 16 * 1024 * 1024;
+/* Set this tunable to FALSE to disable setting of DRR_FLAG_FREERECORDS */
+int zfs_send_set_freerecords_bit = B_TRUE;
+#ifdef _KERNEL
+TUNABLE_INT("vfs.zfs.send_set_freerecords_bit", &zfs_send_set_freerecords_bit);
+#endif
+
static char *dmu_recv_tag = "dmu_recv_tag";
-static const char *recv_clone_name = "%recv";
+const char *recv_clone_name = "%recv";
+#define BP_SPAN(datablkszsec, indblkshift, level) \
+ (((uint64_t)datablkszsec) << (SPA_MINBLOCKSHIFT + \
+ (level) * (indblkshift - SPA_BLKPTRSHIFT)))
+
+static void byteswap_record(dmu_replay_record_t *drr);
+
+struct send_thread_arg {
+ bqueue_t q;
+ dsl_dataset_t *ds; /* Dataset to traverse */
+ uint64_t fromtxg; /* Traverse from this txg */
+ int flags; /* flags to pass to traverse_dataset */
+ int error_code;
+ boolean_t cancel;
+ zbookmark_phys_t resume;
+};
+
+struct send_block_record {
+ boolean_t eos_marker; /* Marks the end of the stream */
+ blkptr_t bp;
+ zbookmark_phys_t zb;
+ uint8_t indblkshift;
+ uint16_t datablkszsec;
+ bqueue_node_t ln;
+};
+
static int
dump_bytes(dmu_sendarg_t *dsp, void *buf, int len)
{
- dsl_dataset_t *ds = dsp->dsa_os->os_dsl_dataset;
+ dsl_dataset_t *ds = dmu_objset_ds(dsp->dsa_os);
struct uio auio;
struct iovec aiov;
+
+ /*
+ * The code does not rely on this (len being a multiple of 8). We keep
+ * this assertion because of the corresponding assertion in
+ * receive_read(). Keeping this assertion ensures that we do not
+ * inadvertently break backwards compatibility (causing the assertion
+ * in receive_read() to trigger on old software).
+ *
+ * Removing the assertions could be rolled into a new feature that uses
+ * data that isn't 8-byte aligned; if the assertions were removed, a
+ * feature flag would have to be added.
+ */
+
ASSERT0(len % 8);
- fletcher_4_incremental_native(buf, len, &dsp->dsa_zc);
aiov.iov_base = buf;
aiov.iov_len = len;
auio.uio_iov = &aiov;
@@ -91,12 +148,74 @@
return (dsp->dsa_err);
}
+/*
+ * For all record types except BEGIN, fill in the checksum (overlaid in
+ * drr_u.drr_checksum.drr_checksum). The checksum verifies everything
+ * up to the start of the checksum itself.
+ */
static int
+dump_record(dmu_sendarg_t *dsp, void *payload, int payload_len)
+{
+ ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
+ ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t));
+ fletcher_4_incremental_native(dsp->dsa_drr,
+ offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
+ &dsp->dsa_zc);
+ if (dsp->dsa_drr->drr_type == DRR_BEGIN) {
+ dsp->dsa_sent_begin = B_TRUE;
+ } else {
+ ASSERT(ZIO_CHECKSUM_IS_ZERO(&dsp->dsa_drr->drr_u.
+ drr_checksum.drr_checksum));
+ dsp->dsa_drr->drr_u.drr_checksum.drr_checksum = dsp->dsa_zc;
+ }
+ if (dsp->dsa_drr->drr_type == DRR_END) {
+ dsp->dsa_sent_end = B_TRUE;
+ }
+ fletcher_4_incremental_native(&dsp->dsa_drr->
+ drr_u.drr_checksum.drr_checksum,
+ sizeof (zio_cksum_t), &dsp->dsa_zc);
+ if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0)
+ return (SET_ERROR(EINTR));
+ if (payload_len != 0) {
+ fletcher_4_incremental_native(payload, payload_len,
+ &dsp->dsa_zc);
+ if (dump_bytes(dsp, payload, payload_len) != 0)
+ return (SET_ERROR(EINTR));
+ }
+ return (0);
+}
+
+/*
+ * Fill in the drr_free struct, or perform aggregation if the previous record is
+ * also a free record, and the two are adjacent.
+ *
+ * Note that we send free records even for a full send, because we want to be
+ * able to receive a full send as a clone, which requires a list of all the free
+ * and freeobject records that were generated on the source.
+ */
+static int
dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
uint64_t length)
{
struct drr_free *drrf = &(dsp->dsa_drr->drr_u.drr_free);
+ /*
+ * When we receive a free record, dbuf_free_range() assumes
+ * that the receiving system doesn't have any dbufs in the range
+ * being freed. This is always true because there is a one-record
+ * constraint: we only send one WRITE record for any given
+ * object,offset. We know that the one-record constraint is
+ * true because we always send data in increasing order by
+ * object,offset.
+ *
+ * If the increasing-order constraint ever changes, we should find
+ * another way to assert that the one-record constraint is still
+ * satisfied.
+ */
+ ASSERT(object > dsp->dsa_last_data_object ||
+ (object == dsp->dsa_last_data_object &&
+ offset > dsp->dsa_last_data_offset));
+
if (length != -1ULL && offset + length < offset)
length = -1ULL;
@@ -109,8 +228,7 @@
*/
if (dsp->dsa_pending_op != PENDING_NONE &&
dsp->dsa_pending_op != PENDING_FREE) {
- if (dump_bytes(dsp, dsp->dsa_drr,
- sizeof (dmu_replay_record_t)) != 0)
+ if (dump_record(dsp, NULL, 0) != 0)
return (SET_ERROR(EINTR));
dsp->dsa_pending_op = PENDING_NONE;
}
@@ -133,8 +251,7 @@
return (0);
} else {
/* not a continuation. Push out pending record */
- if (dump_bytes(dsp, dsp->dsa_drr,
- sizeof (dmu_replay_record_t)) != 0)
+ if (dump_record(dsp, NULL, 0) != 0)
return (SET_ERROR(EINTR));
dsp->dsa_pending_op = PENDING_NONE;
}
@@ -147,8 +264,7 @@
drrf->drr_length = length;
drrf->drr_toguid = dsp->dsa_toguid;
if (length == -1ULL) {
- if (dump_bytes(dsp, dsp->dsa_drr,
- sizeof (dmu_replay_record_t)) != 0)
+ if (dump_record(dsp, NULL, 0) != 0)
return (SET_ERROR(EINTR));
} else {
dsp->dsa_pending_op = PENDING_FREE;
@@ -158,11 +274,20 @@
}
static int
-dump_data(dmu_sendarg_t *dsp, dmu_object_type_t type,
+dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type,
uint64_t object, uint64_t offset, int blksz, const blkptr_t *bp, void *data)
{
struct drr_write *drrw = &(dsp->dsa_drr->drr_u.drr_write);
+ /*
+ * We send data in increasing object, offset order.
+ * See comment in dump_free() for details.
+ */
+ ASSERT(object > dsp->dsa_last_data_object ||
+ (object == dsp->dsa_last_data_object &&
+ offset > dsp->dsa_last_data_offset));
+ dsp->dsa_last_data_object = object;
+ dsp->dsa_last_data_offset = offset + blksz - 1;
/*
* If there is any kind of pending aggregation (currently either
@@ -171,12 +296,11 @@
* of different types.
*/
if (dsp->dsa_pending_op != PENDING_NONE) {
- if (dump_bytes(dsp, dsp->dsa_drr,
- sizeof (dmu_replay_record_t)) != 0)
+ if (dump_record(dsp, NULL, 0) != 0)
return (SET_ERROR(EINTR));
dsp->dsa_pending_op = PENDING_NONE;
}
- /* write a DATA record */
+ /* write a WRITE record */
bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
dsp->dsa_drr->drr_type = DRR_WRITE;
drrw->drr_object = object;
@@ -184,29 +308,71 @@
drrw->drr_offset = offset;
drrw->drr_length = blksz;
drrw->drr_toguid = dsp->dsa_toguid;
- drrw->drr_checksumtype = BP_GET_CHECKSUM(bp);
- if (zio_checksum_table[drrw->drr_checksumtype].ci_dedup)
- drrw->drr_checksumflags |= DRR_CHECKSUM_DEDUP;
- DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp));
- DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp));
- DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp));
- drrw->drr_key.ddk_cksum = bp->blk_cksum;
+ if (bp == NULL || BP_IS_EMBEDDED(bp)) {
+ /*
+ * There's no pre-computed checksum for partial-block
+ * writes or embedded BP's, so (like
+ * fletcher4-checkummed blocks) userland will have to
+ * compute a dedup-capable checksum itself.
+ */
+ drrw->drr_checksumtype = ZIO_CHECKSUM_OFF;
+ } else {
+ drrw->drr_checksumtype = BP_GET_CHECKSUM(bp);
+ if (zio_checksum_table[drrw->drr_checksumtype].ci_flags &
+ ZCHECKSUM_FLAG_DEDUP)
+ drrw->drr_checksumflags |= DRR_CHECKSUM_DEDUP;
+ DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp));
+ DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp));
+ DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp));
+ drrw->drr_key.ddk_cksum = bp->blk_cksum;
+ }
- if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0)
+ if (dump_record(dsp, data, blksz) != 0)
return (SET_ERROR(EINTR));
- if (dump_bytes(dsp, data, blksz) != 0)
- return (SET_ERROR(EINTR));
return (0);
}
static int
+dump_write_embedded(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
+ int blksz, const blkptr_t *bp)
+{
+ char buf[BPE_PAYLOAD_SIZE];
+ struct drr_write_embedded *drrw =
+ &(dsp->dsa_drr->drr_u.drr_write_embedded);
+
+ if (dsp->dsa_pending_op != PENDING_NONE) {
+ if (dump_record(dsp, NULL, 0) != 0)
+ return (EINTR);
+ dsp->dsa_pending_op = PENDING_NONE;
+ }
+
+ ASSERT(BP_IS_EMBEDDED(bp));
+
+ bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
+ dsp->dsa_drr->drr_type = DRR_WRITE_EMBEDDED;
+ drrw->drr_object = object;
+ drrw->drr_offset = offset;
+ drrw->drr_length = blksz;
+ drrw->drr_toguid = dsp->dsa_toguid;
+ drrw->drr_compression = BP_GET_COMPRESS(bp);
+ drrw->drr_etype = BPE_GET_ETYPE(bp);
+ drrw->drr_lsize = BPE_GET_LSIZE(bp);
+ drrw->drr_psize = BPE_GET_PSIZE(bp);
+
+ decode_embedded_bp_compressed(bp, buf);
+
+ if (dump_record(dsp, buf, P2ROUNDUP(drrw->drr_psize, 8)) != 0)
+ return (EINTR);
+ return (0);
+}
+
+static int
dump_spill(dmu_sendarg_t *dsp, uint64_t object, int blksz, void *data)
{
struct drr_spill *drrs = &(dsp->dsa_drr->drr_u.drr_spill);
if (dsp->dsa_pending_op != PENDING_NONE) {
- if (dump_bytes(dsp, dsp->dsa_drr,
- sizeof (dmu_replay_record_t)) != 0)
+ if (dump_record(dsp, NULL, 0) != 0)
return (SET_ERROR(EINTR));
dsp->dsa_pending_op = PENDING_NONE;
}
@@ -218,10 +384,8 @@
drrs->drr_length = blksz;
drrs->drr_toguid = dsp->dsa_toguid;
- if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)))
+ if (dump_record(dsp, data, blksz) != 0)
return (SET_ERROR(EINTR));
- if (dump_bytes(dsp, data, blksz))
- return (SET_ERROR(EINTR));
return (0);
}
@@ -239,8 +403,7 @@
*/
if (dsp->dsa_pending_op != PENDING_NONE &&
dsp->dsa_pending_op != PENDING_FREEOBJECTS) {
- if (dump_bytes(dsp, dsp->dsa_drr,
- sizeof (dmu_replay_record_t)) != 0)
+ if (dump_record(dsp, NULL, 0) != 0)
return (SET_ERROR(EINTR));
dsp->dsa_pending_op = PENDING_NONE;
}
@@ -254,8 +417,7 @@
return (0);
} else {
/* can't be aggregated. Push out pending record */
- if (dump_bytes(dsp, dsp->dsa_drr,
- sizeof (dmu_replay_record_t)) != 0)
+ if (dump_record(dsp, NULL, 0) != 0)
return (SET_ERROR(EINTR));
dsp->dsa_pending_op = PENDING_NONE;
}
@@ -278,12 +440,24 @@
{
struct drr_object *drro = &(dsp->dsa_drr->drr_u.drr_object);
+ if (object < dsp->dsa_resume_object) {
+ /*
+ * Note: when resuming, we will visit all the dnodes in
+ * the block of dnodes that we are resuming from. In
+ * this case it's unnecessary to send the dnodes prior to
+ * the one we are resuming from. We should be at most one
+ * block's worth of dnodes behind the resume point.
+ */
+ ASSERT3U(dsp->dsa_resume_object - object, <,
+ 1 << (DNODE_BLOCK_SHIFT - DNODE_SHIFT));
+ return (0);
+ }
+
if (dnp == NULL || dnp->dn_type == DMU_OT_NONE)
return (dump_freeobjects(dsp, object, 1));
if (dsp->dsa_pending_op != PENDING_NONE) {
- if (dump_bytes(dsp, dsp->dsa_drr,
- sizeof (dmu_replay_record_t)) != 0)
+ if (dump_record(dsp, NULL, 0) != 0)
return (SET_ERROR(EINTR));
dsp->dsa_pending_op = PENDING_NONE;
}
@@ -300,15 +474,18 @@
drro->drr_compress = dnp->dn_compress;
drro->drr_toguid = dsp->dsa_toguid;
- if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0)
- return (SET_ERROR(EINTR));
+ if (!(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
+ drro->drr_blksz > SPA_OLD_MAXBLOCKSIZE)
+ drro->drr_blksz = SPA_OLD_MAXBLOCKSIZE;
- if (dump_bytes(dsp, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8)) != 0)
+ if (dump_record(dsp, DN_BONUS(dnp),
+ P2ROUNDUP(dnp->dn_bonuslen, 8)) != 0) {
return (SET_ERROR(EINTR));
+ }
- /* free anything past the end of the file */
+ /* Free anything past the end of the file. */
if (dump_free(dsp, object, (dnp->dn_maxblkid + 1) *
- (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL))
+ (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL) != 0)
return (SET_ERROR(EINTR));
if (dsp->dsa_err != 0)
return (SET_ERROR(EINTR));
@@ -315,57 +492,157 @@
return (0);
}
-#define BP_SPAN(dnp, level) \
- (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \
- (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)))
+static boolean_t
+backup_do_embed(dmu_sendarg_t *dsp, const blkptr_t *bp)
+{
+ if (!BP_IS_EMBEDDED(bp))
+ return (B_FALSE);
-/* ARGSUSED */
+ /*
+ * Compression function must be legacy, or explicitly enabled.
+ */
+ if ((BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_LEGACY_FUNCTIONS &&
+ !(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA_LZ4)))
+ return (B_FALSE);
+
+ /*
+ * Embed type must be explicitly enabled.
+ */
+ switch (BPE_GET_ETYPE(bp)) {
+ case BP_EMBEDDED_TYPE_DATA:
+ if (dsp->dsa_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA)
+ return (B_TRUE);
+ break;
+ default:
+ return (B_FALSE);
+ }
+ return (B_FALSE);
+}
+
+/*
+ * This is the callback function to traverse_dataset that acts as the worker
+ * thread for dmu_send_impl.
+ */
+/*ARGSUSED*/
static int
-backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
- const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
+send_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+ const zbookmark_phys_t *zb, const struct dnode_phys *dnp, void *arg)
{
- dmu_sendarg_t *dsp = arg;
- dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE;
+ struct send_thread_arg *sta = arg;
+ struct send_block_record *record;
+ uint64_t record_size;
int err = 0;
- if (issig(JUSTLOOKING) && issig(FORREAL))
+ ASSERT(zb->zb_object == DMU_META_DNODE_OBJECT ||
+ zb->zb_object >= sta->resume.zb_object);
+
+ if (sta->cancel)
return (SET_ERROR(EINTR));
+ if (bp == NULL) {
+ ASSERT3U(zb->zb_level, ==, ZB_DNODE_LEVEL);
+ return (0);
+ } else if (zb->zb_level < 0) {
+ return (0);
+ }
+
+ record = kmem_zalloc(sizeof (struct send_block_record), KM_SLEEP);
+ record->eos_marker = B_FALSE;
+ record->bp = *bp;
+ record->zb = *zb;
+ record->indblkshift = dnp->dn_indblkshift;
+ record->datablkszsec = dnp->dn_datablkszsec;
+ record_size = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT;
+ bqueue_enqueue(&sta->q, record, record_size);
+
+ return (err);
+}
+
+/*
+ * This function kicks off the traverse_dataset. It also handles setting the
+ * error code of the thread in case something goes wrong, and pushes the End of
+ * Stream record when the traverse_dataset call has finished. If there is no
+ * dataset to traverse, the thread immediately pushes End of Stream marker.
+ */
+static void
+send_traverse_thread(void *arg)
+{
+ struct send_thread_arg *st_arg = arg;
+ int err;
+ struct send_block_record *data;
+
+ if (st_arg->ds != NULL) {
+ err = traverse_dataset_resume(st_arg->ds,
+ st_arg->fromtxg, &st_arg->resume,
+ st_arg->flags, send_cb, st_arg);
+
+ if (err != EINTR)
+ st_arg->error_code = err;
+ }
+ data = kmem_zalloc(sizeof (*data), KM_SLEEP);
+ data->eos_marker = B_TRUE;
+ bqueue_enqueue(&st_arg->q, data, 1);
+ thread_exit();
+}
+
+/*
+ * This function actually handles figuring out what kind of record needs to be
+ * dumped, reading the data (which has hopefully been prefetched), and calling
+ * the appropriate helper function.
+ */
+static int
+do_dump(dmu_sendarg_t *dsa, struct send_block_record *data)
+{
+ dsl_dataset_t *ds = dmu_objset_ds(dsa->dsa_os);
+ const blkptr_t *bp = &data->bp;
+ const zbookmark_phys_t *zb = &data->zb;
+ uint8_t indblkshift = data->indblkshift;
+ uint16_t dblkszsec = data->datablkszsec;
+ spa_t *spa = ds->ds_dir->dd_pool->dp_spa;
+ dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE;
+ int err = 0;
+
+ ASSERT3U(zb->zb_level, >=, 0);
+
+ ASSERT(zb->zb_object == DMU_META_DNODE_OBJECT ||
+ zb->zb_object >= dsa->dsa_resume_object);
+
if (zb->zb_object != DMU_META_DNODE_OBJECT &&
DMU_OBJECT_IS_SPECIAL(zb->zb_object)) {
return (0);
- } else if (bp == NULL && zb->zb_object == DMU_META_DNODE_OBJECT) {
- uint64_t span = BP_SPAN(dnp, zb->zb_level);
+ } else if (BP_IS_HOLE(bp) &&
+ zb->zb_object == DMU_META_DNODE_OBJECT) {
+ uint64_t span = BP_SPAN(dblkszsec, indblkshift, zb->zb_level);
uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT;
- err = dump_freeobjects(dsp, dnobj, span >> DNODE_SHIFT);
- } else if (bp == NULL) {
- uint64_t span = BP_SPAN(dnp, zb->zb_level);
- err = dump_free(dsp, zb->zb_object, zb->zb_blkid * span, span);
+ err = dump_freeobjects(dsa, dnobj, span >> DNODE_SHIFT);
+ } else if (BP_IS_HOLE(bp)) {
+ uint64_t span = BP_SPAN(dblkszsec, indblkshift, zb->zb_level);
+ uint64_t offset = zb->zb_blkid * span;
+ err = dump_free(dsa, zb->zb_object, offset, span);
} else if (zb->zb_level > 0 || type == DMU_OT_OBJSET) {
return (0);
} else if (type == DMU_OT_DNODE) {
- dnode_phys_t *blk;
- int i;
int blksz = BP_GET_LSIZE(bp);
- uint32_t aflags = ARC_WAIT;
+ arc_flags_t aflags = ARC_FLAG_WAIT;
arc_buf_t *abuf;
+ ASSERT0(zb->zb_level);
+
if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf,
ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
&aflags, zb) != 0)
return (SET_ERROR(EIO));
- blk = abuf->b_data;
- for (i = 0; i < blksz >> DNODE_SHIFT; i++) {
- uint64_t dnobj = (zb->zb_blkid <<
- (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i;
- err = dump_dnode(dsp, dnobj, blk+i);
+ dnode_phys_t *blk = abuf->b_data;
+ uint64_t dnobj = zb->zb_blkid * (blksz >> DNODE_SHIFT);
+ for (int i = 0; i < blksz >> DNODE_SHIFT; i++) {
+ err = dump_dnode(dsa, dnobj + i, blk + i);
if (err != 0)
break;
}
- (void) arc_buf_remove_ref(abuf, &abuf);
+ arc_buf_destroy(abuf, &abuf);
} else if (type == DMU_OT_SA) {
- uint32_t aflags = ARC_WAIT;
+ arc_flags_t aflags = ARC_FLAG_WAIT;
arc_buf_t *abuf;
int blksz = BP_GET_LSIZE(bp);
@@ -374,33 +651,62 @@
&aflags, zb) != 0)
return (SET_ERROR(EIO));
- err = dump_spill(dsp, zb->zb_object, blksz, abuf->b_data);
- (void) arc_buf_remove_ref(abuf, &abuf);
- } else { /* it's a level-0 block of a regular object */
- uint32_t aflags = ARC_WAIT;
+ err = dump_spill(dsa, zb->zb_object, blksz, abuf->b_data);
+ arc_buf_destroy(abuf, &abuf);
+ } else if (backup_do_embed(dsa, bp)) {
+ /* it's an embedded level-0 block of a regular object */
+ int blksz = dblkszsec << SPA_MINBLOCKSHIFT;
+ ASSERT0(zb->zb_level);
+ err = dump_write_embedded(dsa, zb->zb_object,
+ zb->zb_blkid * blksz, blksz, bp);
+ } else {
+ /* it's a level-0 block of a regular object */
+ arc_flags_t aflags = ARC_FLAG_WAIT;
arc_buf_t *abuf;
- int blksz = BP_GET_LSIZE(bp);
+ int blksz = dblkszsec << SPA_MINBLOCKSHIFT;
+ uint64_t offset;
+ ASSERT0(zb->zb_level);
+ ASSERT(zb->zb_object > dsa->dsa_resume_object ||
+ (zb->zb_object == dsa->dsa_resume_object &&
+ zb->zb_blkid * blksz >= dsa->dsa_resume_offset));
+
if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf,
ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
&aflags, zb) != 0) {
if (zfs_send_corrupt_data) {
/* Send a block filled with 0x"zfs badd bloc" */
- abuf = arc_buf_alloc(spa, blksz, &abuf,
+ abuf = arc_alloc_buf(spa, blksz, &abuf,
ARC_BUFC_DATA);
uint64_t *ptr;
for (ptr = abuf->b_data;
(char *)ptr < (char *)abuf->b_data + blksz;
ptr++)
- *ptr = 0x2f5baddb10c;
+ *ptr = 0x2f5baddb10cULL;
} else {
return (SET_ERROR(EIO));
}
}
- err = dump_data(dsp, type, zb->zb_object, zb->zb_blkid * blksz,
- blksz, bp, abuf->b_data);
- (void) arc_buf_remove_ref(abuf, &abuf);
+ offset = zb->zb_blkid * blksz;
+
+ if (!(dsa->dsa_featureflags &
+ DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
+ blksz > SPA_OLD_MAXBLOCKSIZE) {
+ char *buf = abuf->b_data;
+ while (blksz > 0 && err == 0) {
+ int n = MIN(blksz, SPA_OLD_MAXBLOCKSIZE);
+ err = dump_write(dsa, type, zb->zb_object,
+ offset, n, NULL, buf);
+ offset += n;
+ buf += n;
+ blksz -= n;
+ }
+ } else {
+ err = dump_write(dsa, type, zb->zb_object,
+ offset, blksz, bp, abuf->b_data);
+ }
+ arc_buf_destroy(abuf, &abuf);
}
ASSERT(err == 0 || err == EINTR);
@@ -408,14 +714,30 @@
}
/*
- * Releases dp, ds, and fromds, using the specified tag.
+ * Pop the new data off the queue, and free the old data.
*/
+static struct send_block_record *
+get_next_record(bqueue_t *bq, struct send_block_record *data)
+{
+ struct send_block_record *tmp = bqueue_dequeue(bq);
+ kmem_free(data, sizeof (*data));
+ return (tmp);
+}
+
+/*
+ * Actually do the bulk of the work in a zfs send.
+ *
+ * Note: Releases dp using the specified tag.
+ */
static int
-dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds,
+dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds,
+ zfs_bookmark_phys_t *ancestor_zb,
+ boolean_t is_clone, boolean_t embedok, boolean_t large_block_ok, int outfd,
+ uint64_t resumeobj, uint64_t resumeoff,
#ifdef illumos
- dsl_dataset_t *fromds, int outfd, vnode_t *vp, offset_t *off)
+ vnode_t *vp, offset_t *off)
#else
- dsl_dataset_t *fromds, int outfd, struct file *fp, offset_t *off)
+ struct file *fp, offset_t *off)
#endif
{
objset_t *os;
@@ -423,19 +745,11 @@
dmu_sendarg_t *dsp;
int err;
uint64_t fromtxg = 0;
+ uint64_t featureflags = 0;
+ struct send_thread_arg to_arg = { 0 };
- if (fromds != NULL && !dsl_dataset_is_before(ds, fromds)) {
- dsl_dataset_rele(fromds, tag);
- dsl_dataset_rele(ds, tag);
- dsl_pool_rele(dp, tag);
- return (SET_ERROR(EXDEV));
- }
-
- err = dmu_objset_from_ds(ds, &os);
+ err = dmu_objset_from_ds(to_ds, &os);
if (err != 0) {
- if (fromds != NULL)
- dsl_dataset_rele(fromds, tag);
- dsl_dataset_rele(ds, tag);
dsl_pool_rele(dp, tag);
return (err);
}
@@ -451,38 +765,52 @@
uint64_t version;
if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &version) != 0) {
kmem_free(drr, sizeof (dmu_replay_record_t));
- if (fromds != NULL)
- dsl_dataset_rele(fromds, tag);
- dsl_dataset_rele(ds, tag);
dsl_pool_rele(dp, tag);
return (SET_ERROR(EINVAL));
}
if (version >= ZPL_VERSION_SA) {
- DMU_SET_FEATUREFLAGS(
- drr->drr_u.drr_begin.drr_versioninfo,
- DMU_BACKUP_FEATURE_SA_SPILL);
+ featureflags |= DMU_BACKUP_FEATURE_SA_SPILL;
}
}
#endif
+ if (large_block_ok && to_ds->ds_feature_inuse[SPA_FEATURE_LARGE_BLOCKS])
+ featureflags |= DMU_BACKUP_FEATURE_LARGE_BLOCKS;
+ if (embedok &&
+ spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) {
+ featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA;
+ if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS))
+ featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA_LZ4;
+ }
+
+ if (resumeobj != 0 || resumeoff != 0) {
+ featureflags |= DMU_BACKUP_FEATURE_RESUMING;
+ }
+
+ DMU_SET_FEATUREFLAGS(drr->drr_u.drr_begin.drr_versioninfo,
+ featureflags);
+
drr->drr_u.drr_begin.drr_creation_time =
- ds->ds_phys->ds_creation_time;
+ dsl_dataset_phys(to_ds)->ds_creation_time;
drr->drr_u.drr_begin.drr_type = dmu_objset_type(os);
- if (fromds != NULL && ds->ds_dir != fromds->ds_dir)
+ if (is_clone)
drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE;
- drr->drr_u.drr_begin.drr_toguid = ds->ds_phys->ds_guid;
- if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET)
+ drr->drr_u.drr_begin.drr_toguid = dsl_dataset_phys(to_ds)->ds_guid;
+ if (dsl_dataset_phys(to_ds)->ds_flags & DS_FLAG_CI_DATASET)
drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA;
+ if (zfs_send_set_freerecords_bit)
+ drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_FREERECORDS;
- if (fromds != NULL)
- drr->drr_u.drr_begin.drr_fromguid = fromds->ds_phys->ds_guid;
- dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname);
-
- if (fromds != NULL) {
- fromtxg = fromds->ds_phys->ds_creation_txg;
- dsl_dataset_rele(fromds, tag);
- fromds = NULL;
+ if (ancestor_zb != NULL) {
+ drr->drr_u.drr_begin.drr_fromguid =
+ ancestor_zb->zbm_guid;
+ fromtxg = ancestor_zb->zbm_creation_txg;
}
+ dsl_dataset_name(to_ds, drr->drr_u.drr_begin.drr_toname);
+ if (!to_ds->ds_is_snapshot) {
+ (void) strlcat(drr->drr_u.drr_begin.drr_toname, "@--head--",
+ sizeof (drr->drr_u.drr_begin.drr_toname));
+ }
dsp = kmem_zalloc(sizeof (dmu_sendarg_t), KM_SLEEP);
@@ -493,27 +821,82 @@
dsp->dsa_fp = fp;
dsp->dsa_os = os;
dsp->dsa_off = off;
- dsp->dsa_toguid = ds->ds_phys->ds_guid;
- ZIO_SET_CHECKSUM(&dsp->dsa_zc, 0, 0, 0, 0);
+ dsp->dsa_toguid = dsl_dataset_phys(to_ds)->ds_guid;
dsp->dsa_pending_op = PENDING_NONE;
+ dsp->dsa_featureflags = featureflags;
+ dsp->dsa_resume_object = resumeobj;
+ dsp->dsa_resume_offset = resumeoff;
- mutex_enter(&ds->ds_sendstream_lock);
- list_insert_head(&ds->ds_sendstreams, dsp);
- mutex_exit(&ds->ds_sendstream_lock);
+ mutex_enter(&to_ds->ds_sendstream_lock);
+ list_insert_head(&to_ds->ds_sendstreams, dsp);
+ mutex_exit(&to_ds->ds_sendstream_lock);
- dsl_dataset_long_hold(ds, FTAG);
+ dsl_dataset_long_hold(to_ds, FTAG);
dsl_pool_rele(dp, tag);
- if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0) {
+ void *payload = NULL;
+ size_t payload_len = 0;
+ if (resumeobj != 0 || resumeoff != 0) {
+ dmu_object_info_t to_doi;
+ err = dmu_object_info(os, resumeobj, &to_doi);
+ if (err != 0)
+ goto out;
+ SET_BOOKMARK(&to_arg.resume, to_ds->ds_object, resumeobj, 0,
+ resumeoff / to_doi.doi_data_block_size);
+
+ nvlist_t *nvl = fnvlist_alloc();
+ fnvlist_add_uint64(nvl, "resume_object", resumeobj);
+ fnvlist_add_uint64(nvl, "resume_offset", resumeoff);
+ payload = fnvlist_pack(nvl, &payload_len);
+ drr->drr_payloadlen = payload_len;
+ fnvlist_free(nvl);
+ }
+
+ err = dump_record(dsp, payload, payload_len);
+ fnvlist_pack_free(payload, payload_len);
+ if (err != 0) {
err = dsp->dsa_err;
goto out;
}
- err = traverse_dataset(ds, fromtxg, TRAVERSE_PRE | TRAVERSE_PREFETCH,
- backup_cb, dsp);
+ err = bqueue_init(&to_arg.q, zfs_send_queue_length,
+ offsetof(struct send_block_record, ln));
+ to_arg.error_code = 0;
+ to_arg.cancel = B_FALSE;
+ to_arg.ds = to_ds;
+ to_arg.fromtxg = fromtxg;
+ to_arg.flags = TRAVERSE_PRE | TRAVERSE_PREFETCH;
+ (void) thread_create(NULL, 0, send_traverse_thread, &to_arg, 0, &p0,
+ TS_RUN, minclsyspri);
+ struct send_block_record *to_data;
+ to_data = bqueue_dequeue(&to_arg.q);
+
+ while (!to_data->eos_marker && err == 0) {
+ err = do_dump(dsp, to_data);
+ to_data = get_next_record(&to_arg.q, to_data);
+ if (issig(JUSTLOOKING) && issig(FORREAL))
+ err = EINTR;
+ }
+
+ if (err != 0) {
+ to_arg.cancel = B_TRUE;
+ while (!to_data->eos_marker) {
+ to_data = get_next_record(&to_arg.q, to_data);
+ }
+ }
+ kmem_free(to_data, sizeof (*to_data));
+
+ bqueue_destroy(&to_arg.q);
+
+ if (err == 0 && to_arg.error_code != 0)
+ err = to_arg.error_code;
+
+ if (err != 0)
+ goto out;
+
if (dsp->dsa_pending_op != PENDING_NONE)
- if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0)
+ if (dump_record(dsp, NULL, 0) != 0)
err = SET_ERROR(EINTR);
if (err != 0) {
@@ -527,21 +910,20 @@
drr->drr_u.drr_end.drr_checksum = dsp->dsa_zc;
drr->drr_u.drr_end.drr_toguid = dsp->dsa_toguid;
- if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0) {
+ if (dump_record(dsp, NULL, 0) != 0)
err = dsp->dsa_err;
- goto out;
- }
out:
- mutex_enter(&ds->ds_sendstream_lock);
- list_remove(&ds->ds_sendstreams, dsp);
- mutex_exit(&ds->ds_sendstream_lock);
+ mutex_enter(&to_ds->ds_sendstream_lock);
+ list_remove(&to_ds->ds_sendstreams, dsp);
+ mutex_exit(&to_ds->ds_sendstream_lock);
+ VERIFY(err != 0 || (dsp->dsa_sent_begin && dsp->dsa_sent_end));
+
kmem_free(drr, sizeof (dmu_replay_record_t));
kmem_free(dsp, sizeof (dmu_sendarg_t));
- dsl_dataset_long_rele(ds, FTAG);
- dsl_dataset_rele(ds, tag);
+ dsl_dataset_long_rele(to_ds, FTAG);
return (err);
}
@@ -548,6 +930,7 @@
int
dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
+ boolean_t embedok, boolean_t large_block_ok,
#ifdef illumos
int outfd, vnode_t *vp, offset_t *off)
#else
@@ -570,6 +953,9 @@
}
if (fromsnap != 0) {
+ zfs_bookmark_phys_t zb;
+ boolean_t is_clone;
+
err = dsl_dataset_hold_obj(dp, fromsnap, FTAG, &fromds);
if (err != 0) {
dsl_dataset_rele(ds, FTAG);
@@ -576,34 +962,55 @@
dsl_pool_rele(dp, FTAG);
return (err);
}
+ if (!dsl_dataset_is_before(ds, fromds, 0))
+ err = SET_ERROR(EXDEV);
+ zb.zbm_creation_time =
+ dsl_dataset_phys(fromds)->ds_creation_time;
+ zb.zbm_creation_txg = dsl_dataset_phys(fromds)->ds_creation_txg;
+ zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid;
+ is_clone = (fromds->ds_dir != ds->ds_dir);
+ dsl_dataset_rele(fromds, FTAG);
+ err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone,
+ embedok, large_block_ok, outfd, 0, 0, fp, off);
+ } else {
+ err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE,
+ embedok, large_block_ok, outfd, 0, 0, fp, off);
}
-
- return (dmu_send_impl(FTAG, dp, ds, fromds, outfd, fp, off));
+ dsl_dataset_rele(ds, FTAG);
+ return (err);
}
int
-dmu_send(const char *tosnap, const char *fromsnap,
+dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
+ boolean_t large_block_ok, int outfd, uint64_t resumeobj, uint64_t resumeoff,
#ifdef illumos
- int outfd, vnode_t *vp, offset_t *off)
+ vnode_t *vp, offset_t *off)
#else
- int outfd, struct file *fp, offset_t *off)
+ struct file *fp, offset_t *off)
#endif
{
dsl_pool_t *dp;
dsl_dataset_t *ds;
- dsl_dataset_t *fromds = NULL;
int err;
+ boolean_t owned = B_FALSE;
- if (strchr(tosnap, '@') == NULL)
+ if (fromsnap != NULL && strpbrk(fromsnap, "@#") == NULL)
return (SET_ERROR(EINVAL));
- if (fromsnap != NULL && strchr(fromsnap, '@') == NULL)
- return (SET_ERROR(EINVAL));
err = dsl_pool_hold(tosnap, FTAG, &dp);
if (err != 0)
return (err);
- err = dsl_dataset_hold(dp, tosnap, FTAG, &ds);
+ if (strchr(tosnap, '@') == NULL && spa_writeable(dp->dp_spa)) {
+ /*
+ * We are sending a filesystem or volume. Ensure
+ * that it doesn't change by owning the dataset.
+ */
+ err = dsl_dataset_own(dp, tosnap, FTAG, &ds);
+ owned = B_TRUE;
+ } else {
+ err = dsl_dataset_hold(dp, tosnap, FTAG, &ds);
+ }
if (err != 0) {
dsl_pool_rele(dp, FTAG);
return (err);
@@ -610,16 +1017,91 @@
}
if (fromsnap != NULL) {
- err = dsl_dataset_hold(dp, fromsnap, FTAG, &fromds);
+ zfs_bookmark_phys_t zb;
+ boolean_t is_clone = B_FALSE;
+ int fsnamelen = strchr(tosnap, '@') - tosnap;
+
+ /*
+ * If the fromsnap is in a different filesystem, then
+ * mark the send stream as a clone.
+ */
+ if (strncmp(tosnap, fromsnap, fsnamelen) != 0 ||
+ (fromsnap[fsnamelen] != '@' &&
+ fromsnap[fsnamelen] != '#')) {
+ is_clone = B_TRUE;
+ }
+
+ if (strchr(fromsnap, '@')) {
+ dsl_dataset_t *fromds;
+ err = dsl_dataset_hold(dp, fromsnap, FTAG, &fromds);
+ if (err == 0) {
+ if (!dsl_dataset_is_before(ds, fromds, 0))
+ err = SET_ERROR(EXDEV);
+ zb.zbm_creation_time =
+ dsl_dataset_phys(fromds)->ds_creation_time;
+ zb.zbm_creation_txg =
+ dsl_dataset_phys(fromds)->ds_creation_txg;
+ zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid;
+ is_clone = (ds->ds_dir != fromds->ds_dir);
+ dsl_dataset_rele(fromds, FTAG);
+ }
+ } else {
+ err = dsl_bookmark_lookup(dp, fromsnap, ds, &zb);
+ }
if (err != 0) {
dsl_dataset_rele(ds, FTAG);
dsl_pool_rele(dp, FTAG);
return (err);
}
+ err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone,
+ embedok, large_block_ok,
+ outfd, resumeobj, resumeoff, fp, off);
+ } else {
+ err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE,
+ embedok, large_block_ok,
+ outfd, resumeobj, resumeoff, fp, off);
}
- return (dmu_send_impl(FTAG, dp, ds, fromds, outfd, fp, off));
+ if (owned)
+ dsl_dataset_disown(ds, FTAG);
+ else
+ dsl_dataset_rele(ds, FTAG);
+ return (err);
}
+static int
+dmu_adjust_send_estimate_for_indirects(dsl_dataset_t *ds, uint64_t size,
+ uint64_t *sizep)
+{
+ int err;
+ /*
+ * Assume that space (both on-disk and in-stream) is dominated by
+ * data. We will adjust for indirect blocks and the copies property,
+ * but ignore per-object space used (eg, dnodes and DRR_OBJECT records).
+ */
+
+ /*
+ * Subtract out approximate space used by indirect blocks.
+ * Assume most space is used by data blocks (non-indirect, non-dnode).
+ * Assume all blocks are recordsize. Assume ditto blocks and
+ * internal fragmentation counter out compression.
+ *
+ * Therefore, space used by indirect blocks is sizeof(blkptr_t) per
+ * block, which we observe in practice.
+ */
+ uint64_t recordsize;
+ err = dsl_prop_get_int_ds(ds, "recordsize", &recordsize);
+ if (err != 0)
+ return (err);
+ size -= size / recordsize * sizeof (blkptr_t);
+
+ /* Add in the space for the record associated with each block. */
+ size += size / recordsize * sizeof (dmu_replay_record_t);
+
+ *sizep = size;
+
+ return (0);
+}
+
int
dmu_send_estimate(dsl_dataset_t *ds, dsl_dataset_t *fromds, uint64_t *sizep)
{
@@ -630,19 +1112,23 @@
ASSERT(dsl_pool_config_held(dp));
/* tosnap must be a snapshot */
- if (!dsl_dataset_is_snapshot(ds))
+ if (!ds->ds_is_snapshot)
return (SET_ERROR(EINVAL));
+ /* fromsnap, if provided, must be a snapshot */
+ if (fromds != NULL && !fromds->ds_is_snapshot)
+ return (SET_ERROR(EINVAL));
+
/*
* fromsnap must be an earlier snapshot from the same fs as tosnap,
* or the origin's fs.
*/
- if (fromds != NULL && !dsl_dataset_is_before(ds, fromds))
+ if (fromds != NULL && !dsl_dataset_is_before(ds, fromds, 0))
return (SET_ERROR(EXDEV));
/* Get uncompressed size estimate of changed data. */
if (fromds == NULL) {
- size = ds->ds_phys->ds_uncompressed_bytes;
+ size = dsl_dataset_phys(ds)->ds_uncompressed_bytes;
} else {
uint64_t used, comp;
err = dsl_dataset_space_written(fromds, ds,
@@ -651,33 +1137,61 @@
return (err);
}
- /*
- * Assume that space (both on-disk and in-stream) is dominated by
- * data. We will adjust for indirect blocks and the copies property,
- * but ignore per-object space used (eg, dnodes and DRR_OBJECT records).
- */
+ err = dmu_adjust_send_estimate_for_indirects(ds, size, sizep);
+ return (err);
+}
+/*
+ * Simple callback used to traverse the blocks of a snapshot and sum their
+ * uncompressed size
+ */
+/* ARGSUSED */
+static int
+dmu_calculate_send_traversal(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+ const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
+{
+ uint64_t *spaceptr = arg;
+ if (bp != NULL && !BP_IS_HOLE(bp)) {
+ *spaceptr += BP_GET_UCSIZE(bp);
+ }
+ return (0);
+}
+
+/*
+ * Given a desination snapshot and a TXG, calculate the approximate size of a
+ * send stream sent from that TXG. from_txg may be zero, indicating that the
+ * whole snapshot will be sent.
+ */
+int
+dmu_send_estimate_from_txg(dsl_dataset_t *ds, uint64_t from_txg,
+ uint64_t *sizep)
+{
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+ int err;
+ uint64_t size = 0;
+
+ ASSERT(dsl_pool_config_held(dp));
+
+ /* tosnap must be a snapshot */
+ if (!dsl_dataset_is_snapshot(ds))
+ return (SET_ERROR(EINVAL));
+
+ /* verify that from_txg is before the provided snapshot was taken */
+ if (from_txg >= dsl_dataset_phys(ds)->ds_creation_txg) {
+ return (SET_ERROR(EXDEV));
+ }
+
/*
- * Subtract out approximate space used by indirect blocks.
- * Assume most space is used by data blocks (non-indirect, non-dnode).
- * Assume all blocks are recordsize. Assume ditto blocks and
- * internal fragmentation counter out compression.
- *
- * Therefore, space used by indirect blocks is sizeof(blkptr_t) per
- * block, which we observe in practice.
+ * traverse the blocks of the snapshot with birth times after
+ * from_txg, summing their uncompressed size
*/
- uint64_t recordsize;
- err = dsl_prop_get_int_ds(ds, "recordsize", &recordsize);
- if (err != 0)
+ err = traverse_dataset(ds, from_txg, TRAVERSE_POST,
+ dmu_calculate_send_traversal, &size);
+ if (err)
return (err);
- size -= size / recordsize * sizeof (blkptr_t);
- /* Add in the space for the record associated with each block. */
- size += size / recordsize * sizeof (dmu_replay_record_t);
-
- *sizep = size;
-
- return (0);
+ err = dmu_adjust_send_estimate_for_indirects(ds, size, sizep);
+ return (err);
}
typedef struct dmu_recv_begin_arg {
@@ -684,6 +1198,7 @@
const char *drba_origin;
dmu_recv_cookie_t *drba_cookie;
cred_t *drba_cred;
+ uint64_t drba_snapobj;
} dmu_recv_begin_arg_t;
static int
@@ -694,14 +1209,9 @@
int error;
dsl_pool_t *dp = ds->ds_dir->dd_pool;
- /* must not have any changes since most recent snapshot */
- if (!drba->drba_cookie->drc_force &&
- dsl_dataset_modified_since_lastsnap(ds))
- return (SET_ERROR(ETXTBSY));
-
/* temporary clone name must not exist */
error = zap_lookup(dp->dp_meta_objset,
- ds->ds_dir->dd_phys->dd_child_dir_zapobj, recv_clone_name,
+ dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, recv_clone_name,
8, 1, &val);
if (error != ENOENT)
return (error == 0 ? EBUSY : error);
@@ -708,47 +1218,69 @@
/* new snapshot name must not exist */
error = zap_lookup(dp->dp_meta_objset,
- ds->ds_phys->ds_snapnames_zapobj, drba->drba_cookie->drc_tosnap,
- 8, 1, &val);
+ dsl_dataset_phys(ds)->ds_snapnames_zapobj,
+ drba->drba_cookie->drc_tosnap, 8, 1, &val);
if (error != ENOENT)
return (error == 0 ? EEXIST : error);
+ /*
+ * Check snapshot limit before receiving. We'll recheck again at the
+ * end, but might as well abort before receiving if we're already over
+ * the limit.
+ *
+ * Note that we do not check the file system limit with
+ * dsl_dir_fscount_check because the temporary %clones don't count
+ * against that limit.
+ */
+ error = dsl_fs_ss_limit_check(ds->ds_dir, 1, ZFS_PROP_SNAPSHOT_LIMIT,
+ NULL, drba->drba_cred);
+ if (error != 0)
+ return (error);
+
if (fromguid != 0) {
- /* if incremental, most recent snapshot must match fromguid */
- if (ds->ds_prev == NULL)
+ dsl_dataset_t *snap;
+ uint64_t obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
+
+ /* Find snapshot in this dir that matches fromguid. */
+ while (obj != 0) {
+ error = dsl_dataset_hold_obj(dp, obj, FTAG,
+ &snap);
+ if (error != 0)
+ return (SET_ERROR(ENODEV));
+ if (snap->ds_dir != ds->ds_dir) {
+ dsl_dataset_rele(snap, FTAG);
+ return (SET_ERROR(ENODEV));
+ }
+ if (dsl_dataset_phys(snap)->ds_guid == fromguid)
+ break;
+ obj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
+ dsl_dataset_rele(snap, FTAG);
+ }
+ if (obj == 0)
return (SET_ERROR(ENODEV));
- /*
- * most recent snapshot must match fromguid, or there are no
- * changes since the fromguid one
- */
- if (ds->ds_prev->ds_phys->ds_guid != fromguid) {
- uint64_t birth = ds->ds_prev->ds_phys->ds_bp.blk_birth;
- uint64_t obj = ds->ds_prev->ds_phys->ds_prev_snap_obj;
- while (obj != 0) {
- dsl_dataset_t *snap;
- error = dsl_dataset_hold_obj(dp, obj, FTAG,
- &snap);
- if (error != 0)
- return (SET_ERROR(ENODEV));
- if (snap->ds_phys->ds_creation_txg < birth) {
- dsl_dataset_rele(snap, FTAG);
- return (SET_ERROR(ENODEV));
- }
- if (snap->ds_phys->ds_guid == fromguid) {
- dsl_dataset_rele(snap, FTAG);
- break; /* it's ok */
- }
- obj = snap->ds_phys->ds_prev_snap_obj;
+ if (drba->drba_cookie->drc_force) {
+ drba->drba_snapobj = obj;
+ } else {
+ /*
+ * If we are not forcing, there must be no
+ * changes since fromsnap.
+ */
+ if (dsl_dataset_modified_since_snap(ds, snap)) {
dsl_dataset_rele(snap, FTAG);
+ return (SET_ERROR(ETXTBSY));
}
- if (obj == 0)
- return (SET_ERROR(ENODEV));
+ drba->drba_snapobj = ds->ds_prev->ds_object;
}
+
+ dsl_dataset_rele(snap, FTAG);
} else {
- /* if full, most recent snapshot must be $ORIGIN */
- if (ds->ds_phys->ds_prev_snap_txg >= TXG_INITIAL)
- return (SET_ERROR(ENODEV));
+ /* if full, then must be forced */
+ if (!drba->drba_cookie->drc_force)
+ return (SET_ERROR(EEXIST));
+ /* start from $ORIGIN@$ORIGIN, if supported */
+ drba->drba_snapobj = dp->dp_origin_snap != NULL ?
+ dp->dp_origin_snap->ds_object : 0;
}
return (0);
@@ -764,11 +1296,13 @@
uint64_t fromguid = drrb->drr_fromguid;
int flags = drrb->drr_flags;
int error;
+ uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
dsl_dataset_t *ds;
const char *tofs = drba->drba_cookie->drc_tofs;
/* already checked */
ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC);
+ ASSERT(!(featureflags & DMU_BACKUP_FEATURE_RESUMING));
if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) ==
DMU_COMPOUNDSTREAM ||
@@ -777,18 +1311,42 @@
return (SET_ERROR(EINVAL));
/* Verify pool version supports SA if SA_SPILL feature set */
- if ((DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) &
- DMU_BACKUP_FEATURE_SA_SPILL) &&
- spa_version(dp->dp_spa) < SPA_VERSION_SA) {
+ if ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) &&
+ spa_version(dp->dp_spa) < SPA_VERSION_SA)
return (SET_ERROR(ENOTSUP));
- }
+ if (drba->drba_cookie->drc_resumable &&
+ !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EXTENSIBLE_DATASET))
+ return (SET_ERROR(ENOTSUP));
+
+ /*
+ * The receiving code doesn't know how to translate a WRITE_EMBEDDED
+ * record to a plan WRITE record, so the pool must have the
+ * EMBEDDED_DATA feature enabled if the stream has WRITE_EMBEDDED
+ * records. Same with WRITE_EMBEDDED records that use LZ4 compression.
+ */
+ if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) &&
+ !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA))
+ return (SET_ERROR(ENOTSUP));
+ if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA_LZ4) &&
+ !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS))
+ return (SET_ERROR(ENOTSUP));
+
+ /*
+ * The receiving code doesn't know how to translate large blocks
+ * to smaller ones, so the pool must have the LARGE_BLOCKS
+ * feature enabled if the stream has LARGE_BLOCKS.
+ */
+ if ((featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
+ !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS))
+ return (SET_ERROR(ENOTSUP));
+
error = dsl_dataset_hold(dp, tofs, FTAG, &ds);
if (error == 0) {
/* target fs already exists; recv into temp clone */
/* Can't recv a clone into an existing fs */
- if (flags & DRR_FLAG_CLONE) {
+ if (flags & DRR_FLAG_CLONE || drba->drba_origin) {
dsl_dataset_rele(ds, FTAG);
return (SET_ERROR(EINVAL));
}
@@ -797,22 +1355,51 @@
dsl_dataset_rele(ds, FTAG);
} else if (error == ENOENT) {
/* target fs does not exist; must be a full backup or clone */
- char buf[MAXNAMELEN];
+ char buf[ZFS_MAX_DATASET_NAME_LEN];
/*
* If it's a non-clone incremental, we are missing the
* target fs, so fail the recv.
*/
- if (fromguid != 0 && !(flags & DRR_FLAG_CLONE))
+ if (fromguid != 0 && !(flags & DRR_FLAG_CLONE ||
+ drba->drba_origin))
return (SET_ERROR(ENOENT));
+ /*
+ * If we're receiving a full send as a clone, and it doesn't
+ * contain all the necessary free records and freeobject
+ * records, reject it.
+ */
+ if (fromguid == 0 && drba->drba_origin &&
+ !(flags & DRR_FLAG_FREERECORDS))
+ return (SET_ERROR(EINVAL));
+
/* Open the parent of tofs */
- ASSERT3U(strlen(tofs), <, MAXNAMELEN);
+ ASSERT3U(strlen(tofs), <, sizeof (buf));
(void) strlcpy(buf, tofs, strrchr(tofs, '/') - tofs + 1);
error = dsl_dataset_hold(dp, buf, FTAG, &ds);
if (error != 0)
return (error);
+ /*
+ * Check filesystem and snapshot limits before receiving. We'll
+ * recheck snapshot limits again at the end (we create the
+ * filesystems and increment those counts during begin_sync).
+ */
+ error = dsl_fs_ss_limit_check(ds->ds_dir, 1,
+ ZFS_PROP_FILESYSTEM_LIMIT, NULL, drba->drba_cred);
+ if (error != 0) {
+ dsl_dataset_rele(ds, FTAG);
+ return (error);
+ }
+
+ error = dsl_fs_ss_limit_check(ds->ds_dir, 1,
+ ZFS_PROP_SNAPSHOT_LIMIT, NULL, drba->drba_cred);
+ if (error != 0) {
+ dsl_dataset_rele(ds, FTAG);
+ return (error);
+ }
+
if (drba->drba_origin != NULL) {
dsl_dataset_t *origin;
error = dsl_dataset_hold(dp, drba->drba_origin,
@@ -821,12 +1408,13 @@
dsl_dataset_rele(ds, FTAG);
return (error);
}
- if (!dsl_dataset_is_snapshot(origin)) {
+ if (!origin->ds_is_snapshot) {
dsl_dataset_rele(origin, FTAG);
dsl_dataset_rele(ds, FTAG);
return (SET_ERROR(EINVAL));
}
- if (origin->ds_phys->ds_guid != fromguid) {
+ if (dsl_dataset_phys(origin)->ds_guid != fromguid &&
+ fromguid != 0) {
dsl_dataset_rele(origin, FTAG);
dsl_dataset_rele(ds, FTAG);
return (SET_ERROR(ENODEV));
@@ -844,21 +1432,29 @@
{
dmu_recv_begin_arg_t *drba = arg;
dsl_pool_t *dp = dmu_tx_pool(tx);
+ objset_t *mos = dp->dp_meta_objset;
struct drr_begin *drrb = drba->drba_cookie->drc_drrb;
const char *tofs = drba->drba_cookie->drc_tofs;
dsl_dataset_t *ds, *newds;
uint64_t dsobj;
int error;
- uint64_t crflags;
+ uint64_t crflags = 0;
- crflags = (drrb->drr_flags & DRR_FLAG_CI_DATA) ?
- DS_FLAG_CI_DATASET : 0;
+ if (drrb->drr_flags & DRR_FLAG_CI_DATA)
+ crflags |= DS_FLAG_CI_DATASET;
error = dsl_dataset_hold(dp, tofs, FTAG, &ds);
if (error == 0) {
/* create temporary clone */
+ dsl_dataset_t *snap = NULL;
+ if (drba->drba_snapobj != 0) {
+ VERIFY0(dsl_dataset_hold_obj(dp,
+ drba->drba_snapobj, FTAG, &snap));
+ }
dsobj = dsl_dataset_create_sync(ds->ds_dir, recv_clone_name,
- ds->ds_prev, crflags, drba->drba_cred, tx);
+ snap, crflags, drba->drba_cred, tx);
+ if (drba->drba_snapobj != 0)
+ dsl_dataset_rele(snap, FTAG);
dsl_dataset_rele(ds, FTAG);
} else {
dsl_dir_t *dd;
@@ -883,17 +1479,44 @@
}
VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &newds));
+ if (drba->drba_cookie->drc_resumable) {
+ dsl_dataset_zapify(newds, tx);
+ if (drrb->drr_fromguid != 0) {
+ VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_FROMGUID,
+ 8, 1, &drrb->drr_fromguid, tx));
+ }
+ VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_TOGUID,
+ 8, 1, &drrb->drr_toguid, tx));
+ VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_TONAME,
+ 1, strlen(drrb->drr_toname) + 1, drrb->drr_toname, tx));
+ uint64_t one = 1;
+ uint64_t zero = 0;
+ VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_OBJECT,
+ 8, 1, &one, tx));
+ VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_OFFSET,
+ 8, 1, &zero, tx));
+ VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_BYTES,
+ 8, 1, &zero, tx));
+ if (DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) &
+ DMU_BACKUP_FEATURE_EMBED_DATA) {
+ VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_EMBEDOK,
+ 8, 1, &one, tx));
+ }
+ }
+
dmu_buf_will_dirty(newds->ds_dbuf, tx);
- newds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
+ dsl_dataset_phys(newds)->ds_flags |= DS_FLAG_INCONSISTENT;
/*
* If we actually created a non-clone, we need to create the
* objset in our new dataset.
*/
+ rrw_enter(&newds->ds_bp_rwlock, RW_READER, FTAG);
if (BP_IS_HOLE(dsl_dataset_get_blkptr(newds))) {
(void) dmu_objset_create_impl(dp->dp_spa,
newds, dsl_dataset_get_blkptr(newds), drrb->drr_type, tx);
}
+ rrw_exit(&newds->ds_bp_rwlock, FTAG);
drba->drba_cookie->drc_ds = newds;
@@ -900,67 +1523,266 @@
spa_history_log_internal_ds(newds, "receive", tx, "");
}
+static int
+dmu_recv_resume_begin_check(void *arg, dmu_tx_t *tx)
+{
+ dmu_recv_begin_arg_t *drba = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ struct drr_begin *drrb = drba->drba_cookie->drc_drrb;
+ int error;
+ uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
+ dsl_dataset_t *ds;
+ const char *tofs = drba->drba_cookie->drc_tofs;
+
+ /* already checked */
+ ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC);
+ ASSERT(featureflags & DMU_BACKUP_FEATURE_RESUMING);
+
+ if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) ==
+ DMU_COMPOUNDSTREAM ||
+ drrb->drr_type >= DMU_OST_NUMTYPES)
+ return (SET_ERROR(EINVAL));
+
+ /* Verify pool version supports SA if SA_SPILL feature set */
+ if ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) &&
+ spa_version(dp->dp_spa) < SPA_VERSION_SA)
+ return (SET_ERROR(ENOTSUP));
+
+ /*
+ * The receiving code doesn't know how to translate a WRITE_EMBEDDED
+ * record to a plain WRITE record, so the pool must have the
+ * EMBEDDED_DATA feature enabled if the stream has WRITE_EMBEDDED
+ * records. Same with WRITE_EMBEDDED records that use LZ4 compression.
+ */
+ if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) &&
+ !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA))
+ return (SET_ERROR(ENOTSUP));
+ if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA_LZ4) &&
+ !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS))
+ return (SET_ERROR(ENOTSUP));
+
+ /* 6 extra bytes for /%recv */
+ char recvname[ZFS_MAX_DATASET_NAME_LEN + 6];
+
+ (void) snprintf(recvname, sizeof (recvname), "%s/%s",
+ tofs, recv_clone_name);
+
+ if (dsl_dataset_hold(dp, recvname, FTAG, &ds) != 0) {
+ /* %recv does not exist; continue in tofs */
+ error = dsl_dataset_hold(dp, tofs, FTAG, &ds);
+ if (error != 0)
+ return (error);
+ }
+
+ /* check that ds is marked inconsistent */
+ if (!DS_IS_INCONSISTENT(ds)) {
+ dsl_dataset_rele(ds, FTAG);
+ return (SET_ERROR(EINVAL));
+ }
+
+ /* check that there is resuming data, and that the toguid matches */
+ if (!dsl_dataset_is_zapified(ds)) {
+ dsl_dataset_rele(ds, FTAG);
+ return (SET_ERROR(EINVAL));
+ }
+ uint64_t val;
+ error = zap_lookup(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_TOGUID, sizeof (val), 1, &val);
+ if (error != 0 || drrb->drr_toguid != val) {
+ dsl_dataset_rele(ds, FTAG);
+ return (SET_ERROR(EINVAL));
+ }
+
+ /*
+ * Check if the receive is still running. If so, it will be owned.
+ * Note that nothing else can own the dataset (e.g. after the receive
+ * fails) because it will be marked inconsistent.
+ */
+ if (dsl_dataset_has_owner(ds)) {
+ dsl_dataset_rele(ds, FTAG);
+ return (SET_ERROR(EBUSY));
+ }
+
+ /* There should not be any snapshots of this fs yet. */
+ if (ds->ds_prev != NULL && ds->ds_prev->ds_dir == ds->ds_dir) {
+ dsl_dataset_rele(ds, FTAG);
+ return (SET_ERROR(EINVAL));
+ }
+
+ /*
+ * Note: resume point will be checked when we process the first WRITE
+ * record.
+ */
+
+ /* check that the origin matches */
+ val = 0;
+ (void) zap_lookup(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_FROMGUID, sizeof (val), 1, &val);
+ if (drrb->drr_fromguid != val) {
+ dsl_dataset_rele(ds, FTAG);
+ return (SET_ERROR(EINVAL));
+ }
+
+ dsl_dataset_rele(ds, FTAG);
+ return (0);
+}
+
+static void
+dmu_recv_resume_begin_sync(void *arg, dmu_tx_t *tx)
+{
+ dmu_recv_begin_arg_t *drba = arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ const char *tofs = drba->drba_cookie->drc_tofs;
+ dsl_dataset_t *ds;
+ uint64_t dsobj;
+ /* 6 extra bytes for /%recv */
+ char recvname[ZFS_MAX_DATASET_NAME_LEN + 6];
+
+ (void) snprintf(recvname, sizeof (recvname), "%s/%s",
+ tofs, recv_clone_name);
+
+ if (dsl_dataset_hold(dp, recvname, FTAG, &ds) != 0) {
+ /* %recv does not exist; continue in tofs */
+ VERIFY0(dsl_dataset_hold(dp, tofs, FTAG, &ds));
+ drba->drba_cookie->drc_newfs = B_TRUE;
+ }
+
+ /* clear the inconsistent flag so that we can own it */
+ ASSERT(DS_IS_INCONSISTENT(ds));
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ dsl_dataset_phys(ds)->ds_flags &= ~DS_FLAG_INCONSISTENT;
+ dsobj = ds->ds_object;
+ dsl_dataset_rele(ds, FTAG);
+
+ VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &ds));
+
+ dmu_buf_will_dirty(ds->ds_dbuf, tx);
+ dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_INCONSISTENT;
+
+ rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
+ ASSERT(!BP_IS_HOLE(dsl_dataset_get_blkptr(ds)));
+ rrw_exit(&ds->ds_bp_rwlock, FTAG);
+
+ drba->drba_cookie->drc_ds = ds;
+
+ spa_history_log_internal_ds(ds, "resume receive", tx, "");
+}
+
/*
* NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin()
* succeeds; otherwise we will leak the holds on the datasets.
*/
int
-dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb,
- boolean_t force, char *origin, dmu_recv_cookie_t *drc)
+dmu_recv_begin(char *tofs, char *tosnap, dmu_replay_record_t *drr_begin,
+ boolean_t force, boolean_t resumable, char *origin, dmu_recv_cookie_t *drc)
{
dmu_recv_begin_arg_t drba = { 0 };
- dmu_replay_record_t *drr;
bzero(drc, sizeof (dmu_recv_cookie_t));
- drc->drc_drrb = drrb;
+ drc->drc_drr_begin = drr_begin;
+ drc->drc_drrb = &drr_begin->drr_u.drr_begin;
drc->drc_tosnap = tosnap;
drc->drc_tofs = tofs;
drc->drc_force = force;
+ drc->drc_resumable = resumable;
+ drc->drc_cred = CRED();
- if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC))
+ if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) {
drc->drc_byteswap = B_TRUE;
- else if (drrb->drr_magic != DMU_BACKUP_MAGIC)
- return (SET_ERROR(EINVAL));
-
- drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP);
- drr->drr_type = DRR_BEGIN;
- drr->drr_u.drr_begin = *drc->drc_drrb;
- if (drc->drc_byteswap) {
- fletcher_4_incremental_byteswap(drr,
+ fletcher_4_incremental_byteswap(drr_begin,
sizeof (dmu_replay_record_t), &drc->drc_cksum);
+ byteswap_record(drr_begin);
+ } else if (drc->drc_drrb->drr_magic == DMU_BACKUP_MAGIC) {
+ fletcher_4_incremental_native(drr_begin,
+ sizeof (dmu_replay_record_t), &drc->drc_cksum);
} else {
- fletcher_4_incremental_native(drr,
- sizeof (dmu_replay_record_t), &drc->drc_cksum);
+ return (SET_ERROR(EINVAL));
}
- kmem_free(drr, sizeof (dmu_replay_record_t));
- if (drc->drc_byteswap) {
- drrb->drr_magic = BSWAP_64(drrb->drr_magic);
- drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo);
- drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time);
- drrb->drr_type = BSWAP_32(drrb->drr_type);
- drrb->drr_toguid = BSWAP_64(drrb->drr_toguid);
- drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid);
- }
-
drba.drba_origin = origin;
drba.drba_cookie = drc;
drba.drba_cred = CRED();
- return (dsl_sync_task(tofs, dmu_recv_begin_check, dmu_recv_begin_sync,
- &drba, 5));
+ if (DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo) &
+ DMU_BACKUP_FEATURE_RESUMING) {
+ return (dsl_sync_task(tofs,
+ dmu_recv_resume_begin_check, dmu_recv_resume_begin_sync,
+ &drba, 5, ZFS_SPACE_CHECK_NORMAL));
+ } else {
+ return (dsl_sync_task(tofs,
+ dmu_recv_begin_check, dmu_recv_begin_sync,
+ &drba, 5, ZFS_SPACE_CHECK_NORMAL));
+ }
}
-struct restorearg {
+struct receive_record_arg {
+ dmu_replay_record_t header;
+ void *payload; /* Pointer to a buffer containing the payload */
+ /*
+ * If the record is a write, pointer to the arc_buf_t containing the
+ * payload.
+ */
+ arc_buf_t *write_buf;
+ int payload_size;
+ uint64_t bytes_read; /* bytes read from stream when record created */
+ boolean_t eos_marker; /* Marks the end of the stream */
+ bqueue_node_t node;
+};
+
+struct receive_writer_arg {
+ objset_t *os;
+ boolean_t byteswap;
+ bqueue_t q;
+
+ /*
+ * These three args are used to signal to the main thread that we're
+ * done.
+ */
+ kmutex_t mutex;
+ kcondvar_t cv;
+ boolean_t done;
+
int err;
- boolean_t byteswap;
+ /* A map from guid to dataset to help handle dedup'd streams. */
+ avl_tree_t *guid_to_ds_map;
+ boolean_t resumable;
+ uint64_t last_object, last_offset;
+ uint64_t bytes_read; /* bytes read when current record created */
+};
+
+struct objlist {
+ list_t list; /* List of struct receive_objnode. */
+ /*
+ * Last object looked up. Used to assert that objects are being looked
+ * up in ascending order.
+ */
+ uint64_t last_lookup;
+};
+
+struct receive_objnode {
+ list_node_t node;
+ uint64_t object;
+};
+
+struct receive_arg {
+ objset_t *os;
kthread_t *td;
struct file *fp;
- char *buf;
- uint64_t voff;
- int bufsize; /* amount of memory allocated for buf */
+ uint64_t voff; /* The current offset in the stream */
+ uint64_t bytes_read;
+ /*
+ * A record that has had its payload read in, but hasn't yet been handed
+ * off to the worker thread.
+ */
+ struct receive_record_arg *rrd;
+ /* A record that has had its header read in, but not its payload. */
+ struct receive_record_arg *next_rrd;
zio_cksum_t cksum;
- avl_tree_t *guid_to_ds_map;
+ zio_cksum_t prev_cksum;
+ int err;
+ boolean_t byteswap;
+ /* Sorted list of objects not to issue prefetches for. */
+ struct objlist ignore_objlist;
};
typedef struct guid_map_entry {
@@ -999,7 +1821,7 @@
}
static int
-restore_bytes(struct restorearg *ra, void *buf, int len, off_t off, ssize_t *resid)
+restore_bytes(struct receive_arg *ra, void *buf, int len, off_t off, ssize_t *resid)
{
struct uio auio;
struct iovec aiov;
@@ -1024,45 +1846,50 @@
return (error);
}
-static void *
-restore_read(struct restorearg *ra, int len)
+static int
+receive_read(struct receive_arg *ra, int len, void *buf)
{
- void *rv;
int done = 0;
- /* some things will require 8-byte alignment, so everything must */
+ /*
+ * The code doesn't rely on this (lengths being multiples of 8). See
+ * comment in dump_bytes.
+ */
ASSERT0(len % 8);
while (done < len) {
ssize_t resid;
- ra->err = restore_bytes(ra, (caddr_t)ra->buf + done,
+ ra->err = restore_bytes(ra, buf + done,
len - done, ra->voff, &resid);
- if (resid == len - done)
- ra->err = SET_ERROR(EINVAL);
+ if (resid == len - done) {
+ /*
+ * Note: ECKSUM indicates that the receive
+ * was interrupted and can potentially be resumed.
+ */
+ ra->err = SET_ERROR(ECKSUM);
+ }
ra->voff += len - done - resid;
done = len - resid;
if (ra->err != 0)
- return (NULL);
+ return (ra->err);
}
+ ra->bytes_read += len;
+
ASSERT3U(done, ==, len);
- rv = ra->buf;
- if (ra->byteswap)
- fletcher_4_incremental_byteswap(rv, len, &ra->cksum);
- else
- fletcher_4_incremental_native(rv, len, &ra->cksum);
- return (rv);
+ return (0);
}
static void
-backup_byteswap(dmu_replay_record_t *drr)
+byteswap_record(dmu_replay_record_t *drr)
{
#define DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X))
#define DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X))
drr->drr_type = BSWAP_32(drr->drr_type);
drr->drr_payloadlen = BSWAP_32(drr->drr_payloadlen);
+
switch (drr->drr_type) {
case DRR_BEGIN:
DO64(drr_begin.drr_magic);
@@ -1075,7 +1902,6 @@
break;
case DRR_OBJECT:
DO64(drr_object.drr_object);
- /* DO64(drr_object.drr_allocation_txg); */
DO32(drr_object.drr_type);
DO32(drr_object.drr_bonustype);
DO32(drr_object.drr_blksz);
@@ -1093,10 +1919,7 @@
DO64(drr_write.drr_offset);
DO64(drr_write.drr_length);
DO64(drr_write.drr_toguid);
- DO64(drr_write.drr_key.ddk_cksum.zc_word[0]);
- DO64(drr_write.drr_key.ddk_cksum.zc_word[1]);
- DO64(drr_write.drr_key.ddk_cksum.zc_word[2]);
- DO64(drr_write.drr_key.ddk_cksum.zc_word[3]);
+ ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_write.drr_key.ddk_cksum);
DO64(drr_write.drr_key.ddk_prop);
break;
case DRR_WRITE_BYREF:
@@ -1107,12 +1930,18 @@
DO64(drr_write_byref.drr_refguid);
DO64(drr_write_byref.drr_refobject);
DO64(drr_write_byref.drr_refoffset);
- DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[0]);
- DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[1]);
- DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[2]);
- DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[3]);
+ ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_write_byref.
+ drr_key.ddk_cksum);
DO64(drr_write_byref.drr_key.ddk_prop);
break;
+ case DRR_WRITE_EMBEDDED:
+ DO64(drr_write_embedded.drr_object);
+ DO64(drr_write_embedded.drr_offset);
+ DO64(drr_write_embedded.drr_length);
+ DO64(drr_write_embedded.drr_toguid);
+ DO32(drr_write_embedded.drr_lsize);
+ DO32(drr_write_embedded.drr_psize);
+ break;
case DRR_FREE:
DO64(drr_free.drr_object);
DO64(drr_free.drr_offset);
@@ -1125,23 +1954,75 @@
DO64(drr_spill.drr_toguid);
break;
case DRR_END:
- DO64(drr_end.drr_checksum.zc_word[0]);
- DO64(drr_end.drr_checksum.zc_word[1]);
- DO64(drr_end.drr_checksum.zc_word[2]);
- DO64(drr_end.drr_checksum.zc_word[3]);
DO64(drr_end.drr_toguid);
+ ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_end.drr_checksum);
break;
}
+
+ if (drr->drr_type != DRR_BEGIN) {
+ ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_checksum.drr_checksum);
+ }
+
#undef DO64
#undef DO32
}
+static inline uint8_t
+deduce_nblkptr(dmu_object_type_t bonus_type, uint64_t bonus_size)
+{
+ if (bonus_type == DMU_OT_SA) {
+ return (1);
+ } else {
+ return (1 +
+ ((DN_MAX_BONUSLEN - bonus_size) >> SPA_BLKPTRSHIFT));
+ }
+}
+
+static void
+save_resume_state(struct receive_writer_arg *rwa,
+ uint64_t object, uint64_t offset, dmu_tx_t *tx)
+{
+ int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
+
+ if (!rwa->resumable)
+ return;
+
+ /*
+ * We use ds_resume_bytes[] != 0 to indicate that we need to
+ * update this on disk, so it must not be 0.
+ */
+ ASSERT(rwa->bytes_read != 0);
+
+ /*
+ * We only resume from write records, which have a valid
+ * (non-meta-dnode) object number.
+ */
+ ASSERT(object != 0);
+
+ /*
+ * For resuming to work correctly, we must receive records in order,
+ * sorted by object,offset. This is checked by the callers, but
+ * assert it here for good measure.
+ */
+ ASSERT3U(object, >=, rwa->os->os_dsl_dataset->ds_resume_object[txgoff]);
+ ASSERT(object != rwa->os->os_dsl_dataset->ds_resume_object[txgoff] ||
+ offset >= rwa->os->os_dsl_dataset->ds_resume_offset[txgoff]);
+ ASSERT3U(rwa->bytes_read, >=,
+ rwa->os->os_dsl_dataset->ds_resume_bytes[txgoff]);
+
+ rwa->os->os_dsl_dataset->ds_resume_object[txgoff] = object;
+ rwa->os->os_dsl_dataset->ds_resume_offset[txgoff] = offset;
+ rwa->os->os_dsl_dataset->ds_resume_bytes[txgoff] = rwa->bytes_read;
+}
+
static int
-restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
+receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
+ void *data)
{
+ dmu_object_info_t doi;
+ dmu_tx_t *tx;
+ uint64_t object;
int err;
- dmu_tx_t *tx;
- void *data = NULL;
if (drro->drr_type == DMU_OT_NONE ||
!DMU_OT_IS_VALID(drro->drr_type) ||
@@ -1150,66 +2031,78 @@
drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS ||
P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) ||
drro->drr_blksz < SPA_MINBLOCKSIZE ||
- drro->drr_blksz > SPA_MAXBLOCKSIZE ||
+ drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(rwa->os)) ||
drro->drr_bonuslen > DN_MAX_BONUSLEN) {
return (SET_ERROR(EINVAL));
}
- err = dmu_object_info(os, drro->drr_object, NULL);
+ err = dmu_object_info(rwa->os, drro->drr_object, &doi);
if (err != 0 && err != ENOENT)
return (SET_ERROR(EINVAL));
+ object = err == 0 ? drro->drr_object : DMU_NEW_OBJECT;
- if (drro->drr_bonuslen) {
- data = restore_read(ra, P2ROUNDUP(drro->drr_bonuslen, 8));
- if (ra->err != 0)
- return (ra->err);
+ /*
+ * If we are losing blkptrs or changing the block size this must
+ * be a new file instance. We must clear out the previous file
+ * contents before we can change this type of metadata in the dnode.
+ */
+ if (err == 0) {
+ int nblkptr;
+
+ nblkptr = deduce_nblkptr(drro->drr_bonustype,
+ drro->drr_bonuslen);
+
+ if (drro->drr_blksz != doi.doi_data_block_size ||
+ nblkptr < doi.doi_nblkptr) {
+ err = dmu_free_long_range(rwa->os, drro->drr_object,
+ 0, DMU_OBJECT_END);
+ if (err != 0)
+ return (SET_ERROR(EINVAL));
+ }
}
- if (err == ENOENT) {
+ tx = dmu_tx_create(rwa->os);
+ dmu_tx_hold_bonus(tx, object);
+ err = dmu_tx_assign(tx, TXG_WAIT);
+ if (err != 0) {
+ dmu_tx_abort(tx);
+ return (err);
+ }
+
+ if (object == DMU_NEW_OBJECT) {
/* currently free, want to be allocated */
- tx = dmu_tx_create(os);
- dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
- err = dmu_tx_assign(tx, TXG_WAIT);
- if (err != 0) {
- dmu_tx_abort(tx);
- return (err);
- }
- err = dmu_object_claim(os, drro->drr_object,
+ err = dmu_object_claim(rwa->os, drro->drr_object,
drro->drr_type, drro->drr_blksz,
drro->drr_bonustype, drro->drr_bonuslen, tx);
- dmu_tx_commit(tx);
- } else {
- /* currently allocated, want to be allocated */
- err = dmu_object_reclaim(os, drro->drr_object,
+ } else if (drro->drr_type != doi.doi_type ||
+ drro->drr_blksz != doi.doi_data_block_size ||
+ drro->drr_bonustype != doi.doi_bonus_type ||
+ drro->drr_bonuslen != doi.doi_bonus_size) {
+ /* currently allocated, but with different properties */
+ err = dmu_object_reclaim(rwa->os, drro->drr_object,
drro->drr_type, drro->drr_blksz,
- drro->drr_bonustype, drro->drr_bonuslen);
+ drro->drr_bonustype, drro->drr_bonuslen, tx);
}
if (err != 0) {
+ dmu_tx_commit(tx);
return (SET_ERROR(EINVAL));
}
- tx = dmu_tx_create(os);
- dmu_tx_hold_bonus(tx, drro->drr_object);
- err = dmu_tx_assign(tx, TXG_WAIT);
- if (err != 0) {
- dmu_tx_abort(tx);
- return (err);
- }
+ dmu_object_set_checksum(rwa->os, drro->drr_object,
+ drro->drr_checksumtype, tx);
+ dmu_object_set_compress(rwa->os, drro->drr_object,
+ drro->drr_compress, tx);
- dmu_object_set_checksum(os, drro->drr_object, drro->drr_checksumtype,
- tx);
- dmu_object_set_compress(os, drro->drr_object, drro->drr_compress, tx);
-
if (data != NULL) {
dmu_buf_t *db;
- VERIFY(0 == dmu_bonus_hold(os, drro->drr_object, FTAG, &db));
+ VERIFY0(dmu_bonus_hold(rwa->os, drro->drr_object, FTAG, &db));
dmu_buf_will_dirty(db, tx);
ASSERT3U(db->db_size, >=, drro->drr_bonuslen);
bcopy(data, db->db_data, drro->drr_bonuslen);
- if (ra->byteswap) {
+ if (rwa->byteswap) {
dmu_object_byteswap_t byteswap =
DMU_OT_BYTESWAP(drro->drr_bonustype);
dmu_ot_byteswap[byteswap].ob_func(db->db_data,
@@ -1218,40 +2111,43 @@
dmu_buf_rele(db, FTAG);
}
dmu_tx_commit(tx);
+
return (0);
}
/* ARGSUSED */
static int
-restore_freeobjects(struct restorearg *ra, objset_t *os,
+receive_freeobjects(struct receive_writer_arg *rwa,
struct drr_freeobjects *drrfo)
{
uint64_t obj;
+ int next_err = 0;
if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj)
return (SET_ERROR(EINVAL));
for (obj = drrfo->drr_firstobj;
- obj < drrfo->drr_firstobj + drrfo->drr_numobjs;
- (void) dmu_object_next(os, &obj, FALSE, 0)) {
+ obj < drrfo->drr_firstobj + drrfo->drr_numobjs && next_err == 0;
+ next_err = dmu_object_next(rwa->os, &obj, FALSE, 0)) {
int err;
- if (dmu_object_info(os, obj, NULL) != 0)
+ if (dmu_object_info(rwa->os, obj, NULL) != 0)
continue;
- err = dmu_free_object(os, obj);
+ err = dmu_free_long_object(rwa->os, obj);
if (err != 0)
return (err);
}
+ if (next_err != ESRCH)
+ return (next_err);
return (0);
}
static int
-restore_write(struct restorearg *ra, objset_t *os,
- struct drr_write *drrw)
+receive_write(struct receive_writer_arg *rwa, struct drr_write *drrw,
+ arc_buf_t *abuf)
{
dmu_tx_t *tx;
- void *data;
int err;
if (drrw->drr_offset + drrw->drr_length < drrw->drr_offset ||
@@ -1258,14 +2154,22 @@
!DMU_OT_IS_VALID(drrw->drr_type))
return (SET_ERROR(EINVAL));
- data = restore_read(ra, drrw->drr_length);
- if (data == NULL)
- return (ra->err);
+ /*
+ * For resuming to work, records must be in increasing order
+ * by (object, offset).
+ */
+ if (drrw->drr_object < rwa->last_object ||
+ (drrw->drr_object == rwa->last_object &&
+ drrw->drr_offset < rwa->last_offset)) {
+ return (SET_ERROR(EINVAL));
+ }
+ rwa->last_object = drrw->drr_object;
+ rwa->last_offset = drrw->drr_offset;
- if (dmu_object_info(os, drrw->drr_object, NULL) != 0)
+ if (dmu_object_info(rwa->os, drrw->drr_object, NULL) != 0)
return (SET_ERROR(EINVAL));
- tx = dmu_tx_create(os);
+ tx = dmu_tx_create(rwa->os);
dmu_tx_hold_write(tx, drrw->drr_object,
drrw->drr_offset, drrw->drr_length);
@@ -1274,14 +2178,28 @@
dmu_tx_abort(tx);
return (err);
}
- if (ra->byteswap) {
+ if (rwa->byteswap) {
dmu_object_byteswap_t byteswap =
DMU_OT_BYTESWAP(drrw->drr_type);
- dmu_ot_byteswap[byteswap].ob_func(data, drrw->drr_length);
+ dmu_ot_byteswap[byteswap].ob_func(abuf->b_data,
+ drrw->drr_length);
}
- dmu_write(os, drrw->drr_object,
- drrw->drr_offset, drrw->drr_length, data, tx);
+
+ dmu_buf_t *bonus;
+ if (dmu_bonus_hold(rwa->os, drrw->drr_object, FTAG, &bonus) != 0)
+ return (SET_ERROR(EINVAL));
+ dmu_assign_arcbuf(bonus, drrw->drr_offset, abuf, tx);
+
+ /*
+ * Note: If the receive fails, we want the resume stream to start
+ * with the same record that we last successfully received (as opposed
+ * to the next record), so that we can verify that we are
+ * resuming from the correct location.
+ */
+ save_resume_state(rwa, drrw->drr_object, drrw->drr_offset, tx);
dmu_tx_commit(tx);
+ dmu_buf_rele(bonus, FTAG);
+
return (0);
}
@@ -1293,7 +2211,7 @@
* data from the stream to fulfill this write.
*/
static int
-restore_write_byref(struct restorearg *ra, objset_t *os,
+receive_write_byref(struct receive_writer_arg *rwa,
struct drr_write_byref *drrwbr)
{
dmu_tx_t *tx;
@@ -1300,7 +2218,7 @@
int err;
guid_map_entry_t gmesrch;
guid_map_entry_t *gmep;
- avl_index_t where;
+ avl_index_t where;
objset_t *ref_os = NULL;
dmu_buf_t *dbp;
@@ -1313,7 +2231,7 @@
*/
if (drrwbr->drr_toguid != drrwbr->drr_refguid) {
gmesrch.guid = drrwbr->drr_refguid;
- if ((gmep = avl_find(ra->guid_to_ds_map, &gmesrch,
+ if ((gmep = avl_find(rwa->guid_to_ds_map, &gmesrch,
&where)) == NULL) {
return (SET_ERROR(EINVAL));
}
@@ -1320,14 +2238,15 @@
if (dmu_objset_from_ds(gmep->gme_ds, &ref_os))
return (SET_ERROR(EINVAL));
} else {
- ref_os = os;
+ ref_os = rwa->os;
}
- if (err = dmu_buf_hold(ref_os, drrwbr->drr_refobject,
- drrwbr->drr_refoffset, FTAG, &dbp, DMU_READ_PREFETCH))
+ err = dmu_buf_hold(ref_os, drrwbr->drr_refobject,
+ drrwbr->drr_refoffset, FTAG, &dbp, DMU_READ_PREFETCH);
+ if (err != 0)
return (err);
- tx = dmu_tx_create(os);
+ tx = dmu_tx_create(rwa->os);
dmu_tx_hold_write(tx, drrwbr->drr_object,
drrwbr->drr_offset, drrwbr->drr_length);
@@ -1336,39 +2255,77 @@
dmu_tx_abort(tx);
return (err);
}
- dmu_write(os, drrwbr->drr_object,
+ dmu_write(rwa->os, drrwbr->drr_object,
drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx);
dmu_buf_rele(dbp, FTAG);
+
+ /* See comment in restore_write. */
+ save_resume_state(rwa, drrwbr->drr_object, drrwbr->drr_offset, tx);
dmu_tx_commit(tx);
return (0);
}
static int
-restore_spill(struct restorearg *ra, objset_t *os, struct drr_spill *drrs)
+receive_write_embedded(struct receive_writer_arg *rwa,
+ struct drr_write_embedded *drrwe, void *data)
{
dmu_tx_t *tx;
- void *data;
+ int err;
+
+ if (drrwe->drr_offset + drrwe->drr_length < drrwe->drr_offset)
+ return (EINVAL);
+
+ if (drrwe->drr_psize > BPE_PAYLOAD_SIZE)
+ return (EINVAL);
+
+ if (drrwe->drr_etype >= NUM_BP_EMBEDDED_TYPES)
+ return (EINVAL);
+ if (drrwe->drr_compression >= ZIO_COMPRESS_FUNCTIONS)
+ return (EINVAL);
+
+ tx = dmu_tx_create(rwa->os);
+
+ dmu_tx_hold_write(tx, drrwe->drr_object,
+ drrwe->drr_offset, drrwe->drr_length);
+ err = dmu_tx_assign(tx, TXG_WAIT);
+ if (err != 0) {
+ dmu_tx_abort(tx);
+ return (err);
+ }
+
+ dmu_write_embedded(rwa->os, drrwe->drr_object,
+ drrwe->drr_offset, data, drrwe->drr_etype,
+ drrwe->drr_compression, drrwe->drr_lsize, drrwe->drr_psize,
+ rwa->byteswap ^ ZFS_HOST_BYTEORDER, tx);
+
+ /* See comment in restore_write. */
+ save_resume_state(rwa, drrwe->drr_object, drrwe->drr_offset, tx);
+ dmu_tx_commit(tx);
+ return (0);
+}
+
+static int
+receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs,
+ void *data)
+{
+ dmu_tx_t *tx;
dmu_buf_t *db, *db_spill;
int err;
if (drrs->drr_length < SPA_MINBLOCKSIZE ||
- drrs->drr_length > SPA_MAXBLOCKSIZE)
+ drrs->drr_length > spa_maxblocksize(dmu_objset_spa(rwa->os)))
return (SET_ERROR(EINVAL));
- data = restore_read(ra, drrs->drr_length);
- if (data == NULL)
- return (ra->err);
-
- if (dmu_object_info(os, drrs->drr_object, NULL) != 0)
+ if (dmu_object_info(rwa->os, drrs->drr_object, NULL) != 0)
return (SET_ERROR(EINVAL));
- VERIFY(0 == dmu_bonus_hold(os, drrs->drr_object, FTAG, &db));
+ VERIFY0(dmu_bonus_hold(rwa->os, drrs->drr_object, FTAG, &db));
if ((err = dmu_spill_hold_by_bonus(db, FTAG, &db_spill)) != 0) {
dmu_buf_rele(db, FTAG);
return (err);
}
- tx = dmu_tx_create(os);
+ tx = dmu_tx_create(rwa->os);
dmu_tx_hold_spill(tx, db->db_object);
@@ -1395,8 +2352,7 @@
/* ARGSUSED */
static int
-restore_free(struct restorearg *ra, objset_t *os,
- struct drr_free *drrf)
+receive_free(struct receive_writer_arg *rwa, struct drr_free *drrf)
{
int err;
@@ -1404,11 +2360,12 @@
drrf->drr_offset + drrf->drr_length < drrf->drr_offset)
return (SET_ERROR(EINVAL));
- if (dmu_object_info(os, drrf->drr_object, NULL) != 0)
+ if (dmu_object_info(rwa->os, drrf->drr_object, NULL) != 0)
return (SET_ERROR(EINVAL));
- err = dmu_free_long_range(os, drrf->drr_object,
+ err = dmu_free_long_range(rwa->os, drrf->drr_object,
drrf->drr_offset, drrf->drr_length);
+
return (err);
}
@@ -1416,13 +2373,445 @@
static void
dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc)
{
- char name[MAXNAMELEN];
- dsl_dataset_name(drc->drc_ds, name);
- dsl_dataset_disown(drc->drc_ds, dmu_recv_tag);
- (void) dsl_destroy_head(name);
+ if (drc->drc_resumable) {
+ /* wait for our resume state to be written to disk */
+ txg_wait_synced(drc->drc_ds->ds_dir->dd_pool, 0);
+ dsl_dataset_disown(drc->drc_ds, dmu_recv_tag);
+ } else {
+ char name[ZFS_MAX_DATASET_NAME_LEN];
+ dsl_dataset_name(drc->drc_ds, name);
+ dsl_dataset_disown(drc->drc_ds, dmu_recv_tag);
+ (void) dsl_destroy_head(name);
+ }
}
+static void
+receive_cksum(struct receive_arg *ra, int len, void *buf)
+{
+ if (ra->byteswap) {
+ fletcher_4_incremental_byteswap(buf, len, &ra->cksum);
+ } else {
+ fletcher_4_incremental_native(buf, len, &ra->cksum);
+ }
+}
+
/*
+ * Read the payload into a buffer of size len, and update the current record's
+ * payload field.
+ * Allocate ra->next_rrd and read the next record's header into
+ * ra->next_rrd->header.
+ * Verify checksum of payload and next record.
+ */
+static int
+receive_read_payload_and_next_header(struct receive_arg *ra, int len, void *buf)
+{
+ int err;
+
+ if (len != 0) {
+ ASSERT3U(len, <=, SPA_MAXBLOCKSIZE);
+ err = receive_read(ra, len, buf);
+ if (err != 0)
+ return (err);
+ receive_cksum(ra, len, buf);
+
+ /* note: rrd is NULL when reading the begin record's payload */
+ if (ra->rrd != NULL) {
+ ra->rrd->payload = buf;
+ ra->rrd->payload_size = len;
+ ra->rrd->bytes_read = ra->bytes_read;
+ }
+ }
+
+ ra->prev_cksum = ra->cksum;
+
+ ra->next_rrd = kmem_zalloc(sizeof (*ra->next_rrd), KM_SLEEP);
+ err = receive_read(ra, sizeof (ra->next_rrd->header),
+ &ra->next_rrd->header);
+ ra->next_rrd->bytes_read = ra->bytes_read;
+ if (err != 0) {
+ kmem_free(ra->next_rrd, sizeof (*ra->next_rrd));
+ ra->next_rrd = NULL;
+ return (err);
+ }
+ if (ra->next_rrd->header.drr_type == DRR_BEGIN) {
+ kmem_free(ra->next_rrd, sizeof (*ra->next_rrd));
+ ra->next_rrd = NULL;
+ return (SET_ERROR(EINVAL));
+ }
+
+ /*
+ * Note: checksum is of everything up to but not including the
+ * checksum itself.
+ */
+ ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
+ ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t));
+ receive_cksum(ra,
+ offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
+ &ra->next_rrd->header);
+
+ zio_cksum_t cksum_orig =
+ ra->next_rrd->header.drr_u.drr_checksum.drr_checksum;
+ zio_cksum_t *cksump =
+ &ra->next_rrd->header.drr_u.drr_checksum.drr_checksum;
+
+ if (ra->byteswap)
+ byteswap_record(&ra->next_rrd->header);
+
+ if ((!ZIO_CHECKSUM_IS_ZERO(cksump)) &&
+ !ZIO_CHECKSUM_EQUAL(ra->cksum, *cksump)) {
+ kmem_free(ra->next_rrd, sizeof (*ra->next_rrd));
+ ra->next_rrd = NULL;
+ return (SET_ERROR(ECKSUM));
+ }
+
+ receive_cksum(ra, sizeof (cksum_orig), &cksum_orig);
+
+ return (0);
+}
+
+static void
+objlist_create(struct objlist *list)
+{
+ list_create(&list->list, sizeof (struct receive_objnode),
+ offsetof(struct receive_objnode, node));
+ list->last_lookup = 0;
+}
+
+static void
+objlist_destroy(struct objlist *list)
+{
+ for (struct receive_objnode *n = list_remove_head(&list->list);
+ n != NULL; n = list_remove_head(&list->list)) {
+ kmem_free(n, sizeof (*n));
+ }
+ list_destroy(&list->list);
+}
+
+/*
+ * This function looks through the objlist to see if the specified object number
+ * is contained in the objlist. In the process, it will remove all object
+ * numbers in the list that are smaller than the specified object number. Thus,
+ * any lookup of an object number smaller than a previously looked up object
+ * number will always return false; therefore, all lookups should be done in
+ * ascending order.
+ */
+static boolean_t
+objlist_exists(struct objlist *list, uint64_t object)
+{
+ struct receive_objnode *node = list_head(&list->list);
+ ASSERT3U(object, >=, list->last_lookup);
+ list->last_lookup = object;
+ while (node != NULL && node->object < object) {
+ VERIFY3P(node, ==, list_remove_head(&list->list));
+ kmem_free(node, sizeof (*node));
+ node = list_head(&list->list);
+ }
+ return (node != NULL && node->object == object);
+}
+
+/*
+ * The objlist is a list of object numbers stored in ascending order. However,
+ * the insertion of new object numbers does not seek out the correct location to
+ * store a new object number; instead, it appends it to the list for simplicity.
+ * Thus, any users must take care to only insert new object numbers in ascending
+ * order.
+ */
+static void
+objlist_insert(struct objlist *list, uint64_t object)
+{
+ struct receive_objnode *node = kmem_zalloc(sizeof (*node), KM_SLEEP);
+ node->object = object;
+#ifdef ZFS_DEBUG
+ struct receive_objnode *last_object = list_tail(&list->list);
+ uint64_t last_objnum = (last_object != NULL ? last_object->object : 0);
+ ASSERT3U(node->object, >, last_objnum);
+#endif
+ list_insert_tail(&list->list, node);
+}
+
+/*
+ * Issue the prefetch reads for any necessary indirect blocks.
+ *
+ * We use the object ignore list to tell us whether or not to issue prefetches
+ * for a given object. We do this for both correctness (in case the blocksize
+ * of an object has changed) and performance (if the object doesn't exist, don't
+ * needlessly try to issue prefetches). We also trim the list as we go through
+ * the stream to prevent it from growing to an unbounded size.
+ *
+ * The object numbers within will always be in sorted order, and any write
+ * records we see will also be in sorted order, but they're not sorted with
+ * respect to each other (i.e. we can get several object records before
+ * receiving each object's write records). As a result, once we've reached a
+ * given object number, we can safely remove any reference to lower object
+ * numbers in the ignore list. In practice, we receive up to 32 object records
+ * before receiving write records, so the list can have up to 32 nodes in it.
+ */
+/* ARGSUSED */
+static void
+receive_read_prefetch(struct receive_arg *ra,
+ uint64_t object, uint64_t offset, uint64_t length)
+{
+ if (!objlist_exists(&ra->ignore_objlist, object)) {
+ dmu_prefetch(ra->os, object, 1, offset, length,
+ ZIO_PRIORITY_SYNC_READ);
+ }
+}
+
+/*
+ * Read records off the stream, issuing any necessary prefetches.
+ */
+static int
+receive_read_record(struct receive_arg *ra)
+{
+ int err;
+
+ switch (ra->rrd->header.drr_type) {
+ case DRR_OBJECT:
+ {
+ struct drr_object *drro = &ra->rrd->header.drr_u.drr_object;
+ uint32_t size = P2ROUNDUP(drro->drr_bonuslen, 8);
+ void *buf = kmem_zalloc(size, KM_SLEEP);
+ dmu_object_info_t doi;
+ err = receive_read_payload_and_next_header(ra, size, buf);
+ if (err != 0) {
+ kmem_free(buf, size);
+ return (err);
+ }
+ err = dmu_object_info(ra->os, drro->drr_object, &doi);
+ /*
+ * See receive_read_prefetch for an explanation why we're
+ * storing this object in the ignore_obj_list.
+ */
+ if (err == ENOENT ||
+ (err == 0 && doi.doi_data_block_size != drro->drr_blksz)) {
+ objlist_insert(&ra->ignore_objlist, drro->drr_object);
+ err = 0;
+ }
+ return (err);
+ }
+ case DRR_FREEOBJECTS:
+ {
+ err = receive_read_payload_and_next_header(ra, 0, NULL);
+ return (err);
+ }
+ case DRR_WRITE:
+ {
+ struct drr_write *drrw = &ra->rrd->header.drr_u.drr_write;
+ arc_buf_t *abuf = arc_loan_buf(dmu_objset_spa(ra->os),
+ drrw->drr_length);
+
+ err = receive_read_payload_and_next_header(ra,
+ drrw->drr_length, abuf->b_data);
+ if (err != 0) {
+ dmu_return_arcbuf(abuf);
+ return (err);
+ }
+ ra->rrd->write_buf = abuf;
+ receive_read_prefetch(ra, drrw->drr_object, drrw->drr_offset,
+ drrw->drr_length);
+ return (err);
+ }
+ case DRR_WRITE_BYREF:
+ {
+ struct drr_write_byref *drrwb =
+ &ra->rrd->header.drr_u.drr_write_byref;
+ err = receive_read_payload_and_next_header(ra, 0, NULL);
+ receive_read_prefetch(ra, drrwb->drr_object, drrwb->drr_offset,
+ drrwb->drr_length);
+ return (err);
+ }
+ case DRR_WRITE_EMBEDDED:
+ {
+ struct drr_write_embedded *drrwe =
+ &ra->rrd->header.drr_u.drr_write_embedded;
+ uint32_t size = P2ROUNDUP(drrwe->drr_psize, 8);
+ void *buf = kmem_zalloc(size, KM_SLEEP);
+
+ err = receive_read_payload_and_next_header(ra, size, buf);
+ if (err != 0) {
+ kmem_free(buf, size);
+ return (err);
+ }
+
+ receive_read_prefetch(ra, drrwe->drr_object, drrwe->drr_offset,
+ drrwe->drr_length);
+ return (err);
+ }
+ case DRR_FREE:
+ {
+ /*
+ * It might be beneficial to prefetch indirect blocks here, but
+ * we don't really have the data to decide for sure.
+ */
+ err = receive_read_payload_and_next_header(ra, 0, NULL);
+ return (err);
+ }
+ case DRR_END:
+ {
+ struct drr_end *drre = &ra->rrd->header.drr_u.drr_end;
+ if (!ZIO_CHECKSUM_EQUAL(ra->prev_cksum, drre->drr_checksum))
+ return (SET_ERROR(ECKSUM));
+ return (0);
+ }
+ case DRR_SPILL:
+ {
+ struct drr_spill *drrs = &ra->rrd->header.drr_u.drr_spill;
+ void *buf = kmem_zalloc(drrs->drr_length, KM_SLEEP);
+ err = receive_read_payload_and_next_header(ra, drrs->drr_length,
+ buf);
+ if (err != 0)
+ kmem_free(buf, drrs->drr_length);
+ return (err);
+ }
+ default:
+ return (SET_ERROR(EINVAL));
+ }
+}
+
+/*
+ * Commit the records to the pool.
+ */
+static int
+receive_process_record(struct receive_writer_arg *rwa,
+ struct receive_record_arg *rrd)
+{
+ int err;
+
+ /* Processing in order, therefore bytes_read should be increasing. */
+ ASSERT3U(rrd->bytes_read, >=, rwa->bytes_read);
+ rwa->bytes_read = rrd->bytes_read;
+
+ switch (rrd->header.drr_type) {
+ case DRR_OBJECT:
+ {
+ struct drr_object *drro = &rrd->header.drr_u.drr_object;
+ err = receive_object(rwa, drro, rrd->payload);
+ kmem_free(rrd->payload, rrd->payload_size);
+ rrd->payload = NULL;
+ return (err);
+ }
+ case DRR_FREEOBJECTS:
+ {
+ struct drr_freeobjects *drrfo =
+ &rrd->header.drr_u.drr_freeobjects;
+ return (receive_freeobjects(rwa, drrfo));
+ }
+ case DRR_WRITE:
+ {
+ struct drr_write *drrw = &rrd->header.drr_u.drr_write;
+ err = receive_write(rwa, drrw, rrd->write_buf);
+ /* if receive_write() is successful, it consumes the arc_buf */
+ if (err != 0)
+ dmu_return_arcbuf(rrd->write_buf);
+ rrd->write_buf = NULL;
+ rrd->payload = NULL;
+ return (err);
+ }
+ case DRR_WRITE_BYREF:
+ {
+ struct drr_write_byref *drrwbr =
+ &rrd->header.drr_u.drr_write_byref;
+ return (receive_write_byref(rwa, drrwbr));
+ }
+ case DRR_WRITE_EMBEDDED:
+ {
+ struct drr_write_embedded *drrwe =
+ &rrd->header.drr_u.drr_write_embedded;
+ err = receive_write_embedded(rwa, drrwe, rrd->payload);
+ kmem_free(rrd->payload, rrd->payload_size);
+ rrd->payload = NULL;
+ return (err);
+ }
+ case DRR_FREE:
+ {
+ struct drr_free *drrf = &rrd->header.drr_u.drr_free;
+ return (receive_free(rwa, drrf));
+ }
+ case DRR_SPILL:
+ {
+ struct drr_spill *drrs = &rrd->header.drr_u.drr_spill;
+ err = receive_spill(rwa, drrs, rrd->payload);
+ kmem_free(rrd->payload, rrd->payload_size);
+ rrd->payload = NULL;
+ return (err);
+ }
+ default:
+ return (SET_ERROR(EINVAL));
+ }
+}
+
+/*
+ * dmu_recv_stream's worker thread; pull records off the queue, and then call
+ * receive_process_record When we're done, signal the main thread and exit.
+ */
+static void
+receive_writer_thread(void *arg)
+{
+ struct receive_writer_arg *rwa = arg;
+ struct receive_record_arg *rrd;
+ for (rrd = bqueue_dequeue(&rwa->q); !rrd->eos_marker;
+ rrd = bqueue_dequeue(&rwa->q)) {
+ /*
+ * If there's an error, the main thread will stop putting things
+ * on the queue, but we need to clear everything in it before we
+ * can exit.
+ */
+ if (rwa->err == 0) {
+ rwa->err = receive_process_record(rwa, rrd);
+ } else if (rrd->write_buf != NULL) {
+ dmu_return_arcbuf(rrd->write_buf);
+ rrd->write_buf = NULL;
+ rrd->payload = NULL;
+ } else if (rrd->payload != NULL) {
+ kmem_free(rrd->payload, rrd->payload_size);
+ rrd->payload = NULL;
+ }
+ kmem_free(rrd, sizeof (*rrd));
+ }
+ kmem_free(rrd, sizeof (*rrd));
+ mutex_enter(&rwa->mutex);
+ rwa->done = B_TRUE;
+ cv_signal(&rwa->cv);
+ mutex_exit(&rwa->mutex);
+ thread_exit();
+}
+
+static int
+resume_check(struct receive_arg *ra, nvlist_t *begin_nvl)
+{
+ uint64_t val;
+ objset_t *mos = dmu_objset_pool(ra->os)->dp_meta_objset;
+ uint64_t dsobj = dmu_objset_id(ra->os);
+ uint64_t resume_obj, resume_off;
+
+ if (nvlist_lookup_uint64(begin_nvl,
+ "resume_object", &resume_obj) != 0 ||
+ nvlist_lookup_uint64(begin_nvl,
+ "resume_offset", &resume_off) != 0) {
+ return (SET_ERROR(EINVAL));
+ }
+ VERIFY0(zap_lookup(mos, dsobj,
+ DS_FIELD_RESUME_OBJECT, sizeof (val), 1, &val));
+ if (resume_obj != val)
+ return (SET_ERROR(EINVAL));
+ VERIFY0(zap_lookup(mos, dsobj,
+ DS_FIELD_RESUME_OFFSET, sizeof (val), 1, &val));
+ if (resume_off != val)
+ return (SET_ERROR(EINVAL));
+
+ return (0);
+}
+
+/*
+ * Read in the stream's records, one by one, and apply them to the pool. There
+ * are two threads involved; the thread that calls this function will spin up a
+ * worker thread, read the records off the stream one by one, and issue
+ * prefetches for any necessary indirect blocks. It will then push the records
+ * onto an internal blocking queue. The worker thread will pull the records off
+ * the queue, and actually write the data into the DMU. This way, the worker
+ * thread doesn't have to wait for reads to complete, since everything it needs
+ * (the indirect blocks) will be prefetched.
+ *
* NB: callers *must* call dmu_recv_end() if this succeeds.
*/
int
@@ -1429,11 +2818,11 @@
dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp,
int cleanup_fd, uint64_t *action_handlep)
{
- struct restorearg ra = { 0 };
- dmu_replay_record_t *drr;
- objset_t *os;
- zio_cksum_t pcksum;
+ int err = 0;
+ struct receive_arg ra = { 0 };
+ struct receive_writer_arg rwa = { 0 };
int featureflags;
+ nvlist_t *begin_nvl = NULL;
ra.byteswap = drc->drc_byteswap;
ra.cksum = drc->drc_cksum;
@@ -1440,9 +2829,15 @@
ra.td = curthread;
ra.fp = fp;
ra.voff = *voffp;
- ra.bufsize = 1<<20;
- ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP);
+ if (dsl_dataset_is_zapified(drc->drc_ds)) {
+ (void) zap_lookup(drc->drc_ds->ds_dir->dd_pool->dp_meta_objset,
+ drc->drc_ds->ds_object, DS_FIELD_RESUME_BYTES,
+ sizeof (ra.bytes_read), 1, &ra.bytes_read);
+ }
+
+ objlist_create(&ra.ignore_objlist);
+
/* these were verified in dmu_recv_begin */
ASSERT3U(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo), ==,
DMU_SUBSTREAM);
@@ -1451,9 +2846,9 @@
/*
* Open the objset we are modifying.
*/
- VERIFY0(dmu_objset_from_ds(drc->drc_ds, &os));
+ VERIFY0(dmu_objset_from_ds(drc->drc_ds, &ra.os));
- ASSERT(drc->drc_ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT);
+ ASSERT(dsl_dataset_phys(drc->drc_ds)->ds_flags & DS_FLAG_INCONSISTENT);
featureflags = DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo);
@@ -1472,119 +2867,131 @@
}
if (*action_handlep == 0) {
- ra.guid_to_ds_map =
+ rwa.guid_to_ds_map =
kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
- avl_create(ra.guid_to_ds_map, guid_compare,
+ avl_create(rwa.guid_to_ds_map, guid_compare,
sizeof (guid_map_entry_t),
offsetof(guid_map_entry_t, avlnode));
- ra.err = zfs_onexit_add_cb(minor,
- free_guid_map_onexit, ra.guid_to_ds_map,
+ err = zfs_onexit_add_cb(minor,
+ free_guid_map_onexit, rwa.guid_to_ds_map,
action_handlep);
if (ra.err != 0)
goto out;
} else {
- ra.err = zfs_onexit_cb_data(minor, *action_handlep,
- (void **)&ra.guid_to_ds_map);
+ err = zfs_onexit_cb_data(minor, *action_handlep,
+ (void **)&rwa.guid_to_ds_map);
if (ra.err != 0)
goto out;
}
- drc->drc_guid_to_ds_map = ra.guid_to_ds_map;
+ drc->drc_guid_to_ds_map = rwa.guid_to_ds_map;
}
+ uint32_t payloadlen = drc->drc_drr_begin->drr_payloadlen;
+ void *payload = NULL;
+ if (payloadlen != 0)
+ payload = kmem_alloc(payloadlen, KM_SLEEP);
+
+ err = receive_read_payload_and_next_header(&ra, payloadlen, payload);
+ if (err != 0) {
+ if (payloadlen != 0)
+ kmem_free(payload, payloadlen);
+ goto out;
+ }
+ if (payloadlen != 0) {
+ err = nvlist_unpack(payload, payloadlen, &begin_nvl, KM_SLEEP);
+ kmem_free(payload, payloadlen);
+ if (err != 0)
+ goto out;
+ }
+
+ if (featureflags & DMU_BACKUP_FEATURE_RESUMING) {
+ err = resume_check(&ra, begin_nvl);
+ if (err != 0)
+ goto out;
+ }
+
+ (void) bqueue_init(&rwa.q, zfs_recv_queue_length,
+ offsetof(struct receive_record_arg, node));
+ cv_init(&rwa.cv, NULL, CV_DEFAULT, NULL);
+ mutex_init(&rwa.mutex, NULL, MUTEX_DEFAULT, NULL);
+ rwa.os = ra.os;
+ rwa.byteswap = drc->drc_byteswap;
+ rwa.resumable = drc->drc_resumable;
+
+ (void) thread_create(NULL, 0, receive_writer_thread, &rwa, 0, &p0,
+ TS_RUN, minclsyspri);
/*
- * Read records and process them.
+ * We're reading rwa.err without locks, which is safe since we are the
+ * only reader, and the worker thread is the only writer. It's ok if we
+ * miss a write for an iteration or two of the loop, since the writer
+ * thread will keep freeing records we send it until we send it an eos
+ * marker.
+ *
+ * We can leave this loop in 3 ways: First, if rwa.err is
+ * non-zero. In that case, the writer thread will free the rrd we just
+ * pushed. Second, if we're interrupted; in that case, either it's the
+ * first loop and ra.rrd was never allocated, or it's later, and ra.rrd
+ * has been handed off to the writer thread who will free it. Finally,
+ * if receive_read_record fails or we're at the end of the stream, then
+ * we free ra.rrd and exit.
*/
- pcksum = ra.cksum;
- while (ra.err == 0 &&
- NULL != (drr = restore_read(&ra, sizeof (*drr)))) {
+ while (rwa.err == 0) {
if (issig(JUSTLOOKING) && issig(FORREAL)) {
- ra.err = SET_ERROR(EINTR);
- goto out;
+ err = SET_ERROR(EINTR);
+ break;
}
- if (ra.byteswap)
- backup_byteswap(drr);
+ ASSERT3P(ra.rrd, ==, NULL);
+ ra.rrd = ra.next_rrd;
+ ra.next_rrd = NULL;
+ /* Allocates and loads header into ra.next_rrd */
+ err = receive_read_record(&ra);
- switch (drr->drr_type) {
- case DRR_OBJECT:
- {
- /*
- * We need to make a copy of the record header,
- * because restore_{object,write} may need to
- * restore_read(), which will invalidate drr.
- */
- struct drr_object drro = drr->drr_u.drr_object;
- ra.err = restore_object(&ra, os, &drro);
+ if (ra.rrd->header.drr_type == DRR_END || err != 0) {
+ kmem_free(ra.rrd, sizeof (*ra.rrd));
+ ra.rrd = NULL;
break;
}
- case DRR_FREEOBJECTS:
- {
- struct drr_freeobjects drrfo =
- drr->drr_u.drr_freeobjects;
- ra.err = restore_freeobjects(&ra, os, &drrfo);
- break;
- }
- case DRR_WRITE:
- {
- struct drr_write drrw = drr->drr_u.drr_write;
- ra.err = restore_write(&ra, os, &drrw);
- break;
- }
- case DRR_WRITE_BYREF:
- {
- struct drr_write_byref drrwbr =
- drr->drr_u.drr_write_byref;
- ra.err = restore_write_byref(&ra, os, &drrwbr);
- break;
- }
- case DRR_FREE:
- {
- struct drr_free drrf = drr->drr_u.drr_free;
- ra.err = restore_free(&ra, os, &drrf);
- break;
- }
- case DRR_END:
- {
- struct drr_end drre = drr->drr_u.drr_end;
- /*
- * We compare against the *previous* checksum
- * value, because the stored checksum is of
- * everything before the DRR_END record.
- */
- if (!ZIO_CHECKSUM_EQUAL(drre.drr_checksum, pcksum))
- ra.err = SET_ERROR(ECKSUM);
- goto out;
- }
- case DRR_SPILL:
- {
- struct drr_spill drrs = drr->drr_u.drr_spill;
- ra.err = restore_spill(&ra, os, &drrs);
- break;
- }
- default:
- ra.err = SET_ERROR(EINVAL);
- goto out;
- }
- pcksum = ra.cksum;
+
+ bqueue_enqueue(&rwa.q, ra.rrd,
+ sizeof (struct receive_record_arg) + ra.rrd->payload_size);
+ ra.rrd = NULL;
}
- ASSERT(ra.err != 0);
+ if (ra.next_rrd == NULL)
+ ra.next_rrd = kmem_zalloc(sizeof (*ra.next_rrd), KM_SLEEP);
+ ra.next_rrd->eos_marker = B_TRUE;
+ bqueue_enqueue(&rwa.q, ra.next_rrd, 1);
+ mutex_enter(&rwa.mutex);
+ while (!rwa.done) {
+ cv_wait(&rwa.cv, &rwa.mutex);
+ }
+ mutex_exit(&rwa.mutex);
+
+ cv_destroy(&rwa.cv);
+ mutex_destroy(&rwa.mutex);
+ bqueue_destroy(&rwa.q);
+ if (err == 0)
+ err = rwa.err;
+
out:
+ nvlist_free(begin_nvl);
if ((featureflags & DMU_BACKUP_FEATURE_DEDUP) && (cleanup_fd != -1))
zfs_onexit_fd_rele(cleanup_fd);
- if (ra.err != 0) {
+ if (err != 0) {
/*
- * destroy what we created, so we don't leave it in the
- * inconsistent restoring state.
+ * Clean up references. If receive is not resumable,
+ * destroy what we created, so we don't leave it in
+ * the inconsistent state.
*/
dmu_recv_cleanup_ds(drc);
}
- kmem_free(ra.buf, ra.bufsize);
*voffp = ra.voff;
- return (ra.err);
+ objlist_destroy(&ra.ignore_objlist);
+ return (err);
}
static int
@@ -1602,14 +3009,47 @@
error = dsl_dataset_hold(dp, drc->drc_tofs, FTAG, &origin_head);
if (error != 0)
return (error);
+ if (drc->drc_force) {
+ /*
+ * We will destroy any snapshots in tofs (i.e. before
+ * origin_head) that are after the origin (which is
+ * the snap before drc_ds, because drc_ds can not
+ * have any snaps of its own).
+ */
+ uint64_t obj;
+
+ obj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj;
+ while (obj !=
+ dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj) {
+ dsl_dataset_t *snap;
+ error = dsl_dataset_hold_obj(dp, obj, FTAG,
+ &snap);
+ if (error != 0)
+ break;
+ if (snap->ds_dir != origin_head->ds_dir)
+ error = SET_ERROR(EINVAL);
+ if (error == 0) {
+ error = dsl_destroy_snapshot_check_impl(
+ snap, B_FALSE);
+ }
+ obj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
+ dsl_dataset_rele(snap, FTAG);
+ if (error != 0)
+ break;
+ }
+ if (error != 0) {
+ dsl_dataset_rele(origin_head, FTAG);
+ return (error);
+ }
+ }
error = dsl_dataset_clone_swap_check_impl(drc->drc_ds,
- origin_head, drc->drc_force);
+ origin_head, drc->drc_force, drc->drc_owner, tx);
if (error != 0) {
dsl_dataset_rele(origin_head, FTAG);
return (error);
}
error = dsl_dataset_snapshot_check_impl(origin_head,
- drc->drc_tosnap, tx);
+ drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred);
dsl_dataset_rele(origin_head, FTAG);
if (error != 0)
return (error);
@@ -1617,7 +3057,7 @@
error = dsl_destroy_head_check_impl(drc->drc_ds, 1);
} else {
error = dsl_dataset_snapshot_check_impl(drc->drc_ds,
- drc->drc_tosnap, tx);
+ drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred);
}
return (error);
}
@@ -1636,6 +3076,30 @@
VERIFY0(dsl_dataset_hold(dp, drc->drc_tofs, FTAG,
&origin_head));
+
+ if (drc->drc_force) {
+ /*
+ * Destroy any snapshots of drc_tofs (origin_head)
+ * after the origin (the snap before drc_ds).
+ */
+ uint64_t obj;
+
+ obj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj;
+ while (obj !=
+ dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj) {
+ dsl_dataset_t *snap;
+ VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG,
+ &snap));
+ ASSERT3P(snap->ds_dir, ==, origin_head->ds_dir);
+ obj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
+ dsl_destroy_snapshot_sync_impl(snap,
+ B_FALSE, tx);
+ dsl_dataset_rele(snap, FTAG);
+ }
+ }
+ VERIFY3P(drc->drc_ds->ds_prev, ==,
+ origin_head->ds_prev);
+
dsl_dataset_clone_swap_sync_impl(drc->drc_ds,
origin_head, tx);
dsl_dataset_snapshot_sync_impl(origin_head,
@@ -1643,18 +3107,25 @@
/* set snapshot's creation time and guid */
dmu_buf_will_dirty(origin_head->ds_prev->ds_dbuf, tx);
- origin_head->ds_prev->ds_phys->ds_creation_time =
+ dsl_dataset_phys(origin_head->ds_prev)->ds_creation_time =
drc->drc_drrb->drr_creation_time;
- origin_head->ds_prev->ds_phys->ds_guid =
+ dsl_dataset_phys(origin_head->ds_prev)->ds_guid =
drc->drc_drrb->drr_toguid;
- origin_head->ds_prev->ds_phys->ds_flags &=
+ dsl_dataset_phys(origin_head->ds_prev)->ds_flags &=
~DS_FLAG_INCONSISTENT;
dmu_buf_will_dirty(origin_head->ds_dbuf, tx);
- origin_head->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT;
+ dsl_dataset_phys(origin_head)->ds_flags &=
+ ~DS_FLAG_INCONSISTENT;
+ drc->drc_newsnapobj =
+ dsl_dataset_phys(origin_head)->ds_prev_snap_obj;
+
dsl_dataset_rele(origin_head, FTAG);
dsl_destroy_head_sync_impl(drc->drc_ds, tx);
+
+ if (drc->drc_owner != NULL)
+ VERIFY3P(origin_head->ds_owner, ==, drc->drc_owner);
} else {
dsl_dataset_t *ds = drc->drc_ds;
@@ -1662,15 +3133,32 @@
/* set snapshot's creation time and guid */
dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
- ds->ds_prev->ds_phys->ds_creation_time =
+ dsl_dataset_phys(ds->ds_prev)->ds_creation_time =
drc->drc_drrb->drr_creation_time;
- ds->ds_prev->ds_phys->ds_guid = drc->drc_drrb->drr_toguid;
- ds->ds_prev->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT;
+ dsl_dataset_phys(ds->ds_prev)->ds_guid =
+ drc->drc_drrb->drr_toguid;
+ dsl_dataset_phys(ds->ds_prev)->ds_flags &=
+ ~DS_FLAG_INCONSISTENT;
dmu_buf_will_dirty(ds->ds_dbuf, tx);
- ds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT;
+ dsl_dataset_phys(ds)->ds_flags &= ~DS_FLAG_INCONSISTENT;
+ if (dsl_dataset_has_resume_receive_state(ds)) {
+ (void) zap_remove(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_FROMGUID, tx);
+ (void) zap_remove(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_OBJECT, tx);
+ (void) zap_remove(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_OFFSET, tx);
+ (void) zap_remove(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_BYTES, tx);
+ (void) zap_remove(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_TOGUID, tx);
+ (void) zap_remove(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_TONAME, tx);
+ }
+ drc->drc_newsnapobj =
+ dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj;
}
- drc->drc_newsnapobj = drc->drc_ds->ds_phys->ds_prev_snap_obj;
/*
* Release the hold from dmu_recv_begin. This must be done before
* we return to open context, so that when we free the dataset's dnode,
@@ -1696,7 +3184,7 @@
gmep = kmem_alloc(sizeof (*gmep), KM_SLEEP);
err = dsl_dataset_hold_obj(dp, snapobj, gmep, &snapds);
if (err == 0) {
- gmep->guid = snapds->ds_phys->ds_guid;
+ gmep->guid = dsl_dataset_phys(snapds)->ds_guid;
gmep->gme_ds = snapds;
avl_add(guid_map, gmep);
dsl_dataset_long_hold(snapds, gmep);
@@ -1712,36 +3200,41 @@
static int
dmu_recv_existing_end(dmu_recv_cookie_t *drc)
{
- int error;
- char name[MAXNAMELEN];
-
#ifdef _KERNEL
/*
* We will be destroying the ds; make sure its origin is unmounted if
* necessary.
*/
+ char name[ZFS_MAX_DATASET_NAME_LEN];
dsl_dataset_name(drc->drc_ds, name);
zfs_destroy_unmount_origin(name);
#endif
- error = dsl_sync_task(drc->drc_tofs,
+ return (dsl_sync_task(drc->drc_tofs,
dmu_recv_end_check, dmu_recv_end_sync, drc,
- dmu_recv_end_modified_blocks);
-
- if (error != 0)
- dmu_recv_cleanup_ds(drc);
- return (error);
+ dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL));
}
static int
dmu_recv_new_end(dmu_recv_cookie_t *drc)
{
+ return (dsl_sync_task(drc->drc_tofs,
+ dmu_recv_end_check, dmu_recv_end_sync, drc,
+ dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL));
+}
+
+int
+dmu_recv_end(dmu_recv_cookie_t *drc, void *owner)
+{
int error;
- error = dsl_sync_task(drc->drc_tofs,
- dmu_recv_end_check, dmu_recv_end_sync, drc,
- dmu_recv_end_modified_blocks);
+ drc->drc_owner = owner;
+ if (drc->drc_newfs)
+ error = dmu_recv_new_end(drc);
+ else
+ error = dmu_recv_existing_end(drc);
+
if (error != 0) {
dmu_recv_cleanup_ds(drc);
} else if (drc->drc_guid_to_ds_map != NULL) {
@@ -1752,11 +3245,12 @@
return (error);
}
-int
-dmu_recv_end(dmu_recv_cookie_t *drc)
+/*
+ * Return TRUE if this objset is currently being received into.
+ */
+boolean_t
+dmu_objset_is_receiving(objset_t *os)
{
- if (drc->drc_newfs)
- return (dmu_recv_new_end(drc));
- else
- return (dmu_recv_existing_end(drc));
+ return (os->os_dsl_dataset != NULL &&
+ os->os_dsl_dataset->ds_owner == dmu_recv_tag);
}
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,7 +21,8 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2015 Chunwei Chen. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -37,17 +38,25 @@
#include <sys/sa.h>
#include <sys/sa_impl.h>
#include <sys/callb.h>
+#include <sys/zfeature.h>
-int zfs_pd_blks_max = 100;
+int32_t zfs_pd_bytes_max = 50 * 1024 * 1024; /* 50MB */
+boolean_t send_holes_without_birth_time = B_TRUE;
+#ifdef _KERNEL
+SYSCTL_DECL(_vfs_zfs);
+SYSCTL_UINT(_vfs_zfs, OID_AUTO, send_holes_without_birth_time, CTLFLAG_RWTUN,
+ &send_holes_without_birth_time, 0, "Send holes without birth time");
+#endif
+
typedef struct prefetch_data {
kmutex_t pd_mtx;
kcondvar_t pd_cv;
- int pd_blks_max;
- int pd_blks_fetched;
+ int32_t pd_bytes_fetched;
int pd_flags;
boolean_t pd_cancel;
boolean_t pd_exited;
+ zbookmark_phys_t pd_resume;
} prefetch_data_t;
typedef struct traverse_data {
@@ -55,11 +64,14 @@
uint64_t td_objset;
blkptr_t *td_rootbp;
uint64_t td_min_txg;
- zbookmark_t *td_resume;
+ zbookmark_phys_t *td_resume;
int td_flags;
prefetch_data_t *td_pfd;
+ boolean_t td_paused;
+ uint64_t td_hole_birth_enabled_txg;
blkptr_cb_t *td_func;
void *td_arg;
+ boolean_t td_realloc_possible;
} traverse_data_t;
static int traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
@@ -71,9 +83,9 @@
traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
{
traverse_data_t *td = arg;
- zbookmark_t zb;
+ zbookmark_phys_t zb;
- if (bp->blk_birth == 0)
+ if (BP_IS_HOLE(bp))
return (0);
if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(td->td_spa))
@@ -95,9 +107,9 @@
if (lrc->lrc_txtype == TX_WRITE) {
lr_write_t *lr = (lr_write_t *)lrc;
blkptr_t *bp = &lr->lr_blkptr;
- zbookmark_t zb;
+ zbookmark_phys_t zb;
- if (bp->blk_birth == 0)
+ if (BP_IS_HOLE(bp))
return (0);
if (claim_txg == 0 || bp->blk_birth < claim_txg)
@@ -149,7 +161,7 @@
*/
static resume_skip_t
resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp,
- const zbookmark_t *zb)
+ const zbookmark_phys_t *zb)
{
if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume)) {
/*
@@ -156,7 +168,7 @@
* If we already visited this bp & everything below,
* don't bother doing it again.
*/
- if (zbookmark_is_before(dnp, zb, td->td_resume))
+ if (zbookmark_subtree_completed(dnp, zb, td->td_resume))
return (RESUME_SKIP_ALL);
/*
@@ -163,7 +175,6 @@
* If we found the block we're trying to resume from, zero
* the bookmark out to indicate that we have resumed.
*/
- ASSERT3U(zb->zb_object, <=, td->td_resume->zb_object);
if (bcmp(zb, td->td_resume, sizeof (*zb)) == 0) {
bzero(td->td_resume, sizeof (*zb));
if (td->td_flags & TRAVERSE_POST)
@@ -174,18 +185,10 @@
}
static void
-traverse_pause(traverse_data_t *td, const zbookmark_t *zb)
-{
- ASSERT(td->td_resume != NULL);
- ASSERT0(zb->zb_level);
- bcopy(zb, td->td_resume, sizeof (*td->td_resume));
-}
-
-static void
traverse_prefetch_metadata(traverse_data_t *td,
- const blkptr_t *bp, const zbookmark_t *zb)
+ const blkptr_t *bp, const zbookmark_phys_t *zb)
{
- uint32_t flags = ARC_NOWAIT | ARC_PREFETCH;
+ arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
if (!(td->td_flags & TRAVERSE_PREFETCH_METADATA))
return;
@@ -205,16 +208,25 @@
ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
}
+static boolean_t
+prefetch_needed(prefetch_data_t *pfd, const blkptr_t *bp)
+{
+ ASSERT(pfd->pd_flags & TRAVERSE_PREFETCH_DATA);
+ if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp) ||
+ BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG)
+ return (B_FALSE);
+ return (B_TRUE);
+}
+
static int
traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
- const blkptr_t *bp, const zbookmark_t *zb)
+ const blkptr_t *bp, const zbookmark_phys_t *zb)
{
- zbookmark_t czb;
- int err = 0, lasterr = 0;
+ zbookmark_phys_t czb;
+ int err = 0;
arc_buf_t *buf = NULL;
prefetch_data_t *pd = td->td_pfd;
boolean_t hard = td->td_flags & TRAVERSE_HARD;
- boolean_t pause = B_FALSE;
switch (resume_skip_check(td, dnp, zb)) {
case RESUME_SKIP_ALL:
@@ -227,39 +239,67 @@
ASSERT(0);
}
- if (BP_IS_HOLE(bp)) {
- err = td->td_func(td->td_spa, NULL, NULL, zb, dnp, td->td_arg);
- return (err);
+ if (bp->blk_birth == 0) {
+ /*
+ * Since this block has a birth time of 0 it must be one of
+ * two things: a hole created before the
+ * SPA_FEATURE_HOLE_BIRTH feature was enabled, or a hole
+ * which has always been a hole in an object.
+ *
+ * If a file is written sparsely, then the unwritten parts of
+ * the file were "always holes" -- that is, they have been
+ * holes since this object was allocated. However, we (and
+ * our callers) can not necessarily tell when an object was
+ * allocated. Therefore, if it's possible that this object
+ * was freed and then its object number reused, we need to
+ * visit all the holes with birth==0.
+ *
+ * If it isn't possible that the object number was reused,
+ * then if SPA_FEATURE_HOLE_BIRTH was enabled before we wrote
+ * all the blocks we will visit as part of this traversal,
+ * then this hole must have always existed, so we can skip
+ * it. We visit blocks born after (exclusive) td_min_txg.
+ *
+ * Note that the meta-dnode cannot be reallocated.
+ */
+ if (!send_holes_without_birth_time &&
+ (!td->td_realloc_possible ||
+ zb->zb_object == DMU_META_DNODE_OBJECT) &&
+ td->td_hole_birth_enabled_txg <= td->td_min_txg)
+ return (0);
+ } else if (bp->blk_birth <= td->td_min_txg) {
+ return (0);
}
- if (bp->blk_birth <= td->td_min_txg)
- return (0);
-
- if (pd && !pd->pd_exited &&
- ((pd->pd_flags & TRAVERSE_PREFETCH_DATA) ||
- BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0)) {
+ if (pd != NULL && !pd->pd_exited && prefetch_needed(pd, bp)) {
+ uint64_t size = BP_GET_LSIZE(bp);
mutex_enter(&pd->pd_mtx);
- ASSERT(pd->pd_blks_fetched >= 0);
- while (pd->pd_blks_fetched == 0 && !pd->pd_exited)
+ ASSERT(pd->pd_bytes_fetched >= 0);
+ while (pd->pd_bytes_fetched < size && !pd->pd_exited)
cv_wait(&pd->pd_cv, &pd->pd_mtx);
- pd->pd_blks_fetched--;
+ pd->pd_bytes_fetched -= size;
cv_broadcast(&pd->pd_cv);
mutex_exit(&pd->pd_mtx);
}
+ if (BP_IS_HOLE(bp)) {
+ err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg);
+ if (err != 0)
+ goto post;
+ return (0);
+ }
+
if (td->td_flags & TRAVERSE_PRE) {
err = td->td_func(td->td_spa, NULL, bp, zb, dnp,
td->td_arg);
if (err == TRAVERSE_VISIT_NO_CHILDREN)
return (0);
- if (err == ERESTART)
- pause = B_TRUE; /* handle pausing at a common point */
if (err != 0)
goto post;
}
if (BP_GET_LEVEL(bp) > 0) {
- uint32_t flags = ARC_WAIT;
+ arc_flags_t flags = ARC_FLAG_WAIT;
int i;
blkptr_t *cbp;
int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
@@ -267,7 +307,7 @@
err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
if (err != 0)
- return (err);
+ goto post;
cbp = buf->b_data;
for (i = 0; i < epb; i++) {
@@ -283,14 +323,11 @@
zb->zb_level - 1,
zb->zb_blkid * epb + i);
err = traverse_visitbp(td, dnp, &cbp[i], &czb);
- if (err != 0) {
- if (!hard)
- break;
- lasterr = err;
- }
+ if (err != 0)
+ break;
}
} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
- uint32_t flags = ARC_WAIT;
+ arc_flags_t flags = ARC_FLAG_WAIT;
int i;
int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
@@ -297,84 +334,101 @@
err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
if (err != 0)
- return (err);
- dnp = buf->b_data;
+ goto post;
+ dnode_phys_t *child_dnp = buf->b_data;
for (i = 0; i < epb; i++) {
- prefetch_dnode_metadata(td, &dnp[i], zb->zb_objset,
- zb->zb_blkid * epb + i);
+ prefetch_dnode_metadata(td, &child_dnp[i],
+ zb->zb_objset, zb->zb_blkid * epb + i);
}
/* recursively visitbp() blocks below this */
for (i = 0; i < epb; i++) {
- err = traverse_dnode(td, &dnp[i], zb->zb_objset,
- zb->zb_blkid * epb + i);
- if (err != 0) {
- if (!hard)
- break;
- lasterr = err;
- }
+ err = traverse_dnode(td, &child_dnp[i],
+ zb->zb_objset, zb->zb_blkid * epb + i);
+ if (err != 0)
+ break;
}
} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
- uint32_t flags = ARC_WAIT;
- objset_phys_t *osp;
- dnode_phys_t *dnp;
+ arc_flags_t flags = ARC_FLAG_WAIT;
err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
if (err != 0)
- return (err);
+ goto post;
- osp = buf->b_data;
- dnp = &osp->os_meta_dnode;
- prefetch_dnode_metadata(td, dnp, zb->zb_objset,
+ objset_phys_t *osp = buf->b_data;
+ prefetch_dnode_metadata(td, &osp->os_meta_dnode, zb->zb_objset,
DMU_META_DNODE_OBJECT);
+ /*
+ * See the block comment above for the goal of this variable.
+ * If the maxblkid of the meta-dnode is 0, then we know that
+ * we've never had more than DNODES_PER_BLOCK objects in the
+ * dataset, which means we can't have reused any object ids.
+ */
+ if (osp->os_meta_dnode.dn_maxblkid == 0)
+ td->td_realloc_possible = B_FALSE;
+
if (arc_buf_size(buf) >= sizeof (objset_phys_t)) {
+ prefetch_dnode_metadata(td, &osp->os_groupused_dnode,
+ zb->zb_objset, DMU_GROUPUSED_OBJECT);
prefetch_dnode_metadata(td, &osp->os_userused_dnode,
zb->zb_objset, DMU_USERUSED_OBJECT);
- prefetch_dnode_metadata(td, &osp->os_groupused_dnode,
- zb->zb_objset, DMU_USERUSED_OBJECT);
}
- err = traverse_dnode(td, dnp, zb->zb_objset,
+ err = traverse_dnode(td, &osp->os_meta_dnode, zb->zb_objset,
DMU_META_DNODE_OBJECT);
- if (err && hard) {
- lasterr = err;
- err = 0;
- }
if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) {
- dnp = &osp->os_userused_dnode;
- err = traverse_dnode(td, dnp, zb->zb_objset,
- DMU_USERUSED_OBJECT);
+ err = traverse_dnode(td, &osp->os_groupused_dnode,
+ zb->zb_objset, DMU_GROUPUSED_OBJECT);
}
- if (err && hard) {
- lasterr = err;
- err = 0;
- }
if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) {
- dnp = &osp->os_groupused_dnode;
- err = traverse_dnode(td, dnp, zb->zb_objset,
- DMU_GROUPUSED_OBJECT);
+ err = traverse_dnode(td, &osp->os_userused_dnode,
+ zb->zb_objset, DMU_USERUSED_OBJECT);
}
}
if (buf)
- (void) arc_buf_remove_ref(buf, &buf);
+ arc_buf_destroy(buf, &buf);
post:
- if (err == 0 && lasterr == 0 && (td->td_flags & TRAVERSE_POST)) {
+ if (err == 0 && (td->td_flags & TRAVERSE_POST))
err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg);
- if (err == ERESTART)
- pause = B_TRUE;
+
+ if (hard && (err == EIO || err == ECKSUM)) {
+ /*
+ * Ignore this disk error as requested by the HARD flag,
+ * and continue traversal.
+ */
+ err = 0;
}
- if (pause && td->td_resume != NULL) {
- ASSERT3U(err, ==, ERESTART);
- ASSERT(!hard);
- traverse_pause(td, zb);
+ /*
+ * If we are stopping here, set td_resume.
+ */
+ if (td->td_resume != NULL && err != 0 && !td->td_paused) {
+ td->td_resume->zb_objset = zb->zb_objset;
+ td->td_resume->zb_object = zb->zb_object;
+ td->td_resume->zb_level = 0;
+ /*
+ * If we have stopped on an indirect block (e.g. due to
+ * i/o error), we have not visited anything below it.
+ * Set the bookmark to the first level-0 block that we need
+ * to visit. This way, the resuming code does not need to
+ * deal with resuming from indirect blocks.
+ *
+ * Note, if zb_level <= 0, dnp may be NULL, so we don't want
+ * to dereference it.
+ */
+ td->td_resume->zb_blkid = zb->zb_blkid;
+ if (zb->zb_level > 0) {
+ td->td_resume->zb_blkid <<= zb->zb_level *
+ (dnp->dn_indblkshift - SPA_BLKPTRSHIFT);
+ }
+ td->td_paused = B_TRUE;
}
- return (err != 0 ? err : lasterr);
+ return (err);
}
static void
@@ -382,7 +436,7 @@
uint64_t objset, uint64_t object)
{
int j;
- zbookmark_t czb;
+ zbookmark_phys_t czb;
for (j = 0; j < dnp->dn_nblkptr; j++) {
SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
@@ -399,53 +453,70 @@
traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
uint64_t objset, uint64_t object)
{
- int j, err = 0, lasterr = 0;
- zbookmark_t czb;
- boolean_t hard = (td->td_flags & TRAVERSE_HARD);
+ int j, err = 0;
+ zbookmark_phys_t czb;
+ if (object != DMU_META_DNODE_OBJECT && td->td_resume != NULL &&
+ object < td->td_resume->zb_object)
+ return (0);
+
+ if (td->td_flags & TRAVERSE_PRE) {
+ SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL,
+ ZB_DNODE_BLKID);
+ err = td->td_func(td->td_spa, NULL, NULL, &czb, dnp,
+ td->td_arg);
+ if (err == TRAVERSE_VISIT_NO_CHILDREN)
+ return (0);
+ if (err != 0)
+ return (err);
+ }
+
for (j = 0; j < dnp->dn_nblkptr; j++) {
SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
err = traverse_visitbp(td, dnp, &dnp->dn_blkptr[j], &czb);
- if (err != 0) {
- if (!hard)
- break;
- lasterr = err;
- }
+ if (err != 0)
+ break;
}
- if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+ if (err == 0 && (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) {
SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID);
err = traverse_visitbp(td, dnp, &dnp->dn_spill, &czb);
- if (err != 0) {
- if (!hard)
- return (err);
- lasterr = err;
- }
}
- return (err != 0 ? err : lasterr);
+
+ if (err == 0 && (td->td_flags & TRAVERSE_POST)) {
+ SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL,
+ ZB_DNODE_BLKID);
+ err = td->td_func(td->td_spa, NULL, NULL, &czb, dnp,
+ td->td_arg);
+ if (err == TRAVERSE_VISIT_NO_CHILDREN)
+ return (0);
+ if (err != 0)
+ return (err);
+ }
+ return (err);
}
/* ARGSUSED */
static int
traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
- const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
+ const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
{
prefetch_data_t *pfd = arg;
- uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
+ arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
- ASSERT(pfd->pd_blks_fetched >= 0);
+ ASSERT(pfd->pd_bytes_fetched >= 0);
+ if (bp == NULL)
+ return (0);
if (pfd->pd_cancel)
return (SET_ERROR(EINTR));
- if (bp == NULL || !((pfd->pd_flags & TRAVERSE_PREFETCH_DATA) ||
- BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0) ||
- BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG)
+ if (!prefetch_needed(pfd, bp))
return (0);
mutex_enter(&pfd->pd_mtx);
- while (!pfd->pd_cancel && pfd->pd_blks_fetched >= pfd->pd_blks_max)
+ while (!pfd->pd_cancel && pfd->pd_bytes_fetched >= zfs_pd_bytes_max)
cv_wait(&pfd->pd_cv, &pfd->pd_mtx);
- pfd->pd_blks_fetched++;
+ pfd->pd_bytes_fetched += BP_GET_LSIZE(bp);
cv_broadcast(&pfd->pd_cv);
mutex_exit(&pfd->pd_mtx);
@@ -460,11 +531,12 @@
{
traverse_data_t *td_main = arg;
traverse_data_t td = *td_main;
- zbookmark_t czb;
+ zbookmark_phys_t czb;
td.td_func = traverse_prefetcher;
td.td_arg = td_main->td_pfd;
td.td_pfd = NULL;
+ td.td_resume = &td_main->td_pfd->pd_resume;
SET_BOOKMARK(&czb, td.td_objset,
ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
@@ -482,23 +554,17 @@
*/
static int
traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp,
- uint64_t txg_start, zbookmark_t *resume, int flags,
+ uint64_t txg_start, zbookmark_phys_t *resume, int flags,
blkptr_cb_t func, void *arg)
{
traverse_data_t td;
prefetch_data_t pd = { 0 };
- zbookmark_t czb;
+ zbookmark_phys_t czb;
int err;
ASSERT(ds == NULL || objset == ds->ds_object);
ASSERT(!(flags & TRAVERSE_PRE) || !(flags & TRAVERSE_POST));
- /*
- * The data prefetching mechanism (the prefetch thread) is incompatible
- * with resuming from a bookmark.
- */
- ASSERT(resume == NULL || !(flags & TRAVERSE_PREFETCH_DATA));
-
td.td_spa = spa;
td.td_objset = objset;
td.td_rootbp = rootbp;
@@ -508,15 +574,25 @@
td.td_arg = arg;
td.td_pfd = &pd;
td.td_flags = flags;
+ td.td_paused = B_FALSE;
+ td.td_realloc_possible = (txg_start == 0 ? B_FALSE : B_TRUE);
- pd.pd_blks_max = zfs_pd_blks_max;
+ if (spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) {
+ VERIFY(spa_feature_enabled_txg(spa,
+ SPA_FEATURE_HOLE_BIRTH, &td.td_hole_birth_enabled_txg));
+ } else {
+ td.td_hole_birth_enabled_txg = UINT64_MAX;
+ }
+
pd.pd_flags = flags;
+ if (resume != NULL)
+ pd.pd_resume = *resume;
mutex_init(&pd.pd_mtx, NULL, MUTEX_DEFAULT, NULL);
cv_init(&pd.pd_cv, NULL, CV_DEFAULT, NULL);
/* See comment on ZIL traversal in dsl_scan_visitds. */
- if (ds != NULL && !dsl_dataset_is_snapshot(ds) && !BP_IS_HOLE(rootbp)) {
- uint32_t flags = ARC_WAIT;
+ if (ds != NULL && !ds->ds_is_snapshot && !BP_IS_HOLE(rootbp)) {
+ arc_flags_t flags = ARC_FLAG_WAIT;
objset_phys_t *osp;
arc_buf_t *buf;
@@ -528,7 +604,7 @@
osp = buf->b_data;
traverse_zil(&td, &osp->os_zil_header);
- (void) arc_buf_remove_ref(buf, &buf);
+ arc_buf_destroy(buf, &buf);
}
if (!(flags & TRAVERSE_PREFETCH_DATA) ||
@@ -558,16 +634,24 @@
* in syncing context).
*/
int
-traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start, int flags,
- blkptr_cb_t func, void *arg)
+traverse_dataset_resume(dsl_dataset_t *ds, uint64_t txg_start,
+ zbookmark_phys_t *resume,
+ int flags, blkptr_cb_t func, void *arg)
{
return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds, ds->ds_object,
- &ds->ds_phys->ds_bp, txg_start, NULL, flags, func, arg));
+ &dsl_dataset_phys(ds)->ds_bp, txg_start, resume, flags, func, arg));
}
int
+traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start,
+ int flags, blkptr_cb_t func, void *arg)
+{
+ return (traverse_dataset_resume(ds, txg_start, NULL, flags, func, arg));
+}
+
+int
traverse_dataset_destroyed(spa_t *spa, blkptr_t *blkptr,
- uint64_t txg_start, zbookmark_t *resume, int flags,
+ uint64_t txg_start, zbookmark_phys_t *resume, int flags,
blkptr_cb_t func, void *arg)
{
return (traverse_impl(spa, NULL, ZB_DESTROYED_OBJSET,
@@ -581,8 +665,7 @@
traverse_pool(spa_t *spa, uint64_t txg_start, int flags,
blkptr_cb_t func, void *arg)
{
- int err, lasterr = 0;
- uint64_t obj;
+ int err;
dsl_pool_t *dp = spa_get_dsl(spa);
objset_t *mos = dp->dp_meta_objset;
boolean_t hard = (flags & TRAVERSE_HARD);
@@ -594,19 +677,18 @@
return (err);
/* visit each dataset */
- for (obj = 1; err == 0 || (err != ESRCH && hard);
- err = dmu_object_next(mos, &obj, FALSE, txg_start)) {
+ for (uint64_t obj = 1; err == 0;
+ err = dmu_object_next(mos, &obj, B_FALSE, txg_start)) {
dmu_object_info_t doi;
err = dmu_object_info(mos, obj, &doi);
if (err != 0) {
- if (!hard)
- return (err);
- lasterr = err;
- continue;
+ if (hard)
+ continue;
+ break;
}
- if (doi.doi_type == DMU_OT_DSL_DATASET) {
+ if (doi.doi_bonus_type == DMU_OT_DSL_DATASET) {
dsl_dataset_t *ds;
uint64_t txg = txg_start;
@@ -614,23 +696,19 @@
err = dsl_dataset_hold_obj(dp, obj, FTAG, &ds);
dsl_pool_config_exit(dp, FTAG);
if (err != 0) {
- if (!hard)
- return (err);
- lasterr = err;
- continue;
+ if (hard)
+ continue;
+ break;
}
- if (ds->ds_phys->ds_prev_snap_txg > txg)
- txg = ds->ds_phys->ds_prev_snap_txg;
+ if (dsl_dataset_phys(ds)->ds_prev_snap_txg > txg)
+ txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
err = traverse_dataset(ds, txg, flags, func, arg);
dsl_dataset_rele(ds, FTAG);
- if (err != 0) {
- if (!hard)
- return (err);
- lasterr = err;
- }
+ if (err != 0)
+ break;
}
}
if (err == ESRCH)
err = 0;
- return (err != 0 ? err : lasterr);
+ return (err);
}
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -22,7 +22,8 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
*/
#include <sys/dmu.h>
@@ -55,6 +56,7 @@
offsetof(dmu_tx_hold_t, txh_node));
list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t),
offsetof(dmu_tx_callback_t, dcb_node));
+ tx->tx_start = gethrtime();
#ifdef ZFS_DEBUG
refcount_create(&tx->tx_space_written);
refcount_create(&tx->tx_space_freed);
@@ -128,6 +130,12 @@
txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP);
txh->txh_tx = tx;
txh->txh_dnode = dn;
+ refcount_create(&txh->txh_space_towrite);
+ refcount_create(&txh->txh_space_tofree);
+ refcount_create(&txh->txh_space_tooverwrite);
+ refcount_create(&txh->txh_space_tounref);
+ refcount_create(&txh->txh_memory_tohold);
+ refcount_create(&txh->txh_fudge);
#ifdef ZFS_DEBUG
txh->txh_type = type;
txh->txh_arg1 = arg1;
@@ -200,13 +208,19 @@
freeable = (bp && (freeable ||
dsl_dataset_block_freeable(ds, bp, bp->blk_birth)));
- if (freeable)
- txh->txh_space_tooverwrite += space;
- else
- txh->txh_space_towrite += space;
- if (bp)
- txh->txh_space_tounref += bp_get_dsize(os->os_spa, bp);
+ if (freeable) {
+ (void) refcount_add_many(&txh->txh_space_tooverwrite,
+ space, FTAG);
+ } else {
+ (void) refcount_add_many(&txh->txh_space_towrite,
+ space, FTAG);
+ }
+ if (bp) {
+ (void) refcount_add_many(&txh->txh_space_tounref,
+ bp_get_dsize(os->os_spa, bp), FTAG);
+ }
+
dmu_tx_count_twig(txh, dn, parent, level + 1,
blkid >> epbs, freeable, history);
}
@@ -224,7 +238,7 @@
return;
min_bs = SPA_MINBLOCKSHIFT;
- max_bs = SPA_MAXBLOCKSHIFT;
+ max_bs = highbit64(txh->txh_tx->tx_objset->os_recordsize) - 1;
min_ibs = DN_MIN_INDBLKSHIFT;
max_ibs = DN_MAX_INDBLKSHIFT;
@@ -293,6 +307,14 @@
*/
ASSERT(dn->dn_datablkshift != 0);
min_bs = max_bs = dn->dn_datablkshift;
+ } else {
+ /*
+ * The blocksize can increase up to the recordsize,
+ * or if it is already more than the recordsize,
+ * up to the next power of 2.
+ */
+ min_bs = highbit64(dn->dn_datablksz - 1);
+ max_bs = MAX(max_bs, highbit64(dn->dn_datablksz - 1));
}
/*
@@ -307,7 +329,8 @@
dmu_buf_impl_t *db;
rw_enter(&dn->dn_struct_rwlock, RW_READER);
- err = dbuf_hold_impl(dn, 0, start, FALSE, FTAG, &db);
+ err = dbuf_hold_impl(dn, 0, start,
+ FALSE, FALSE, FTAG, &db);
rw_exit(&dn->dn_struct_rwlock);
if (err) {
@@ -326,8 +349,11 @@
bits = 64 - min_bs;
epbs = min_ibs - SPA_BLKPTRSHIFT;
for (bits -= epbs * (nlvls - 1);
- bits >= 0; bits -= epbs)
- txh->txh_fudge += 1ULL << max_ibs;
+ bits >= 0; bits -= epbs) {
+ (void) refcount_add_many(
+ &txh->txh_fudge,
+ 1ULL << max_ibs, FTAG);
+ }
goto out;
}
off += delta;
@@ -343,7 +369,8 @@
*/
start = P2ALIGN(off, 1ULL << max_bs);
end = P2ROUNDUP(off + len, 1ULL << max_bs) - 1;
- txh->txh_space_towrite += end - start + 1;
+ (void) refcount_add_many(&txh->txh_space_towrite,
+ end - start + 1, FTAG);
start >>= min_bs;
end >>= min_bs;
@@ -358,18 +385,21 @@
start >>= epbs;
end >>= epbs;
ASSERT3U(end, >=, start);
- txh->txh_space_towrite += (end - start + 1) << max_ibs;
+ (void) refcount_add_many(&txh->txh_space_towrite,
+ (end - start + 1) << max_ibs, FTAG);
if (start != 0) {
/*
* We also need a new blkid=0 indirect block
* to reference any existing file data.
*/
- txh->txh_space_towrite += 1ULL << max_ibs;
+ (void) refcount_add_many(&txh->txh_space_towrite,
+ 1ULL << max_ibs, FTAG);
}
}
out:
- if (txh->txh_space_towrite + txh->txh_space_tooverwrite >
+ if (refcount_count(&txh->txh_space_towrite) +
+ refcount_count(&txh->txh_space_tooverwrite) >
2 * DMU_MAX_ACCESS)
err = SET_ERROR(EFBIG);
@@ -388,12 +418,15 @@
if (dn && dn->dn_dbuf->db_blkptr &&
dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
dn->dn_dbuf->db_blkptr, dn->dn_dbuf->db_blkptr->blk_birth)) {
- txh->txh_space_tooverwrite += space;
- txh->txh_space_tounref += space;
+ (void) refcount_add_many(&txh->txh_space_tooverwrite,
+ space, FTAG);
+ (void) refcount_add_many(&txh->txh_space_tounref, space, FTAG);
} else {
- txh->txh_space_towrite += space;
- if (dn && dn->dn_dbuf->db_blkptr)
- txh->txh_space_tounref += space;
+ (void) refcount_add_many(&txh->txh_space_towrite, space, FTAG);
+ if (dn && dn->dn_dbuf->db_blkptr) {
+ (void) refcount_add_many(&txh->txh_space_tounref,
+ space, FTAG);
+ }
}
}
@@ -449,12 +482,12 @@
blkid = off >> dn->dn_datablkshift;
nblks = (len + dn->dn_datablksz - 1) >> dn->dn_datablkshift;
- if (blkid >= dn->dn_maxblkid) {
+ if (blkid > dn->dn_maxblkid) {
rw_exit(&dn->dn_struct_rwlock);
return;
}
if (blkid + nblks > dn->dn_maxblkid)
- nblks = dn->dn_maxblkid - blkid;
+ nblks = dn->dn_maxblkid - blkid + 1;
}
l0span = nblks; /* save for later use to calc level > 1 overhead */
@@ -508,13 +541,15 @@
blkoff = P2PHASE(blkid, epb);
tochk = MIN(epb - blkoff, nblks);
- err = dbuf_hold_impl(dn, 1, blkid >> epbs, FALSE, FTAG, &dbuf);
+ err = dbuf_hold_impl(dn, 1, blkid >> epbs,
+ FALSE, FALSE, FTAG, &dbuf);
if (err) {
txh->txh_tx->tx_err = err;
break;
}
- txh->txh_memory_tohold += dbuf->db.db_size;
+ (void) refcount_add_many(&txh->txh_memory_tohold,
+ dbuf->db.db_size, FTAG);
/*
* We don't check memory_tohold against DMU_MAX_ACCESS because
@@ -567,8 +602,9 @@
(dn->dn_indblkshift - SPA_BLKPTRSHIFT);
while (level++ < maxlevel) {
- txh->txh_memory_tohold += MAX(MIN(blkcnt, nl1blks), 1)
- << dn->dn_indblkshift;
+ (void) refcount_add_many(&txh->txh_memory_tohold,
+ MAX(MIN(blkcnt, nl1blks), 1) << dn->dn_indblkshift,
+ FTAG);
blkcnt = 1 + (blkcnt >> epbs);
}
}
@@ -575,21 +611,51 @@
/* account for new level 1 indirect blocks that might show up */
if (skipped > 0) {
- txh->txh_fudge += skipped << dn->dn_indblkshift;
+ (void) refcount_add_many(&txh->txh_fudge,
+ skipped << dn->dn_indblkshift, FTAG);
skipped = MIN(skipped, DMU_MAX_DELETEBLKCNT >> epbs);
- txh->txh_memory_tohold += skipped << dn->dn_indblkshift;
+ (void) refcount_add_many(&txh->txh_memory_tohold,
+ skipped << dn->dn_indblkshift, FTAG);
}
- txh->txh_space_tofree += space;
- txh->txh_space_tounref += unref;
+ (void) refcount_add_many(&txh->txh_space_tofree, space, FTAG);
+ (void) refcount_add_many(&txh->txh_space_tounref, unref, FTAG);
}
+/*
+ * This function marks the transaction as being a "net free". The end
+ * result is that refquotas will be disabled for this transaction, and
+ * this transaction will be able to use half of the pool space overhead
+ * (see dsl_pool_adjustedsize()). Therefore this function should only
+ * be called for transactions that we expect will not cause a net increase
+ * in the amount of space used (but it's OK if that is occasionally not true).
+ */
void
+dmu_tx_mark_netfree(dmu_tx_t *tx)
+{
+ dmu_tx_hold_t *txh;
+
+ txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
+ DMU_NEW_OBJECT, THT_FREE, 0, 0);
+
+ /*
+ * Pretend that this operation will free 1GB of space. This
+ * should be large enough to cancel out the largest write.
+ * We don't want to use something like UINT64_MAX, because that would
+ * cause overflows when doing math with these values (e.g. in
+ * dmu_tx_try_assign()).
+ */
+ (void) refcount_add_many(&txh->txh_space_tofree,
+ 1024 * 1024 * 1024, FTAG);
+ (void) refcount_add_many(&txh->txh_space_tounref,
+ 1024 * 1024 * 1024, FTAG);
+}
+
+void
dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
{
dmu_tx_hold_t *txh;
dnode_t *dn;
- uint64_t start, end, i;
- int err, shift;
+ int err;
zio_t *zio;
ASSERT(tx->tx_txg == 0);
@@ -599,14 +665,6 @@
if (txh == NULL)
return;
dn = txh->txh_dnode;
-
- /* first block */
- if (off != 0)
- dmu_tx_count_write(txh, off, 1);
- /* last block */
- if (len != DMU_OBJECT_END)
- dmu_tx_count_write(txh, off+len, 1);
-
dmu_tx_count_dnode(txh);
if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz)
@@ -614,24 +672,54 @@
if (len == DMU_OBJECT_END)
len = (dn->dn_maxblkid+1) * dn->dn_datablksz - off;
+
/*
- * For i/o error checking, read the first and last level-0
- * blocks, and all the level-1 blocks. The above count_write's
- * have already taken care of the level-0 blocks.
+ * For i/o error checking, we read the first and last level-0
+ * blocks if they are not aligned, and all the level-1 blocks.
+ *
+ * Note: dbuf_free_range() assumes that we have not instantiated
+ * any level-0 dbufs that will be completely freed. Therefore we must
+ * exercise care to not read or count the first and last blocks
+ * if they are blocksize-aligned.
*/
+ if (dn->dn_datablkshift == 0) {
+ if (off != 0 || len < dn->dn_datablksz)
+ dmu_tx_count_write(txh, 0, dn->dn_datablksz);
+ } else {
+ /* first block will be modified if it is not aligned */
+ if (!IS_P2ALIGNED(off, 1 << dn->dn_datablkshift))
+ dmu_tx_count_write(txh, off, 1);
+ /* last block will be modified if it is not aligned */
+ if (!IS_P2ALIGNED(off + len, 1 << dn->dn_datablkshift))
+ dmu_tx_count_write(txh, off+len, 1);
+ }
+
+ /*
+ * Check level-1 blocks.
+ */
if (dn->dn_nlevels > 1) {
- shift = dn->dn_datablkshift + dn->dn_indblkshift -
+ int shift = dn->dn_datablkshift + dn->dn_indblkshift -
SPA_BLKPTRSHIFT;
- start = off >> shift;
- end = dn->dn_datablkshift ? ((off+len) >> shift) : 0;
+ uint64_t start = off >> shift;
+ uint64_t end = (off + len) >> shift;
+ ASSERT(dn->dn_indblkshift != 0);
+
+ /*
+ * dnode_reallocate() can result in an object with indirect
+ * blocks having an odd data block size. In this case,
+ * just check the single block.
+ */
+ if (dn->dn_datablkshift == 0)
+ start = end = 0;
+
zio = zio_root(tx->tx_pool->dp_spa,
NULL, NULL, ZIO_FLAG_CANFAIL);
- for (i = start; i <= end; i++) {
+ for (uint64_t i = start; i <= end; i++) {
uint64_t ibyte = i << shift;
err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0);
i = ibyte >> shift;
- if (err == ESRCH)
+ if (err == ESRCH || i > end)
break;
if (err) {
tx->tx_err = err;
@@ -659,8 +747,7 @@
{
dmu_tx_hold_t *txh;
dnode_t *dn;
- uint64_t nblocks;
- int epbs, err;
+ int err;
ASSERT(tx->tx_txg == 0);
@@ -703,12 +790,17 @@
*/
bp = &dn->dn_phys->dn_blkptr[0];
if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
- bp, bp->blk_birth))
- txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE;
- else
- txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
- if (!BP_IS_HOLE(bp))
- txh->txh_space_tounref += SPA_MAXBLOCKSIZE;
+ bp, bp->blk_birth)) {
+ (void) refcount_add_many(&txh->txh_space_tooverwrite,
+ MZAP_MAX_BLKSZ, FTAG);
+ } else {
+ (void) refcount_add_many(&txh->txh_space_towrite,
+ MZAP_MAX_BLKSZ, FTAG);
+ }
+ if (!BP_IS_HOLE(bp)) {
+ (void) refcount_add_many(&txh->txh_space_tounref,
+ MZAP_MAX_BLKSZ, FTAG);
+ }
return;
}
@@ -717,8 +809,7 @@
* access the name in this fat-zap so that we'll check
* for i/o errors to the leaf blocks, etc.
*/
- err = zap_lookup(dn->dn_objset, dn->dn_object, name,
- 8, 0, NULL);
+ err = zap_lookup_by_dnode(dn, name, 8, 0, NULL);
if (err == EIO) {
tx->tx_err = err;
return;
@@ -725,19 +816,34 @@
}
}
- err = zap_count_write(dn->dn_objset, dn->dn_object, name, add,
+ err = zap_count_write_by_dnode(dn, name, add,
&txh->txh_space_towrite, &txh->txh_space_tooverwrite);
/*
* If the modified blocks are scattered to the four winds,
- * we'll have to modify an indirect twig for each.
+ * we'll have to modify an indirect twig for each. We can make
+ * modifications at up to 3 locations:
+ * - header block at the beginning of the object
+ * - target leaf block
+ * - end of the object, where we might need to write:
+ * - a new leaf block if the target block needs to be split
+ * - the new pointer table, if it is growing
+ * - the new cookie table, if it is growing
*/
- epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
- for (nblocks = dn->dn_maxblkid >> epbs; nblocks != 0; nblocks >>= epbs)
- if (dn->dn_objset->os_dsl_dataset->ds_phys->ds_prev_snap_obj)
- txh->txh_space_towrite += 3 << dn->dn_indblkshift;
- else
- txh->txh_space_tooverwrite += 3 << dn->dn_indblkshift;
+ int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+ dsl_dataset_phys_t *ds_phys =
+ dsl_dataset_phys(dn->dn_objset->os_dsl_dataset);
+ for (int lvl = 1; lvl < dn->dn_nlevels; lvl++) {
+ uint64_t num_indirects = 1 + (dn->dn_maxblkid >> (epbs * lvl));
+ uint64_t spc = MIN(3, num_indirects) << dn->dn_indblkshift;
+ if (ds_phys->ds_prev_snap_obj != 0) {
+ (void) refcount_add_many(&txh->txh_space_towrite,
+ spc, FTAG);
+ } else {
+ (void) refcount_add_many(&txh->txh_space_tooverwrite,
+ spc, FTAG);
+ }
+ }
}
void
@@ -762,7 +868,7 @@
txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
DMU_NEW_OBJECT, THT_SPACE, space, 0);
- txh->txh_space_towrite += space;
+ (void) refcount_add_many(&txh->txh_space_towrite, space, FTAG);
}
int
@@ -898,8 +1004,163 @@
}
#endif
+/*
+ * If we can't do 10 iops, something is wrong. Let us go ahead
+ * and hit zfs_dirty_data_max.
+ */
+hrtime_t zfs_delay_max_ns = MSEC2NSEC(100);
+int zfs_delay_resolution_ns = 100 * 1000; /* 100 microseconds */
+
+/*
+ * We delay transactions when we've determined that the backend storage
+ * isn't able to accommodate the rate of incoming writes.
+ *
+ * If there is already a transaction waiting, we delay relative to when
+ * that transaction finishes waiting. This way the calculated min_time
+ * is independent of the number of threads concurrently executing
+ * transactions.
+ *
+ * If we are the only waiter, wait relative to when the transaction
+ * started, rather than the current time. This credits the transaction for
+ * "time already served", e.g. reading indirect blocks.
+ *
+ * The minimum time for a transaction to take is calculated as:
+ * min_time = scale * (dirty - min) / (max - dirty)
+ * min_time is then capped at zfs_delay_max_ns.
+ *
+ * The delay has two degrees of freedom that can be adjusted via tunables.
+ * The percentage of dirty data at which we start to delay is defined by
+ * zfs_delay_min_dirty_percent. This should typically be at or above
+ * zfs_vdev_async_write_active_max_dirty_percent so that we only start to
+ * delay after writing at full speed has failed to keep up with the incoming
+ * write rate. The scale of the curve is defined by zfs_delay_scale. Roughly
+ * speaking, this variable determines the amount of delay at the midpoint of
+ * the curve.
+ *
+ * delay
+ * 10ms +-------------------------------------------------------------*+
+ * | *|
+ * 9ms + *+
+ * | *|
+ * 8ms + *+
+ * | * |
+ * 7ms + * +
+ * | * |
+ * 6ms + * +
+ * | * |
+ * 5ms + * +
+ * | * |
+ * 4ms + * +
+ * | * |
+ * 3ms + * +
+ * | * |
+ * 2ms + (midpoint) * +
+ * | | ** |
+ * 1ms + v *** +
+ * | zfs_delay_scale ----------> ******** |
+ * 0 +-------------------------------------*********----------------+
+ * 0% <- zfs_dirty_data_max -> 100%
+ *
+ * Note that since the delay is added to the outstanding time remaining on the
+ * most recent transaction, the delay is effectively the inverse of IOPS.
+ * Here the midpoint of 500us translates to 2000 IOPS. The shape of the curve
+ * was chosen such that small changes in the amount of accumulated dirty data
+ * in the first 3/4 of the curve yield relatively small differences in the
+ * amount of delay.
+ *
+ * The effects can be easier to understand when the amount of delay is
+ * represented on a log scale:
+ *
+ * delay
+ * 100ms +-------------------------------------------------------------++
+ * + +
+ * | |
+ * + *+
+ * 10ms + *+
+ * + ** +
+ * | (midpoint) ** |
+ * + | ** +
+ * 1ms + v **** +
+ * + zfs_delay_scale ----------> ***** +
+ * | **** |
+ * + **** +
+ * 100us + ** +
+ * + * +
+ * | * |
+ * + * +
+ * 10us + * +
+ * + +
+ * | |
+ * + +
+ * +--------------------------------------------------------------+
+ * 0% <- zfs_dirty_data_max -> 100%
+ *
+ * Note here that only as the amount of dirty data approaches its limit does
+ * the delay start to increase rapidly. The goal of a properly tuned system
+ * should be to keep the amount of dirty data out of that range by first
+ * ensuring that the appropriate limits are set for the I/O scheduler to reach
+ * optimal throughput on the backend storage, and then by changing the value
+ * of zfs_delay_scale to increase the steepness of the curve.
+ */
+static void
+dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty)
+{
+ dsl_pool_t *dp = tx->tx_pool;
+ uint64_t delay_min_bytes =
+ zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
+ hrtime_t wakeup, min_tx_time, now;
+
+ if (dirty <= delay_min_bytes)
+ return;
+
+ /*
+ * The caller has already waited until we are under the max.
+ * We make them pass us the amount of dirty data so we don't
+ * have to handle the case of it being >= the max, which could
+ * cause a divide-by-zero if it's == the max.
+ */
+ ASSERT3U(dirty, <, zfs_dirty_data_max);
+
+ now = gethrtime();
+ min_tx_time = zfs_delay_scale *
+ (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty);
+ if (now > tx->tx_start + min_tx_time)
+ return;
+
+ min_tx_time = MIN(min_tx_time, zfs_delay_max_ns);
+
+ DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty,
+ uint64_t, min_tx_time);
+
+ mutex_enter(&dp->dp_lock);
+ wakeup = MAX(tx->tx_start + min_tx_time,
+ dp->dp_last_wakeup + min_tx_time);
+ dp->dp_last_wakeup = wakeup;
+ mutex_exit(&dp->dp_lock);
+
+#ifdef _KERNEL
+#ifdef illumos
+ mutex_enter(&curthread->t_delay_lock);
+ while (cv_timedwait_hires(&curthread->t_delay_cv,
+ &curthread->t_delay_lock, wakeup, zfs_delay_resolution_ns,
+ CALLOUT_FLAG_ABSOLUTE | CALLOUT_FLAG_ROUNDUP) > 0)
+ continue;
+ mutex_exit(&curthread->t_delay_lock);
+#else
+ pause_sbt("dmu_tx_delay", wakeup * SBT_1NS,
+ zfs_delay_resolution_ns * SBT_1NS, C_ABSOLUTE);
+#endif
+#else
+ hrtime_t delta = wakeup - gethrtime();
+ struct timespec ts;
+ ts.tv_sec = delta / NANOSEC;
+ ts.tv_nsec = delta % NANOSEC;
+ (void) nanosleep(&ts, NULL);
+#endif
+}
+
static int
-dmu_tx_try_assign(dmu_tx_t *tx, txg_how_t txg_how)
+dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how)
{
dmu_tx_hold_t *txh;
spa_t *spa = tx->tx_pool->dp_spa;
@@ -922,12 +1183,18 @@
* of the failuremode setting.
*/
if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE &&
- txg_how != TXG_WAIT)
+ !(txg_how & TXG_WAIT))
return (SET_ERROR(EIO));
return (SET_ERROR(ERESTART));
}
+ if (!tx->tx_dirty_delayed &&
+ dsl_pool_need_dirty_delay(tx->tx_pool)) {
+ tx->tx_wait_dirty = B_TRUE;
+ return (SET_ERROR(ERESTART));
+ }
+
tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh);
tx->tx_needassign_txh = NULL;
@@ -954,12 +1221,12 @@
(void) refcount_add(&dn->dn_tx_holds, tx);
mutex_exit(&dn->dn_mtx);
}
- towrite += txh->txh_space_towrite;
- tofree += txh->txh_space_tofree;
- tooverwrite += txh->txh_space_tooverwrite;
- tounref += txh->txh_space_tounref;
- tohold += txh->txh_memory_tohold;
- fudge += txh->txh_fudge;
+ towrite += refcount_count(&txh->txh_space_towrite);
+ tofree += refcount_count(&txh->txh_space_tofree);
+ tooverwrite += refcount_count(&txh->txh_space_tooverwrite);
+ tounref += refcount_count(&txh->txh_space_tounref);
+ tohold += refcount_count(&txh->txh_memory_tohold);
+ fudge += refcount_count(&txh->txh_fudge);
}
/*
@@ -1042,33 +1309,44 @@
}
/*
- * Assign tx to a transaction group. txg_how can be one of:
+ * Assign tx to a transaction group; txg_how is a bitmask:
*
- * (1) TXG_WAIT. If the current open txg is full, waits until there's
- * a new one. This should be used when you're not holding locks.
- * It will only fail if we're truly out of space (or over quota).
+ * If TXG_WAIT is set and the currently open txg is full, this function
+ * will wait until there's a new txg. This should be used when no locks
+ * are being held. With this bit set, this function will only fail if
+ * we're truly out of space (or over quota).
*
- * (2) TXG_NOWAIT. If we can't assign into the current open txg without
- * blocking, returns immediately with ERESTART. This should be used
- * whenever you're holding locks. On an ERESTART error, the caller
- * should drop locks, do a dmu_tx_wait(tx), and try again.
+ * If TXG_WAIT is *not* set and we can't assign into the currently open
+ * txg without blocking, this function will return immediately with
+ * ERESTART. This should be used whenever locks are being held. On an
+ * ERESTART error, the caller should drop all locks, call dmu_tx_wait(),
+ * and try again.
+ *
+ * If TXG_NOTHROTTLE is set, this indicates that this tx should not be
+ * delayed due on the ZFS Write Throttle (see comments in dsl_pool.c for
+ * details on the throttle). This is used by the VFS operations, after
+ * they have already called dmu_tx_wait() (though most likely on a
+ * different tx).
*/
int
-dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how)
+dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how)
{
int err;
ASSERT(tx->tx_txg == 0);
- ASSERT(txg_how == TXG_WAIT || txg_how == TXG_NOWAIT);
+ ASSERT0(txg_how & ~(TXG_WAIT | TXG_NOTHROTTLE));
ASSERT(!dsl_pool_sync_context(tx->tx_pool));
/* If we might wait, we must not hold the config lock. */
- ASSERT(txg_how != TXG_WAIT || !dsl_pool_config_held(tx->tx_pool));
+ IMPLY((txg_how & TXG_WAIT), !dsl_pool_config_held(tx->tx_pool));
+ if ((txg_how & TXG_NOTHROTTLE))
+ tx->tx_dirty_delayed = B_TRUE;
+
while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) {
dmu_tx_unassign(tx);
- if (err != ERESTART || txg_how != TXG_WAIT)
+ if (err != ERESTART || !(txg_how & TXG_WAIT))
return (err);
dmu_tx_wait(tx);
@@ -1083,18 +1361,48 @@
dmu_tx_wait(dmu_tx_t *tx)
{
spa_t *spa = tx->tx_pool->dp_spa;
+ dsl_pool_t *dp = tx->tx_pool;
ASSERT(tx->tx_txg == 0);
ASSERT(!dsl_pool_config_held(tx->tx_pool));
- /*
- * It's possible that the pool has become active after this thread
- * has tried to obtain a tx. If that's the case then his
- * tx_lasttried_txg would not have been assigned.
- */
- if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) {
- txg_wait_synced(tx->tx_pool, spa_last_synced_txg(spa) + 1);
+ if (tx->tx_wait_dirty) {
+ /*
+ * dmu_tx_try_assign() has determined that we need to wait
+ * because we've consumed much or all of the dirty buffer
+ * space.
+ */
+ mutex_enter(&dp->dp_lock);
+ while (dp->dp_dirty_total >= zfs_dirty_data_max)
+ cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock);
+ uint64_t dirty = dp->dp_dirty_total;
+ mutex_exit(&dp->dp_lock);
+
+ dmu_tx_delay(tx, dirty);
+
+ tx->tx_wait_dirty = B_FALSE;
+
+ /*
+ * Note: setting tx_dirty_delayed only has effect if the
+ * caller used TX_WAIT. Otherwise they are going to
+ * destroy this tx and try again. The common case,
+ * zfs_write(), uses TX_WAIT.
+ */
+ tx->tx_dirty_delayed = B_TRUE;
+ } else if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) {
+ /*
+ * If the pool is suspended we need to wait until it
+ * is resumed. Note that it's possible that the pool
+ * has become active after this thread has tried to
+ * obtain a tx. If that's the case then tx_lasttried_txg
+ * would not have been set.
+ */
+ txg_wait_synced(dp, spa_last_synced_txg(spa) + 1);
} else if (tx->tx_needassign_txh) {
+ /*
+ * A dnode is assigned to the quiescing txg. Wait for its
+ * transaction to complete.
+ */
dnode_t *dn = tx->tx_needassign_txh->txh_dnode;
mutex_enter(&dn->dn_mtx);
@@ -1124,11 +1432,46 @@
#endif
}
+static void
+dmu_tx_destroy(dmu_tx_t *tx)
+{
+ dmu_tx_hold_t *txh;
+
+ while ((txh = list_head(&tx->tx_holds)) != NULL) {
+ dnode_t *dn = txh->txh_dnode;
+
+ list_remove(&tx->tx_holds, txh);
+ refcount_destroy_many(&txh->txh_space_towrite,
+ refcount_count(&txh->txh_space_towrite));
+ refcount_destroy_many(&txh->txh_space_tofree,
+ refcount_count(&txh->txh_space_tofree));
+ refcount_destroy_many(&txh->txh_space_tooverwrite,
+ refcount_count(&txh->txh_space_tooverwrite));
+ refcount_destroy_many(&txh->txh_space_tounref,
+ refcount_count(&txh->txh_space_tounref));
+ refcount_destroy_many(&txh->txh_memory_tohold,
+ refcount_count(&txh->txh_memory_tohold));
+ refcount_destroy_many(&txh->txh_fudge,
+ refcount_count(&txh->txh_fudge));
+ kmem_free(txh, sizeof (dmu_tx_hold_t));
+ if (dn != NULL)
+ dnode_rele(dn, tx);
+ }
+
+ list_destroy(&tx->tx_callbacks);
+ list_destroy(&tx->tx_holds);
+#ifdef ZFS_DEBUG
+ refcount_destroy_many(&tx->tx_space_written,
+ refcount_count(&tx->tx_space_written));
+ refcount_destroy_many(&tx->tx_space_freed,
+ refcount_count(&tx->tx_space_freed));
+#endif
+ kmem_free(tx, sizeof (dmu_tx_t));
+}
+
void
dmu_tx_commit(dmu_tx_t *tx)
{
- dmu_tx_hold_t *txh;
-
ASSERT(tx->tx_txg != 0);
/*
@@ -1135,13 +1478,13 @@
* Go through the transaction's hold list and remove holds on
* associated dnodes, notifying waiters if no holds remain.
*/
- while (txh = list_head(&tx->tx_holds)) {
+ for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL;
+ txh = list_next(&tx->tx_holds, txh)) {
dnode_t *dn = txh->txh_dnode;
- list_remove(&tx->tx_holds, txh);
- kmem_free(txh, sizeof (dmu_tx_hold_t));
if (dn == NULL)
continue;
+
mutex_enter(&dn->dn_mtx);
ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
@@ -1150,7 +1493,6 @@
cv_broadcast(&dn->dn_notxholds);
}
mutex_exit(&dn->dn_mtx);
- dnode_rele(dn, tx);
}
if (tx->tx_tempreserve_cookie)
@@ -1162,36 +1504,19 @@
if (tx->tx_anyobj == FALSE)
txg_rele_to_sync(&tx->tx_txgh);
- list_destroy(&tx->tx_callbacks);
- list_destroy(&tx->tx_holds);
#ifdef ZFS_DEBUG
dprintf("towrite=%llu written=%llu tofree=%llu freed=%llu\n",
tx->tx_space_towrite, refcount_count(&tx->tx_space_written),
tx->tx_space_tofree, refcount_count(&tx->tx_space_freed));
- refcount_destroy_many(&tx->tx_space_written,
- refcount_count(&tx->tx_space_written));
- refcount_destroy_many(&tx->tx_space_freed,
- refcount_count(&tx->tx_space_freed));
#endif
- kmem_free(tx, sizeof (dmu_tx_t));
+ dmu_tx_destroy(tx);
}
void
dmu_tx_abort(dmu_tx_t *tx)
{
- dmu_tx_hold_t *txh;
-
ASSERT(tx->tx_txg == 0);
- while (txh = list_head(&tx->tx_holds)) {
- dnode_t *dn = txh->txh_dnode;
-
- list_remove(&tx->tx_holds, txh);
- kmem_free(txh, sizeof (dmu_tx_hold_t));
- if (dn != NULL)
- dnode_rele(dn, tx);
- }
-
/*
* Call any registered callbacks with an error code.
*/
@@ -1198,15 +1523,7 @@
if (!list_is_empty(&tx->tx_callbacks))
dmu_tx_do_callbacks(&tx->tx_callbacks, ECANCELED);
- list_destroy(&tx->tx_callbacks);
- list_destroy(&tx->tx_holds);
-#ifdef ZFS_DEBUG
- refcount_destroy_many(&tx->tx_space_written,
- refcount_count(&tx->tx_space_written));
- refcount_destroy_many(&tx->tx_space_freed,
- refcount_count(&tx->tx_space_freed));
-#endif
- kmem_free(tx, sizeof (dmu_tx_t));
+ dmu_tx_destroy(tx);
}
uint64_t
@@ -1245,7 +1562,7 @@
{
dmu_tx_callback_t *dcb;
- while (dcb = list_head(cb_list)) {
+ while ((dcb = list_head(cb_list)) != NULL) {
list_remove(cb_list, dcb);
dcb->dcb_func(dcb->dcb_data, error);
kmem_free(dcb, sizeof (dmu_tx_callback_t));
@@ -1303,18 +1620,24 @@
/* If blkptr doesn't exist then add space to towrite */
if (!(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) {
- txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
+ (void) refcount_add_many(&txh->txh_space_towrite,
+ SPA_OLD_MAXBLOCKSIZE, FTAG);
} else {
blkptr_t *bp;
bp = &dn->dn_phys->dn_spill;
if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
- bp, bp->blk_birth))
- txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE;
- else
- txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
- if (!BP_IS_HOLE(bp))
- txh->txh_space_tounref += SPA_MAXBLOCKSIZE;
+ bp, bp->blk_birth)) {
+ (void) refcount_add_many(&txh->txh_space_tooverwrite,
+ SPA_OLD_MAXBLOCKSIZE, FTAG);
+ } else {
+ (void) refcount_add_many(&txh->txh_space_towrite,
+ SPA_OLD_MAXBLOCKSIZE, FTAG);
+ }
+ if (!BP_IS_HOLE(bp)) {
+ (void) refcount_add_many(&txh->txh_space_tounref,
+ SPA_OLD_MAXBLOCKSIZE, FTAG);
+ }
}
}
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -24,6 +24,10 @@
* Use is subject to license terms.
*/
+/*
+ * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
+ */
+
#include <sys/zfs_context.h>
#include <sys/dnode.h>
#include <sys/dmu_objset.h>
@@ -33,19 +37,22 @@
#include <sys/kstat.h>
/*
- * I'm against tune-ables, but these should probably exist as tweakable globals
- * until we can get this working the way we want it to.
+ * This tunable disables predictive prefetch. Note that it leaves "prescient"
+ * prefetch (e.g. prefetch for zfs send) intact. Unlike predictive prefetch,
+ * prescient prefetch never issues i/os that end up not being needed,
+ * so it can't hurt performance.
*/
+boolean_t zfs_prefetch_disable = B_FALSE;
-int zfs_prefetch_disable = 0;
-
/* max # of streams per zfetch */
uint32_t zfetch_max_streams = 8;
/* min time before stream reclaim */
uint32_t zfetch_min_sec_reap = 2;
-/* max number of blocks to fetch at a time */
-uint32_t zfetch_block_cap = 256;
-/* number of bytes in a array_read at which we stop prefetching (1Mb) */
+/* max bytes to prefetch per stream (default 8MB) */
+uint32_t zfetch_max_distance = 8 * 1024 * 1024;
+/* max bytes to prefetch indirects for per stream (default 64MB) */
+uint32_t zfetch_max_idistance = 64 * 1024 * 1024;
+/* max number of bytes in an array_read in which we allow prefetching (1MB) */
uint64_t zfetch_array_rd_sz = 1024 * 1024;
SYSCTL_DECL(_vfs_zfs);
@@ -56,202 +63,36 @@
SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_streams, CTLFLAG_RW,
&zfetch_max_streams, 0, "Max # of streams per zfetch");
TUNABLE_INT("vfs.zfs.zfetch.min_sec_reap", &zfetch_min_sec_reap);
-SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, min_sec_reap, CTLFLAG_RDTUN,
+SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, min_sec_reap, CTLFLAG_RWTUN,
&zfetch_min_sec_reap, 0, "Min time before stream reclaim");
-TUNABLE_INT("vfs.zfs.zfetch.block_cap", &zfetch_block_cap);
-SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, block_cap, CTLFLAG_RDTUN,
- &zfetch_block_cap, 0, "Max number of blocks to fetch at a time");
+TUNABLE_INT("vfs.zfs.zfetch.max_distance", &zfetch_max_distance);
+SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_distance, CTLFLAG_RWTUN,
+ &zfetch_max_distance, 0, "Max bytes to prefetch per stream");
TUNABLE_QUAD("vfs.zfs.zfetch.array_rd_sz", &zfetch_array_rd_sz);
-SYSCTL_UQUAD(_vfs_zfs_zfetch, OID_AUTO, array_rd_sz, CTLFLAG_RDTUN,
+SYSCTL_UQUAD(_vfs_zfs_zfetch, OID_AUTO, array_rd_sz, CTLFLAG_RWTUN,
&zfetch_array_rd_sz, 0,
"Number of bytes in a array_read at which we stop prefetching");
-/* forward decls for static routines */
-static boolean_t dmu_zfetch_colinear(zfetch_t *, zstream_t *);
-static void dmu_zfetch_dofetch(zfetch_t *, zstream_t *);
-static uint64_t dmu_zfetch_fetch(dnode_t *, uint64_t, uint64_t);
-static uint64_t dmu_zfetch_fetchsz(dnode_t *, uint64_t, uint64_t);
-static boolean_t dmu_zfetch_find(zfetch_t *, zstream_t *, int);
-static int dmu_zfetch_stream_insert(zfetch_t *, zstream_t *);
-static zstream_t *dmu_zfetch_stream_reclaim(zfetch_t *);
-static void dmu_zfetch_stream_remove(zfetch_t *, zstream_t *);
-static int dmu_zfetch_streams_equal(zstream_t *, zstream_t *);
-
typedef struct zfetch_stats {
kstat_named_t zfetchstat_hits;
kstat_named_t zfetchstat_misses;
- kstat_named_t zfetchstat_colinear_hits;
- kstat_named_t zfetchstat_colinear_misses;
- kstat_named_t zfetchstat_stride_hits;
- kstat_named_t zfetchstat_stride_misses;
- kstat_named_t zfetchstat_reclaim_successes;
- kstat_named_t zfetchstat_reclaim_failures;
- kstat_named_t zfetchstat_stream_resets;
- kstat_named_t zfetchstat_stream_noresets;
- kstat_named_t zfetchstat_bogus_streams;
+ kstat_named_t zfetchstat_max_streams;
} zfetch_stats_t;
static zfetch_stats_t zfetch_stats = {
{ "hits", KSTAT_DATA_UINT64 },
{ "misses", KSTAT_DATA_UINT64 },
- { "colinear_hits", KSTAT_DATA_UINT64 },
- { "colinear_misses", KSTAT_DATA_UINT64 },
- { "stride_hits", KSTAT_DATA_UINT64 },
- { "stride_misses", KSTAT_DATA_UINT64 },
- { "reclaim_successes", KSTAT_DATA_UINT64 },
- { "reclaim_failures", KSTAT_DATA_UINT64 },
- { "streams_resets", KSTAT_DATA_UINT64 },
- { "streams_noresets", KSTAT_DATA_UINT64 },
- { "bogus_streams", KSTAT_DATA_UINT64 },
+ { "max_streams", KSTAT_DATA_UINT64 },
};
-#define ZFETCHSTAT_INCR(stat, val) \
- atomic_add_64(&zfetch_stats.stat.value.ui64, (val));
+#define ZFETCHSTAT_BUMP(stat) \
+ atomic_inc_64(&zfetch_stats.stat.value.ui64);
-#define ZFETCHSTAT_BUMP(stat) ZFETCHSTAT_INCR(stat, 1);
-
kstat_t *zfetch_ksp;
-/*
- * Given a zfetch structure and a zstream structure, determine whether the
- * blocks to be read are part of a co-linear pair of existing prefetch
- * streams. If a set is found, coalesce the streams, removing one, and
- * configure the prefetch so it looks for a strided access pattern.
- *
- * In other words: if we find two sequential access streams that are
- * the same length and distance N appart, and this read is N from the
- * last stream, then we are probably in a strided access pattern. So
- * combine the two sequential streams into a single strided stream.
- *
- * Returns whether co-linear streams were found.
- */
-static boolean_t
-dmu_zfetch_colinear(zfetch_t *zf, zstream_t *zh)
-{
- zstream_t *z_walk;
- zstream_t *z_comp;
-
- if (! rw_tryenter(&zf->zf_rwlock, RW_WRITER))
- return (0);
-
- if (zh == NULL) {
- rw_exit(&zf->zf_rwlock);
- return (0);
- }
-
- for (z_walk = list_head(&zf->zf_stream); z_walk;
- z_walk = list_next(&zf->zf_stream, z_walk)) {
- for (z_comp = list_next(&zf->zf_stream, z_walk); z_comp;
- z_comp = list_next(&zf->zf_stream, z_comp)) {
- int64_t diff;
-
- if (z_walk->zst_len != z_walk->zst_stride ||
- z_comp->zst_len != z_comp->zst_stride) {
- continue;
- }
-
- diff = z_comp->zst_offset - z_walk->zst_offset;
- if (z_comp->zst_offset + diff == zh->zst_offset) {
- z_walk->zst_offset = zh->zst_offset;
- z_walk->zst_direction = diff < 0 ? -1 : 1;
- z_walk->zst_stride =
- diff * z_walk->zst_direction;
- z_walk->zst_ph_offset =
- zh->zst_offset + z_walk->zst_stride;
- dmu_zfetch_stream_remove(zf, z_comp);
- mutex_destroy(&z_comp->zst_lock);
- kmem_free(z_comp, sizeof (zstream_t));
-
- dmu_zfetch_dofetch(zf, z_walk);
-
- rw_exit(&zf->zf_rwlock);
- return (1);
- }
-
- diff = z_walk->zst_offset - z_comp->zst_offset;
- if (z_walk->zst_offset + diff == zh->zst_offset) {
- z_walk->zst_offset = zh->zst_offset;
- z_walk->zst_direction = diff < 0 ? -1 : 1;
- z_walk->zst_stride =
- diff * z_walk->zst_direction;
- z_walk->zst_ph_offset =
- zh->zst_offset + z_walk->zst_stride;
- dmu_zfetch_stream_remove(zf, z_comp);
- mutex_destroy(&z_comp->zst_lock);
- kmem_free(z_comp, sizeof (zstream_t));
-
- dmu_zfetch_dofetch(zf, z_walk);
-
- rw_exit(&zf->zf_rwlock);
- return (1);
- }
- }
- }
-
- rw_exit(&zf->zf_rwlock);
- return (0);
-}
-
-/*
- * Given a zstream_t, determine the bounds of the prefetch. Then call the
- * routine that actually prefetches the individual blocks.
- */
-static void
-dmu_zfetch_dofetch(zfetch_t *zf, zstream_t *zs)
-{
- uint64_t prefetch_tail;
- uint64_t prefetch_limit;
- uint64_t prefetch_ofst;
- uint64_t prefetch_len;
- uint64_t blocks_fetched;
-
- zs->zst_stride = MAX((int64_t)zs->zst_stride, zs->zst_len);
- zs->zst_cap = MIN(zfetch_block_cap, 2 * zs->zst_cap);
-
- prefetch_tail = MAX((int64_t)zs->zst_ph_offset,
- (int64_t)(zs->zst_offset + zs->zst_stride));
- /*
- * XXX: use a faster division method?
- */
- prefetch_limit = zs->zst_offset + zs->zst_len +
- (zs->zst_cap * zs->zst_stride) / zs->zst_len;
-
- while (prefetch_tail < prefetch_limit) {
- prefetch_ofst = zs->zst_offset + zs->zst_direction *
- (prefetch_tail - zs->zst_offset);
-
- prefetch_len = zs->zst_len;
-
- /*
- * Don't prefetch beyond the end of the file, if working
- * backwards.
- */
- if ((zs->zst_direction == ZFETCH_BACKWARD) &&
- (prefetch_ofst > prefetch_tail)) {
- prefetch_len += prefetch_ofst;
- prefetch_ofst = 0;
- }
-
- /* don't prefetch more than we're supposed to */
- if (prefetch_len > zs->zst_len)
- break;
-
- blocks_fetched = dmu_zfetch_fetch(zf->zf_dnode,
- prefetch_ofst, zs->zst_len);
-
- prefetch_tail += zs->zst_stride;
- /* stop if we've run out of stuff to prefetch */
- if (blocks_fetched < zs->zst_len)
- break;
- }
- zs->zst_ph_offset = prefetch_tail;
- zs->zst_last = ddi_get_lbolt();
-}
-
void
zfetch_init(void)
{
-
zfetch_ksp = kstat_create("zfs", 0, "zfetchstats", "misc",
KSTAT_TYPE_NAMED, sizeof (zfetch_stats) / sizeof (kstat_named_t),
KSTAT_FLAG_VIRTUAL);
@@ -279,284 +120,41 @@
void
dmu_zfetch_init(zfetch_t *zf, dnode_t *dno)
{
- if (zf == NULL) {
+ if (zf == NULL)
return;
- }
zf->zf_dnode = dno;
- zf->zf_stream_cnt = 0;
- zf->zf_alloc_fail = 0;
list_create(&zf->zf_stream, sizeof (zstream_t),
- offsetof(zstream_t, zst_node));
+ offsetof(zstream_t, zs_node));
rw_init(&zf->zf_rwlock, NULL, RW_DEFAULT, NULL);
}
-/*
- * This function computes the actual size, in blocks, that can be prefetched,
- * and fetches it.
- */
-static uint64_t
-dmu_zfetch_fetch(dnode_t *dn, uint64_t blkid, uint64_t nblks)
+static void
+dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs)
{
- uint64_t fetchsz;
- uint64_t i;
-
- fetchsz = dmu_zfetch_fetchsz(dn, blkid, nblks);
-
- for (i = 0; i < fetchsz; i++) {
- dbuf_prefetch(dn, blkid + i);
- }
-
- return (fetchsz);
+ ASSERT(RW_WRITE_HELD(&zf->zf_rwlock));
+ list_remove(&zf->zf_stream, zs);
+ mutex_destroy(&zs->zs_lock);
+ kmem_free(zs, sizeof (*zs));
}
/*
- * this function returns the number of blocks that would be prefetched, based
- * upon the supplied dnode, blockid, and nblks. This is used so that we can
- * update streams in place, and then prefetch with their old value after the
- * fact. This way, we can delay the prefetch, but subsequent accesses to the
- * stream won't result in the same data being prefetched multiple times.
+ * Clean-up state associated with a zfetch structure (e.g. destroy the
+ * streams). This doesn't free the zfetch_t itself, that's left to the caller.
*/
-static uint64_t
-dmu_zfetch_fetchsz(dnode_t *dn, uint64_t blkid, uint64_t nblks)
-{
- uint64_t fetchsz;
-
- if (blkid > dn->dn_maxblkid) {
- return (0);
- }
-
- /* compute fetch size */
- if (blkid + nblks + 1 > dn->dn_maxblkid) {
- fetchsz = (dn->dn_maxblkid - blkid) + 1;
- ASSERT(blkid + fetchsz - 1 <= dn->dn_maxblkid);
- } else {
- fetchsz = nblks;
- }
-
-
- return (fetchsz);
-}
-
-/*
- * given a zfetch and a zstream structure, see if there is an associated zstream
- * for this block read. If so, it starts a prefetch for the stream it
- * located and returns true, otherwise it returns false
- */
-static boolean_t
-dmu_zfetch_find(zfetch_t *zf, zstream_t *zh, int prefetched)
-{
- zstream_t *zs;
- int64_t diff;
- int reset = !prefetched;
- int rc = 0;
-
- if (zh == NULL)
- return (0);
-
- /*
- * XXX: This locking strategy is a bit coarse; however, it's impact has
- * yet to be tested. If this turns out to be an issue, it can be
- * modified in a number of different ways.
- */
-
- rw_enter(&zf->zf_rwlock, RW_READER);
-top:
-
- for (zs = list_head(&zf->zf_stream); zs;
- zs = list_next(&zf->zf_stream, zs)) {
-
- /*
- * XXX - should this be an assert?
- */
- if (zs->zst_len == 0) {
- /* bogus stream */
- ZFETCHSTAT_BUMP(zfetchstat_bogus_streams);
- continue;
- }
-
- /*
- * We hit this case when we are in a strided prefetch stream:
- * we will read "len" blocks before "striding".
- */
- if (zh->zst_offset >= zs->zst_offset &&
- zh->zst_offset < zs->zst_offset + zs->zst_len) {
- if (prefetched) {
- /* already fetched */
- ZFETCHSTAT_BUMP(zfetchstat_stride_hits);
- rc = 1;
- goto out;
- } else {
- ZFETCHSTAT_BUMP(zfetchstat_stride_misses);
- }
- }
-
- /*
- * This is the forward sequential read case: we increment
- * len by one each time we hit here, so we will enter this
- * case on every read.
- */
- if (zh->zst_offset == zs->zst_offset + zs->zst_len) {
-
- reset = !prefetched && zs->zst_len > 1;
-
- if (mutex_tryenter(&zs->zst_lock) == 0) {
- rc = 1;
- goto out;
- }
-
- if (zh->zst_offset != zs->zst_offset + zs->zst_len) {
- mutex_exit(&zs->zst_lock);
- goto top;
- }
- zs->zst_len += zh->zst_len;
- diff = zs->zst_len - zfetch_block_cap;
- if (diff > 0) {
- zs->zst_offset += diff;
- zs->zst_len = zs->zst_len > diff ?
- zs->zst_len - diff : 0;
- }
- zs->zst_direction = ZFETCH_FORWARD;
-
- break;
-
- /*
- * Same as above, but reading backwards through the file.
- */
- } else if (zh->zst_offset == zs->zst_offset - zh->zst_len) {
- /* backwards sequential access */
-
- reset = !prefetched && zs->zst_len > 1;
-
- if (mutex_tryenter(&zs->zst_lock) == 0) {
- rc = 1;
- goto out;
- }
-
- if (zh->zst_offset != zs->zst_offset - zh->zst_len) {
- mutex_exit(&zs->zst_lock);
- goto top;
- }
-
- zs->zst_offset = zs->zst_offset > zh->zst_len ?
- zs->zst_offset - zh->zst_len : 0;
- zs->zst_ph_offset = zs->zst_ph_offset > zh->zst_len ?
- zs->zst_ph_offset - zh->zst_len : 0;
- zs->zst_len += zh->zst_len;
-
- diff = zs->zst_len - zfetch_block_cap;
- if (diff > 0) {
- zs->zst_ph_offset = zs->zst_ph_offset > diff ?
- zs->zst_ph_offset - diff : 0;
- zs->zst_len = zs->zst_len > diff ?
- zs->zst_len - diff : zs->zst_len;
- }
- zs->zst_direction = ZFETCH_BACKWARD;
-
- break;
-
- } else if ((zh->zst_offset - zs->zst_offset - zs->zst_stride <
- zs->zst_len) && (zs->zst_len != zs->zst_stride)) {
- /* strided forward access */
-
- if (mutex_tryenter(&zs->zst_lock) == 0) {
- rc = 1;
- goto out;
- }
-
- if ((zh->zst_offset - zs->zst_offset - zs->zst_stride >=
- zs->zst_len) || (zs->zst_len == zs->zst_stride)) {
- mutex_exit(&zs->zst_lock);
- goto top;
- }
-
- zs->zst_offset += zs->zst_stride;
- zs->zst_direction = ZFETCH_FORWARD;
-
- break;
-
- } else if ((zh->zst_offset - zs->zst_offset + zs->zst_stride <
- zs->zst_len) && (zs->zst_len != zs->zst_stride)) {
- /* strided reverse access */
-
- if (mutex_tryenter(&zs->zst_lock) == 0) {
- rc = 1;
- goto out;
- }
-
- if ((zh->zst_offset - zs->zst_offset + zs->zst_stride >=
- zs->zst_len) || (zs->zst_len == zs->zst_stride)) {
- mutex_exit(&zs->zst_lock);
- goto top;
- }
-
- zs->zst_offset = zs->zst_offset > zs->zst_stride ?
- zs->zst_offset - zs->zst_stride : 0;
- zs->zst_ph_offset = (zs->zst_ph_offset >
- (2 * zs->zst_stride)) ?
- (zs->zst_ph_offset - (2 * zs->zst_stride)) : 0;
- zs->zst_direction = ZFETCH_BACKWARD;
-
- break;
- }
- }
-
- if (zs) {
- if (reset) {
- zstream_t *remove = zs;
-
- ZFETCHSTAT_BUMP(zfetchstat_stream_resets);
- rc = 0;
- mutex_exit(&zs->zst_lock);
- rw_exit(&zf->zf_rwlock);
- rw_enter(&zf->zf_rwlock, RW_WRITER);
- /*
- * Relocate the stream, in case someone removes
- * it while we were acquiring the WRITER lock.
- */
- for (zs = list_head(&zf->zf_stream); zs;
- zs = list_next(&zf->zf_stream, zs)) {
- if (zs == remove) {
- dmu_zfetch_stream_remove(zf, zs);
- mutex_destroy(&zs->zst_lock);
- kmem_free(zs, sizeof (zstream_t));
- break;
- }
- }
- } else {
- ZFETCHSTAT_BUMP(zfetchstat_stream_noresets);
- rc = 1;
- dmu_zfetch_dofetch(zf, zs);
- mutex_exit(&zs->zst_lock);
- }
- }
-out:
- rw_exit(&zf->zf_rwlock);
- return (rc);
-}
-
-/*
- * Clean-up state associated with a zfetch structure. This frees allocated
- * structure members, empties the zf_stream tree, and generally makes things
- * nice. This doesn't free the zfetch_t itself, that's left to the caller.
- */
void
-dmu_zfetch_rele(zfetch_t *zf)
+dmu_zfetch_fini(zfetch_t *zf)
{
- zstream_t *zs;
- zstream_t *zs_next;
+ zstream_t *zs;
ASSERT(!RW_LOCK_HELD(&zf->zf_rwlock));
- for (zs = list_head(&zf->zf_stream); zs; zs = zs_next) {
- zs_next = list_next(&zf->zf_stream, zs);
-
- list_remove(&zf->zf_stream, zs);
- mutex_destroy(&zs->zst_lock);
- kmem_free(zs, sizeof (zstream_t));
- }
+ rw_enter(&zf->zf_rwlock, RW_WRITER);
+ while ((zs = list_head(&zf->zf_stream)) != NULL)
+ dmu_zfetch_stream_remove(zf, zs);
+ rw_exit(&zf->zf_rwlock);
list_destroy(&zf->zf_stream);
rw_destroy(&zf->zf_rwlock);
@@ -564,193 +162,190 @@
}
/*
- * Given a zfetch and zstream structure, insert the zstream structure into the
- * AVL tree contained within the zfetch structure. Peform the appropriate
- * book-keeping. It is possible that another thread has inserted a stream which
- * matches one that we are about to insert, so we must be sure to check for this
- * case. If one is found, return failure, and let the caller cleanup the
- * duplicates.
+ * If there aren't too many streams already, create a new stream.
+ * The "blkid" argument is the next block that we expect this stream to access.
+ * While we're here, clean up old streams (which haven't been
+ * accessed for at least zfetch_min_sec_reap seconds).
*/
-static int
-dmu_zfetch_stream_insert(zfetch_t *zf, zstream_t *zs)
+static void
+dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
{
- zstream_t *zs_walk;
- zstream_t *zs_next;
+ zstream_t *zs_next;
+ int numstreams = 0;
ASSERT(RW_WRITE_HELD(&zf->zf_rwlock));
- for (zs_walk = list_head(&zf->zf_stream); zs_walk; zs_walk = zs_next) {
- zs_next = list_next(&zf->zf_stream, zs_walk);
-
- if (dmu_zfetch_streams_equal(zs_walk, zs)) {
- return (0);
- }
+ /*
+ * Clean up old streams.
+ */
+ for (zstream_t *zs = list_head(&zf->zf_stream);
+ zs != NULL; zs = zs_next) {
+ zs_next = list_next(&zf->zf_stream, zs);
+ if (((gethrtime() - zs->zs_atime) / NANOSEC) >
+ zfetch_min_sec_reap)
+ dmu_zfetch_stream_remove(zf, zs);
+ else
+ numstreams++;
}
- list_insert_head(&zf->zf_stream, zs);
- zf->zf_stream_cnt++;
- return (1);
-}
-
-
-/*
- * Walk the list of zstreams in the given zfetch, find an old one (by time), and
- * reclaim it for use by the caller.
- */
-static zstream_t *
-dmu_zfetch_stream_reclaim(zfetch_t *zf)
-{
- zstream_t *zs;
-
- if (! rw_tryenter(&zf->zf_rwlock, RW_WRITER))
- return (0);
-
- for (zs = list_head(&zf->zf_stream); zs;
- zs = list_next(&zf->zf_stream, zs)) {
-
- if (((ddi_get_lbolt() - zs->zst_last)/hz) > zfetch_min_sec_reap)
- break;
+ /*
+ * The maximum number of streams is normally zfetch_max_streams,
+ * but for small files we lower it such that it's at least possible
+ * for all the streams to be non-overlapping.
+ *
+ * If we are already at the maximum number of streams for this file,
+ * even after removing old streams, then don't create this stream.
+ */
+ uint32_t max_streams = MAX(1, MIN(zfetch_max_streams,
+ zf->zf_dnode->dn_maxblkid * zf->zf_dnode->dn_datablksz /
+ zfetch_max_distance));
+ if (numstreams >= max_streams) {
+ ZFETCHSTAT_BUMP(zfetchstat_max_streams);
+ return;
}
- if (zs) {
- dmu_zfetch_stream_remove(zf, zs);
- mutex_destroy(&zs->zst_lock);
- bzero(zs, sizeof (zstream_t));
- } else {
- zf->zf_alloc_fail++;
- }
- rw_exit(&zf->zf_rwlock);
+ zstream_t *zs = kmem_zalloc(sizeof (*zs), KM_SLEEP);
+ zs->zs_blkid = blkid;
+ zs->zs_pf_blkid = blkid;
+ zs->zs_ipf_blkid = blkid;
+ zs->zs_atime = gethrtime();
+ mutex_init(&zs->zs_lock, NULL, MUTEX_DEFAULT, NULL);
- return (zs);
+ list_insert_head(&zf->zf_stream, zs);
}
/*
- * Given a zfetch and zstream structure, remove the zstream structure from its
- * container in the zfetch structure. Perform the appropriate book-keeping.
+ * This is the predictive prefetch entry point. It associates dnode access
+ * specified with blkid and nblks arguments with prefetch stream, predicts
+ * further accesses based on that stats and initiates speculative prefetch.
+ * fetch_data argument specifies whether actual data blocks should be fetched:
+ * FALSE -- prefetch only indirect blocks for predicted data blocks;
+ * TRUE -- prefetch predicted data blocks plus following indirect blocks.
*/
-static void
-dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs)
-{
- ASSERT(RW_WRITE_HELD(&zf->zf_rwlock));
-
- list_remove(&zf->zf_stream, zs);
- zf->zf_stream_cnt--;
-}
-
-static int
-dmu_zfetch_streams_equal(zstream_t *zs1, zstream_t *zs2)
-{
- if (zs1->zst_offset != zs2->zst_offset)
- return (0);
-
- if (zs1->zst_len != zs2->zst_len)
- return (0);
-
- if (zs1->zst_stride != zs2->zst_stride)
- return (0);
-
- if (zs1->zst_ph_offset != zs2->zst_ph_offset)
- return (0);
-
- if (zs1->zst_cap != zs2->zst_cap)
- return (0);
-
- if (zs1->zst_direction != zs2->zst_direction)
- return (0);
-
- return (1);
-}
-
-/*
- * This is the prefetch entry point. It calls all of the other dmu_zfetch
- * routines to create, delete, find, or operate upon prefetch streams.
- */
void
-dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size, int prefetched)
+dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data)
{
- zstream_t zst;
- zstream_t *newstream;
- boolean_t fetched;
- int inserted;
- unsigned int blkshft;
- uint64_t blksz;
+ zstream_t *zs;
+ int64_t pf_start, ipf_start, ipf_istart, ipf_iend;
+ int64_t pf_ahead_blks, max_blks;
+ int epbs, max_dist_blks, pf_nblks, ipf_nblks;
+ uint64_t end_of_access_blkid = blkid + nblks;
if (zfs_prefetch_disable)
return;
- /* files that aren't ln2 blocksz are only one block -- nothing to do */
- if (!zf->zf_dnode->dn_datablkshift)
+ /*
+ * As a fast path for small (single-block) files, ignore access
+ * to the first block.
+ */
+ if (blkid == 0)
return;
- /* convert offset and size, into blockid and nblocks */
- blkshft = zf->zf_dnode->dn_datablkshift;
- blksz = (1 << blkshft);
+ rw_enter(&zf->zf_rwlock, RW_READER);
- bzero(&zst, sizeof (zstream_t));
- zst.zst_offset = offset >> blkshft;
- zst.zst_len = (P2ROUNDUP(offset + size, blksz) -
- P2ALIGN(offset, blksz)) >> blkshft;
+ for (zs = list_head(&zf->zf_stream); zs != NULL;
+ zs = list_next(&zf->zf_stream, zs)) {
+ if (blkid == zs->zs_blkid) {
+ mutex_enter(&zs->zs_lock);
+ /*
+ * zs_blkid could have changed before we
+ * acquired zs_lock; re-check them here.
+ */
+ if (blkid != zs->zs_blkid) {
+ mutex_exit(&zs->zs_lock);
+ continue;
+ }
+ break;
+ }
+ }
- fetched = dmu_zfetch_find(zf, &zst, prefetched);
- if (fetched) {
- ZFETCHSTAT_BUMP(zfetchstat_hits);
- } else {
+ if (zs == NULL) {
+ /*
+ * This access is not part of any existing stream. Create
+ * a new stream for it.
+ */
ZFETCHSTAT_BUMP(zfetchstat_misses);
- fetched = dmu_zfetch_colinear(zf, &zst);
- if (fetched) {
- ZFETCHSTAT_BUMP(zfetchstat_colinear_hits);
- } else {
- ZFETCHSTAT_BUMP(zfetchstat_colinear_misses);
- }
+ if (rw_tryupgrade(&zf->zf_rwlock))
+ dmu_zfetch_stream_create(zf, end_of_access_blkid);
+ rw_exit(&zf->zf_rwlock);
+ return;
}
- if (!fetched) {
- newstream = dmu_zfetch_stream_reclaim(zf);
+ /*
+ * This access was to a block that we issued a prefetch for on
+ * behalf of this stream. Issue further prefetches for this stream.
+ *
+ * Normally, we start prefetching where we stopped
+ * prefetching last (zs_pf_blkid). But when we get our first
+ * hit on this stream, zs_pf_blkid == zs_blkid, we don't
+ * want to prefetch the block we just accessed. In this case,
+ * start just after the block we just accessed.
+ */
+ pf_start = MAX(zs->zs_pf_blkid, end_of_access_blkid);
+ /*
+ * Double our amount of prefetched data, but don't let the
+ * prefetch get further ahead than zfetch_max_distance.
+ */
+ if (fetch_data) {
+ max_dist_blks =
+ zfetch_max_distance >> zf->zf_dnode->dn_datablkshift;
/*
- * we still couldn't find a stream, drop the lock, and allocate
- * one if possible. Otherwise, give up and go home.
+ * Previously, we were (zs_pf_blkid - blkid) ahead. We
+ * want to now be double that, so read that amount again,
+ * plus the amount we are catching up by (i.e. the amount
+ * read just now).
*/
- if (newstream) {
- ZFETCHSTAT_BUMP(zfetchstat_reclaim_successes);
- } else {
- uint64_t maxblocks;
- uint32_t max_streams;
- uint32_t cur_streams;
+ pf_ahead_blks = zs->zs_pf_blkid - blkid + nblks;
+ max_blks = max_dist_blks - (pf_start - end_of_access_blkid);
+ pf_nblks = MIN(pf_ahead_blks, max_blks);
+ } else {
+ pf_nblks = 0;
+ }
- ZFETCHSTAT_BUMP(zfetchstat_reclaim_failures);
- cur_streams = zf->zf_stream_cnt;
- maxblocks = zf->zf_dnode->dn_maxblkid;
+ zs->zs_pf_blkid = pf_start + pf_nblks;
- max_streams = MIN(zfetch_max_streams,
- (maxblocks / zfetch_block_cap));
- if (max_streams == 0) {
- max_streams++;
- }
+ /*
+ * Do the same for indirects, starting from where we stopped last,
+ * or where we will stop reading data blocks (and the indirects
+ * that point to them).
+ */
+ ipf_start = MAX(zs->zs_ipf_blkid, zs->zs_pf_blkid);
+ max_dist_blks = zfetch_max_idistance >> zf->zf_dnode->dn_datablkshift;
+ /*
+ * We want to double our distance ahead of the data prefetch
+ * (or reader, if we are not prefetching data). Previously, we
+ * were (zs_ipf_blkid - blkid) ahead. To double that, we read
+ * that amount again, plus the amount we are catching up by
+ * (i.e. the amount read now + the amount of data prefetched now).
+ */
+ pf_ahead_blks = zs->zs_ipf_blkid - blkid + nblks + pf_nblks;
+ max_blks = max_dist_blks - (ipf_start - end_of_access_blkid);
+ ipf_nblks = MIN(pf_ahead_blks, max_blks);
+ zs->zs_ipf_blkid = ipf_start + ipf_nblks;
- if (cur_streams >= max_streams) {
- return;
- }
- newstream = kmem_zalloc(sizeof (zstream_t), KM_SLEEP);
- }
+ epbs = zf->zf_dnode->dn_indblkshift - SPA_BLKPTRSHIFT;
+ ipf_istart = P2ROUNDUP(ipf_start, 1 << epbs) >> epbs;
+ ipf_iend = P2ROUNDUP(zs->zs_ipf_blkid, 1 << epbs) >> epbs;
- newstream->zst_offset = zst.zst_offset;
- newstream->zst_len = zst.zst_len;
- newstream->zst_stride = zst.zst_len;
- newstream->zst_ph_offset = zst.zst_len + zst.zst_offset;
- newstream->zst_cap = zst.zst_len;
- newstream->zst_direction = ZFETCH_FORWARD;
- newstream->zst_last = ddi_get_lbolt();
+ zs->zs_atime = gethrtime();
+ zs->zs_blkid = end_of_access_blkid;
+ mutex_exit(&zs->zs_lock);
+ rw_exit(&zf->zf_rwlock);
- mutex_init(&newstream->zst_lock, NULL, MUTEX_DEFAULT, NULL);
+ /*
+ * dbuf_prefetch() is asynchronous (even when it needs to read
+ * indirect blocks), but we still prefer to drop our locks before
+ * calling it to reduce the time we hold them.
+ */
- rw_enter(&zf->zf_rwlock, RW_WRITER);
- inserted = dmu_zfetch_stream_insert(zf, newstream);
- rw_exit(&zf->zf_rwlock);
-
- if (!inserted) {
- mutex_destroy(&newstream->zst_lock);
- kmem_free(newstream, sizeof (zstream_t));
- }
+ for (int i = 0; i < pf_nblks; i++) {
+ dbuf_prefetch(zf->zf_dnode, 0, pf_start + i,
+ ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH);
}
+ for (int64_t iblk = ipf_istart; iblk < ipf_iend; iblk++) {
+ dbuf_prefetch(zf->zf_dnode, 1, iblk,
+ ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH);
+ }
+ ZFETCHSTAT_BUMP(zfetchstat_hits);
}
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,7 +21,9 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
*/
#include <sys/zfs_context.h>
@@ -36,9 +38,8 @@
#include <sys/spa.h>
#include <sys/zio.h>
#include <sys/dmu_zfetch.h>
+#include <sys/range_tree.h>
-static int free_range_compar(const void *node1, const void *node2);
-
static kmem_cache_t *dnode_cache;
/*
* Define DNODE_STATS to turn on statistic gathering. By default, it is only
@@ -59,10 +60,47 @@
int zfs_default_bs = SPA_MINBLOCKSHIFT;
int zfs_default_ibs = DN_MAX_INDBLKSHIFT;
-#ifdef sun
+#ifdef illumos
static kmem_cbrc_t dnode_move(void *, void *, size_t, void *);
#endif
+static int
+dbuf_compare(const void *x1, const void *x2)
+{
+ const dmu_buf_impl_t *d1 = x1;
+ const dmu_buf_impl_t *d2 = x2;
+
+ if (d1->db_level < d2->db_level) {
+ return (-1);
+ }
+ if (d1->db_level > d2->db_level) {
+ return (1);
+ }
+
+ if (d1->db_blkid < d2->db_blkid) {
+ return (-1);
+ }
+ if (d1->db_blkid > d2->db_blkid) {
+ return (1);
+ }
+
+ if (d1->db_state == DB_SEARCH) {
+ ASSERT3S(d2->db_state, !=, DB_SEARCH);
+ return (-1);
+ } else if (d2->db_state == DB_SEARCH) {
+ ASSERT3S(d1->db_state, !=, DB_SEARCH);
+ return (1);
+ }
+
+ if ((uintptr_t)d1 < (uintptr_t)d2) {
+ return (-1);
+ }
+ if ((uintptr_t)d1 > (uintptr_t)d2) {
+ return (1);
+ }
+ return (0);
+}
+
/* ARGSUSED */
static int
dnode_cons(void *arg, void *unused, int kmflag)
@@ -93,9 +131,7 @@
for (i = 0; i < TXG_SIZE; i++) {
list_link_init(&dn->dn_dirty_link[i]);
- avl_create(&dn->dn_ranges[i], free_range_compar,
- sizeof (free_range_t),
- offsetof(struct free_range, fr_node));
+ dn->dn_free_ranges[i] = NULL;
list_create(&dn->dn_dirty_records[i],
sizeof (dbuf_dirty_record_t),
offsetof(dbuf_dirty_record_t, dr_dirty_node));
@@ -118,7 +154,8 @@
dn->dn_id_flags = 0;
dn->dn_dbufs_count = 0;
- list_create(&dn->dn_dbufs, sizeof (dmu_buf_impl_t),
+ dn->dn_unlisted_l0_blkid = 0;
+ avl_create(&dn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t),
offsetof(dmu_buf_impl_t, db_link));
dn->dn_moved = 0;
@@ -143,7 +180,7 @@
for (i = 0; i < TXG_SIZE; i++) {
ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
- avl_destroy(&dn->dn_ranges[i]);
+ ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
list_destroy(&dn->dn_dirty_records[i]);
ASSERT0(dn->dn_next_nblkptr[i]);
ASSERT0(dn->dn_next_nlevels[i]);
@@ -171,7 +208,8 @@
ASSERT0(dn->dn_id_flags);
ASSERT0(dn->dn_dbufs_count);
- list_destroy(&dn->dn_dbufs);
+ ASSERT0(dn->dn_unlisted_l0_blkid);
+ avl_destroy(&dn->dn_dbufs);
}
void
@@ -315,19 +353,6 @@
}
}
-static int
-free_range_compar(const void *node1, const void *node2)
-{
- const free_range_t *rp1 = node1;
- const free_range_t *rp2 = node2;
-
- if (rp1->fr_blkid < rp2->fr_blkid)
- return (-1);
- else if (rp1->fr_blkid > rp2->fr_blkid)
- return (1);
- else return (0);
-}
-
void
dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx)
{
@@ -376,7 +401,7 @@
1<<(sizeof (dn->dn_phys->dn_datablkszsec) * 8));
dn->dn_datablksz = size;
dn->dn_datablkszsec = size >> SPA_MINBLOCKSHIFT;
- dn->dn_datablkshift = ISP2(size) ? highbit(size - 1) : 0;
+ dn->dn_datablkshift = ISP2(size) ? highbit64(size - 1) : 0;
}
static dnode_t *
@@ -383,8 +408,9 @@
dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
uint64_t object, dnode_handle_t *dnh)
{
- dnode_t *dn = kmem_cache_alloc(dnode_cache, KM_SLEEP);
+ dnode_t *dn;
+ dn = kmem_cache_alloc(dnode_cache, KM_SLEEP);
ASSERT(!POINTER_IS_VALID(dn->dn_objset));
dn->dn_moved = 0;
@@ -421,13 +447,31 @@
ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type));
mutex_enter(&os->os_lock);
- list_insert_head(&os->os_dnodes, dn);
+ if (dnh->dnh_dnode != NULL) {
+ /* Lost the allocation race. */
+ mutex_exit(&os->os_lock);
+ kmem_cache_free(dnode_cache, dn);
+ return (dnh->dnh_dnode);
+ }
+
+ /*
+ * Exclude special dnodes from os_dnodes so an empty os_dnodes
+ * signifies that the special dnodes have no references from
+ * their children (the entries in os_dnodes). This allows
+ * dnode_destroy() to easily determine if the last child has
+ * been removed and then complete eviction of the objset.
+ */
+ if (!DMU_OBJECT_IS_SPECIAL(object))
+ list_insert_head(&os->os_dnodes, dn);
membar_producer();
+
/*
- * Everything else must be valid before assigning dn_objset makes the
- * dnode eligible for dnode_move().
+ * Everything else must be valid before assigning dn_objset
+ * makes the dnode eligible for dnode_move().
*/
dn->dn_objset = os;
+
+ dnh->dnh_dnode = dn;
mutex_exit(&os->os_lock);
arc_space_consume(sizeof (dnode_t), ARC_SPACE_OTHER);
@@ -441,12 +485,18 @@
dnode_destroy(dnode_t *dn)
{
objset_t *os = dn->dn_objset;
+ boolean_t complete_os_eviction = B_FALSE;
ASSERT((dn->dn_id_flags & DN_ID_NEW_EXIST) == 0);
mutex_enter(&os->os_lock);
POINTER_INVALIDATE(&dn->dn_objset);
- list_remove(&os->os_dnodes, dn);
+ if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
+ list_remove(&os->os_dnodes, dn);
+ complete_os_eviction =
+ list_is_empty(&os->os_dnodes) &&
+ list_link_active(&os->os_evicting_node);
+ }
mutex_exit(&os->os_lock);
/* the dnode can no longer move, so we can release the handle */
@@ -463,7 +513,7 @@
}
if (dn->dn_bonus != NULL) {
mutex_enter(&dn->dn_bonus->db_mtx);
- dbuf_evict(dn->dn_bonus);
+ dbuf_destroy(dn->dn_bonus);
dn->dn_bonus = NULL;
}
dn->dn_zio = NULL;
@@ -476,10 +526,14 @@
dn->dn_newuid = 0;
dn->dn_newgid = 0;
dn->dn_id_flags = 0;
+ dn->dn_unlisted_l0_blkid = 0;
- dmu_zfetch_rele(&dn->dn_zfetch);
+ dmu_zfetch_fini(&dn->dn_zfetch);
kmem_cache_free(dnode_cache, dn);
arc_space_return(sizeof (dnode_t), ARC_SPACE_OTHER);
+
+ if (complete_os_eviction)
+ dmu_objset_evict_done(os);
}
void
@@ -488,10 +542,10 @@
{
int i;
+ ASSERT3U(blocksize, <=,
+ spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
if (blocksize == 0)
blocksize = 1 << zfs_default_bs;
- else if (blocksize > SPA_MAXBLOCKSIZE)
- blocksize = SPA_MAXBLOCKSIZE;
else
blocksize = P2ROUNDUP(blocksize, SPA_MINBLOCKSIZE);
@@ -519,7 +573,7 @@
ASSERT0(dn->dn_assigned_txg);
ASSERT(refcount_is_zero(&dn->dn_tx_holds));
ASSERT3U(refcount_count(&dn->dn_holds), <=, 1);
- ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
+ ASSERT(avl_is_empty(&dn->dn_dbufs));
for (i = 0; i < TXG_SIZE; i++) {
ASSERT0(dn->dn_next_nblkptr[i]);
@@ -531,7 +585,7 @@
ASSERT0(dn->dn_next_blksz[i]);
ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL);
- ASSERT0(avl_numnodes(&dn->dn_ranges[i]));
+ ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
}
dn->dn_type = ot;
@@ -572,7 +626,8 @@
int nblkptr;
ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE);
- ASSERT3U(blocksize, <=, SPA_MAXBLOCKSIZE);
+ ASSERT3U(blocksize, <=,
+ spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
ASSERT0(blocksize % SPA_MINBLOCKSIZE);
ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
ASSERT(tx->tx_txg != 0);
@@ -695,7 +750,8 @@
list_move_tail(&ndn->dn_dirty_records[i],
&odn->dn_dirty_records[i]);
}
- bcopy(&odn->dn_ranges[0], &ndn->dn_ranges[0], sizeof (odn->dn_ranges));
+ bcopy(&odn->dn_free_ranges[0], &ndn->dn_free_ranges[0],
+ sizeof (odn->dn_free_ranges));
ndn->dn_allocated_txg = odn->dn_allocated_txg;
ndn->dn_free_txg = odn->dn_free_txg;
ndn->dn_assigned_txg = odn->dn_assigned_txg;
@@ -703,9 +759,10 @@
ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset;
ASSERT(refcount_count(&odn->dn_tx_holds) == 0);
refcount_transfer(&ndn->dn_holds, &odn->dn_holds);
- ASSERT(list_is_empty(&ndn->dn_dbufs));
- list_move_tail(&ndn->dn_dbufs, &odn->dn_dbufs);
+ ASSERT(avl_is_empty(&ndn->dn_dbufs));
+ avl_swap(&ndn->dn_dbufs, &odn->dn_dbufs);
ndn->dn_dbufs_count = odn->dn_dbufs_count;
+ ndn->dn_unlisted_l0_blkid = odn->dn_unlisted_l0_blkid;
ndn->dn_bonus = odn->dn_bonus;
ndn->dn_have_spill = odn->dn_have_spill;
ndn->dn_zio = odn->dn_zio;
@@ -719,8 +776,6 @@
dmu_zfetch_init(&ndn->dn_zfetch, NULL);
list_move_tail(&ndn->dn_zfetch.zf_stream, &odn->dn_zfetch.zf_stream);
ndn->dn_zfetch.zf_dnode = odn->dn_zfetch.zf_dnode;
- ndn->dn_zfetch.zf_stream_cnt = odn->dn_zfetch.zf_stream_cnt;
- ndn->dn_zfetch.zf_alloc_fail = odn->dn_zfetch.zf_alloc_fail;
/*
* Update back pointers. Updating the handle fixes the back pointer of
@@ -737,9 +792,10 @@
*/
odn->dn_dbuf = NULL;
odn->dn_handle = NULL;
- list_create(&odn->dn_dbufs, sizeof (dmu_buf_impl_t),
+ avl_create(&odn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t),
offsetof(dmu_buf_impl_t, db_link));
odn->dn_dbufs_count = 0;
+ odn->dn_unlisted_l0_blkid = 0;
odn->dn_bonus = NULL;
odn->dn_zfetch.zf_dnode = NULL;
@@ -756,8 +812,7 @@
list_create(&odn->dn_dirty_records[i],
sizeof (dbuf_dirty_record_t),
offsetof(dbuf_dirty_record_t, dr_dirty_node));
- odn->dn_ranges[i].avl_root = NULL;
- odn->dn_ranges[i].avl_numnodes = 0;
+ odn->dn_free_ranges[i] = NULL;
odn->dn_next_nlevels[i] = 0;
odn->dn_next_indblkshift[i] = 0;
odn->dn_next_bonustype[i] = 0;
@@ -787,7 +842,7 @@
odn->dn_moved = (uint8_t)-1;
}
-#ifdef sun
+#ifdef illumos
#ifdef _KERNEL
/*ARGSUSED*/
static kmem_cbrc_t
@@ -930,7 +985,7 @@
return (KMEM_CBRC_YES);
}
#endif /* _KERNEL */
-#endif /* sun */
+#endif /* illumos */
void
dnode_special_close(dnode_handle_t *dnh)
@@ -945,6 +1000,8 @@
*/
while (refcount_count(&dn->dn_holds) > 0)
delay(1);
+ ASSERT(dn->dn_dbuf == NULL ||
+ dmu_buf_get_user(&dn->dn_dbuf->db) == NULL);
zrl_add(&dnh->dnh_zrlock);
dnode_destroy(dn); /* implicit zrl_remove() */
zrl_destroy(&dnh->dnh_zrlock);
@@ -951,27 +1008,24 @@
dnh->dnh_dnode = NULL;
}
-dnode_t *
+void
dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object,
dnode_handle_t *dnh)
{
- dnode_t *dn = dnode_create(os, dnp, NULL, object, dnh);
- dnh->dnh_dnode = dn;
+ dnode_t *dn;
+
+ dn = dnode_create(os, dnp, NULL, object, dnh);
zrl_init(&dnh->dnh_zrlock);
DNODE_VERIFY(dn);
- return (dn);
}
static void
-dnode_buf_pageout(dmu_buf_t *db, void *arg)
+dnode_buf_pageout(void *dbu)
{
- dnode_children_t *children_dnodes = arg;
+ dnode_children_t *children_dnodes = dbu;
int i;
- int epb = db->db_size >> DNODE_SHIFT;
- ASSERT(epb == children_dnodes->dnc_count);
-
- for (i = 0; i < epb; i++) {
+ for (i = 0; i < children_dnodes->dnc_count; i++) {
dnode_handle_t *dnh = &children_dnodes->dnc_children[i];
dnode_t *dn;
@@ -1001,7 +1055,7 @@
dnh->dnh_dnode = NULL;
}
kmem_free(children_dnodes, sizeof (dnode_children_t) +
- (epb - 1) * sizeof (dnode_handle_t));
+ children_dnodes->dnc_count * sizeof (dnode_handle_t));
}
/*
@@ -1062,7 +1116,7 @@
drop_struct_lock = TRUE;
}
- blk = dbuf_whichblock(mdn, object * sizeof (dnode_phys_t));
+ blk = dbuf_whichblock(mdn, 0, object * sizeof (dnode_phys_t));
db = dbuf_hold(mdn, blk, FTAG);
if (drop_struct_lock)
@@ -1086,17 +1140,23 @@
int i;
dnode_children_t *winner;
children_dnodes = kmem_zalloc(sizeof (dnode_children_t) +
- (epb - 1) * sizeof (dnode_handle_t), KM_SLEEP);
+ epb * sizeof (dnode_handle_t), KM_SLEEP);
children_dnodes->dnc_count = epb;
dnh = &children_dnodes->dnc_children[0];
for (i = 0; i < epb; i++) {
zrl_init(&dnh[i].dnh_zrlock);
- dnh[i].dnh_dnode = NULL;
}
- if (winner = dmu_buf_set_user(&db->db, children_dnodes, NULL,
- dnode_buf_pageout)) {
+ dmu_buf_init_user(&children_dnodes->dnc_dbu,
+ dnode_buf_pageout, NULL);
+ winner = dmu_buf_set_user(&db->db, &children_dnodes->dnc_dbu);
+ if (winner != NULL) {
+
+ for (i = 0; i < epb; i++) {
+ zrl_destroy(&dnh[i].dnh_zrlock);
+ }
+
kmem_free(children_dnodes, sizeof (dnode_children_t) +
- (epb - 1) * sizeof (dnode_handle_t));
+ epb * sizeof (dnode_handle_t));
children_dnodes = winner;
}
}
@@ -1104,17 +1164,11 @@
dnh = &children_dnodes->dnc_children[idx];
zrl_add(&dnh->dnh_zrlock);
- if ((dn = dnh->dnh_dnode) == NULL) {
+ dn = dnh->dnh_dnode;
+ if (dn == NULL) {
dnode_phys_t *phys = (dnode_phys_t *)db->db.db_data+idx;
- dnode_t *winner;
dn = dnode_create(os, phys, db, object, dnh);
- winner = atomic_cas_ptr(&dnh->dnh_dnode, NULL, dn);
- if (winner != NULL) {
- zrl_add(&dnh->dnh_zrlock);
- dnode_destroy(dn); /* implicit zrl_remove() */
- dn = winner;
- }
}
mutex_enter(&dn->dn_mtx);
@@ -1128,10 +1182,10 @@
dbuf_rele(db, FTAG);
return (type == DMU_OT_NONE ? ENOENT : EEXIST);
}
+ if (refcount_add(&dn->dn_holds, tag) == 1)
+ dbuf_add_ref(db, dnh);
mutex_exit(&dn->dn_mtx);
- if (refcount_add(&dn->dn_holds, tag) == 1)
- dbuf_add_ref(db, dnh);
/* Now we can rely on the hold to prevent the dnode from moving. */
zrl_remove(&dnh->dnh_zrlock);
@@ -1174,12 +1228,18 @@
void
dnode_rele(dnode_t *dn, void *tag)
{
+ mutex_enter(&dn->dn_mtx);
+ dnode_rele_and_unlock(dn, tag);
+}
+
+void
+dnode_rele_and_unlock(dnode_t *dn, void *tag)
+{
uint64_t refs;
/* Get while the hold prevents the dnode from moving. */
dmu_buf_impl_t *db = dn->dn_dbuf;
dnode_handle_t *dnh = dn->dn_handle;
- mutex_enter(&dn->dn_mtx);
refs = refcount_remove(&dn->dn_holds, tag);
mutex_exit(&dn->dn_mtx);
@@ -1243,7 +1303,8 @@
return;
}
- ASSERT(!refcount_is_zero(&dn->dn_holds) || list_head(&dn->dn_dbufs));
+ ASSERT(!refcount_is_zero(&dn->dn_holds) ||
+ !avl_is_empty(&dn->dn_dbufs));
ASSERT(dn->dn_datablksz != 0);
ASSERT0(dn->dn_next_bonuslen[txg&TXG_MASK]);
ASSERT0(dn->dn_next_blksz[txg&TXG_MASK]);
@@ -1316,13 +1377,12 @@
int
dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
{
- dmu_buf_impl_t *db, *db_next;
+ dmu_buf_impl_t *db;
int err;
+ ASSERT3U(size, <=, spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
if (size == 0)
size = SPA_MINBLOCKSIZE;
- if (size > SPA_MAXBLOCKSIZE)
- size = SPA_MAXBLOCKSIZE;
else
size = P2ROUNDUP(size, SPA_MINBLOCKSIZE);
@@ -1335,13 +1395,12 @@
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
/* Check for any allocated blocks beyond the first */
- if (dn->dn_phys->dn_maxblkid != 0)
+ if (dn->dn_maxblkid != 0)
goto fail;
mutex_enter(&dn->dn_dbufs_mtx);
- for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
- db_next = list_next(&dn->dn_dbufs, db);
-
+ for (db = avl_first(&dn->dn_dbufs); db != NULL;
+ db = AVL_NEXT(&dn->dn_dbufs, db)) {
if (db->db_blkid != 0 && db->db_blkid != DMU_BONUS_BLKID &&
db->db_blkid != DMU_SPILL_BLKID) {
mutex_exit(&dn->dn_dbufs_mtx);
@@ -1354,7 +1413,7 @@
goto fail;
/* resize the old block */
- err = dbuf_hold_impl(dn, 0, 0, TRUE, FTAG, &db);
+ err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db);
if (err == 0)
dbuf_new_size(db, size, tx);
else if (err != ENOENT)
@@ -1462,56 +1521,13 @@
rw_downgrade(&dn->dn_struct_rwlock);
}
-void
-dnode_clear_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
+static void
+dnode_dirty_l1(dnode_t *dn, uint64_t l1blkid, dmu_tx_t *tx)
{
- avl_tree_t *tree = &dn->dn_ranges[tx->tx_txg&TXG_MASK];
- avl_index_t where;
- free_range_t *rp;
- free_range_t rp_tofind;
- uint64_t endblk = blkid + nblks;
-
- ASSERT(MUTEX_HELD(&dn->dn_mtx));
- ASSERT(nblks <= UINT64_MAX - blkid); /* no overflow */
-
- dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",
- blkid, nblks, tx->tx_txg);
- rp_tofind.fr_blkid = blkid;
- rp = avl_find(tree, &rp_tofind, &where);
- if (rp == NULL)
- rp = avl_nearest(tree, where, AVL_BEFORE);
- if (rp == NULL)
- rp = avl_nearest(tree, where, AVL_AFTER);
-
- while (rp && (rp->fr_blkid <= blkid + nblks)) {
- uint64_t fr_endblk = rp->fr_blkid + rp->fr_nblks;
- free_range_t *nrp = AVL_NEXT(tree, rp);
-
- if (blkid <= rp->fr_blkid && endblk >= fr_endblk) {
- /* clear this entire range */
- avl_remove(tree, rp);
- kmem_free(rp, sizeof (free_range_t));
- } else if (blkid <= rp->fr_blkid &&
- endblk > rp->fr_blkid && endblk < fr_endblk) {
- /* clear the beginning of this range */
- rp->fr_blkid = endblk;
- rp->fr_nblks = fr_endblk - endblk;
- } else if (blkid > rp->fr_blkid && blkid < fr_endblk &&
- endblk >= fr_endblk) {
- /* clear the end of this range */
- rp->fr_nblks = blkid - rp->fr_blkid;
- } else if (blkid > rp->fr_blkid && endblk < fr_endblk) {
- /* clear a chunk out of this range */
- free_range_t *new_rp =
- kmem_alloc(sizeof (free_range_t), KM_SLEEP);
-
- new_rp->fr_blkid = endblk;
- new_rp->fr_nblks = fr_endblk - endblk;
- avl_insert_here(tree, new_rp, rp, AVL_AFTER);
- rp->fr_nblks = blkid - rp->fr_blkid;
- }
- /* there may be no overlap */
- rp = nrp;
+ dmu_buf_impl_t *db = dbuf_hold_level(dn, 1, l1blkid, FTAG);
+ if (db != NULL) {
+ dmu_buf_will_dirty(&db->db, tx);
+ dbuf_rele(db, FTAG);
}
}
@@ -1529,7 +1545,7 @@
blkshift = dn->dn_datablkshift;
epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
- if (len == -1ULL) {
+ if (len == DMU_OBJECT_END) {
len = UINT64_MAX - off;
trunc = TRUE;
}
@@ -1545,7 +1561,13 @@
} else {
ASSERT(dn->dn_maxblkid == 0);
if (off == 0 && len >= blksz) {
- /* Freeing the whole block; fast-track this request */
+ /*
+ * Freeing the whole block; fast-track this request.
+ * Note that we won't dirty any indirect blocks,
+ * which is fine because we will be freeing the entire
+ * file and thus all indirect blocks will be freed
+ * by free_children().
+ */
blkid = 0;
nblks = 1;
goto done;
@@ -1564,8 +1586,8 @@
ASSERT3U(blkoff + head, ==, blksz);
if (len < head)
head = len;
- if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off), TRUE,
- FTAG, &db) == 0) {
+ if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off),
+ TRUE, FALSE, FTAG, &db) == 0) {
caddr_t data;
/* don't dirty if it isn't on disk and isn't dirty */
@@ -1572,7 +1594,7 @@
if (db->db_last_dirty ||
(db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) {
rw_exit(&dn->dn_struct_rwlock);
- dbuf_will_dirty(db, tx);
+ dmu_buf_will_dirty(&db->db, tx);
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
data = db->db.db_data;
bzero(data + blkoff, head);
@@ -1602,13 +1624,13 @@
if (tail) {
if (len < tail)
tail = len;
- if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off+len),
- TRUE, FTAG, &db) == 0) {
+ if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off+len),
+ TRUE, FALSE, FTAG, &db) == 0) {
/* don't dirty if not on disk and not dirty */
if (db->db_last_dirty ||
(db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) {
rw_exit(&dn->dn_struct_rwlock);
- dbuf_will_dirty(db, tx);
+ dmu_buf_will_dirty(&db->db, tx);
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
bzero(db->db.db_data, tail);
}
@@ -1629,45 +1651,70 @@
nblks += 1;
/*
- * Read in and mark all the level-1 indirects dirty,
- * so that they will stay in memory until syncing phase.
- * Always dirty the first and last indirect to make sure
- * we dirty all the partial indirects.
+ * Dirty all the indirect blocks in this range. Note that only
+ * the first and last indirect blocks can actually be written
+ * (if they were partially freed) -- they must be dirtied, even if
+ * they do not exist on disk yet. The interior blocks will
+ * be freed by free_children(), so they will not actually be written.
+ * Even though these interior blocks will not be written, we
+ * dirty them for two reasons:
+ *
+ * - It ensures that the indirect blocks remain in memory until
+ * syncing context. (They have already been prefetched by
+ * dmu_tx_hold_free(), so we don't have to worry about reading
+ * them serially here.)
+ *
+ * - The dirty space accounting will put pressure on the txg sync
+ * mechanism to begin syncing, and to delay transactions if there
+ * is a large amount of freeing. Even though these indirect
+ * blocks will not be written, we could need to write the same
+ * amount of space if we copy the freed BPs into deadlists.
*/
if (dn->dn_nlevels > 1) {
- uint64_t i, first, last;
- int shift = epbs + dn->dn_datablkshift;
+ uint64_t first, last;
first = blkid >> epbs;
- if (db = dbuf_hold_level(dn, 1, first, FTAG)) {
- dbuf_will_dirty(db, tx);
- dbuf_rele(db, FTAG);
- }
+ dnode_dirty_l1(dn, first, tx);
if (trunc)
last = dn->dn_maxblkid >> epbs;
else
last = (blkid + nblks - 1) >> epbs;
- if (last > first && (db = dbuf_hold_level(dn, 1, last, FTAG))) {
- dbuf_will_dirty(db, tx);
- dbuf_rele(db, FTAG);
- }
- for (i = first + 1; i < last; i++) {
+ if (last != first)
+ dnode_dirty_l1(dn, last, tx);
+
+ int shift = dn->dn_datablkshift + dn->dn_indblkshift -
+ SPA_BLKPTRSHIFT;
+ for (uint64_t i = first + 1; i < last; i++) {
+ /*
+ * Set i to the blockid of the next non-hole
+ * level-1 indirect block at or after i. Note
+ * that dnode_next_offset() operates in terms of
+ * level-0-equivalent bytes.
+ */
uint64_t ibyte = i << shift;
- int err;
-
- err = dnode_next_offset(dn,
- DNODE_FIND_HAVELOCK, &ibyte, 1, 1, 0);
+ int err = dnode_next_offset(dn, DNODE_FIND_HAVELOCK,
+ &ibyte, 2, 1, 0);
i = ibyte >> shift;
- if (err == ESRCH || i >= last)
+ if (i >= last)
break;
- ASSERT(err == 0);
- db = dbuf_hold_level(dn, 1, i, FTAG);
- if (db) {
- dbuf_will_dirty(db, tx);
- dbuf_rele(db, FTAG);
- }
+
+ /*
+ * Normally we should not see an error, either
+ * from dnode_next_offset() or dbuf_hold_level()
+ * (except for ESRCH from dnode_next_offset).
+ * If there is an i/o error, then when we read
+ * this block in syncing context, it will use
+ * ZIO_FLAG_MUSTSUCCEED, and thus hang/panic according
+ * to the "failmode" property. dnode_next_offset()
+ * doesn't have a flag to indicate MUSTSUCCEED.
+ */
+ if (err != 0)
+ break;
+
+ dnode_dirty_l1(dn, i, tx);
}
}
+
done:
/*
* Add this range to the dnode range list.
@@ -1674,29 +1721,20 @@
* We will finish up this free operation in the syncing phase.
*/
mutex_enter(&dn->dn_mtx);
- dnode_clear_range(dn, blkid, nblks, tx);
- {
- free_range_t *rp, *found;
- avl_index_t where;
- avl_tree_t *tree = &dn->dn_ranges[tx->tx_txg&TXG_MASK];
-
- /* Add new range to dn_ranges */
- rp = kmem_alloc(sizeof (free_range_t), KM_SLEEP);
- rp->fr_blkid = blkid;
- rp->fr_nblks = nblks;
- found = avl_find(tree, rp, &where);
- ASSERT(found == NULL);
- avl_insert(tree, rp, where);
- dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",
- blkid, nblks, tx->tx_txg);
+ int txgoff = tx->tx_txg & TXG_MASK;
+ if (dn->dn_free_ranges[txgoff] == NULL) {
+ dn->dn_free_ranges[txgoff] =
+ range_tree_create(NULL, NULL, &dn->dn_mtx);
}
+ range_tree_clear(dn->dn_free_ranges[txgoff], blkid, nblks);
+ range_tree_add(dn->dn_free_ranges[txgoff], blkid, nblks);
+ dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",
+ blkid, nblks, tx->tx_txg);
mutex_exit(&dn->dn_mtx);
dbuf_free_range(dn, blkid, blkid + nblks - 1, tx);
dnode_setdirty(dn, tx);
out:
- if (trunc && dn->dn_maxblkid >= (off >> blkshift))
- dn->dn_maxblkid = (off >> blkshift ? (off >> blkshift) - 1 : 0);
rw_exit(&dn->dn_struct_rwlock);
}
@@ -1719,7 +1757,6 @@
uint64_t
dnode_block_freed(dnode_t *dn, uint64_t blkid)
{
- free_range_t range_tofind;
void *dp = spa_get_dsl(dn->dn_objset->os_spa);
int i;
@@ -1739,21 +1776,11 @@
if (blkid == DMU_SPILL_BLKID)
return (dnode_spill_freed(dn));
- range_tofind.fr_blkid = blkid;
mutex_enter(&dn->dn_mtx);
for (i = 0; i < TXG_SIZE; i++) {
- free_range_t *range_found;
- avl_index_t idx;
-
- range_found = avl_find(&dn->dn_ranges[i], &range_tofind, &idx);
- if (range_found) {
- ASSERT(range_found->fr_nblks > 0);
+ if (dn->dn_free_ranges[i] != NULL &&
+ range_tree_contains(dn->dn_free_ranges[i], blkid, 1))
break;
- }
- range_found = avl_nearest(&dn->dn_ranges[i], idx, AVL_BEFORE);
- if (range_found &&
- range_found->fr_blkid + range_found->fr_nblks > blkid)
- break;
}
mutex_exit(&dn->dn_mtx);
return (i < TXG_SIZE);
@@ -1789,9 +1816,8 @@
}
/*
- * Call when we think we're going to write/free space in open context.
- * Be conservative (ie. OK to write less than this or free more than
- * this, but don't write more or free less).
+ * Call when we think we're going to write/free space in open context to track
+ * the amount of memory in use by the currently open txg.
*/
void
dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx)
@@ -1798,14 +1824,14 @@
{
objset_t *os = dn->dn_objset;
dsl_dataset_t *ds = os->os_dsl_dataset;
+ int64_t aspace = spa_get_asize(os->os_spa, space);
- if (space > 0)
- space = spa_get_asize(os->os_spa, space);
+ if (ds != NULL) {
+ dsl_dir_willuse_space(ds->ds_dir, aspace, tx);
+ dsl_pool_dirty_space(dmu_tx_pool(tx), space, tx);
+ }
- if (ds)
- dsl_dir_willuse_space(ds->ds_dir, space, tx);
-
- dmu_tx_willuse_space(tx, space);
+ dmu_tx_willuse_space(tx, aspace);
}
/*
@@ -1828,7 +1854,7 @@
*/
static int
dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
- int lvl, uint64_t blkfill, uint64_t txg)
+ int lvl, uint64_t blkfill, uint64_t txg)
{
dmu_buf_impl_t *db = NULL;
void *data = NULL;
@@ -1850,8 +1876,8 @@
epb = dn->dn_phys->dn_nblkptr;
data = dn->dn_phys->dn_blkptr;
} else {
- uint64_t blkid = dbuf_whichblock(dn, *offset) >> (epbs * lvl);
- error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FTAG, &db);
+ uint64_t blkid = dbuf_whichblock(dn, lvl, *offset);
+ error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FALSE, FTAG, &db);
if (error) {
if (error != ENOENT)
return (error);
@@ -1874,8 +1900,10 @@
data = db->db.db_data;
}
- if (db && txg &&
- (db->db_blkptr == NULL || db->db_blkptr->blk_birth <= txg)) {
+
+ if (db != NULL && txg != 0 && (db->db_blkptr == NULL ||
+ db->db_blkptr->blk_birth <= txg ||
+ BP_IS_HOLE(db->db_blkptr))) {
/*
* This can only happen when we are searching up the tree
* and these conditions mean that we need to keep climbing.
@@ -1909,8 +1937,8 @@
*offset = *offset >> span;
for (i = BF64_GET(*offset, 0, epbs);
i >= 0 && i < epb; i += inc) {
- if (bp[i].blk_fill >= minfill &&
- bp[i].blk_fill <= maxfill &&
+ if (BP_GET_FILL(&bp[i]) >= minfill &&
+ BP_GET_FILL(&bp[i]) <= maxfill &&
(hole || bp[i].blk_birth > txg))
break;
if (inc > 0 || *offset > 0)
@@ -1997,6 +2025,15 @@
flags, offset, lvl, blkfill, txg);
}
+ /*
+ * There's always a "virtual hole" at the end of the object, even
+ * if all BP's which physically exist are non-holes.
+ */
+ if ((flags & DNODE_FIND_HOLE) && error == ESRCH && txg == 0 &&
+ minlvl == 1 && blkfill == 1 && !(flags & DNODE_FIND_BACKWARDS)) {
+ error = 0;
+ }
+
if (error == 0 && (flags & DNODE_FIND_BACKWARDS ?
initial_offset < *offset : initial_offset > *offset))
error = SET_ERROR(ESRCH);
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -22,7 +22,8 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -33,6 +34,8 @@
#include <sys/dmu_objset.h>
#include <sys/dsl_dataset.h>
#include <sys/spa.h>
+#include <sys/range_tree.h>
+#include <sys/zfeature.h>
static void
dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx)
@@ -58,24 +61,19 @@
dprintf("os=%p obj=%llu, increase to %d\n", dn->dn_objset,
dn->dn_object, dn->dn_phys->dn_nlevels);
- /* check for existing blkptrs in the dnode */
- for (i = 0; i < nblkptr; i++)
- if (!BP_IS_HOLE(&dn->dn_phys->dn_blkptr[i]))
- break;
- if (i != nblkptr) {
- /* transfer dnode's block pointers to new indirect block */
- (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED|DB_RF_HAVESTRUCT);
- ASSERT(db->db.db_data);
- ASSERT(arc_released(db->db_buf));
- ASSERT3U(sizeof (blkptr_t) * nblkptr, <=, db->db.db_size);
- bcopy(dn->dn_phys->dn_blkptr, db->db.db_data,
- sizeof (blkptr_t) * nblkptr);
- arc_buf_freeze(db->db_buf);
- }
+ /* transfer dnode's block pointers to new indirect block */
+ (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED|DB_RF_HAVESTRUCT);
+ ASSERT(db->db.db_data);
+ ASSERT(arc_released(db->db_buf));
+ ASSERT3U(sizeof (blkptr_t) * nblkptr, <=, db->db.db_size);
+ bcopy(dn->dn_phys->dn_blkptr, db->db.db_data,
+ sizeof (blkptr_t) * nblkptr);
+ arc_buf_freeze(db->db_buf);
/* set dbuf's parent pointers to new indirect buf */
for (i = 0; i < nblkptr; i++) {
- dmu_buf_impl_t *child = dbuf_find(dn, old_toplvl, i);
+ dmu_buf_impl_t *child =
+ dbuf_find(dn->dn_objset, dn->dn_object, old_toplvl, i);
if (child == NULL)
continue;
@@ -113,26 +111,44 @@
rw_exit(&dn->dn_struct_rwlock);
}
-static int
+static void
free_blocks(dnode_t *dn, blkptr_t *bp, int num, dmu_tx_t *tx)
{
dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
uint64_t bytesfreed = 0;
- int i, blocks_freed = 0;
dprintf("ds=%p obj=%llx num=%d\n", ds, dn->dn_object, num);
- for (i = 0; i < num; i++, bp++) {
+ for (int i = 0; i < num; i++, bp++) {
if (BP_IS_HOLE(bp))
continue;
bytesfreed += dsl_dataset_block_kill(ds, bp, tx, B_FALSE);
ASSERT3U(bytesfreed, <=, DN_USED_BYTES(dn->dn_phys));
+
+ /*
+ * Save some useful information on the holes being
+ * punched, including logical size, type, and indirection
+ * level. Retaining birth time enables detection of when
+ * holes are punched for reducing the number of free
+ * records transmitted during a zfs send.
+ */
+
+ uint64_t lsize = BP_GET_LSIZE(bp);
+ dmu_object_type_t type = BP_GET_TYPE(bp);
+ uint64_t lvl = BP_GET_LEVEL(bp);
+
bzero(bp, sizeof (blkptr_t));
- blocks_freed += 1;
+
+ if (spa_feature_is_active(dn->dn_objset->os_spa,
+ SPA_FEATURE_HOLE_BIRTH)) {
+ BP_SET_LSIZE(bp, lsize);
+ BP_SET_TYPE(bp, type);
+ BP_SET_LEVEL(bp, lvl);
+ BP_SET_BIRTH(bp, dmu_tx_get_txg(tx), 0);
+ }
}
dnode_diduse_space(dn, -bytesfreed);
- return (blocks_freed);
}
#ifdef ZFS_DEBUG
@@ -167,7 +183,7 @@
rw_enter(&dn->dn_struct_rwlock, RW_READER);
err = dbuf_hold_impl(dn, db->db_level-1,
- (db->db_blkid << epbs) + i, TRUE, FTAG, &child);
+ (db->db_blkid << epbs) + i, TRUE, FALSE, FTAG, &child);
rw_exit(&dn->dn_struct_rwlock);
if (err == ENOENT)
continue;
@@ -214,10 +230,8 @@
}
#endif
-#define ALL -1
-
-static int
-free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc,
+static void
+free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks,
dmu_tx_t *tx)
{
dnode_t *dn;
@@ -224,22 +238,19 @@
blkptr_t *bp;
dmu_buf_impl_t *subdb;
uint64_t start, end, dbstart, dbend, i;
- int epbs, shift, err;
- int all = TRUE;
- int blocks_freed = 0;
+ int epbs, shift;
/*
* There is a small possibility that this block will not be cached:
* 1 - if level > 1 and there are no children with level <= 1
- * 2 - if we didn't get a dirty hold (because this block had just
- * finished being written -- and so had no holds), and then this
- * block got evicted before we got here.
+ * 2 - if this block was evicted since we read it from
+ * dmu_tx_hold_free().
*/
if (db->db_state != DB_CACHED)
(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
dbuf_release_bp(db);
- bp = (blkptr_t *)db->db.db_data;
+ bp = db->db.db_data;
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
@@ -249,7 +260,6 @@
start = blkid >> shift;
if (dbstart < start) {
bp += start - dbstart;
- all = FALSE;
} else {
start = dbstart;
}
@@ -257,49 +267,46 @@
end = (blkid + nblks - 1) >> shift;
if (dbend <= end)
end = dbend;
- else if (all)
- all = trunc;
+
ASSERT3U(start, <=, end);
if (db->db_level == 1) {
FREE_VERIFY(db, start, end, tx);
- blocks_freed = free_blocks(dn, bp, end-start+1, tx);
- arc_buf_freeze(db->db_buf);
- ASSERT(all || blocks_freed == 0 || db->db_last_dirty);
- DB_DNODE_EXIT(db);
- return (all ? ALL : blocks_freed);
+ free_blocks(dn, bp, end-start+1, tx);
+ } else {
+ for (i = start; i <= end; i++, bp++) {
+ if (BP_IS_HOLE(bp))
+ continue;
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ VERIFY0(dbuf_hold_impl(dn, db->db_level - 1,
+ i, TRUE, FALSE, FTAG, &subdb));
+ rw_exit(&dn->dn_struct_rwlock);
+ ASSERT3P(bp, ==, subdb->db_blkptr);
+
+ free_children(subdb, blkid, nblks, tx);
+ dbuf_rele(subdb, FTAG);
+ }
}
- for (i = start; i <= end; i++, bp++) {
- if (BP_IS_HOLE(bp))
- continue;
- rw_enter(&dn->dn_struct_rwlock, RW_READER);
- err = dbuf_hold_impl(dn, db->db_level-1, i, TRUE, FTAG, &subdb);
- ASSERT0(err);
- rw_exit(&dn->dn_struct_rwlock);
+ /* If this whole block is free, free ourself too. */
+ for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++) {
+ if (!BP_IS_HOLE(bp))
+ break;
+ }
+ if (i == 1 << epbs) {
+ /* didn't find any non-holes */
+ bzero(db->db.db_data, db->db.db_size);
+ free_blocks(dn, db->db_blkptr, 1, tx);
+ } else {
+ /*
+ * Partial block free; must be marked dirty so that it
+ * will be written out.
+ */
+ ASSERT(db->db_dirtycnt > 0);
+ }
- if (free_children(subdb, blkid, nblks, trunc, tx) == ALL) {
- ASSERT3P(subdb->db_blkptr, ==, bp);
- blocks_freed += free_blocks(dn, bp, 1, tx);
- } else {
- all = FALSE;
- }
- dbuf_rele(subdb, FTAG);
- }
DB_DNODE_EXIT(db);
arc_buf_freeze(db->db_buf);
-#ifdef ZFS_DEBUG
- bp -= (end-start)+1;
- for (i = start; i <= end; i++, bp++) {
- if (i == start && blkid != 0)
- continue;
- else if (i == end && !trunc)
- continue;
- ASSERT0(bp->blk_birth);
- }
-#endif
- ASSERT(all || blocks_freed == 0 || db->db_last_dirty);
- return (all ? ALL : blocks_freed);
}
/*
@@ -307,20 +314,21 @@
* and "free" all the blocks contained there.
*/
static void
-dnode_sync_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
+dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks,
+ dmu_tx_t *tx)
{
blkptr_t *bp = dn->dn_phys->dn_blkptr;
- dmu_buf_impl_t *db;
- int trunc, start, end, shift, i, err;
int dnlevel = dn->dn_phys->dn_nlevels;
+ boolean_t trunc = B_FALSE;
if (blkid > dn->dn_phys->dn_maxblkid)
return;
ASSERT(dn->dn_phys->dn_maxblkid < UINT64_MAX);
- trunc = blkid + nblks > dn->dn_phys->dn_maxblkid;
- if (trunc)
+ if (blkid + nblks > dn->dn_phys->dn_maxblkid) {
nblks = dn->dn_phys->dn_maxblkid - blkid + 1;
+ trunc = B_TRUE;
+ }
/* There are no indirect blocks in the object */
if (dnlevel == 1) {
@@ -329,41 +337,34 @@
return;
}
ASSERT3U(blkid + nblks, <=, dn->dn_phys->dn_nblkptr);
- (void) free_blocks(dn, bp + blkid, nblks, tx);
- if (trunc) {
- uint64_t off = (dn->dn_phys->dn_maxblkid + 1) *
- (dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT);
- dn->dn_phys->dn_maxblkid = (blkid ? blkid - 1 : 0);
- ASSERT(off < dn->dn_phys->dn_maxblkid ||
- dn->dn_phys->dn_maxblkid == 0 ||
- dnode_next_offset(dn, 0, &off, 1, 1, 0) != 0);
- }
- return;
- }
+ free_blocks(dn, bp + blkid, nblks, tx);
+ } else {
+ int shift = (dnlevel - 1) *
+ (dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT);
+ int start = blkid >> shift;
+ int end = (blkid + nblks - 1) >> shift;
+ dmu_buf_impl_t *db;
- shift = (dnlevel - 1) * (dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT);
- start = blkid >> shift;
- ASSERT(start < dn->dn_phys->dn_nblkptr);
- end = (blkid + nblks - 1) >> shift;
- bp += start;
- for (i = start; i <= end; i++, bp++) {
- if (BP_IS_HOLE(bp))
- continue;
- rw_enter(&dn->dn_struct_rwlock, RW_READER);
- err = dbuf_hold_impl(dn, dnlevel-1, i, TRUE, FTAG, &db);
- ASSERT0(err);
- rw_exit(&dn->dn_struct_rwlock);
+ ASSERT(start < dn->dn_phys->dn_nblkptr);
+ bp += start;
+ for (int i = start; i <= end; i++, bp++) {
+ if (BP_IS_HOLE(bp))
+ continue;
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ VERIFY0(dbuf_hold_impl(dn, dnlevel - 1, i,
+ TRUE, FALSE, FTAG, &db));
+ rw_exit(&dn->dn_struct_rwlock);
- if (free_children(db, blkid, nblks, trunc, tx) == ALL) {
- ASSERT3P(db->db_blkptr, ==, bp);
- (void) free_blocks(dn, bp, 1, tx);
+ free_children(db, blkid, nblks, tx);
+ dbuf_rele(db, FTAG);
}
- dbuf_rele(db, FTAG);
}
+
if (trunc) {
+ dn->dn_phys->dn_maxblkid = blkid == 0 ? 0 : blkid - 1;
+
uint64_t off = (dn->dn_phys->dn_maxblkid + 1) *
(dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT);
- dn->dn_phys->dn_maxblkid = (blkid ? blkid - 1 : 0);
ASSERT(off < dn->dn_phys->dn_maxblkid ||
dn->dn_phys->dn_maxblkid == 0 ||
dnode_next_offset(dn, 0, &off, 1, 1, 0) != 0);
@@ -370,6 +371,22 @@
}
}
+typedef struct dnode_sync_free_range_arg {
+ dnode_t *dsfra_dnode;
+ dmu_tx_t *dsfra_tx;
+} dnode_sync_free_range_arg_t;
+
+static void
+dnode_sync_free_range(void *arg, uint64_t blkid, uint64_t nblks)
+{
+ dnode_sync_free_range_arg_t *dsfra = arg;
+ dnode_t *dn = dsfra->dsfra_dnode;
+
+ mutex_exit(&dn->dn_mtx);
+ dnode_sync_free_range_impl(dn, blkid, nblks, dsfra->dsfra_tx);
+ mutex_enter(&dn->dn_mtx);
+}
+
/*
* Try to kick all the dnode's dbufs out of the cache...
*/
@@ -376,59 +393,54 @@
void
dnode_evict_dbufs(dnode_t *dn)
{
- int progress;
- int pass = 0;
+ dmu_buf_impl_t db_marker;
+ dmu_buf_impl_t *db, *db_next;
- do {
- dmu_buf_impl_t *db, marker;
- int evicting = FALSE;
+ mutex_enter(&dn->dn_dbufs_mtx);
+ for (db = avl_first(&dn->dn_dbufs); db != NULL; db = db_next) {
- progress = FALSE;
- mutex_enter(&dn->dn_dbufs_mtx);
- list_insert_tail(&dn->dn_dbufs, &marker);
- db = list_head(&dn->dn_dbufs);
- for (; db != ▮ db = list_head(&dn->dn_dbufs)) {
- list_remove(&dn->dn_dbufs, db);
- list_insert_tail(&dn->dn_dbufs, db);
#ifdef DEBUG
- DB_DNODE_ENTER(db);
- ASSERT3P(DB_DNODE(db), ==, dn);
- DB_DNODE_EXIT(db);
+ DB_DNODE_ENTER(db);
+ ASSERT3P(DB_DNODE(db), ==, dn);
+ DB_DNODE_EXIT(db);
#endif /* DEBUG */
- mutex_enter(&db->db_mtx);
- if (db->db_state == DB_EVICTING) {
- progress = TRUE;
- evicting = TRUE;
- mutex_exit(&db->db_mtx);
- } else if (refcount_is_zero(&db->db_holds)) {
- progress = TRUE;
- dbuf_clear(db); /* exits db_mtx for us */
- } else {
- mutex_exit(&db->db_mtx);
- }
+ mutex_enter(&db->db_mtx);
+ if (db->db_state != DB_EVICTING &&
+ refcount_is_zero(&db->db_holds)) {
+ db_marker.db_level = db->db_level;
+ db_marker.db_blkid = db->db_blkid;
+ db_marker.db_state = DB_SEARCH;
+ avl_insert_here(&dn->dn_dbufs, &db_marker, db,
+ AVL_BEFORE);
+ dbuf_destroy(db);
+
+ db_next = AVL_NEXT(&dn->dn_dbufs, &db_marker);
+ avl_remove(&dn->dn_dbufs, &db_marker);
+ } else {
+ db->db_pending_evict = TRUE;
+ mutex_exit(&db->db_mtx);
+ db_next = AVL_NEXT(&dn->dn_dbufs, db);
}
- list_remove(&dn->dn_dbufs, &marker);
- /*
- * NB: we need to drop dn_dbufs_mtx between passes so
- * that any DB_EVICTING dbufs can make progress.
- * Ideally, we would have some cv we could wait on, but
- * since we don't, just wait a bit to give the other
- * thread a chance to run.
- */
- mutex_exit(&dn->dn_dbufs_mtx);
- if (evicting)
- delay(1);
- pass++;
- ASSERT(pass < 100); /* sanity check */
- } while (progress);
+ }
+ mutex_exit(&dn->dn_dbufs_mtx);
+ dnode_evict_bonus(dn);
+}
+
+void
+dnode_evict_bonus(dnode_t *dn)
+{
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
- if (dn->dn_bonus && refcount_is_zero(&dn->dn_bonus->db_holds)) {
- mutex_enter(&dn->dn_bonus->db_mtx);
- dbuf_evict(dn->dn_bonus);
- dn->dn_bonus = NULL;
+ if (dn->dn_bonus != NULL) {
+ if (refcount_is_zero(&dn->dn_bonus->db_holds)) {
+ mutex_enter(&dn->dn_bonus->db_mtx);
+ dbuf_destroy(dn->dn_bonus);
+ dn->dn_bonus = NULL;
+ } else {
+ dn->dn_bonus->db_pending_evict = TRUE;
+ }
}
rw_exit(&dn->dn_struct_rwlock);
}
@@ -456,8 +468,8 @@
dr->dt.dl.dr_data == db->db_buf);
dbuf_unoverride(dr);
} else {
+ mutex_destroy(&dr->dt.di.dr_mtx);
list_destroy(&dr->dt.di.dr_children);
- mutex_destroy(&dr->dt.di.dr_mtx);
}
kmem_free(dr, sizeof (dbuf_dirty_record_t));
dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
@@ -480,8 +492,6 @@
dnode_undirty_dbufs(&dn->dn_dirty_records[txgoff]);
dnode_evict_dbufs(dn);
- ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
- ASSERT3P(dn->dn_bonus, ==, NULL);
/*
* XXX - It would be nice to assert this, but we may still
@@ -504,7 +514,7 @@
ASSERT(dn->dn_free_txg > 0);
if (dn->dn_allocated_txg != dn->dn_free_txg)
- dbuf_will_dirty(dn->dn_dbuf, tx);
+ dmu_buf_will_dirty(&dn->dn_dbuf->db, tx);
bzero(dn->dn_phys, sizeof (dnode_phys_t));
mutex_enter(&dn->dn_mtx);
@@ -530,7 +540,6 @@
void
dnode_sync(dnode_t *dn, dmu_tx_t *tx)
{
- free_range_t *rp;
dnode_phys_t *dnp = dn->dn_phys;
int txgoff = tx->tx_txg & TXG_MASK;
list_t *list = &dn->dn_dirty_records[txgoff];
@@ -572,26 +581,34 @@
dnp->dn_bonustype = dn->dn_bonustype;
dnp->dn_bonuslen = dn->dn_bonuslen;
}
-
ASSERT(dnp->dn_nlevels > 1 ||
BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
+ BP_IS_EMBEDDED(&dnp->dn_blkptr[0]) ||
BP_GET_LSIZE(&dnp->dn_blkptr[0]) ==
dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
+ ASSERT(dnp->dn_nlevels < 2 ||
+ BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
+ BP_GET_LSIZE(&dnp->dn_blkptr[0]) == 1 << dnp->dn_indblkshift);
- if (dn->dn_next_blksz[txgoff]) {
+ if (dn->dn_next_type[txgoff] != 0) {
+ dnp->dn_type = dn->dn_type;
+ dn->dn_next_type[txgoff] = 0;
+ }
+
+ if (dn->dn_next_blksz[txgoff] != 0) {
ASSERT(P2PHASE(dn->dn_next_blksz[txgoff],
SPA_MINBLOCKSIZE) == 0);
ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[0]) ||
dn->dn_maxblkid == 0 || list_head(list) != NULL ||
- avl_last(&dn->dn_ranges[txgoff]) ||
dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT ==
- dnp->dn_datablkszsec);
+ dnp->dn_datablkszsec ||
+ range_tree_space(dn->dn_free_ranges[txgoff]) != 0);
dnp->dn_datablkszsec =
dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT;
dn->dn_next_blksz[txgoff] = 0;
}
- if (dn->dn_next_bonuslen[txgoff]) {
+ if (dn->dn_next_bonuslen[txgoff] != 0) {
if (dn->dn_next_bonuslen[txgoff] == DN_ZERO_BONUSLEN)
dnp->dn_bonuslen = 0;
else
@@ -600,25 +617,26 @@
dn->dn_next_bonuslen[txgoff] = 0;
}
- if (dn->dn_next_bonustype[txgoff]) {
+ if (dn->dn_next_bonustype[txgoff] != 0) {
ASSERT(DMU_OT_IS_VALID(dn->dn_next_bonustype[txgoff]));
dnp->dn_bonustype = dn->dn_next_bonustype[txgoff];
dn->dn_next_bonustype[txgoff] = 0;
}
+ boolean_t freeing_dnode = dn->dn_free_txg > 0 &&
+ dn->dn_free_txg <= tx->tx_txg;
+
/*
- * We will either remove a spill block when a file is being removed
- * or we have been asked to remove it.
+ * Remove the spill block if we have been explicitly asked to
+ * remove it, or if the object is being removed.
*/
- if (dn->dn_rm_spillblk[txgoff] ||
- ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) &&
- dn->dn_free_txg > 0 && dn->dn_free_txg <= tx->tx_txg)) {
- if ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
+ if (dn->dn_rm_spillblk[txgoff] || freeing_dnode) {
+ if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)
kill_spill = B_TRUE;
dn->dn_rm_spillblk[txgoff] = 0;
}
- if (dn->dn_next_indblkshift[txgoff]) {
+ if (dn->dn_next_indblkshift[txgoff] != 0) {
ASSERT(dnp->dn_nlevels == 1);
dnp->dn_indblkshift = dn->dn_next_indblkshift[txgoff];
dn->dn_next_indblkshift[txgoff] = 0;
@@ -635,7 +653,7 @@
mutex_exit(&dn->dn_mtx);
if (kill_spill) {
- (void) free_blocks(dn, &dn->dn_phys->dn_spill, 1, tx);
+ free_blocks(dn, &dn->dn_phys->dn_spill, 1, tx);
mutex_enter(&dn->dn_mtx);
dnp->dn_flags &= ~DNODE_FLAG_SPILL_BLKPTR;
mutex_exit(&dn->dn_mtx);
@@ -642,20 +660,28 @@
}
/* process all the "freed" ranges in the file */
- while (rp = avl_last(&dn->dn_ranges[txgoff])) {
- dnode_sync_free_range(dn, rp->fr_blkid, rp->fr_nblks, tx);
- /* grab the mutex so we don't race with dnode_block_freed() */
+ if (dn->dn_free_ranges[txgoff] != NULL) {
+ dnode_sync_free_range_arg_t dsfra;
+ dsfra.dsfra_dnode = dn;
+ dsfra.dsfra_tx = tx;
mutex_enter(&dn->dn_mtx);
- avl_remove(&dn->dn_ranges[txgoff], rp);
+ range_tree_vacate(dn->dn_free_ranges[txgoff],
+ dnode_sync_free_range, &dsfra);
+ range_tree_destroy(dn->dn_free_ranges[txgoff]);
+ dn->dn_free_ranges[txgoff] = NULL;
mutex_exit(&dn->dn_mtx);
- kmem_free(rp, sizeof (free_range_t));
}
- if (dn->dn_free_txg > 0 && dn->dn_free_txg <= tx->tx_txg) {
+ if (freeing_dnode) {
dnode_sync_free(dn, tx);
return;
}
+ if (dn->dn_next_nlevels[txgoff]) {
+ dnode_increase_indirection(dn, tx);
+ dn->dn_next_nlevels[txgoff] = 0;
+ }
+
if (dn->dn_next_nblkptr[txgoff]) {
/* this should only happen on a realloc */
ASSERT(dn->dn_allocated_txg == tx->tx_txg);
@@ -680,13 +706,8 @@
mutex_exit(&dn->dn_mtx);
}
- if (dn->dn_next_nlevels[txgoff]) {
- dnode_increase_indirection(dn, tx);
- dn->dn_next_nlevels[txgoff] = 0;
- }
+ dbuf_sync_list(list, dn->dn_phys->dn_nlevels - 1, tx);
- dbuf_sync_list(list, tx);
-
if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
ASSERT3P(list_head(list), ==, NULL);
dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg);
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,9 +21,13 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
- * Copyright (c) 2012, Joyent, Inc. All rights reserved.
* Portions Copyright (c) 2011 Martin Matuska <mm at FreeBSD.org>
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014 RackTop Systems.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
+ * Copyright 2016, OmniTI Computer Consulting, Inc. All rights reserved.
*/
#include <sys/dmu_objset.h>
@@ -33,6 +37,7 @@
#include <sys/dsl_synctask.h>
#include <sys/dmu_traverse.h>
#include <sys/dmu_impl.h>
+#include <sys/dmu_send.h>
#include <sys/dmu_tx.h>
#include <sys/arc.h>
#include <sys/zio.h>
@@ -49,7 +54,28 @@
#include <sys/dsl_deadlist.h>
#include <sys/dsl_destroy.h>
#include <sys/dsl_userhold.h>
+#include <sys/dsl_bookmark.h>
+#include <sys/dmu_send.h>
+#include <sys/zio_checksum.h>
+#include <sys/zio_compress.h>
+#include <zfs_fletcher.h>
+SYSCTL_DECL(_vfs_zfs);
+
+/*
+ * The SPA supports block sizes up to 16MB. However, very large blocks
+ * can have an impact on i/o latency (e.g. tying up a spinning disk for
+ * ~300ms), and also potentially on the memory allocator. Therefore,
+ * we do not allow the recordsize to be set larger than zfs_max_recordsize
+ * (default 1MB). Larger blocks can be created by changing this tunable,
+ * and pools with larger blocks can always be imported and used, regardless
+ * of this setting.
+ */
+int zfs_max_recordsize = 1 * 1024 * 1024;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, max_recordsize, CTLFLAG_RWTUN,
+ &zfs_max_recordsize, 0,
+ "Maximum block size. Expect dragons when tuning this.");
+
#define SWITCH64(x, y) \
{ \
uint64_t __tmp = (x); \
@@ -59,8 +85,12 @@
#define DS_REF_MAX (1ULL << 62)
-#define DSL_DEADLIST_BLOCKSIZE SPA_MAXBLOCKSIZE
+extern inline dsl_dataset_phys_t *dsl_dataset_phys(dsl_dataset_t *ds);
+extern int spa_asize_inflation;
+
+static zil_header_t zero_zil;
+
/*
* Figure out how much of this delta should be propogated to the dsl_dir
* layer. If there's a refreservation, that space has already been
@@ -69,13 +99,15 @@
static int64_t
parent_delta(dsl_dataset_t *ds, int64_t delta)
{
+ dsl_dataset_phys_t *ds_phys;
uint64_t old_bytes, new_bytes;
if (ds->ds_reserved == 0)
return (delta);
- old_bytes = MAX(ds->ds_phys->ds_unique_bytes, ds->ds_reserved);
- new_bytes = MAX(ds->ds_phys->ds_unique_bytes + delta, ds->ds_reserved);
+ ds_phys = dsl_dataset_phys(ds);
+ old_bytes = MAX(ds_phys->ds_unique_bytes, ds->ds_reserved);
+ new_bytes = MAX(ds_phys->ds_unique_bytes + delta, ds->ds_reserved);
ASSERT3U(ABS((int64_t)(new_bytes - old_bytes)), <=, ABS(delta));
return (new_bytes - old_bytes);
@@ -102,21 +134,30 @@
used, compressed, uncompressed);
return;
}
+
+ ASSERT3U(bp->blk_birth, >, dsl_dataset_phys(ds)->ds_prev_snap_txg);
dmu_buf_will_dirty(ds->ds_dbuf, tx);
-
- mutex_enter(&ds->ds_dir->dd_lock);
mutex_enter(&ds->ds_lock);
delta = parent_delta(ds, used);
- ds->ds_phys->ds_referenced_bytes += used;
- ds->ds_phys->ds_compressed_bytes += compressed;
- ds->ds_phys->ds_uncompressed_bytes += uncompressed;
- ds->ds_phys->ds_unique_bytes += used;
+ dsl_dataset_phys(ds)->ds_referenced_bytes += used;
+ dsl_dataset_phys(ds)->ds_compressed_bytes += compressed;
+ dsl_dataset_phys(ds)->ds_uncompressed_bytes += uncompressed;
+ dsl_dataset_phys(ds)->ds_unique_bytes += used;
+
+ if (BP_GET_LSIZE(bp) > SPA_OLD_MAXBLOCKSIZE) {
+ ds->ds_feature_activation_needed[SPA_FEATURE_LARGE_BLOCKS] =
+ B_TRUE;
+ }
+
+ spa_feature_t f = zio_checksum_to_feature(BP_GET_CHECKSUM(bp));
+ if (f != SPA_FEATURE_NONE)
+ ds->ds_feature_activation_needed[f] = B_TRUE;
+
mutex_exit(&ds->ds_lock);
dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta,
compressed, uncompressed, tx);
dsl_dir_transfer_space(ds->ds_dir, used - delta,
- DD_USED_REFRSRV, DD_USED_HEAD, tx);
- mutex_exit(&ds->ds_dir->dd_lock);
+ DD_USED_REFRSRV, DD_USED_HEAD, NULL);
}
int
@@ -123,6 +164,10 @@
dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
boolean_t async)
{
+ int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
+ int compressed = BP_GET_PSIZE(bp);
+ int uncompressed = BP_GET_UCSIZE(bp);
+
if (BP_IS_HOLE(bp))
return (0);
@@ -129,11 +174,6 @@
ASSERT(dmu_tx_is_syncing(tx));
ASSERT(bp->blk_birth <= tx->tx_txg);
- int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
- int compressed = BP_GET_PSIZE(bp);
- int uncompressed = BP_GET_UCSIZE(bp);
-
- ASSERT(used > 0);
if (ds == NULL) {
dsl_free(tx->tx_pool, tx->tx_txg, bp);
dsl_pool_mos_diduse_space(tx->tx_pool,
@@ -142,27 +182,25 @@
}
ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool);
- ASSERT(!dsl_dataset_is_snapshot(ds));
+ ASSERT(!ds->ds_is_snapshot);
dmu_buf_will_dirty(ds->ds_dbuf, tx);
- if (bp->blk_birth > ds->ds_phys->ds_prev_snap_txg) {
+ if (bp->blk_birth > dsl_dataset_phys(ds)->ds_prev_snap_txg) {
int64_t delta;
dprintf_bp(bp, "freeing ds=%llu", ds->ds_object);
dsl_free(tx->tx_pool, tx->tx_txg, bp);
- mutex_enter(&ds->ds_dir->dd_lock);
mutex_enter(&ds->ds_lock);
- ASSERT(ds->ds_phys->ds_unique_bytes >= used ||
+ ASSERT(dsl_dataset_phys(ds)->ds_unique_bytes >= used ||
!DS_UNIQUE_IS_ACCURATE(ds));
delta = parent_delta(ds, -used);
- ds->ds_phys->ds_unique_bytes -= used;
+ dsl_dataset_phys(ds)->ds_unique_bytes -= used;
mutex_exit(&ds->ds_lock);
dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD,
delta, -compressed, -uncompressed, tx);
dsl_dir_transfer_space(ds->ds_dir, -used - delta,
- DD_USED_REFRSRV, DD_USED_HEAD, tx);
- mutex_exit(&ds->ds_dir->dd_lock);
+ DD_USED_REFRSRV, DD_USED_HEAD, NULL);
} else {
dprintf_bp(bp, "putting on dead list: %s", "");
if (async) {
@@ -178,15 +216,15 @@
dsl_deadlist_insert(&ds->ds_deadlist, bp, tx);
}
ASSERT3U(ds->ds_prev->ds_object, ==,
- ds->ds_phys->ds_prev_snap_obj);
- ASSERT(ds->ds_prev->ds_phys->ds_num_children > 0);
+ dsl_dataset_phys(ds)->ds_prev_snap_obj);
+ ASSERT(dsl_dataset_phys(ds->ds_prev)->ds_num_children > 0);
/* if (bp->blk_birth > prev prev snap txg) prev unique += bs */
- if (ds->ds_prev->ds_phys->ds_next_snap_obj ==
+ if (dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj ==
ds->ds_object && bp->blk_birth >
- ds->ds_prev->ds_phys->ds_prev_snap_txg) {
+ dsl_dataset_phys(ds->ds_prev)->ds_prev_snap_txg) {
dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
mutex_enter(&ds->ds_prev->ds_lock);
- ds->ds_prev->ds_phys->ds_unique_bytes += used;
+ dsl_dataset_phys(ds->ds_prev)->ds_unique_bytes += used;
mutex_exit(&ds->ds_prev->ds_lock);
}
if (bp->blk_birth > ds->ds_dir->dd_origin_txg) {
@@ -195,12 +233,12 @@
}
}
mutex_enter(&ds->ds_lock);
- ASSERT3U(ds->ds_phys->ds_referenced_bytes, >=, used);
- ds->ds_phys->ds_referenced_bytes -= used;
- ASSERT3U(ds->ds_phys->ds_compressed_bytes, >=, compressed);
- ds->ds_phys->ds_compressed_bytes -= compressed;
- ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, >=, uncompressed);
- ds->ds_phys->ds_uncompressed_bytes -= uncompressed;
+ ASSERT3U(dsl_dataset_phys(ds)->ds_referenced_bytes, >=, used);
+ dsl_dataset_phys(ds)->ds_referenced_bytes -= used;
+ ASSERT3U(dsl_dataset_phys(ds)->ds_compressed_bytes, >=, compressed);
+ dsl_dataset_phys(ds)->ds_compressed_bytes -= compressed;
+ ASSERT3U(dsl_dataset_phys(ds)->ds_uncompressed_bytes, >=, uncompressed);
+ dsl_dataset_phys(ds)->ds_uncompressed_bytes -= uncompressed;
mutex_exit(&ds->ds_lock);
return (used);
@@ -226,7 +264,7 @@
if (ds->ds_trysnap_txg >
spa_last_synced_txg(ds->ds_dir->dd_pool->dp_spa))
trysnap = ds->ds_trysnap_txg;
- return (MAX(ds->ds_phys->ds_prev_snap_txg, trysnap));
+ return (MAX(dsl_dataset_phys(ds)->ds_prev_snap_txg, trysnap));
}
boolean_t
@@ -233,7 +271,8 @@
dsl_dataset_block_freeable(dsl_dataset_t *ds, const blkptr_t *bp,
uint64_t blk_birth)
{
- if (blk_birth <= dsl_dataset_prev_snap_txg(ds))
+ if (blk_birth <= dsl_dataset_prev_snap_txg(ds) ||
+ (bp != NULL && BP_IS_HOLE(bp)))
return (B_FALSE);
ddt_prefetch(dsl_dataset_get_spa(ds), bp);
@@ -241,14 +280,15 @@
return (B_TRUE);
}
-/* ARGSUSED */
static void
-dsl_dataset_evict(dmu_buf_t *db, void *dsv)
+dsl_dataset_evict(void *dbu)
{
- dsl_dataset_t *ds = dsv;
+ dsl_dataset_t *ds = dbu;
ASSERT(ds->ds_owner == NULL);
+ ds->ds_dbuf = NULL;
+
unique_remove(ds->ds_fsid_guid);
if (ds->ds_objset != NULL)
@@ -260,13 +300,14 @@
}
bplist_destroy(&ds->ds_pending_deadlist);
- if (ds->ds_phys->ds_deadlist_obj != 0)
+ if (ds->ds_deadlist.dl_os != NULL)
dsl_deadlist_close(&ds->ds_deadlist);
if (ds->ds_dir)
- dsl_dir_rele(ds->ds_dir, ds);
+ dsl_dir_async_rele(ds->ds_dir, ds);
ASSERT(!list_link_active(&ds->ds_synced_link));
+ list_destroy(&ds->ds_prop_cbs);
if (mutex_owned(&ds->ds_lock))
mutex_exit(&ds->ds_lock);
mutex_destroy(&ds->ds_lock);
@@ -273,7 +314,9 @@
if (mutex_owned(&ds->ds_opening_lock))
mutex_exit(&ds->ds_opening_lock);
mutex_destroy(&ds->ds_opening_lock);
+ mutex_destroy(&ds->ds_sendstream_lock);
refcount_destroy(&ds->ds_longholds);
+ rrw_destroy(&ds->ds_bp_rwlock);
kmem_free(ds, sizeof (dsl_dataset_t));
}
@@ -289,10 +332,10 @@
if (ds->ds_snapname[0])
return (0);
- if (ds->ds_phys->ds_next_snap_obj == 0)
+ if (dsl_dataset_phys(ds)->ds_next_snap_obj == 0)
return (0);
- err = dmu_bonus_hold(mos, ds->ds_dir->dd_phys->dd_head_dataset_obj,
+ err = dmu_bonus_hold(mos, dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj,
FTAG, &headdbuf);
if (err != 0)
return (err);
@@ -307,11 +350,11 @@
dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, uint64_t *value)
{
objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
- uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj;
+ uint64_t snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
matchtype_t mt;
int err;
- if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET)
+ if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET)
mt = MT_FIRST;
else
mt = MT_EXACT;
@@ -324,16 +367,17 @@
}
int
-dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx)
+dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx,
+ boolean_t adj_cnt)
{
objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
- uint64_t snapobj = ds->ds_phys->ds_snapnames_zapobj;
+ uint64_t snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
matchtype_t mt;
int err;
dsl_dir_snap_cmtime_update(ds->ds_dir);
- if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET)
+ if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET)
mt = MT_FIRST;
else
mt = MT_EXACT;
@@ -341,9 +385,32 @@
err = zap_remove_norm(mos, snapobj, name, mt, tx);
if (err == ENOTSUP && mt == MT_FIRST)
err = zap_remove(mos, snapobj, name, tx);
+
+ if (err == 0 && adj_cnt)
+ dsl_fs_ss_count_adjust(ds->ds_dir, -1,
+ DD_FIELD_SNAPSHOT_COUNT, tx);
+
return (err);
}
+boolean_t
+dsl_dataset_try_add_ref(dsl_pool_t *dp, dsl_dataset_t *ds, void *tag)
+{
+ dmu_buf_t *dbuf = ds->ds_dbuf;
+ boolean_t result = B_FALSE;
+
+ if (dbuf != NULL && dmu_buf_try_add_ref(dbuf, dp->dp_meta_objset,
+ ds->ds_object, DMU_BONUS_BLKID, tag)) {
+
+ if (ds == dmu_buf_get_user(dbuf))
+ result = B_TRUE;
+ else
+ dmu_buf_rele(dbuf, tag);
+ }
+
+ return (result);
+}
+
int
dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
dsl_dataset_t **dsp)
@@ -362,7 +429,7 @@
/* Make sure dsobj has the correct object type. */
dmu_object_info_from_db(dbuf, &doi);
- if (doi.doi_type != DMU_OT_DSL_DATASET) {
+ if (doi.doi_bonus_type != DMU_OT_DSL_DATASET) {
dmu_buf_rele(dbuf, tag);
return (SET_ERROR(EINVAL));
}
@@ -374,27 +441,46 @@
ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP);
ds->ds_dbuf = dbuf;
ds->ds_object = dsobj;
- ds->ds_phys = dbuf->db_data;
+ ds->ds_is_snapshot = dsl_dataset_phys(ds)->ds_num_children != 0;
mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&ds->ds_sendstream_lock, NULL, MUTEX_DEFAULT, NULL);
+ rrw_init(&ds->ds_bp_rwlock, B_FALSE);
refcount_create(&ds->ds_longholds);
bplist_create(&ds->ds_pending_deadlist);
dsl_deadlist_open(&ds->ds_deadlist,
- mos, ds->ds_phys->ds_deadlist_obj);
+ mos, dsl_dataset_phys(ds)->ds_deadlist_obj);
list_create(&ds->ds_sendstreams, sizeof (dmu_sendarg_t),
offsetof(dmu_sendarg_t, dsa_link));
- if (err == 0) {
- err = dsl_dir_hold_obj(dp,
- ds->ds_phys->ds_dir_obj, NULL, ds, &ds->ds_dir);
+ list_create(&ds->ds_prop_cbs, sizeof (dsl_prop_cb_record_t),
+ offsetof(dsl_prop_cb_record_t, cbr_ds_node));
+
+ if (doi.doi_type == DMU_OTN_ZAP_METADATA) {
+ for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
+ if (!(spa_feature_table[f].fi_flags &
+ ZFEATURE_FLAG_PER_DATASET))
+ continue;
+ err = zap_contains(mos, dsobj,
+ spa_feature_table[f].fi_guid);
+ if (err == 0) {
+ ds->ds_feature_inuse[f] = B_TRUE;
+ } else {
+ ASSERT3U(err, ==, ENOENT);
+ err = 0;
+ }
+ }
}
+
+ err = dsl_dir_hold_obj(dp,
+ dsl_dataset_phys(ds)->ds_dir_obj, NULL, ds, &ds->ds_dir);
if (err != 0) {
mutex_destroy(&ds->ds_lock);
mutex_destroy(&ds->ds_opening_lock);
+ mutex_destroy(&ds->ds_sendstream_lock);
refcount_destroy(&ds->ds_longholds);
bplist_destroy(&ds->ds_pending_deadlist);
dsl_deadlist_close(&ds->ds_deadlist);
@@ -403,25 +489,34 @@
return (err);
}
- if (!dsl_dataset_is_snapshot(ds)) {
+ if (!ds->ds_is_snapshot) {
ds->ds_snapname[0] = '\0';
- if (ds->ds_phys->ds_prev_snap_obj != 0) {
+ if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
err = dsl_dataset_hold_obj(dp,
- ds->ds_phys->ds_prev_snap_obj,
+ dsl_dataset_phys(ds)->ds_prev_snap_obj,
ds, &ds->ds_prev);
}
+ if (doi.doi_type == DMU_OTN_ZAP_METADATA) {
+ int zaperr = zap_lookup(mos, ds->ds_object,
+ DS_FIELD_BOOKMARK_NAMES,
+ sizeof (ds->ds_bookmarks), 1,
+ &ds->ds_bookmarks);
+ if (zaperr != ENOENT)
+ VERIFY0(zaperr);
+ }
} else {
if (zfs_flags & ZFS_DEBUG_SNAPNAMES)
err = dsl_dataset_get_snapname(ds);
- if (err == 0 && ds->ds_phys->ds_userrefs_obj != 0) {
+ if (err == 0 &&
+ dsl_dataset_phys(ds)->ds_userrefs_obj != 0) {
err = zap_count(
ds->ds_dir->dd_pool->dp_meta_objset,
- ds->ds_phys->ds_userrefs_obj,
+ dsl_dataset_phys(ds)->ds_userrefs_obj,
&ds->ds_userrefs);
}
}
- if (err == 0 && !dsl_dataset_is_snapshot(ds)) {
+ if (err == 0 && !ds->ds_is_snapshot) {
err = dsl_prop_get_int_ds(ds,
zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
&ds->ds_reserved);
@@ -434,8 +529,11 @@
ds->ds_reserved = ds->ds_quota = 0;
}
- if (err != 0 || (winner = dmu_buf_set_user_ie(dbuf, ds,
- &ds->ds_phys, dsl_dataset_evict)) != NULL) {
+ dmu_buf_init_user(&ds->ds_dbu, dsl_dataset_evict, &ds->ds_dbuf);
+ if (err == 0)
+ winner = dmu_buf_set_user_ie(dbuf, &ds->ds_dbu);
+
+ if (err != 0 || winner != NULL) {
bplist_destroy(&ds->ds_pending_deadlist);
dsl_deadlist_close(&ds->ds_deadlist);
if (ds->ds_prev)
@@ -443,6 +541,7 @@
dsl_dir_rele(ds->ds_dir, ds);
mutex_destroy(&ds->ds_lock);
mutex_destroy(&ds->ds_opening_lock);
+ mutex_destroy(&ds->ds_sendstream_lock);
refcount_destroy(&ds->ds_longholds);
kmem_free(ds, sizeof (dsl_dataset_t));
if (err != 0) {
@@ -452,12 +551,12 @@
ds = winner;
} else {
ds->ds_fsid_guid =
- unique_insert(ds->ds_phys->ds_fsid_guid);
+ unique_insert(dsl_dataset_phys(ds)->ds_fsid_guid);
}
}
ASSERT3P(ds->ds_dbuf, ==, dbuf);
- ASSERT3P(ds->ds_phys, ==, dbuf->db_data);
- ASSERT(ds->ds_phys->ds_prev_snap_obj != 0 ||
+ ASSERT3P(dsl_dataset_phys(ds), ==, dbuf->db_data);
+ ASSERT(dsl_dataset_phys(ds)->ds_prev_snap_obj != 0 ||
spa_version(dp->dp_spa) < SPA_VERSION_ORIGIN ||
dp->dp_origin_snap == NULL || ds == dp->dp_origin_snap);
*dsp = ds;
@@ -472,6 +571,7 @@
const char *snapname;
uint64_t obj;
int err = 0;
+ dsl_dataset_t *ds;
err = dsl_dir_hold(dp, name, FTAG, &dd, &snapname);
if (err != 0)
@@ -478,38 +578,39 @@
return (err);
ASSERT(dsl_pool_config_held(dp));
- obj = dd->dd_phys->dd_head_dataset_obj;
+ obj = dsl_dir_phys(dd)->dd_head_dataset_obj;
if (obj != 0)
- err = dsl_dataset_hold_obj(dp, obj, tag, dsp);
+ err = dsl_dataset_hold_obj(dp, obj, tag, &ds);
else
err = SET_ERROR(ENOENT);
/* we may be looking for a snapshot */
if (err == 0 && snapname != NULL) {
- dsl_dataset_t *ds;
+ dsl_dataset_t *snap_ds;
if (*snapname++ != '@') {
- dsl_dataset_rele(*dsp, tag);
+ dsl_dataset_rele(ds, tag);
dsl_dir_rele(dd, FTAG);
return (SET_ERROR(ENOENT));
}
dprintf("looking for snapshot '%s'\n", snapname);
- err = dsl_dataset_snap_lookup(*dsp, snapname, &obj);
+ err = dsl_dataset_snap_lookup(ds, snapname, &obj);
if (err == 0)
- err = dsl_dataset_hold_obj(dp, obj, tag, &ds);
- dsl_dataset_rele(*dsp, tag);
+ err = dsl_dataset_hold_obj(dp, obj, tag, &snap_ds);
+ dsl_dataset_rele(ds, tag);
if (err == 0) {
- mutex_enter(&ds->ds_lock);
- if (ds->ds_snapname[0] == 0)
- (void) strlcpy(ds->ds_snapname, snapname,
- sizeof (ds->ds_snapname));
- mutex_exit(&ds->ds_lock);
- *dsp = ds;
+ mutex_enter(&snap_ds->ds_lock);
+ if (snap_ds->ds_snapname[0] == 0)
+ (void) strlcpy(snap_ds->ds_snapname, snapname,
+ sizeof (snap_ds->ds_snapname));
+ mutex_exit(&snap_ds->ds_lock);
+ ds = snap_ds;
}
}
-
+ if (err == 0)
+ *dsp = ds;
dsl_dir_rele(dd, FTAG);
return (err);
}
@@ -581,7 +682,8 @@
dsl_dir_name(ds->ds_dir, name);
VERIFY0(dsl_dataset_get_snapname(ds));
if (ds->ds_snapname[0]) {
- (void) strcat(name, "@");
+ VERIFY3U(strlcat(name, "@", ZFS_MAX_DATASET_NAME_LEN),
+ <, ZFS_MAX_DATASET_NAME_LEN);
/*
* We use a "recursive" mutex so that we
* can call dprintf_ds() with ds_lock held.
@@ -588,38 +690,27 @@
*/
if (!MUTEX_HELD(&ds->ds_lock)) {
mutex_enter(&ds->ds_lock);
- (void) strcat(name, ds->ds_snapname);
+ VERIFY3U(strlcat(name, ds->ds_snapname,
+ ZFS_MAX_DATASET_NAME_LEN), <,
+ ZFS_MAX_DATASET_NAME_LEN);
mutex_exit(&ds->ds_lock);
} else {
- (void) strcat(name, ds->ds_snapname);
+ VERIFY3U(strlcat(name, ds->ds_snapname,
+ ZFS_MAX_DATASET_NAME_LEN), <,
+ ZFS_MAX_DATASET_NAME_LEN);
}
}
}
}
-static int
+int
dsl_dataset_namelen(dsl_dataset_t *ds)
{
- int result;
-
- if (ds == NULL) {
- result = 3; /* "mos" */
- } else {
- result = dsl_dir_namelen(ds->ds_dir);
- VERIFY0(dsl_dataset_get_snapname(ds));
- if (ds->ds_snapname[0]) {
- ++result; /* adding one for the @-sign */
- if (!MUTEX_HELD(&ds->ds_lock)) {
- mutex_enter(&ds->ds_lock);
- result += strlen(ds->ds_snapname);
- mutex_exit(&ds->ds_lock);
- } else {
- result += strlen(ds->ds_snapname);
- }
- }
- }
-
- return (result);
+ VERIFY0(dsl_dataset_get_snapname(ds));
+ mutex_enter(&ds->ds_lock);
+ int len = dsl_dir_namelen(ds->ds_dir) + 1 + strlen(ds->ds_snapname);
+ mutex_exit(&ds->ds_lock);
+ return (len);
}
void
@@ -631,16 +722,14 @@
void
dsl_dataset_disown(dsl_dataset_t *ds, void *tag)
{
- ASSERT(ds->ds_owner == tag && ds->ds_dbuf != NULL);
+ ASSERT3P(ds->ds_owner, ==, tag);
+ ASSERT(ds->ds_dbuf != NULL);
mutex_enter(&ds->ds_lock);
ds->ds_owner = NULL;
mutex_exit(&ds->ds_lock);
dsl_dataset_long_rele(ds, tag);
- if (ds->ds_dbuf != NULL)
- dsl_dataset_rele(ds, tag);
- else
- dsl_dataset_evict(NULL, ds);
+ dsl_dataset_rele(ds, tag);
}
boolean_t
@@ -648,6 +737,7 @@
{
boolean_t gotit = FALSE;
+ ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
mutex_enter(&ds->ds_lock);
if (ds->ds_owner == NULL && !DS_IS_INCONSISTENT(ds)) {
ds->ds_owner = tag;
@@ -658,6 +748,44 @@
return (gotit);
}
+boolean_t
+dsl_dataset_has_owner(dsl_dataset_t *ds)
+{
+ boolean_t rv;
+ mutex_enter(&ds->ds_lock);
+ rv = (ds->ds_owner != NULL);
+ mutex_exit(&ds->ds_lock);
+ return (rv);
+}
+
+static void
+dsl_dataset_activate_feature(uint64_t dsobj, spa_feature_t f, dmu_tx_t *tx)
+{
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+ objset_t *mos = dmu_tx_pool(tx)->dp_meta_objset;
+ uint64_t zero = 0;
+
+ VERIFY(spa_feature_table[f].fi_flags & ZFEATURE_FLAG_PER_DATASET);
+
+ spa_feature_incr(spa, f, tx);
+ dmu_object_zapify(mos, dsobj, DMU_OT_DSL_DATASET, tx);
+
+ VERIFY0(zap_add(mos, dsobj, spa_feature_table[f].fi_guid,
+ sizeof (zero), 1, &zero, tx));
+}
+
+void
+dsl_dataset_deactivate_feature(uint64_t dsobj, spa_feature_t f, dmu_tx_t *tx)
+{
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+ objset_t *mos = dmu_tx_pool(tx)->dp_meta_objset;
+
+ VERIFY(spa_feature_table[f].fi_flags & ZFEATURE_FLAG_PER_DATASET);
+
+ VERIFY0(zap_remove(mos, dsobj, spa_feature_table[f].fi_guid, tx));
+ spa_feature_decr(spa, f, tx);
+}
+
uint64_t
dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
uint64_t flags, dmu_tx_t *tx)
@@ -672,9 +800,9 @@
origin = dp->dp_origin_snap;
ASSERT(origin == NULL || origin->ds_dir->dd_pool == dp);
- ASSERT(origin == NULL || origin->ds_phys->ds_num_children > 0);
+ ASSERT(origin == NULL || dsl_dataset_phys(origin)->ds_num_children > 0);
ASSERT(dmu_tx_is_syncing(tx));
- ASSERT(dd->dd_phys->dd_head_dataset_obj == 0);
+ ASSERT(dsl_dir_phys(dd)->dd_head_dataset_obj == 0);
dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
@@ -702,46 +830,62 @@
dsphys->ds_prev_snap_obj = origin->ds_object;
dsphys->ds_prev_snap_txg =
- origin->ds_phys->ds_creation_txg;
+ dsl_dataset_phys(origin)->ds_creation_txg;
dsphys->ds_referenced_bytes =
- origin->ds_phys->ds_referenced_bytes;
+ dsl_dataset_phys(origin)->ds_referenced_bytes;
dsphys->ds_compressed_bytes =
- origin->ds_phys->ds_compressed_bytes;
+ dsl_dataset_phys(origin)->ds_compressed_bytes;
dsphys->ds_uncompressed_bytes =
- origin->ds_phys->ds_uncompressed_bytes;
- dsphys->ds_bp = origin->ds_phys->ds_bp;
- dsphys->ds_flags |= origin->ds_phys->ds_flags;
+ dsl_dataset_phys(origin)->ds_uncompressed_bytes;
+ rrw_enter(&origin->ds_bp_rwlock, RW_READER, FTAG);
+ dsphys->ds_bp = dsl_dataset_phys(origin)->ds_bp;
+ rrw_exit(&origin->ds_bp_rwlock, FTAG);
+ /*
+ * Inherit flags that describe the dataset's contents
+ * (INCONSISTENT) or properties (Case Insensitive).
+ */
+ dsphys->ds_flags |= dsl_dataset_phys(origin)->ds_flags &
+ (DS_FLAG_INCONSISTENT | DS_FLAG_CI_DATASET);
+
+ for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
+ if (origin->ds_feature_inuse[f])
+ dsl_dataset_activate_feature(dsobj, f, tx);
+ }
+
dmu_buf_will_dirty(origin->ds_dbuf, tx);
- origin->ds_phys->ds_num_children++;
+ dsl_dataset_phys(origin)->ds_num_children++;
VERIFY0(dsl_dataset_hold_obj(dp,
- origin->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ohds));
+ dsl_dir_phys(origin->ds_dir)->dd_head_dataset_obj,
+ FTAG, &ohds));
dsphys->ds_deadlist_obj = dsl_deadlist_clone(&ohds->ds_deadlist,
dsphys->ds_prev_snap_txg, dsphys->ds_prev_snap_obj, tx);
dsl_dataset_rele(ohds, FTAG);
if (spa_version(dp->dp_spa) >= SPA_VERSION_NEXT_CLONES) {
- if (origin->ds_phys->ds_next_clones_obj == 0) {
- origin->ds_phys->ds_next_clones_obj =
+ if (dsl_dataset_phys(origin)->ds_next_clones_obj == 0) {
+ dsl_dataset_phys(origin)->ds_next_clones_obj =
zap_create(mos,
DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx);
}
VERIFY0(zap_add_int(mos,
- origin->ds_phys->ds_next_clones_obj, dsobj, tx));
+ dsl_dataset_phys(origin)->ds_next_clones_obj,
+ dsobj, tx));
}
dmu_buf_will_dirty(dd->dd_dbuf, tx);
- dd->dd_phys->dd_origin_obj = origin->ds_object;
+ dsl_dir_phys(dd)->dd_origin_obj = origin->ds_object;
if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
- if (origin->ds_dir->dd_phys->dd_clones == 0) {
+ if (dsl_dir_phys(origin->ds_dir)->dd_clones == 0) {
dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx);
- origin->ds_dir->dd_phys->dd_clones =
+ dsl_dir_phys(origin->ds_dir)->dd_clones =
zap_create(mos,
DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
}
VERIFY0(zap_add_int(mos,
- origin->ds_dir->dd_phys->dd_clones, dsobj, tx));
+ dsl_dir_phys(origin->ds_dir)->dd_clones,
+ dsobj, tx));
}
}
@@ -751,7 +895,7 @@
dmu_buf_rele(dbuf, FTAG);
dmu_buf_will_dirty(dd->dd_dbuf, tx);
- dd->dd_phys->dd_head_dataset_obj = dsobj;
+ dsl_dir_phys(dd)->dd_head_dataset_obj = dsobj;
return (dsobj);
}
@@ -762,8 +906,20 @@
objset_t *os;
VERIFY0(dmu_objset_from_ds(ds, &os));
- bzero(&os->os_zil_header, sizeof (os->os_zil_header));
- dsl_dataset_dirty(ds, tx);
+ if (bcmp(&os->os_zil_header, &zero_zil, sizeof (zero_zil)) != 0) {
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+ zio_t *zio;
+
+ bzero(&os->os_zil_header, sizeof (os->os_zil_header));
+
+ zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
+ dsl_dataset_sync(ds, zio, tx);
+ VERIFY0(zio_wait(zio));
+
+ /* dsl_dataset_sync_done will drop this reference. */
+ dmu_buf_add_ref(ds->ds_dbuf, ds);
+ dsl_dataset_sync_done(ds, tx);
+ }
}
uint64_t
@@ -785,6 +941,21 @@
dsl_deleg_set_create_perms(dd, tx, cr);
+ /*
+ * Since we're creating a new node we know it's a leaf, so we can
+ * initialize the counts if the limit feature is active.
+ */
+ if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) {
+ uint64_t cnt = 0;
+ objset_t *os = dd->dd_pool->dp_meta_objset;
+
+ dsl_dir_zapify(dd, tx);
+ VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT,
+ sizeof (cnt), 1, &cnt, tx));
+ VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT,
+ sizeof (cnt), 1, &cnt, tx));
+ }
+
dsl_dir_rele(dd, FTAG);
/*
@@ -856,10 +1027,10 @@
uint64_t mrs_used;
uint64_t dlused, dlcomp, dluncomp;
- ASSERT(!dsl_dataset_is_snapshot(ds));
+ ASSERT(!ds->ds_is_snapshot);
- if (ds->ds_phys->ds_prev_snap_obj != 0)
- mrs_used = ds->ds_prev->ds_phys->ds_referenced_bytes;
+ if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0)
+ mrs_used = dsl_dataset_phys(ds->ds_prev)->ds_referenced_bytes;
else
mrs_used = 0;
@@ -866,12 +1037,12 @@
dsl_deadlist_space(&ds->ds_deadlist, &dlused, &dlcomp, &dluncomp);
ASSERT3U(dlused, <=, mrs_used);
- ds->ds_phys->ds_unique_bytes =
- ds->ds_phys->ds_referenced_bytes - (mrs_used - dlused);
+ dsl_dataset_phys(ds)->ds_unique_bytes =
+ dsl_dataset_phys(ds)->ds_referenced_bytes - (mrs_used - dlused);
if (spa_version(ds->ds_dir->dd_pool->dp_spa) >=
SPA_VERSION_UNIQUE_ACCURATE)
- ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
+ dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
}
void
@@ -882,8 +1053,9 @@
uint64_t count;
int err;
- ASSERT(ds->ds_phys->ds_num_children >= 2);
- err = zap_remove_int(mos, ds->ds_phys->ds_next_clones_obj, obj, tx);
+ ASSERT(dsl_dataset_phys(ds)->ds_num_children >= 2);
+ err = zap_remove_int(mos, dsl_dataset_phys(ds)->ds_next_clones_obj,
+ obj, tx);
/*
* The err should not be ENOENT, but a bug in a previous version
* of the code could cause upgrade_clones_cb() to not set
@@ -896,9 +1068,9 @@
*/
if (err != ENOENT)
VERIFY0(err);
- ASSERT0(zap_count(mos, ds->ds_phys->ds_next_clones_obj,
+ ASSERT0(zap_count(mos, dsl_dataset_phys(ds)->ds_next_clones_obj,
&count));
- ASSERT3U(count, <=, ds->ds_phys->ds_num_children - 2);
+ ASSERT3U(count, <=, dsl_dataset_phys(ds)->ds_num_children - 2);
}
@@ -905,22 +1077,9 @@
blkptr_t *
dsl_dataset_get_blkptr(dsl_dataset_t *ds)
{
- return (&ds->ds_phys->ds_bp);
+ return (&dsl_dataset_phys(ds)->ds_bp);
}
-void
-dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
-{
- ASSERT(dmu_tx_is_syncing(tx));
- /* If it's the meta-objset, set dp_meta_rootbp */
- if (ds == NULL) {
- tx->tx_pool->dp_meta_rootbp = *bp;
- } else {
- dmu_buf_will_dirty(ds->ds_dbuf, tx);
- ds->ds_phys->ds_bp = *bp;
- }
-}
-
spa_t *
dsl_dataset_get_spa(dsl_dataset_t *ds)
{
@@ -937,11 +1096,13 @@
ASSERT(ds->ds_objset != NULL);
- if (ds->ds_phys->ds_next_snap_obj != 0)
+ if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0)
panic("dirtying snapshot!");
+ /* Must not dirty a dataset in the same txg where it got snapshotted. */
+ ASSERT3U(tx->tx_txg, >, dsl_dataset_phys(ds)->ds_prev_snap_txg);
+
dp = ds->ds_dir->dd_pool;
-
if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg)) {
/* up the hold count until we can be written out */
dmu_buf_add_ref(ds->ds_dbuf, ds);
@@ -973,7 +1134,7 @@
* outside of the reservation.
*/
ASSERT(ds->ds_reserved == 0 || DS_UNIQUE_IS_ACCURATE(ds));
- asize = MIN(ds->ds_phys->ds_unique_bytes, ds->ds_reserved);
+ asize = MIN(dsl_dataset_phys(ds)->ds_unique_bytes, ds->ds_reserved);
if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE))
return (SET_ERROR(ENOSPC));
@@ -991,11 +1152,12 @@
nvlist_t *ddsa_snaps;
nvlist_t *ddsa_props;
nvlist_t *ddsa_errors;
+ cred_t *ddsa_cr;
} dsl_dataset_snapshot_arg_t;
int
dsl_dataset_snapshot_check_impl(dsl_dataset_t *ds, const char *snapname,
- dmu_tx_t *tx)
+ dmu_tx_t *tx, boolean_t recv, uint64_t cnt, cred_t *cr)
{
int error;
uint64_t value;
@@ -1009,7 +1171,7 @@
* We don't allow multiple snapshots of the same txg. If there
* is already one, try again.
*/
- if (ds->ds_phys->ds_prev_snap_txg >= tx->tx_txg)
+ if (dsl_dataset_phys(ds)->ds_prev_snap_txg >= tx->tx_txg)
return (SET_ERROR(EAGAIN));
/*
@@ -1021,6 +1183,30 @@
if (error != ENOENT)
return (error);
+ /*
+ * We don't allow taking snapshots of inconsistent datasets, such as
+ * those into which we are currently receiving. However, if we are
+ * creating this snapshot as part of a receive, this check will be
+ * executed atomically with respect to the completion of the receive
+ * itself but prior to the clearing of DS_FLAG_INCONSISTENT; in this
+ * case we ignore this, knowing it will be fixed up for us shortly in
+ * dmu_recv_end_sync().
+ */
+ if (!recv && DS_IS_INCONSISTENT(ds))
+ return (SET_ERROR(EBUSY));
+
+ /*
+ * Skip the check for temporary snapshots or if we have already checked
+ * the counts in dsl_dataset_snapshot_check. This means we really only
+ * check the count here when we're receiving a stream.
+ */
+ if (cnt != 0 && cr != NULL) {
+ error = dsl_fs_ss_limit_check(ds->ds_dir, cnt,
+ ZFS_PROP_SNAPSHOT_LIMIT, NULL, cr);
+ if (error != 0)
+ return (error);
+ }
+
error = dsl_dataset_snapshot_reserve_space(ds, tx);
if (error != 0)
return (error);
@@ -1036,15 +1222,108 @@
nvpair_t *pair;
int rv = 0;
+ /*
+ * Pre-compute how many total new snapshots will be created for each
+ * level in the tree and below. This is needed for validating the
+ * snapshot limit when either taking a recursive snapshot or when
+ * taking multiple snapshots.
+ *
+ * The problem is that the counts are not actually adjusted when
+ * we are checking, only when we finally sync. For a single snapshot,
+ * this is easy, the count will increase by 1 at each node up the tree,
+ * but its more complicated for the recursive/multiple snapshot case.
+ *
+ * The dsl_fs_ss_limit_check function does recursively check the count
+ * at each level up the tree but since it is validating each snapshot
+ * independently we need to be sure that we are validating the complete
+ * count for the entire set of snapshots. We do this by rolling up the
+ * counts for each component of the name into an nvlist and then
+ * checking each of those cases with the aggregated count.
+ *
+ * This approach properly handles not only the recursive snapshot
+ * case (where we get all of those on the ddsa_snaps list) but also
+ * the sibling case (e.g. snapshot a/b and a/c so that we will also
+ * validate the limit on 'a' using a count of 2).
+ *
+ * We validate the snapshot names in the third loop and only report
+ * name errors once.
+ */
+ if (dmu_tx_is_syncing(tx)) {
+ nvlist_t *cnt_track = NULL;
+ cnt_track = fnvlist_alloc();
+
+ /* Rollup aggregated counts into the cnt_track list */
+ for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL);
+ pair != NULL;
+ pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {
+ char *pdelim;
+ uint64_t val;
+ char nm[MAXPATHLEN];
+
+ (void) strlcpy(nm, nvpair_name(pair), sizeof (nm));
+ pdelim = strchr(nm, '@');
+ if (pdelim == NULL)
+ continue;
+ *pdelim = '\0';
+
+ do {
+ if (nvlist_lookup_uint64(cnt_track, nm,
+ &val) == 0) {
+ /* update existing entry */
+ fnvlist_add_uint64(cnt_track, nm,
+ val + 1);
+ } else {
+ /* add to list */
+ fnvlist_add_uint64(cnt_track, nm, 1);
+ }
+
+ pdelim = strrchr(nm, '/');
+ if (pdelim != NULL)
+ *pdelim = '\0';
+ } while (pdelim != NULL);
+ }
+
+ /* Check aggregated counts at each level */
+ for (pair = nvlist_next_nvpair(cnt_track, NULL);
+ pair != NULL; pair = nvlist_next_nvpair(cnt_track, pair)) {
+ int error = 0;
+ char *name;
+ uint64_t cnt = 0;
+ dsl_dataset_t *ds;
+
+ name = nvpair_name(pair);
+ cnt = fnvpair_value_uint64(pair);
+ ASSERT(cnt > 0);
+
+ error = dsl_dataset_hold(dp, name, FTAG, &ds);
+ if (error == 0) {
+ error = dsl_fs_ss_limit_check(ds->ds_dir, cnt,
+ ZFS_PROP_SNAPSHOT_LIMIT, NULL,
+ ddsa->ddsa_cr);
+ dsl_dataset_rele(ds, FTAG);
+ }
+
+ if (error != 0) {
+ if (ddsa->ddsa_errors != NULL)
+ fnvlist_add_int32(ddsa->ddsa_errors,
+ name, error);
+ rv = error;
+ /* only report one error for this check */
+ break;
+ }
+ }
+ nvlist_free(cnt_track);
+ }
+
for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL);
pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {
int error = 0;
dsl_dataset_t *ds;
char *name, *atp;
- char dsname[MAXNAMELEN];
+ char dsname[ZFS_MAX_DATASET_NAME_LEN];
name = nvpair_name(pair);
- if (strlen(name) >= MAXNAMELEN)
+ if (strlen(name) >= ZFS_MAX_DATASET_NAME_LEN)
error = SET_ERROR(ENAMETOOLONG);
if (error == 0) {
atp = strchr(name, '@');
@@ -1056,8 +1335,9 @@
if (error == 0)
error = dsl_dataset_hold(dp, dsname, FTAG, &ds);
if (error == 0) {
+ /* passing 0/NULL skips dsl_fs_ss_limit_check */
error = dsl_dataset_snapshot_check_impl(ds,
- atp + 1, tx);
+ atp + 1, tx, B_FALSE, 0, NULL);
dsl_dataset_rele(ds, FTAG);
}
@@ -1069,6 +1349,7 @@
rv = error;
}
}
+
return (rv);
}
@@ -1076,8 +1357,6 @@
dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname,
dmu_tx_t *tx)
{
- static zil_header_t zero_zil;
-
dsl_pool_t *dp = ds->ds_dir->dd_pool;
dmu_buf_t *dbuf;
dsl_dataset_phys_t *dsphys;
@@ -1096,7 +1375,12 @@
bcmp(&os->os_phys->os_zil_header, &zero_zil,
sizeof (zero_zil)) == 0);
+ /* Should not snapshot a dirty dataset. */
+ ASSERT(!txg_list_member(&ds->ds_dir->dd_pool->dp_dirty_datasets,
+ ds, tx->tx_txg));
+ dsl_fs_ss_count_adjust(ds->ds_dir, 1, DD_FIELD_SNAPSHOT_COUNT, tx);
+
/*
* The origin's ds_creation_txg has to be < TXG_INITIAL
*/
@@ -1117,32 +1401,42 @@
(void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
sizeof (dsphys->ds_guid));
} while (dsphys->ds_guid == 0);
- dsphys->ds_prev_snap_obj = ds->ds_phys->ds_prev_snap_obj;
- dsphys->ds_prev_snap_txg = ds->ds_phys->ds_prev_snap_txg;
+ dsphys->ds_prev_snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
+ dsphys->ds_prev_snap_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
dsphys->ds_next_snap_obj = ds->ds_object;
dsphys->ds_num_children = 1;
dsphys->ds_creation_time = gethrestime_sec();
dsphys->ds_creation_txg = crtxg;
- dsphys->ds_deadlist_obj = ds->ds_phys->ds_deadlist_obj;
- dsphys->ds_referenced_bytes = ds->ds_phys->ds_referenced_bytes;
- dsphys->ds_compressed_bytes = ds->ds_phys->ds_compressed_bytes;
- dsphys->ds_uncompressed_bytes = ds->ds_phys->ds_uncompressed_bytes;
- dsphys->ds_flags = ds->ds_phys->ds_flags;
- dsphys->ds_bp = ds->ds_phys->ds_bp;
+ dsphys->ds_deadlist_obj = dsl_dataset_phys(ds)->ds_deadlist_obj;
+ dsphys->ds_referenced_bytes = dsl_dataset_phys(ds)->ds_referenced_bytes;
+ dsphys->ds_compressed_bytes = dsl_dataset_phys(ds)->ds_compressed_bytes;
+ dsphys->ds_uncompressed_bytes =
+ dsl_dataset_phys(ds)->ds_uncompressed_bytes;
+ dsphys->ds_flags = dsl_dataset_phys(ds)->ds_flags;
+ rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
+ dsphys->ds_bp = dsl_dataset_phys(ds)->ds_bp;
+ rrw_exit(&ds->ds_bp_rwlock, FTAG);
dmu_buf_rele(dbuf, FTAG);
- ASSERT3U(ds->ds_prev != 0, ==, ds->ds_phys->ds_prev_snap_obj != 0);
+ for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
+ if (ds->ds_feature_inuse[f])
+ dsl_dataset_activate_feature(dsobj, f, tx);
+ }
+
+ ASSERT3U(ds->ds_prev != 0, ==,
+ dsl_dataset_phys(ds)->ds_prev_snap_obj != 0);
if (ds->ds_prev) {
uint64_t next_clones_obj =
- ds->ds_prev->ds_phys->ds_next_clones_obj;
- ASSERT(ds->ds_prev->ds_phys->ds_next_snap_obj ==
+ dsl_dataset_phys(ds->ds_prev)->ds_next_clones_obj;
+ ASSERT(dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj ==
ds->ds_object ||
- ds->ds_prev->ds_phys->ds_num_children > 1);
- if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) {
+ dsl_dataset_phys(ds->ds_prev)->ds_num_children > 1);
+ if (dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj ==
+ ds->ds_object) {
dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
- ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==,
- ds->ds_prev->ds_phys->ds_creation_txg);
- ds->ds_prev->ds_phys->ds_next_snap_obj = dsobj;
+ ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, ==,
+ dsl_dataset_phys(ds->ds_prev)->ds_creation_txg);
+ dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj = dsobj;
} else if (next_clones_obj != 0) {
dsl_dataset_remove_from_next_clones(ds->ds_prev,
dsphys->ds_next_snap_obj, tx);
@@ -1159,33 +1453,36 @@
if (ds->ds_reserved) {
int64_t delta;
ASSERT(DS_UNIQUE_IS_ACCURATE(ds));
- delta = MIN(ds->ds_phys->ds_unique_bytes, ds->ds_reserved);
+ delta = MIN(dsl_dataset_phys(ds)->ds_unique_bytes,
+ ds->ds_reserved);
dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV,
delta, 0, 0, tx);
}
dmu_buf_will_dirty(ds->ds_dbuf, tx);
- ds->ds_phys->ds_deadlist_obj = dsl_deadlist_clone(&ds->ds_deadlist,
- UINT64_MAX, ds->ds_phys->ds_prev_snap_obj, tx);
+ dsl_dataset_phys(ds)->ds_deadlist_obj =
+ dsl_deadlist_clone(&ds->ds_deadlist, UINT64_MAX,
+ dsl_dataset_phys(ds)->ds_prev_snap_obj, tx);
dsl_deadlist_close(&ds->ds_deadlist);
- dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj);
+ dsl_deadlist_open(&ds->ds_deadlist, mos,
+ dsl_dataset_phys(ds)->ds_deadlist_obj);
dsl_deadlist_add_key(&ds->ds_deadlist,
- ds->ds_phys->ds_prev_snap_txg, tx);
+ dsl_dataset_phys(ds)->ds_prev_snap_txg, tx);
- ASSERT3U(ds->ds_phys->ds_prev_snap_txg, <, tx->tx_txg);
- ds->ds_phys->ds_prev_snap_obj = dsobj;
- ds->ds_phys->ds_prev_snap_txg = crtxg;
- ds->ds_phys->ds_unique_bytes = 0;
+ ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, <, tx->tx_txg);
+ dsl_dataset_phys(ds)->ds_prev_snap_obj = dsobj;
+ dsl_dataset_phys(ds)->ds_prev_snap_txg = crtxg;
+ dsl_dataset_phys(ds)->ds_unique_bytes = 0;
if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE)
- ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
+ dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
- VERIFY0(zap_add(mos, ds->ds_phys->ds_snapnames_zapobj,
+ VERIFY0(zap_add(mos, dsl_dataset_phys(ds)->ds_snapnames_zapobj,
snapname, 8, 1, &dsobj, tx));
if (ds->ds_prev)
dsl_dataset_rele(ds->ds_prev, ds);
VERIFY0(dsl_dataset_hold_obj(dp,
- ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev));
+ dsl_dataset_phys(ds)->ds_prev_snap_obj, ds, &ds->ds_prev));
dsl_scan_ds_snapshotted(ds, tx);
@@ -1205,7 +1502,7 @@
pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) {
dsl_dataset_t *ds;
char *name, *atp;
- char dsname[MAXNAMELEN];
+ char dsname[ZFS_MAX_DATASET_NAME_LEN];
name = nvpair_name(pair);
atp = strchr(name, '@');
@@ -1251,7 +1548,7 @@
suspended = fnvlist_alloc();
for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
pair = nvlist_next_nvpair(snaps, pair)) {
- char fsname[MAXNAMELEN];
+ char fsname[ZFS_MAX_DATASET_NAME_LEN];
char *snapname = nvpair_name(pair);
char *atp;
void *cookie;
@@ -1274,11 +1571,12 @@
ddsa.ddsa_snaps = snaps;
ddsa.ddsa_props = props;
ddsa.ddsa_errors = errors;
+ ddsa.ddsa_cr = CRED();
if (error == 0) {
error = dsl_sync_task(firstname, dsl_dataset_snapshot_check,
dsl_dataset_snapshot_sync, &ddsa,
- fnvlist_num_pairs(snaps) * 3);
+ fnvlist_num_pairs(snaps) * 3, ZFS_SPACE_CHECK_NORMAL);
}
if (suspended != NULL) {
@@ -1323,7 +1621,9 @@
if (error != 0)
return (error);
- error = dsl_dataset_snapshot_check_impl(ds, ddsta->ddsta_snapname, tx);
+ /* NULL cred means no limit check for tmp snapshot */
+ error = dsl_dataset_snapshot_check_impl(ds, ddsta->ddsta_snapname,
+ tx, B_FALSE, 0, NULL);
if (error != 0) {
dsl_dataset_rele(ds, FTAG);
return (error);
@@ -1389,7 +1689,7 @@
}
error = dsl_sync_task(fsname, dsl_dataset_snapshot_tmp_check,
- dsl_dataset_snapshot_tmp_sync, &ddsta, 3);
+ dsl_dataset_snapshot_tmp_sync, &ddsta, 3, ZFS_SPACE_CHECK_RESERVED);
if (needsuspend)
zil_resume(cookie);
@@ -1402,7 +1702,7 @@
{
ASSERT(dmu_tx_is_syncing(tx));
ASSERT(ds->ds_objset != NULL);
- ASSERT(ds->ds_phys->ds_next_snap_obj == 0);
+ ASSERT(dsl_dataset_phys(ds)->ds_next_snap_obj == 0);
/*
* in case we had to change ds_fsid_guid when we opened it,
@@ -1409,11 +1709,56 @@
* sync it out now.
*/
dmu_buf_will_dirty(ds->ds_dbuf, tx);
- ds->ds_phys->ds_fsid_guid = ds->ds_fsid_guid;
+ dsl_dataset_phys(ds)->ds_fsid_guid = ds->ds_fsid_guid;
+ if (ds->ds_resume_bytes[tx->tx_txg & TXG_MASK] != 0) {
+ VERIFY0(zap_update(tx->tx_pool->dp_meta_objset,
+ ds->ds_object, DS_FIELD_RESUME_OBJECT, 8, 1,
+ &ds->ds_resume_object[tx->tx_txg & TXG_MASK], tx));
+ VERIFY0(zap_update(tx->tx_pool->dp_meta_objset,
+ ds->ds_object, DS_FIELD_RESUME_OFFSET, 8, 1,
+ &ds->ds_resume_offset[tx->tx_txg & TXG_MASK], tx));
+ VERIFY0(zap_update(tx->tx_pool->dp_meta_objset,
+ ds->ds_object, DS_FIELD_RESUME_BYTES, 8, 1,
+ &ds->ds_resume_bytes[tx->tx_txg & TXG_MASK], tx));
+ ds->ds_resume_object[tx->tx_txg & TXG_MASK] = 0;
+ ds->ds_resume_offset[tx->tx_txg & TXG_MASK] = 0;
+ ds->ds_resume_bytes[tx->tx_txg & TXG_MASK] = 0;
+ }
+
dmu_objset_sync(ds->ds_objset, zio, tx);
+
+ for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
+ if (ds->ds_feature_activation_needed[f]) {
+ if (ds->ds_feature_inuse[f])
+ continue;
+ dsl_dataset_activate_feature(ds->ds_object, f, tx);
+ ds->ds_feature_inuse[f] = B_TRUE;
+ }
+ }
}
+static int
+deadlist_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+ dsl_deadlist_t *dl = arg;
+ dsl_deadlist_insert(dl, bp, tx);
+ return (0);
+}
+
+void
+dsl_dataset_sync_done(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+ objset_t *os = ds->ds_objset;
+
+ bplist_iterate(&ds->ds_pending_deadlist,
+ deadlist_enqueue_cb, &ds->ds_deadlist, tx);
+
+ ASSERT(!dmu_objset_is_dirty(os, dmu_tx_get_txg(tx)));
+
+ dmu_buf_rele(ds->ds_dbuf, ds);
+}
+
static void
get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv)
{
@@ -1422,26 +1767,38 @@
zap_cursor_t zc;
zap_attribute_t za;
nvlist_t *propval = fnvlist_alloc();
- nvlist_t *val = fnvlist_alloc();
+ nvlist_t *val;
ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
/*
+ * We use nvlist_alloc() instead of fnvlist_alloc() because the
+ * latter would allocate the list with NV_UNIQUE_NAME flag.
+ * As a result, every time a clone name is appended to the list
+ * it would be (linearly) searched for for a duplicate name.
+ * We already know that all clone names must be unique and we
+ * want avoid the quadratic complexity of double-checking that
+ * because we can have a large number of clones.
+ */
+ VERIFY0(nvlist_alloc(&val, 0, KM_SLEEP));
+
+ /*
* There may be missing entries in ds_next_clones_obj
* due to a bug in a previous version of the code.
* Only trust it if it has the right number of entries.
*/
- if (ds->ds_phys->ds_next_clones_obj != 0) {
- ASSERT0(zap_count(mos, ds->ds_phys->ds_next_clones_obj,
+ if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) {
+ VERIFY0(zap_count(mos, dsl_dataset_phys(ds)->ds_next_clones_obj,
&count));
}
- if (count != ds->ds_phys->ds_num_children - 1)
+ if (count != dsl_dataset_phys(ds)->ds_num_children - 1)
goto fail;
- for (zap_cursor_init(&zc, mos, ds->ds_phys->ds_next_clones_obj);
+ for (zap_cursor_init(&zc, mos,
+ dsl_dataset_phys(ds)->ds_next_clones_obj);
zap_cursor_retrieve(&zc, &za) == 0;
zap_cursor_advance(&zc)) {
dsl_dataset_t *clone;
- char buf[ZFS_MAXNAMELEN];
+ char buf[ZFS_MAX_DATASET_NAME_LEN];
VERIFY0(dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
za.za_first_integer, FTAG, &clone));
dsl_dir_name(clone->ds_dir, buf);
@@ -1456,6 +1813,76 @@
nvlist_free(propval);
}
+static void
+get_receive_resume_stats(dsl_dataset_t *ds, nvlist_t *nv)
+{
+ dsl_pool_t *dp = ds->ds_dir->dd_pool;
+
+ if (dsl_dataset_has_resume_receive_state(ds)) {
+ char *str;
+ void *packed;
+ uint8_t *compressed;
+ uint64_t val;
+ nvlist_t *token_nv = fnvlist_alloc();
+ size_t packed_size, compressed_size;
+
+ if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_FROMGUID, sizeof (val), 1, &val) == 0) {
+ fnvlist_add_uint64(token_nv, "fromguid", val);
+ }
+ if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_OBJECT, sizeof (val), 1, &val) == 0) {
+ fnvlist_add_uint64(token_nv, "object", val);
+ }
+ if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_OFFSET, sizeof (val), 1, &val) == 0) {
+ fnvlist_add_uint64(token_nv, "offset", val);
+ }
+ if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_BYTES, sizeof (val), 1, &val) == 0) {
+ fnvlist_add_uint64(token_nv, "bytes", val);
+ }
+ if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_TOGUID, sizeof (val), 1, &val) == 0) {
+ fnvlist_add_uint64(token_nv, "toguid", val);
+ }
+ char buf[256];
+ if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_TONAME, 1, sizeof (buf), buf) == 0) {
+ fnvlist_add_string(token_nv, "toname", buf);
+ }
+ if (zap_contains(dp->dp_meta_objset, ds->ds_object,
+ DS_FIELD_RESUME_EMBEDOK) == 0) {
+ fnvlist_add_boolean(token_nv, "embedok");
+ }
+ packed = fnvlist_pack(token_nv, &packed_size);
+ fnvlist_free(token_nv);
+ compressed = kmem_alloc(packed_size, KM_SLEEP);
+
+ compressed_size = gzip_compress(packed, compressed,
+ packed_size, packed_size, 6);
+
+ zio_cksum_t cksum;
+ fletcher_4_native(compressed, compressed_size, NULL, &cksum);
+
+ str = kmem_alloc(compressed_size * 2 + 1, KM_SLEEP);
+ for (int i = 0; i < compressed_size; i++) {
+ (void) sprintf(str + i * 2, "%02x", compressed[i]);
+ }
+ str[compressed_size * 2] = '\0';
+ char *propval = kmem_asprintf("%u-%llx-%llx-%s",
+ ZFS_SEND_RESUME_TOKEN_VERSION,
+ (longlong_t)cksum.zc_word[0],
+ (longlong_t)packed_size, str);
+ dsl_prop_nvlist_add_string(nv,
+ ZFS_PROP_RECEIVE_RESUME_TOKEN, propval);
+ kmem_free(packed, packed_size);
+ kmem_free(str, compressed_size * 2 + 1);
+ kmem_free(compressed, packed_size);
+ strfree(propval);
+ }
+}
+
void
dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
{
@@ -1464,20 +1891,26 @@
ASSERT(dsl_pool_config_held(dp));
- ratio = ds->ds_phys->ds_compressed_bytes == 0 ? 100 :
- (ds->ds_phys->ds_uncompressed_bytes * 100 /
- ds->ds_phys->ds_compressed_bytes);
+ ratio = dsl_dataset_phys(ds)->ds_compressed_bytes == 0 ? 100 :
+ (dsl_dataset_phys(ds)->ds_uncompressed_bytes * 100 /
+ dsl_dataset_phys(ds)->ds_compressed_bytes);
dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRATIO, ratio);
dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALREFERENCED,
- ds->ds_phys->ds_uncompressed_bytes);
+ dsl_dataset_phys(ds)->ds_uncompressed_bytes);
- if (dsl_dataset_is_snapshot(ds)) {
+ if (ds->ds_is_snapshot) {
dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO, ratio);
dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED,
- ds->ds_phys->ds_unique_bytes);
+ dsl_dataset_phys(ds)->ds_unique_bytes);
get_clones_stat(ds, nv);
} else {
+ if (ds->ds_prev != NULL && ds->ds_prev != dp->dp_origin_snap) {
+ char buf[ZFS_MAX_DATASET_NAME_LEN];
+ dsl_dataset_name(ds->ds_prev, buf);
+ dsl_prop_nvlist_add_string(nv, ZFS_PROP_PREV_SNAP, buf);
+ }
+
dsl_dir_stats(ds->ds_dir, nv);
}
@@ -1486,17 +1919,17 @@
dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED, refd);
dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATION,
- ds->ds_phys->ds_creation_time);
+ dsl_dataset_phys(ds)->ds_creation_time);
dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATETXG,
- ds->ds_phys->ds_creation_txg);
+ dsl_dataset_phys(ds)->ds_creation_txg);
dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFQUOTA,
ds->ds_quota);
dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRESERVATION,
ds->ds_reserved);
dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_GUID,
- ds->ds_phys->ds_guid);
+ dsl_dataset_phys(ds)->ds_guid);
dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_UNIQUE,
- ds->ds_phys->ds_unique_bytes);
+ dsl_dataset_phys(ds)->ds_unique_bytes);
dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_OBJSETID,
ds->ds_object);
dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERREFS,
@@ -1504,13 +1937,13 @@
dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_DEFER_DESTROY,
DS_IS_DEFER_DESTROY(ds) ? 1 : 0);
- if (ds->ds_phys->ds_prev_snap_obj != 0) {
+ if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
uint64_t written, comp, uncomp;
dsl_pool_t *dp = ds->ds_dir->dd_pool;
dsl_dataset_t *prev;
int err = dsl_dataset_hold_obj(dp,
- ds->ds_phys->ds_prev_snap_obj, FTAG, &prev);
+ dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
if (err == 0) {
err = dsl_dataset_space_written(prev, ds, &written,
&comp, &uncomp);
@@ -1521,6 +1954,32 @@
}
}
}
+
+ if (!dsl_dataset_is_snapshot(ds)) {
+ /*
+ * A failed "newfs" (e.g. full) resumable receive leaves
+ * the stats set on this dataset. Check here for the prop.
+ */
+ get_receive_resume_stats(ds, nv);
+
+ /*
+ * A failed incremental resumable receive leaves the
+ * stats set on our child named "%recv". Check the child
+ * for the prop.
+ */
+ /* 6 extra bytes for /%recv */
+ char recvname[ZFS_MAX_DATASET_NAME_LEN + 6];
+ dsl_dataset_t *recv_ds;
+ dsl_dataset_name(ds, recvname);
+ if (strlcat(recvname, "/", sizeof (recvname)) <
+ sizeof (recvname) &&
+ strlcat(recvname, recv_clone_name, sizeof (recvname)) <
+ sizeof (recvname) &&
+ dsl_dataset_hold(dp, recvname, FTAG, &recv_ds) == 0) {
+ get_receive_resume_stats(recv_ds, nv);
+ dsl_dataset_rele(recv_ds, FTAG);
+ }
+ }
}
void
@@ -1529,13 +1988,15 @@
dsl_pool_t *dp = ds->ds_dir->dd_pool;
ASSERT(dsl_pool_config_held(dp));
- stat->dds_creation_txg = ds->ds_phys->ds_creation_txg;
- stat->dds_inconsistent = ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT;
- stat->dds_guid = ds->ds_phys->ds_guid;
+ stat->dds_creation_txg = dsl_dataset_phys(ds)->ds_creation_txg;
+ stat->dds_inconsistent =
+ dsl_dataset_phys(ds)->ds_flags & DS_FLAG_INCONSISTENT;
+ stat->dds_guid = dsl_dataset_phys(ds)->ds_guid;
stat->dds_origin[0] = '\0';
- if (dsl_dataset_is_snapshot(ds)) {
+ if (ds->ds_is_snapshot) {
stat->dds_is_snapshot = B_TRUE;
- stat->dds_num_clones = ds->ds_phys->ds_num_children - 1;
+ stat->dds_num_clones =
+ dsl_dataset_phys(ds)->ds_num_children - 1;
} else {
stat->dds_is_snapshot = B_FALSE;
stat->dds_num_clones = 0;
@@ -1544,7 +2005,8 @@
dsl_dataset_t *ods;
VERIFY0(dsl_dataset_hold_obj(dp,
- ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &ods));
+ dsl_dir_phys(ds->ds_dir)->dd_origin_obj,
+ FTAG, &ods));
dsl_dataset_name(ods, stat->dds_origin);
dsl_dataset_rele(ods, FTAG);
}
@@ -1562,10 +2024,11 @@
uint64_t *refdbytesp, uint64_t *availbytesp,
uint64_t *usedobjsp, uint64_t *availobjsp)
{
- *refdbytesp = ds->ds_phys->ds_referenced_bytes;
+ *refdbytesp = dsl_dataset_phys(ds)->ds_referenced_bytes;
*availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE);
- if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes)
- *availbytesp += ds->ds_reserved - ds->ds_phys->ds_unique_bytes;
+ if (ds->ds_reserved > dsl_dataset_phys(ds)->ds_unique_bytes)
+ *availbytesp +=
+ ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes;
if (ds->ds_quota != 0) {
/*
* Adjust available bytes according to refquota
@@ -1576,21 +2039,26 @@
else
*availbytesp = 0;
}
- *usedobjsp = ds->ds_phys->ds_bp.blk_fill;
+ rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
+ *usedobjsp = BP_GET_FILL(&dsl_dataset_phys(ds)->ds_bp);
+ rrw_exit(&ds->ds_bp_rwlock, FTAG);
*availobjsp = DN_MAX_OBJECT - *usedobjsp;
}
boolean_t
-dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds)
+dsl_dataset_modified_since_snap(dsl_dataset_t *ds, dsl_dataset_t *snap)
{
dsl_pool_t *dp = ds->ds_dir->dd_pool;
+ uint64_t birth;
ASSERT(dsl_pool_config_held(dp));
- if (ds->ds_prev == NULL)
+ if (snap == NULL)
return (B_FALSE);
- if (ds->ds_phys->ds_bp.blk_birth >
- ds->ds_prev->ds_phys->ds_creation_txg) {
- objset_t *os, *os_prev;
+ rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
+ birth = dsl_dataset_get_blkptr(ds)->blk_birth;
+ rrw_exit(&ds->ds_bp_rwlock, FTAG);
+ if (birth > dsl_dataset_phys(snap)->ds_creation_txg) {
+ objset_t *os, *os_snap;
/*
* It may be that only the ZIL differs, because it was
* reset in the head. Don't count that as being
@@ -1598,10 +2066,10 @@
*/
if (dmu_objset_from_ds(ds, &os) != 0)
return (B_TRUE);
- if (dmu_objset_from_ds(ds->ds_prev, &os_prev) != 0)
+ if (dmu_objset_from_ds(snap, &os_snap) != 0)
return (B_TRUE);
return (bcmp(&os->os_phys->os_meta_dnode,
- &os_prev->os_phys->os_meta_dnode,
+ &os_snap->os_phys->os_meta_dnode,
sizeof (os->os_phys->os_meta_dnode)) != 0);
}
return (B_FALSE);
@@ -1639,7 +2107,7 @@
/* dataset name + 1 for the "@" + the new snapshot name must fit */
if (dsl_dir_namelen(hds->ds_dir) + 1 +
- strlen(ddrsa->ddrsa_newsnapname) >= MAXNAMELEN)
+ strlen(ddrsa->ddrsa_newsnapname) >= ZFS_MAX_DATASET_NAME_LEN)
error = SET_ERROR(ENAMETOOLONG);
return (error);
@@ -1696,11 +2164,13 @@
spa_history_log_internal_ds(ds, "rename", tx,
"-> @%s", ddrsa->ddrsa_newsnapname);
- VERIFY0(dsl_dataset_snap_remove(hds, ddrsa->ddrsa_oldsnapname, tx));
+ VERIFY0(dsl_dataset_snap_remove(hds, ddrsa->ddrsa_oldsnapname, tx,
+ B_FALSE));
mutex_enter(&ds->ds_lock);
(void) strcpy(ds->ds_snapname, ddrsa->ddrsa_newsnapname);
mutex_exit(&ds->ds_lock);
- VERIFY0(zap_add(dp->dp_meta_objset, hds->ds_phys->ds_snapnames_zapobj,
+ VERIFY0(zap_add(dp->dp_meta_objset,
+ dsl_dataset_phys(hds)->ds_snapnames_zapobj,
ds->ds_snapname, 8, 1, &ds->ds_object, tx));
#ifdef __FreeBSD__
@@ -1753,45 +2223,118 @@
ddrsa.ddrsa_recursive = recursive;
return (dsl_sync_task(fsname, dsl_dataset_rename_snapshot_check,
- dsl_dataset_rename_snapshot_sync, &ddrsa, 1));
+ dsl_dataset_rename_snapshot_sync, &ddrsa,
+ 1, ZFS_SPACE_CHECK_RESERVED));
}
+/*
+ * If we're doing an ownership handoff, we need to make sure that there is
+ * only one long hold on the dataset. We're not allowed to change anything here
+ * so we don't permanently release the long hold or regular hold here. We want
+ * to do this only when syncing to avoid the dataset unexpectedly going away
+ * when we release the long hold.
+ */
static int
+dsl_dataset_handoff_check(dsl_dataset_t *ds, void *owner, dmu_tx_t *tx)
+{
+ boolean_t held;
+
+ if (!dmu_tx_is_syncing(tx))
+ return (0);
+
+ if (owner != NULL) {
+ VERIFY3P(ds->ds_owner, ==, owner);
+ dsl_dataset_long_rele(ds, owner);
+ }
+
+ held = dsl_dataset_long_held(ds);
+
+ if (owner != NULL)
+ dsl_dataset_long_hold(ds, owner);
+
+ if (held)
+ return (SET_ERROR(EBUSY));
+
+ return (0);
+}
+
+typedef struct dsl_dataset_rollback_arg {
+ const char *ddra_fsname;
+ void *ddra_owner;
+ nvlist_t *ddra_result;
+} dsl_dataset_rollback_arg_t;
+
+static int
dsl_dataset_rollback_check(void *arg, dmu_tx_t *tx)
{
- const char *fsname = arg;
+ dsl_dataset_rollback_arg_t *ddra = arg;
dsl_pool_t *dp = dmu_tx_pool(tx);
dsl_dataset_t *ds;
int64_t unused_refres_delta;
int error;
- error = dsl_dataset_hold(dp, fsname, FTAG, &ds);
+ error = dsl_dataset_hold(dp, ddra->ddra_fsname, FTAG, &ds);
if (error != 0)
return (error);
/* must not be a snapshot */
- if (dsl_dataset_is_snapshot(ds)) {
+ if (ds->ds_is_snapshot) {
dsl_dataset_rele(ds, FTAG);
return (SET_ERROR(EINVAL));
}
/* must have a most recent snapshot */
- if (ds->ds_phys->ds_prev_snap_txg < TXG_INITIAL) {
+ if (dsl_dataset_phys(ds)->ds_prev_snap_txg < TXG_INITIAL) {
dsl_dataset_rele(ds, FTAG);
return (SET_ERROR(EINVAL));
}
- if (dsl_dataset_long_held(ds)) {
+ /*
+ * No rollback to a snapshot created in the current txg, because
+ * the rollback may dirty the dataset and create blocks that are
+ * not reachable from the rootbp while having a birth txg that
+ * falls into the snapshot's range.
+ */
+ if (dmu_tx_is_syncing(tx) &&
+ dsl_dataset_phys(ds)->ds_prev_snap_txg >= tx->tx_txg) {
dsl_dataset_rele(ds, FTAG);
- return (SET_ERROR(EBUSY));
+ return (SET_ERROR(EAGAIN));
}
+ /* must not have any bookmarks after the most recent snapshot */
+ nvlist_t *proprequest = fnvlist_alloc();
+ fnvlist_add_boolean(proprequest, zfs_prop_to_name(ZFS_PROP_CREATETXG));
+ nvlist_t *bookmarks = fnvlist_alloc();
+ error = dsl_get_bookmarks_impl(ds, proprequest, bookmarks);
+ fnvlist_free(proprequest);
+ if (error != 0)
+ return (error);
+ for (nvpair_t *pair = nvlist_next_nvpair(bookmarks, NULL);
+ pair != NULL; pair = nvlist_next_nvpair(bookmarks, pair)) {
+ nvlist_t *valuenv =
+ fnvlist_lookup_nvlist(fnvpair_value_nvlist(pair),
+ zfs_prop_to_name(ZFS_PROP_CREATETXG));
+ uint64_t createtxg = fnvlist_lookup_uint64(valuenv, "value");
+ if (createtxg > dsl_dataset_phys(ds)->ds_prev_snap_txg) {
+ fnvlist_free(bookmarks);
+ dsl_dataset_rele(ds, FTAG);
+ return (SET_ERROR(EEXIST));
+ }
+ }
+ fnvlist_free(bookmarks);
+
+ error = dsl_dataset_handoff_check(ds, ddra->ddra_owner, tx);
+ if (error != 0) {
+ dsl_dataset_rele(ds, FTAG);
+ return (error);
+ }
+
/*
* Check if the snap we are rolling back to uses more than
* the refquota.
*/
if (ds->ds_quota != 0 &&
- ds->ds_prev->ds_phys->ds_referenced_bytes > ds->ds_quota) {
+ dsl_dataset_phys(ds->ds_prev)->ds_referenced_bytes > ds->ds_quota) {
dsl_dataset_rele(ds, FTAG);
return (SET_ERROR(EDQUOT));
}
@@ -1804,7 +2347,7 @@
* this space, but the freeing happens over many txg's.
*/
unused_refres_delta = (int64_t)MIN(ds->ds_reserved,
- ds->ds_phys->ds_unique_bytes);
+ dsl_dataset_phys(ds)->ds_unique_bytes);
if (unused_refres_delta > 0 &&
unused_refres_delta >
@@ -1820,13 +2363,17 @@
static void
dsl_dataset_rollback_sync(void *arg, dmu_tx_t *tx)
{
- const char *fsname = arg;
+ dsl_dataset_rollback_arg_t *ddra = arg;
dsl_pool_t *dp = dmu_tx_pool(tx);
dsl_dataset_t *ds, *clone;
uint64_t cloneobj;
+ char namebuf[ZFS_MAX_DATASET_NAME_LEN];
- VERIFY0(dsl_dataset_hold(dp, fsname, FTAG, &ds));
+ VERIFY0(dsl_dataset_hold(dp, ddra->ddra_fsname, FTAG, &ds));
+ dsl_dataset_name(ds->ds_prev, namebuf);
+ fnvlist_add_string(ddra->ddra_result, "target", namebuf);
+
cloneobj = dsl_dataset_create_sync(ds->ds_dir, "%rollback",
ds->ds_prev, DS_CREATE_FLAG_NODIRTY, kcred, tx);
@@ -1841,11 +2388,31 @@
dsl_dataset_rele(ds, FTAG);
}
+/*
+ * Rolls back the given filesystem or volume to the most recent snapshot.
+ * The name of the most recent snapshot will be returned under key "target"
+ * in the result nvlist.
+ *
+ * If owner != NULL:
+ * - The existing dataset MUST be owned by the specified owner at entry
+ * - Upon return, dataset will still be held by the same owner, whether we
+ * succeed or not.
+ *
+ * This mode is required any time the existing filesystem is mounted. See
+ * notes above zfs_suspend_fs() for further details.
+ */
int
-dsl_dataset_rollback(const char *fsname)
+dsl_dataset_rollback(const char *fsname, void *owner, nvlist_t *result)
{
+ dsl_dataset_rollback_arg_t ddra;
+
+ ddra.ddra_fsname = fsname;
+ ddra.ddra_owner = owner;
+ ddra.ddra_result = result;
+
return (dsl_sync_task(fsname, dsl_dataset_rollback_check,
- dsl_dataset_rollback_sync, (void *)fsname, 1));
+ dsl_dataset_rollback_sync, &ddra,
+ 1, ZFS_SPACE_CHECK_RESERVED));
}
struct promotenode {
@@ -1860,6 +2427,7 @@
dsl_dataset_t *origin_origin; /* origin of the origin */
uint64_t used, comp, uncomp, unique, cloneusedsnap, originusedsnap;
char *err_ds;
+ cred_t *cr;
} dsl_dataset_promote_arg_t;
static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep);
@@ -1877,6 +2445,8 @@
dsl_dataset_t *origin_ds;
int err;
uint64_t unused;
+ uint64_t ss_mv_cnt;
+ size_t max_snap_len;
err = promote_hold(ddpa, dp, FTAG);
if (err != 0)
@@ -1883,8 +2453,9 @@
return (err);
hds = ddpa->ddpa_clone;
+ max_snap_len = MAXNAMELEN - strlen(ddpa->ddpa_clonename) - 1;
- if (hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE) {
+ if (dsl_dataset_phys(hds)->ds_flags & DS_FLAG_NOPROMOTE) {
promote_rele(ddpa, FTAG);
return (SET_ERROR(EXDEV));
}
@@ -1903,9 +2474,10 @@
/* compute origin's new unique space */
snap = list_tail(&ddpa->clone_snaps);
- ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object);
+ ASSERT3U(dsl_dataset_phys(snap->ds)->ds_prev_snap_obj, ==,
+ origin_ds->ds_object);
dsl_deadlist_space_range(&snap->ds->ds_deadlist,
- origin_ds->ds_phys->ds_prev_snap_txg, UINT64_MAX,
+ dsl_dataset_phys(origin_ds)->ds_prev_snap_txg, UINT64_MAX,
&ddpa->unique, &unused, &unused);
/*
@@ -1923,14 +2495,17 @@
* Note however, if we stop before we reach the ORIGIN we get:
* uN + kN + kN-1 + ... + kM - uM-1
*/
- ddpa->used = origin_ds->ds_phys->ds_referenced_bytes;
- ddpa->comp = origin_ds->ds_phys->ds_compressed_bytes;
- ddpa->uncomp = origin_ds->ds_phys->ds_uncompressed_bytes;
+ ss_mv_cnt = 0;
+ ddpa->used = dsl_dataset_phys(origin_ds)->ds_referenced_bytes;
+ ddpa->comp = dsl_dataset_phys(origin_ds)->ds_compressed_bytes;
+ ddpa->uncomp = dsl_dataset_phys(origin_ds)->ds_uncompressed_bytes;
for (snap = list_head(&ddpa->shared_snaps); snap;
snap = list_next(&ddpa->shared_snaps, snap)) {
uint64_t val, dlused, dlcomp, dluncomp;
dsl_dataset_t *ds = snap->ds;
+ ss_mv_cnt++;
+
/*
* If there are long holds, we won't be able to evict
* the objset.
@@ -1942,6 +2517,10 @@
/* Check that the snapshot name does not conflict */
VERIFY0(dsl_dataset_get_snapname(ds));
+ if (strlen(ds->ds_snapname) >= max_snap_len) {
+ err = SET_ERROR(ENAMETOOLONG);
+ goto out;
+ }
err = dsl_dataset_snap_lookup(hds, ds->ds_snapname, &val);
if (err == 0) {
(void) strcpy(ddpa->err_ds, snap->ds->ds_snapname);
@@ -1952,7 +2531,7 @@
goto out;
/* The very first snapshot does not have a deadlist */
- if (ds->ds_phys->ds_prev_snap_obj == 0)
+ if (dsl_dataset_phys(ds)->ds_prev_snap_obj == 0)
continue;
dsl_deadlist_space(&ds->ds_deadlist,
@@ -1967,15 +2546,18 @@
* so we need to subtract out the clone origin's used space.
*/
if (ddpa->origin_origin) {
- ddpa->used -= ddpa->origin_origin->ds_phys->ds_referenced_bytes;
- ddpa->comp -= ddpa->origin_origin->ds_phys->ds_compressed_bytes;
+ ddpa->used -=
+ dsl_dataset_phys(ddpa->origin_origin)->ds_referenced_bytes;
+ ddpa->comp -=
+ dsl_dataset_phys(ddpa->origin_origin)->ds_compressed_bytes;
ddpa->uncomp -=
- ddpa->origin_origin->ds_phys->ds_uncompressed_bytes;
+ dsl_dataset_phys(ddpa->origin_origin)->
+ ds_uncompressed_bytes;
}
- /* Check that there is enough space here */
+ /* Check that there is enough space and limit headroom here */
err = dsl_dir_transfer_possible(origin_ds->ds_dir, hds->ds_dir,
- ddpa->used);
+ 0, ss_mv_cnt, ddpa->used, ddpa->cr);
if (err != 0)
goto out;
@@ -1985,7 +2567,7 @@
* it is the amount of space that will be on all of their
* deadlists (that was not born before their new origin).
*/
- if (hds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) {
+ if (dsl_dir_phys(hds->ds_dir)->dd_flags & DD_FLAG_USED_BREAKDOWN) {
uint64_t space;
/*
@@ -2007,9 +2589,11 @@
goto out;
ddpa->cloneusedsnap += space;
}
- if (origin_ds->ds_dir->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) {
+ if (dsl_dir_phys(origin_ds->ds_dir)->dd_flags &
+ DD_FLAG_USED_BREAKDOWN) {
err = snaplist_space(&ddpa->origin_snaps,
- origin_ds->ds_phys->ds_creation_txg, &ddpa->originusedsnap);
+ dsl_dataset_phys(origin_ds)->ds_creation_txg,
+ &ddpa->originusedsnap);
if (err != 0)
goto out;
}
@@ -2032,11 +2616,14 @@
dsl_dir_t *odd = NULL;
uint64_t oldnext_obj;
int64_t delta;
+#if defined(__FreeBSD__) && defined(_KERNEL)
+ char *oldname, *newname;
+#endif
VERIFY0(promote_hold(ddpa, dp, FTAG));
hds = ddpa->ddpa_clone;
- ASSERT0(hds->ds_phys->ds_flags & DS_FLAG_NOPROMOTE);
+ ASSERT0(dsl_dataset_phys(hds)->ds_flags & DS_FLAG_NOPROMOTE);
snap = list_head(&ddpa->shared_snaps);
origin_ds = snap->ds;
@@ -2054,49 +2641,59 @@
/* change origin's next snap */
dmu_buf_will_dirty(origin_ds->ds_dbuf, tx);
- oldnext_obj = origin_ds->ds_phys->ds_next_snap_obj;
+ oldnext_obj = dsl_dataset_phys(origin_ds)->ds_next_snap_obj;
snap = list_tail(&ddpa->clone_snaps);
- ASSERT3U(snap->ds->ds_phys->ds_prev_snap_obj, ==, origin_ds->ds_object);
- origin_ds->ds_phys->ds_next_snap_obj = snap->ds->ds_object;
+ ASSERT3U(dsl_dataset_phys(snap->ds)->ds_prev_snap_obj, ==,
+ origin_ds->ds_object);
+ dsl_dataset_phys(origin_ds)->ds_next_snap_obj = snap->ds->ds_object;
/* change the origin's next clone */
- if (origin_ds->ds_phys->ds_next_clones_obj) {
+ if (dsl_dataset_phys(origin_ds)->ds_next_clones_obj) {
dsl_dataset_remove_from_next_clones(origin_ds,
snap->ds->ds_object, tx);
VERIFY0(zap_add_int(dp->dp_meta_objset,
- origin_ds->ds_phys->ds_next_clones_obj,
+ dsl_dataset_phys(origin_ds)->ds_next_clones_obj,
oldnext_obj, tx));
}
/* change origin */
dmu_buf_will_dirty(dd->dd_dbuf, tx);
- ASSERT3U(dd->dd_phys->dd_origin_obj, ==, origin_ds->ds_object);
- dd->dd_phys->dd_origin_obj = odd->dd_phys->dd_origin_obj;
+ ASSERT3U(dsl_dir_phys(dd)->dd_origin_obj, ==, origin_ds->ds_object);
+ dsl_dir_phys(dd)->dd_origin_obj = dsl_dir_phys(odd)->dd_origin_obj;
dd->dd_origin_txg = origin_head->ds_dir->dd_origin_txg;
dmu_buf_will_dirty(odd->dd_dbuf, tx);
- odd->dd_phys->dd_origin_obj = origin_ds->ds_object;
+ dsl_dir_phys(odd)->dd_origin_obj = origin_ds->ds_object;
origin_head->ds_dir->dd_origin_txg =
- origin_ds->ds_phys->ds_creation_txg;
+ dsl_dataset_phys(origin_ds)->ds_creation_txg;
/* change dd_clone entries */
if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
VERIFY0(zap_remove_int(dp->dp_meta_objset,
- odd->dd_phys->dd_clones, hds->ds_object, tx));
+ dsl_dir_phys(odd)->dd_clones, hds->ds_object, tx));
VERIFY0(zap_add_int(dp->dp_meta_objset,
- ddpa->origin_origin->ds_dir->dd_phys->dd_clones,
+ dsl_dir_phys(ddpa->origin_origin->ds_dir)->dd_clones,
hds->ds_object, tx));
VERIFY0(zap_remove_int(dp->dp_meta_objset,
- ddpa->origin_origin->ds_dir->dd_phys->dd_clones,
+ dsl_dir_phys(ddpa->origin_origin->ds_dir)->dd_clones,
origin_head->ds_object, tx));
- if (dd->dd_phys->dd_clones == 0) {
- dd->dd_phys->dd_clones = zap_create(dp->dp_meta_objset,
- DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
+ if (dsl_dir_phys(dd)->dd_clones == 0) {
+ dsl_dir_phys(dd)->dd_clones =
+ zap_create(dp->dp_meta_objset, DMU_OT_DSL_CLONES,
+ DMU_OT_NONE, 0, tx);
}
VERIFY0(zap_add_int(dp->dp_meta_objset,
- dd->dd_phys->dd_clones, origin_head->ds_object, tx));
+ dsl_dir_phys(dd)->dd_clones, origin_head->ds_object, tx));
}
+#if defined(__FreeBSD__) && defined(_KERNEL)
+ /* Take the spa_namespace_lock early so zvol renames don't deadlock. */
+ mutex_enter(&spa_namespace_lock);
+
+ oldname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+ newname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
+#endif
+
/* move snapshots to this dir */
for (snap = list_head(&ddpa->shared_snaps); snap;
snap = list_next(&ddpa->shared_snaps, snap)) {
@@ -2115,28 +2712,36 @@
/* move snap name entry */
VERIFY0(dsl_dataset_get_snapname(ds));
VERIFY0(dsl_dataset_snap_remove(origin_head,
- ds->ds_snapname, tx));
+ ds->ds_snapname, tx, B_TRUE));
VERIFY0(zap_add(dp->dp_meta_objset,
- hds->ds_phys->ds_snapnames_zapobj, ds->ds_snapname,
+ dsl_dataset_phys(hds)->ds_snapnames_zapobj, ds->ds_snapname,
8, 1, &ds->ds_object, tx));
+ dsl_fs_ss_count_adjust(hds->ds_dir, 1,
+ DD_FIELD_SNAPSHOT_COUNT, tx);
/* change containing dsl_dir */
dmu_buf_will_dirty(ds->ds_dbuf, tx);
- ASSERT3U(ds->ds_phys->ds_dir_obj, ==, odd->dd_object);
- ds->ds_phys->ds_dir_obj = dd->dd_object;
+ ASSERT3U(dsl_dataset_phys(ds)->ds_dir_obj, ==, odd->dd_object);
+ dsl_dataset_phys(ds)->ds_dir_obj = dd->dd_object;
ASSERT3P(ds->ds_dir, ==, odd);
dsl_dir_rele(ds->ds_dir, ds);
VERIFY0(dsl_dir_hold_obj(dp, dd->dd_object,
NULL, ds, &ds->ds_dir));
+#if defined(__FreeBSD__) && defined(_KERNEL)
+ dsl_dataset_name(ds, newname);
+ zfsvfs_update_fromname(oldname, newname);
+ zvol_rename_minors(oldname, newname);
+#endif
+
/* move any clone references */
- if (ds->ds_phys->ds_next_clones_obj &&
+ if (dsl_dataset_phys(ds)->ds_next_clones_obj &&
spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
zap_cursor_t zc;
zap_attribute_t za;
for (zap_cursor_init(&zc, dp->dp_meta_objset,
- ds->ds_phys->ds_next_clones_obj);
+ dsl_dataset_phys(ds)->ds_next_clones_obj);
zap_cursor_retrieve(&zc, &za) == 0;
zap_cursor_advance(&zc)) {
dsl_dataset_t *cnds;
@@ -2152,12 +2757,13 @@
VERIFY0(dsl_dataset_hold_obj(dp,
za.za_first_integer, FTAG, &cnds));
- o = cnds->ds_dir->dd_phys->dd_head_dataset_obj;
+ o = dsl_dir_phys(cnds->ds_dir)->
+ dd_head_dataset_obj;
VERIFY0(zap_remove_int(dp->dp_meta_objset,
- odd->dd_phys->dd_clones, o, tx));
+ dsl_dir_phys(odd)->dd_clones, o, tx));
VERIFY0(zap_add_int(dp->dp_meta_objset,
- dd->dd_phys->dd_clones, o, tx));
+ dsl_dir_phys(dd)->dd_clones, o, tx));
dsl_dataset_rele(cnds, FTAG);
}
zap_cursor_fini(&zc);
@@ -2166,6 +2772,12 @@
ASSERT(!dsl_prop_hascb(ds));
}
+#if defined(__FreeBSD__) && defined(_KERNEL)
+ mutex_exit(&spa_namespace_lock);
+
+ kmem_free(newname, MAXPATHLEN);
+ kmem_free(oldname, MAXPATHLEN);
+#endif
/*
* Change space accounting.
* Note, pa->*usedsnap and dd_used_breakdown[SNAP] will either
@@ -2174,7 +2786,7 @@
*/
delta = ddpa->cloneusedsnap -
- dd->dd_phys->dd_used_breakdown[DD_USED_SNAP];
+ dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_SNAP];
ASSERT3S(delta, >=, 0);
ASSERT3U(ddpa->used, >=, delta);
dsl_dir_diduse_space(dd, DD_USED_SNAP, delta, 0, 0, tx);
@@ -2182,7 +2794,7 @@
ddpa->used - delta, ddpa->comp, ddpa->uncomp, tx);
delta = ddpa->originusedsnap -
- odd->dd_phys->dd_used_breakdown[DD_USED_SNAP];
+ dsl_dir_phys(odd)->dd_used_breakdown[DD_USED_SNAP];
ASSERT3S(delta, <=, 0);
ASSERT3U(ddpa->used, >=, -delta);
dsl_dir_diduse_space(odd, DD_USED_SNAP, delta, 0, 0, tx);
@@ -2189,7 +2801,7 @@
dsl_dir_diduse_space(odd, DD_USED_HEAD,
-ddpa->used - delta, -ddpa->comp, -ddpa->uncomp, tx);
- origin_ds->ds_phys->ds_unique_bytes = ddpa->unique;
+ dsl_dataset_phys(origin_ds)->ds_unique_bytes = ddpa->unique;
/* log history record */
spa_history_log_internal_ds(hds, "promote", tx, "");
@@ -2224,12 +2836,12 @@
return (err);
if (first_obj == 0)
- first_obj = ds->ds_dir->dd_phys->dd_origin_obj;
+ first_obj = dsl_dir_phys(ds->ds_dir)->dd_origin_obj;
snap = kmem_alloc(sizeof (*snap), KM_SLEEP);
snap->ds = ds;
list_insert_tail(l, snap);
- obj = ds->ds_phys->ds_prev_snap_obj;
+ obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
}
return (0);
@@ -2279,13 +2891,13 @@
return (error);
dd = ddpa->ddpa_clone->ds_dir;
- if (dsl_dataset_is_snapshot(ddpa->ddpa_clone) ||
+ if (ddpa->ddpa_clone->ds_is_snapshot ||
!dsl_dir_is_clone(dd)) {
dsl_dataset_rele(ddpa->ddpa_clone, tag);
return (SET_ERROR(EINVAL));
}
- error = snaplist_make(dp, 0, dd->dd_phys->dd_origin_obj,
+ error = snaplist_make(dp, 0, dsl_dir_phys(dd)->dd_origin_obj,
&ddpa->shared_snaps, tag);
if (error != 0)
goto out;
@@ -2296,16 +2908,16 @@
goto out;
snap = list_head(&ddpa->shared_snaps);
- ASSERT3U(snap->ds->ds_object, ==, dd->dd_phys->dd_origin_obj);
- error = snaplist_make(dp, dd->dd_phys->dd_origin_obj,
- snap->ds->ds_dir->dd_phys->dd_head_dataset_obj,
+ ASSERT3U(snap->ds->ds_object, ==, dsl_dir_phys(dd)->dd_origin_obj);
+ error = snaplist_make(dp, dsl_dir_phys(dd)->dd_origin_obj,
+ dsl_dir_phys(snap->ds->ds_dir)->dd_head_dataset_obj,
&ddpa->origin_snaps, tag);
if (error != 0)
goto out;
- if (snap->ds->ds_dir->dd_phys->dd_origin_obj != 0) {
+ if (dsl_dir_phys(snap->ds->ds_dir)->dd_origin_obj != 0) {
error = dsl_dataset_hold_obj(dp,
- snap->ds->ds_dir->dd_phys->dd_origin_obj,
+ dsl_dir_phys(snap->ds->ds_dir)->dd_origin_obj,
tag, &ddpa->origin_origin);
if (error != 0)
goto out;
@@ -2331,7 +2943,7 @@
* Promote a clone.
*
* If it fails due to a conflicting snapshot name, "conflsnap" will be filled
- * in with the name. (It must be at least MAXNAMELEN bytes long.)
+ * in with the name. (It must be at least ZFS_MAX_DATASET_NAME_LEN bytes long.)
*/
int
dsl_dataset_promote(const char *name, char *conflsnap)
@@ -2349,7 +2961,8 @@
if (error != 0)
return (error);
error = zap_count(dmu_objset_pool(os)->dp_meta_objset,
- dmu_objset_ds(os)->ds_phys->ds_snapnames_zapobj, &numsnaps);
+ dsl_dataset_phys(dmu_objset_ds(os))->ds_snapnames_zapobj,
+ &numsnaps);
dmu_objset_rele(os, FTAG);
if (error != 0)
return (error);
@@ -2356,31 +2969,37 @@
ddpa.ddpa_clonename = name;
ddpa.err_ds = conflsnap;
+ ddpa.cr = CRED();
return (dsl_sync_task(name, dsl_dataset_promote_check,
- dsl_dataset_promote_sync, &ddpa, 2 + numsnaps));
+ dsl_dataset_promote_sync, &ddpa,
+ 2 + numsnaps, ZFS_SPACE_CHECK_RESERVED));
}
int
dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone,
- dsl_dataset_t *origin_head, boolean_t force)
+ dsl_dataset_t *origin_head, boolean_t force, void *owner, dmu_tx_t *tx)
{
+ /*
+ * "slack" factor for received datasets with refquota set on them.
+ * See the bottom of this function for details on its use.
+ */
+ uint64_t refquota_slack = DMU_MAX_ACCESS * spa_asize_inflation;
int64_t unused_refres_delta;
/* they should both be heads */
- if (dsl_dataset_is_snapshot(clone) ||
- dsl_dataset_is_snapshot(origin_head))
+ if (clone->ds_is_snapshot ||
+ origin_head->ds_is_snapshot)
return (SET_ERROR(EINVAL));
- /* the branch point should be just before them */
- if (clone->ds_prev != origin_head->ds_prev)
+ /* if we are not forcing, the branch point should be just before them */
+ if (!force && clone->ds_prev != origin_head->ds_prev)
return (SET_ERROR(EINVAL));
/* clone should be the clone (unless they are unrelated) */
if (clone->ds_prev != NULL &&
clone->ds_prev != clone->ds_dir->dd_pool->dp_origin_snap &&
- origin_head->ds_object !=
- clone->ds_prev->ds_phys->ds_next_snap_obj)
+ origin_head->ds_dir != clone->ds_prev->ds_dir)
return (SET_ERROR(EINVAL));
/* the clone should be a child of the origin */
@@ -2388,19 +3007,20 @@
return (SET_ERROR(EINVAL));
/* origin_head shouldn't be modified unless 'force' */
- if (!force && dsl_dataset_modified_since_lastsnap(origin_head))
+ if (!force &&
+ dsl_dataset_modified_since_snap(origin_head, origin_head->ds_prev))
return (SET_ERROR(ETXTBSY));
/* origin_head should have no long holds (e.g. is not mounted) */
- if (dsl_dataset_long_held(origin_head))
+ if (dsl_dataset_handoff_check(origin_head, owner, tx))
return (SET_ERROR(EBUSY));
/* check amount of any unconsumed refreservation */
unused_refres_delta =
(int64_t)MIN(origin_head->ds_reserved,
- origin_head->ds_phys->ds_unique_bytes) -
+ dsl_dataset_phys(origin_head)->ds_unique_bytes) -
(int64_t)MIN(origin_head->ds_reserved,
- clone->ds_phys->ds_unique_bytes);
+ dsl_dataset_phys(clone)->ds_unique_bytes);
if (unused_refres_delta > 0 &&
unused_refres_delta >
@@ -2407,9 +3027,22 @@
dsl_dir_space_available(origin_head->ds_dir, NULL, 0, TRUE))
return (SET_ERROR(ENOSPC));
- /* clone can't be over the head's refquota */
+ /*
+ * The clone can't be too much over the head's refquota.
+ *
+ * To ensure that the entire refquota can be used, we allow one
+ * transaction to exceed the the refquota. Therefore, this check
+ * needs to also allow for the space referenced to be more than the
+ * refquota. The maximum amount of space that one transaction can use
+ * on disk is DMU_MAX_ACCESS * spa_asize_inflation. Allowing this
+ * overage ensures that we are able to receive a filesystem that
+ * exceeds the refquota on the source system.
+ *
+ * So that overage is the refquota_slack we use below.
+ */
if (origin_head->ds_quota != 0 &&
- clone->ds_phys->ds_referenced_bytes > origin_head->ds_quota)
+ dsl_dataset_phys(clone)->ds_referenced_bytes >
+ origin_head->ds_quota + refquota_slack)
return (SET_ERROR(EDQUOT));
return (0);
@@ -2423,9 +3056,49 @@
int64_t unused_refres_delta;
ASSERT(clone->ds_reserved == 0);
+ /*
+ * NOTE: On DEBUG kernels there could be a race between this and
+ * the check function if spa_asize_inflation is adjusted...
+ */
ASSERT(origin_head->ds_quota == 0 ||
- clone->ds_phys->ds_unique_bytes <= origin_head->ds_quota);
+ dsl_dataset_phys(clone)->ds_unique_bytes <= origin_head->ds_quota +
+ DMU_MAX_ACCESS * spa_asize_inflation);
+ ASSERT3P(clone->ds_prev, ==, origin_head->ds_prev);
+ /*
+ * Swap per-dataset feature flags.
+ */
+ for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
+ if (!(spa_feature_table[f].fi_flags &
+ ZFEATURE_FLAG_PER_DATASET)) {
+ ASSERT(!clone->ds_feature_inuse[f]);
+ ASSERT(!origin_head->ds_feature_inuse[f]);
+ continue;
+ }
+
+ boolean_t clone_inuse = clone->ds_feature_inuse[f];
+ boolean_t origin_head_inuse = origin_head->ds_feature_inuse[f];
+
+ if (clone_inuse) {
+ dsl_dataset_deactivate_feature(clone->ds_object, f, tx);
+ clone->ds_feature_inuse[f] = B_FALSE;
+ }
+ if (origin_head_inuse) {
+ dsl_dataset_deactivate_feature(origin_head->ds_object,
+ f, tx);
+ origin_head->ds_feature_inuse[f] = B_FALSE;
+ }
+ if (clone_inuse) {
+ dsl_dataset_activate_feature(origin_head->ds_object,
+ f, tx);
+ origin_head->ds_feature_inuse[f] = B_TRUE;
+ }
+ if (origin_head_inuse) {
+ dsl_dataset_activate_feature(clone->ds_object, f, tx);
+ clone->ds_feature_inuse[f] = B_TRUE;
+ }
+ }
+
dmu_buf_will_dirty(clone->ds_dbuf, tx);
dmu_buf_will_dirty(origin_head->ds_dbuf, tx);
@@ -2441,9 +3114,9 @@
unused_refres_delta =
(int64_t)MIN(origin_head->ds_reserved,
- origin_head->ds_phys->ds_unique_bytes) -
+ dsl_dataset_phys(origin_head)->ds_unique_bytes) -
(int64_t)MIN(origin_head->ds_reserved,
- clone->ds_phys->ds_unique_bytes);
+ dsl_dataset_phys(clone)->ds_unique_bytes);
/*
* Reset origin's unique bytes, if it exists.
@@ -2454,16 +3127,21 @@
dmu_buf_will_dirty(origin->ds_dbuf, tx);
dsl_deadlist_space_range(&clone->ds_deadlist,
- origin->ds_phys->ds_prev_snap_txg, UINT64_MAX,
- &origin->ds_phys->ds_unique_bytes, &comp, &uncomp);
+ dsl_dataset_phys(origin)->ds_prev_snap_txg, UINT64_MAX,
+ &dsl_dataset_phys(origin)->ds_unique_bytes, &comp, &uncomp);
}
/* swap blkptrs */
{
+ rrw_enter(&clone->ds_bp_rwlock, RW_WRITER, FTAG);
+ rrw_enter(&origin_head->ds_bp_rwlock, RW_WRITER, FTAG);
blkptr_t tmp;
- tmp = origin_head->ds_phys->ds_bp;
- origin_head->ds_phys->ds_bp = clone->ds_phys->ds_bp;
- clone->ds_phys->ds_bp = tmp;
+ tmp = dsl_dataset_phys(origin_head)->ds_bp;
+ dsl_dataset_phys(origin_head)->ds_bp =
+ dsl_dataset_phys(clone)->ds_bp;
+ dsl_dataset_phys(clone)->ds_bp = tmp;
+ rrw_exit(&origin_head->ds_bp_rwlock, FTAG);
+ rrw_exit(&clone->ds_bp_rwlock, FTAG);
}
/* set dd_*_bytes */
@@ -2472,7 +3150,7 @@
uint64_t cdl_used, cdl_comp, cdl_uncomp;
uint64_t odl_used, odl_comp, odl_uncomp;
- ASSERT3U(clone->ds_dir->dd_phys->
+ ASSERT3U(dsl_dir_phys(clone->ds_dir)->
dd_used_breakdown[DD_USED_SNAP], ==, 0);
dsl_deadlist_space(&clone->ds_deadlist,
@@ -2480,13 +3158,18 @@
dsl_deadlist_space(&origin_head->ds_deadlist,
&odl_used, &odl_comp, &odl_uncomp);
- dused = clone->ds_phys->ds_referenced_bytes + cdl_used -
- (origin_head->ds_phys->ds_referenced_bytes + odl_used);
- dcomp = clone->ds_phys->ds_compressed_bytes + cdl_comp -
- (origin_head->ds_phys->ds_compressed_bytes + odl_comp);
- duncomp = clone->ds_phys->ds_uncompressed_bytes +
+ dused = dsl_dataset_phys(clone)->ds_referenced_bytes +
+ cdl_used -
+ (dsl_dataset_phys(origin_head)->ds_referenced_bytes +
+ odl_used);
+ dcomp = dsl_dataset_phys(clone)->ds_compressed_bytes +
+ cdl_comp -
+ (dsl_dataset_phys(origin_head)->ds_compressed_bytes +
+ odl_comp);
+ duncomp = dsl_dataset_phys(clone)->ds_uncompressed_bytes +
cdl_uncomp -
- (origin_head->ds_phys->ds_uncompressed_bytes + odl_uncomp);
+ (dsl_dataset_phys(origin_head)->ds_uncompressed_bytes +
+ odl_uncomp);
dsl_dir_diduse_space(origin_head->ds_dir, DD_USED_HEAD,
dused, dcomp, duncomp, tx);
@@ -2506,18 +3189,18 @@
origin_head->ds_dir->dd_origin_txg, UINT64_MAX,
&odl_used, &odl_comp, &odl_uncomp);
dsl_dir_transfer_space(origin_head->ds_dir, cdl_used - odl_used,
- DD_USED_HEAD, DD_USED_SNAP, tx);
+ DD_USED_HEAD, DD_USED_SNAP, NULL);
}
/* swap ds_*_bytes */
- SWITCH64(origin_head->ds_phys->ds_referenced_bytes,
- clone->ds_phys->ds_referenced_bytes);
- SWITCH64(origin_head->ds_phys->ds_compressed_bytes,
- clone->ds_phys->ds_compressed_bytes);
- SWITCH64(origin_head->ds_phys->ds_uncompressed_bytes,
- clone->ds_phys->ds_uncompressed_bytes);
- SWITCH64(origin_head->ds_phys->ds_unique_bytes,
- clone->ds_phys->ds_unique_bytes);
+ SWITCH64(dsl_dataset_phys(origin_head)->ds_referenced_bytes,
+ dsl_dataset_phys(clone)->ds_referenced_bytes);
+ SWITCH64(dsl_dataset_phys(origin_head)->ds_compressed_bytes,
+ dsl_dataset_phys(clone)->ds_compressed_bytes);
+ SWITCH64(dsl_dataset_phys(origin_head)->ds_uncompressed_bytes,
+ dsl_dataset_phys(clone)->ds_uncompressed_bytes);
+ SWITCH64(dsl_dataset_phys(origin_head)->ds_unique_bytes,
+ dsl_dataset_phys(clone)->ds_unique_bytes);
/* apply any parent delta for change in unconsumed refreservation */
dsl_dir_diduse_space(origin_head->ds_dir, DD_USED_REFRSRV,
@@ -2528,12 +3211,12 @@
*/
dsl_deadlist_close(&clone->ds_deadlist);
dsl_deadlist_close(&origin_head->ds_deadlist);
- SWITCH64(origin_head->ds_phys->ds_deadlist_obj,
- clone->ds_phys->ds_deadlist_obj);
+ SWITCH64(dsl_dataset_phys(origin_head)->ds_deadlist_obj,
+ dsl_dataset_phys(clone)->ds_deadlist_obj);
dsl_deadlist_open(&clone->ds_deadlist, dp->dp_meta_objset,
- clone->ds_phys->ds_deadlist_obj);
+ dsl_dataset_phys(clone)->ds_deadlist_obj);
dsl_deadlist_open(&origin_head->ds_deadlist, dp->dp_meta_objset,
- origin_head->ds_phys->ds_deadlist_obj);
+ dsl_dataset_phys(origin_head)->ds_deadlist_obj);
dsl_scan_ds_clone_swapped(origin_head, clone, tx);
@@ -2584,10 +3267,11 @@
/*
* Make a space adjustment for reserved bytes.
*/
- if (ds->ds_reserved > ds->ds_phys->ds_unique_bytes) {
+ if (ds->ds_reserved > dsl_dataset_phys(ds)->ds_unique_bytes) {
ASSERT3U(*used, >=,
- ds->ds_reserved - ds->ds_phys->ds_unique_bytes);
- *used -= (ds->ds_reserved - ds->ds_phys->ds_unique_bytes);
+ ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes);
+ *used -=
+ (ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes);
*ref_rsrv =
asize - MIN(asize, parent_delta(ds, asize + inflight));
}
@@ -2602,9 +3286,10 @@
* on-disk is over quota and there are no pending changes (which
* may free up space for us).
*/
- if (ds->ds_phys->ds_referenced_bytes + inflight >= ds->ds_quota) {
+ if (dsl_dataset_phys(ds)->ds_referenced_bytes + inflight >=
+ ds->ds_quota) {
if (inflight > 0 ||
- ds->ds_phys->ds_referenced_bytes < ds->ds_quota)
+ dsl_dataset_phys(ds)->ds_referenced_bytes < ds->ds_quota)
error = SET_ERROR(ERESTART);
else
error = SET_ERROR(EDQUOT);
@@ -2638,7 +3323,7 @@
if (error != 0)
return (error);
- if (dsl_dataset_is_snapshot(ds)) {
+ if (ds->ds_is_snapshot) {
dsl_dataset_rele(ds, FTAG);
return (SET_ERROR(EINVAL));
}
@@ -2656,7 +3341,7 @@
return (0);
}
- if (newval < ds->ds_phys->ds_referenced_bytes ||
+ if (newval < dsl_dataset_phys(ds)->ds_referenced_bytes ||
newval < ds->ds_reserved) {
dsl_dataset_rele(ds, FTAG);
return (SET_ERROR(ENOSPC));
@@ -2702,7 +3387,7 @@
ddsqra.ddsqra_value = refquota;
return (dsl_sync_task(dsname, dsl_dataset_set_refquota_check,
- dsl_dataset_set_refquota_sync, &ddsqra, 0));
+ dsl_dataset_set_refquota_sync, &ddsqra, 0, ZFS_SPACE_CHECK_NONE));
}
static int
@@ -2721,7 +3406,7 @@
if (error != 0)
return (error);
- if (dsl_dataset_is_snapshot(ds)) {
+ if (ds->ds_is_snapshot) {
dsl_dataset_rele(ds, FTAG);
return (SET_ERROR(EINVAL));
}
@@ -2746,7 +3431,7 @@
mutex_enter(&ds->ds_lock);
if (!DS_UNIQUE_IS_ACCURATE(ds))
dsl_dataset_recalc_head_uniq(ds);
- unique = ds->ds_phys->ds_unique_bytes;
+ unique = dsl_dataset_phys(ds)->ds_unique_bytes;
mutex_exit(&ds->ds_lock);
if (MAX(unique, newval) > MAX(unique, ds->ds_reserved)) {
@@ -2783,7 +3468,7 @@
mutex_enter(&ds->ds_dir->dd_lock);
mutex_enter(&ds->ds_lock);
ASSERT(DS_UNIQUE_IS_ACCURATE(ds));
- unique = ds->ds_phys->ds_unique_bytes;
+ unique = dsl_dataset_phys(ds)->ds_unique_bytes;
delta = MAX(0, (int64_t)(newval - unique)) -
MAX(0, (int64_t)(ds->ds_reserved - unique));
ds->ds_reserved = newval;
@@ -2817,7 +3502,8 @@
ddsqra.ddsqra_value = refreservation;
return (dsl_sync_task(dsname, dsl_dataset_set_refreservation_check,
- dsl_dataset_set_refreservation_sync, &ddsqra, 0));
+ dsl_dataset_set_refreservation_sync, &ddsqra,
+ 0, ZFS_SPACE_CHECK_NONE));
}
/*
@@ -2848,16 +3534,16 @@
ASSERT(dsl_pool_config_held(dp));
*usedp = 0;
- *usedp += new->ds_phys->ds_referenced_bytes;
- *usedp -= oldsnap->ds_phys->ds_referenced_bytes;
+ *usedp += dsl_dataset_phys(new)->ds_referenced_bytes;
+ *usedp -= dsl_dataset_phys(oldsnap)->ds_referenced_bytes;
*compp = 0;
- *compp += new->ds_phys->ds_compressed_bytes;
- *compp -= oldsnap->ds_phys->ds_compressed_bytes;
+ *compp += dsl_dataset_phys(new)->ds_compressed_bytes;
+ *compp -= dsl_dataset_phys(oldsnap)->ds_compressed_bytes;
*uncompp = 0;
- *uncompp += new->ds_phys->ds_uncompressed_bytes;
- *uncompp -= oldsnap->ds_phys->ds_uncompressed_bytes;
+ *uncompp += dsl_dataset_phys(new)->ds_uncompressed_bytes;
+ *uncompp -= dsl_dataset_phys(oldsnap)->ds_uncompressed_bytes;
snapobj = new->ds_object;
while (snapobj != oldsnap->ds_object) {
@@ -2872,8 +3558,8 @@
break;
}
- if (snap->ds_phys->ds_prev_snap_txg ==
- oldsnap->ds_phys->ds_creation_txg) {
+ if (dsl_dataset_phys(snap)->ds_prev_snap_txg ==
+ dsl_dataset_phys(oldsnap)->ds_creation_txg) {
/*
* The blocks in the deadlist can not be born after
* ds_prev_snap_txg, so get the whole deadlist space,
@@ -2886,7 +3572,7 @@
&used, &comp, &uncomp);
} else {
dsl_deadlist_space_range(&snap->ds_deadlist,
- 0, oldsnap->ds_phys->ds_creation_txg,
+ 0, dsl_dataset_phys(oldsnap)->ds_creation_txg,
&used, &comp, &uncomp);
}
*usedp += used;
@@ -2898,7 +3584,7 @@
* (ds_prev_snap_obj == 0) before oldsnap, then oldsnap
* was not a snapshot of/before new.
*/
- snapobj = snap->ds_phys->ds_prev_snap_obj;
+ snapobj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
if (snap != new)
dsl_dataset_rele(snap, FTAG);
if (snapobj == 0) {
@@ -2934,8 +3620,8 @@
uint64_t snapobj;
dsl_pool_t *dp = firstsnap->ds_dir->dd_pool;
- ASSERT(dsl_dataset_is_snapshot(firstsnap));
- ASSERT(dsl_dataset_is_snapshot(lastsnap));
+ ASSERT(firstsnap->ds_is_snapshot);
+ ASSERT(lastsnap->ds_is_snapshot);
/*
* Check that the snapshots are in the same dsl_dir, and firstsnap
@@ -2942,13 +3628,13 @@
* is before lastsnap.
*/
if (firstsnap->ds_dir != lastsnap->ds_dir ||
- firstsnap->ds_phys->ds_creation_txg >
- lastsnap->ds_phys->ds_creation_txg)
+ dsl_dataset_phys(firstsnap)->ds_creation_txg >
+ dsl_dataset_phys(lastsnap)->ds_creation_txg)
return (SET_ERROR(EINVAL));
*usedp = *compp = *uncompp = 0;
- snapobj = lastsnap->ds_phys->ds_next_snap_obj;
+ snapobj = dsl_dataset_phys(lastsnap)->ds_next_snap_obj;
while (snapobj != firstsnap->ds_object) {
dsl_dataset_t *ds;
uint64_t used, comp, uncomp;
@@ -2958,13 +3644,13 @@
break;
dsl_deadlist_space_range(&ds->ds_deadlist,
- firstsnap->ds_phys->ds_prev_snap_txg, UINT64_MAX,
+ dsl_dataset_phys(firstsnap)->ds_prev_snap_txg, UINT64_MAX,
&used, &comp, &uncomp);
*usedp += used;
*compp += comp;
*uncompp += uncomp;
- snapobj = ds->ds_phys->ds_prev_snap_obj;
+ snapobj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
ASSERT3U(snapobj, !=, 0);
dsl_dataset_rele(ds, FTAG);
}
@@ -2977,9 +3663,12 @@
* 'earlier' is before 'later'. Or 'earlier' could be the origin of
* 'later's filesystem. Or 'earlier' could be an older snapshot in the origin's
* filesystem. Or 'earlier' could be the origin's origin.
+ *
+ * If non-zero, earlier_txg is used instead of earlier's ds_creation_txg.
*/
boolean_t
-dsl_dataset_is_before(dsl_dataset_t *later, dsl_dataset_t *earlier)
+dsl_dataset_is_before(dsl_dataset_t *later, dsl_dataset_t *earlier,
+ uint64_t earlier_txg)
{
dsl_pool_t *dp = later->ds_dir->dd_pool;
int error;
@@ -2986,9 +3675,13 @@
boolean_t ret;
ASSERT(dsl_pool_config_held(dp));
+ ASSERT(earlier->ds_is_snapshot || earlier_txg != 0);
- if (earlier->ds_phys->ds_creation_txg >=
- later->ds_phys->ds_creation_txg)
+ if (earlier_txg == 0)
+ earlier_txg = dsl_dataset_phys(earlier)->ds_creation_txg;
+
+ if (later->ds_is_snapshot &&
+ earlier_txg >= dsl_dataset_phys(later)->ds_creation_txg)
return (B_FALSE);
if (later->ds_dir == earlier->ds_dir)
@@ -2996,14 +3689,38 @@
if (!dsl_dir_is_clone(later->ds_dir))
return (B_FALSE);
- if (later->ds_dir->dd_phys->dd_origin_obj == earlier->ds_object)
+ if (dsl_dir_phys(later->ds_dir)->dd_origin_obj == earlier->ds_object)
return (B_TRUE);
dsl_dataset_t *origin;
error = dsl_dataset_hold_obj(dp,
- later->ds_dir->dd_phys->dd_origin_obj, FTAG, &origin);
+ dsl_dir_phys(later->ds_dir)->dd_origin_obj, FTAG, &origin);
if (error != 0)
return (B_FALSE);
- ret = dsl_dataset_is_before(origin, earlier);
+ ret = dsl_dataset_is_before(origin, earlier, earlier_txg);
dsl_dataset_rele(origin, FTAG);
return (ret);
}
+
+void
+dsl_dataset_zapify(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+ objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+ dmu_object_zapify(mos, ds->ds_object, DMU_OT_DSL_DATASET, tx);
+}
+
+boolean_t
+dsl_dataset_is_zapified(dsl_dataset_t *ds)
+{
+ dmu_object_info_t doi;
+
+ dmu_object_info_from_db(ds->ds_dbuf, &doi);
+ return (doi.doi_type == DMU_OTN_ZAP_METADATA);
+}
+
+boolean_t
+dsl_dataset_has_resume_receive_state(dsl_dataset_t *ds)
+{
+ return (dsl_dataset_is_zapified(ds) &&
+ zap_contains(ds->ds_dir->dd_pool->dp_meta_objset,
+ ds->ds_object, DS_FIELD_RESUME_TOGUID) == 0);
+}
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -22,6 +22,8 @@
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
*/
#include <sys/dsl_dataset.h>
@@ -120,6 +122,8 @@
void *cookie = NULL;
dsl_deadlist_entry_t *dle;
+ dl->dl_os = NULL;
+
if (dl->dl_oldfmt) {
dl->dl_oldfmt = B_FALSE;
bpobj_close(&dl->dl_bpobj);
@@ -144,7 +148,7 @@
dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx)
{
if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS)
- return (bpobj_alloc(os, SPA_MAXBLOCKSIZE, tx));
+ return (bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx));
return (zap_create(os, DMU_OT_DEADLIST, DMU_OT_DEADLIST_HDR,
sizeof (dsl_deadlist_phys_t), tx));
}
@@ -181,7 +185,7 @@
{
if (dle->dle_bpobj.bpo_object ==
dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) {
- uint64_t obj = bpobj_alloc(dl->dl_os, SPA_MAXBLOCKSIZE, tx);
+ uint64_t obj = bpobj_alloc(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx);
bpobj_close(&dle->dle_bpobj);
bpobj_decr_empty(dl->dl_os, tx);
VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
@@ -255,7 +259,7 @@
dle = kmem_alloc(sizeof (*dle), KM_SLEEP);
dle->dle_mintxg = mintxg;
- obj = bpobj_alloc_empty(dl->dl_os, SPA_MAXBLOCKSIZE, tx);
+ obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx);
VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
avl_add(&dl->dl_tree, dle);
@@ -309,8 +313,9 @@
while (mrs_obj != 0) {
dsl_dataset_t *ds;
VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, mrs_obj, FTAG, &ds));
- dsl_deadlist_add_key(&dl, ds->ds_phys->ds_prev_snap_txg, tx);
- mrs_obj = ds->ds_phys->ds_prev_snap_obj;
+ dsl_deadlist_add_key(&dl,
+ dsl_dataset_phys(ds)->ds_prev_snap_txg, tx);
+ mrs_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
dsl_dataset_rele(ds, FTAG);
}
dsl_deadlist_close(&dl);
@@ -339,7 +344,7 @@
if (dle->dle_mintxg >= maxtxg)
break;
- obj = bpobj_alloc_empty(dl->dl_os, SPA_MAXBLOCKSIZE, tx);
+ obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx);
VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, newobj,
dle->dle_mintxg, obj, tx));
}
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,7 +21,7 @@
*/
/*
* Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
*/
/*
@@ -165,10 +165,10 @@
VERIFY0(dsl_dir_hold(dp, dda->dda_name, FTAG, &dd, NULL));
- zapobj = dd->dd_phys->dd_deleg_zapobj;
+ zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj;
if (zapobj == 0) {
dmu_buf_will_dirty(dd->dd_dbuf, tx);
- zapobj = dd->dd_phys->dd_deleg_zapobj = zap_create(mos,
+ zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj = zap_create(mos,
DMU_OT_DSL_PERMS, DMU_OT_NONE, 0, tx);
}
@@ -209,7 +209,7 @@
uint64_t zapobj;
VERIFY0(dsl_dir_hold(dp, dda->dda_name, FTAG, &dd, NULL));
- zapobj = dd->dd_phys->dd_deleg_zapobj;
+ zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj;
if (zapobj == 0) {
dsl_dir_rele(dd, FTAG);
return;
@@ -283,7 +283,7 @@
return (dsl_sync_task(ddname, dsl_deleg_check,
unset ? dsl_deleg_unset_sync : dsl_deleg_set_sync,
- &dda, fnvlist_num_pairs(nvp)));
+ &dda, fnvlist_num_pairs(nvp), ZFS_SPACE_CHECK_RESERVED));
}
/*
@@ -331,16 +331,16 @@
zap_attribute_t baseza;
nvlist_t *sp_nvp;
uint64_t n;
- char source[MAXNAMELEN];
+ char source[ZFS_MAX_DATASET_NAME_LEN];
- if (dd->dd_phys->dd_deleg_zapobj == 0 ||
- zap_count(mos, dd->dd_phys->dd_deleg_zapobj, &n) != 0 ||
- n == 0)
+ if (dsl_dir_phys(dd)->dd_deleg_zapobj == 0 ||
+ zap_count(mos,
+ dsl_dir_phys(dd)->dd_deleg_zapobj, &n) != 0 || n == 0)
continue;
sp_nvp = fnvlist_alloc();
for (zap_cursor_init(&basezc, mos,
- dd->dd_phys->dd_deleg_zapobj);
+ dsl_dir_phys(dd)->dd_deleg_zapobj);
zap_cursor_retrieve(&basezc, &baseza) == 0;
zap_cursor_advance(&basezc)) {
zap_cursor_t zc;
@@ -562,7 +562,7 @@
SPA_VERSION_DELEGATED_PERMS)
return (SET_ERROR(EPERM));
- if (dsl_dataset_is_snapshot(ds)) {
+ if (ds->ds_is_snapshot) {
/*
* Snapshots are treated as descendents only,
* local permissions do not apply.
@@ -595,7 +595,7 @@
if (!zoned)
break;
}
- zapobj = dd->dd_phys->dd_deleg_zapobj;
+ zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj;
if (zapobj == 0)
continue;
@@ -674,7 +674,7 @@
{
objset_t *mos = dd->dd_pool->dp_meta_objset;
uint64_t jumpobj, pjumpobj;
- uint64_t zapobj = dd->dd_phys->dd_deleg_zapobj;
+ uint64_t zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj;
zap_cursor_t zc;
zap_attribute_t za;
char whokey[ZFS_MAX_DELEG_NAME];
@@ -687,7 +687,7 @@
if (zapobj == 0) {
dmu_buf_will_dirty(dd->dd_dbuf, tx);
- zapobj = dd->dd_phys->dd_deleg_zapobj = zap_create(mos,
+ zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj = zap_create(mos,
DMU_OT_DSL_PERMS, DMU_OT_NONE, 0, tx);
}
@@ -725,7 +725,7 @@
return;
for (dd = sdd->dd_parent; dd != NULL; dd = dd->dd_parent) {
- uint64_t pzapobj = dd->dd_phys->dd_deleg_zapobj;
+ uint64_t pzapobj = dsl_dir_phys(dd)->dd_deleg_zapobj;
if (pzapobj == 0)
continue;
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,8 +21,10 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
* Copyright (c) 2013 Steven Hartland. All rights reserved.
+ * Copyright (c) 2013 by Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
*/
#include <sys/zfs_context.h>
@@ -39,6 +41,7 @@
#include <sys/zfeature.h>
#include <sys/zfs_ioctl.h>
#include <sys/dsl_deleg.h>
+#include <sys/dmu_impl.h>
typedef struct dmu_snapshots_destroy_arg {
nvlist_t *dsda_snaps;
@@ -47,13 +50,10 @@
nvlist_t *dsda_errlist;
} dmu_snapshots_destroy_arg_t;
-/*
- * ds must be owned.
- */
-static int
+int
dsl_destroy_snapshot_check_impl(dsl_dataset_t *ds, boolean_t defer)
{
- if (!dsl_dataset_is_snapshot(ds))
+ if (!ds->ds_is_snapshot)
return (SET_ERROR(EINVAL));
if (dsl_dataset_long_held(ds))
@@ -80,7 +80,7 @@
/*
* Can't delete a branch point.
*/
- if (ds->ds_phys->ds_num_children > 1)
+ if (dsl_dataset_phys(ds)->ds_num_children > 1)
return (SET_ERROR(EEXIST));
return (0);
@@ -147,12 +147,14 @@
struct process_old_arg *poa = arg;
dsl_pool_t *dp = poa->ds->ds_dir->dd_pool;
- if (bp->blk_birth <= poa->ds->ds_phys->ds_prev_snap_txg) {
+ ASSERT(!BP_IS_HOLE(bp));
+
+ if (bp->blk_birth <= dsl_dataset_phys(poa->ds)->ds_prev_snap_txg) {
dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, tx);
if (poa->ds_prev && !poa->after_branch_point &&
bp->blk_birth >
- poa->ds_prev->ds_phys->ds_prev_snap_txg) {
- poa->ds_prev->ds_phys->ds_unique_bytes +=
+ dsl_dataset_phys(poa->ds_prev)->ds_prev_snap_txg) {
+ dsl_dataset_phys(poa->ds_prev)->ds_unique_bytes +=
bp_get_dsize_sync(dp->dp_spa, bp);
}
} else {
@@ -183,7 +185,7 @@
VERIFY0(bpobj_iterate(&ds_next->ds_deadlist.dl_bpobj,
process_old_cb, &poa, tx));
VERIFY0(zio_wait(poa.pio));
- ASSERT3U(poa.used, ==, ds->ds_phys->ds_unique_bytes);
+ ASSERT3U(poa.used, ==, dsl_dataset_phys(ds)->ds_unique_bytes);
/* change snapused */
dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
@@ -192,12 +194,14 @@
/* swap next's deadlist to our deadlist */
dsl_deadlist_close(&ds->ds_deadlist);
dsl_deadlist_close(&ds_next->ds_deadlist);
- deadlist_obj = ds->ds_phys->ds_deadlist_obj;
- ds->ds_phys->ds_deadlist_obj = ds_next->ds_phys->ds_deadlist_obj;
- ds_next->ds_phys->ds_deadlist_obj = deadlist_obj;
- dsl_deadlist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj);
+ deadlist_obj = dsl_dataset_phys(ds)->ds_deadlist_obj;
+ dsl_dataset_phys(ds)->ds_deadlist_obj =
+ dsl_dataset_phys(ds_next)->ds_deadlist_obj;
+ dsl_dataset_phys(ds_next)->ds_deadlist_obj = deadlist_obj;
+ dsl_deadlist_open(&ds->ds_deadlist, mos,
+ dsl_dataset_phys(ds)->ds_deadlist_obj);
dsl_deadlist_open(&ds_next->ds_deadlist, mos,
- ds_next->ds_phys->ds_deadlist_obj);
+ dsl_dataset_phys(ds_next)->ds_deadlist_obj);
}
static void
@@ -212,10 +216,10 @@
* find the clones, but dsl_deadlist_remove_key() is a no-op so it
* doesn't matter.
*/
- if (ds->ds_dir->dd_phys->dd_clones == 0)
+ if (dsl_dir_phys(ds->ds_dir)->dd_clones == 0)
return;
- for (zap_cursor_init(&zc, mos, ds->ds_dir->dd_phys->dd_clones);
+ for (zap_cursor_init(&zc, mos, dsl_dir_phys(ds->ds_dir)->dd_clones);
zap_cursor_retrieve(&zc, &za) == 0;
zap_cursor_advance(&zc)) {
dsl_dataset_t *clone;
@@ -243,19 +247,22 @@
uint64_t obj;
ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
- ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg);
+ rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
+ ASSERT3U(dsl_dataset_phys(ds)->ds_bp.blk_birth, <=, tx->tx_txg);
+ rrw_exit(&ds->ds_bp_rwlock, FTAG);
ASSERT(refcount_is_zero(&ds->ds_longholds));
if (defer &&
- (ds->ds_userrefs > 0 || ds->ds_phys->ds_num_children > 1)) {
+ (ds->ds_userrefs > 0 ||
+ dsl_dataset_phys(ds)->ds_num_children > 1)) {
ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
dmu_buf_will_dirty(ds->ds_dbuf, tx);
- ds->ds_phys->ds_flags |= DS_FLAG_DEFER_DESTROY;
+ dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_DEFER_DESTROY;
spa_history_log_internal_ds(ds, "defer_destroy", tx, "");
return;
}
- ASSERT3U(ds->ds_phys->ds_num_children, <=, 1);
+ ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1);
/* We need to log before removing it from the namespace. */
spa_history_log_internal_ds(ds, "destroy", tx, "");
@@ -264,26 +271,34 @@
obj = ds->ds_object;
- if (ds->ds_phys->ds_prev_snap_obj != 0) {
+ for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
+ if (ds->ds_feature_inuse[f]) {
+ dsl_dataset_deactivate_feature(obj, f, tx);
+ ds->ds_feature_inuse[f] = B_FALSE;
+ }
+ }
+ if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
ASSERT3P(ds->ds_prev, ==, NULL);
VERIFY0(dsl_dataset_hold_obj(dp,
- ds->ds_phys->ds_prev_snap_obj, FTAG, &ds_prev));
+ dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &ds_prev));
after_branch_point =
- (ds_prev->ds_phys->ds_next_snap_obj != obj);
+ (dsl_dataset_phys(ds_prev)->ds_next_snap_obj != obj);
dmu_buf_will_dirty(ds_prev->ds_dbuf, tx);
if (after_branch_point &&
- ds_prev->ds_phys->ds_next_clones_obj != 0) {
+ dsl_dataset_phys(ds_prev)->ds_next_clones_obj != 0) {
dsl_dataset_remove_from_next_clones(ds_prev, obj, tx);
- if (ds->ds_phys->ds_next_snap_obj != 0) {
+ if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0) {
VERIFY0(zap_add_int(mos,
- ds_prev->ds_phys->ds_next_clones_obj,
- ds->ds_phys->ds_next_snap_obj, tx));
+ dsl_dataset_phys(ds_prev)->
+ ds_next_clones_obj,
+ dsl_dataset_phys(ds)->ds_next_snap_obj,
+ tx));
}
}
if (!after_branch_point) {
- ds_prev->ds_phys->ds_next_snap_obj =
- ds->ds_phys->ds_next_snap_obj;
+ dsl_dataset_phys(ds_prev)->ds_next_snap_obj =
+ dsl_dataset_phys(ds)->ds_next_snap_obj;
}
}
@@ -292,18 +307,18 @@
uint64_t used = 0, comp = 0, uncomp = 0;
VERIFY0(dsl_dataset_hold_obj(dp,
- ds->ds_phys->ds_next_snap_obj, FTAG, &ds_next));
- ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj);
+ dsl_dataset_phys(ds)->ds_next_snap_obj, FTAG, &ds_next));
+ ASSERT3U(dsl_dataset_phys(ds_next)->ds_prev_snap_obj, ==, obj);
- old_unique = ds_next->ds_phys->ds_unique_bytes;
+ old_unique = dsl_dataset_phys(ds_next)->ds_unique_bytes;
dmu_buf_will_dirty(ds_next->ds_dbuf, tx);
- ds_next->ds_phys->ds_prev_snap_obj =
- ds->ds_phys->ds_prev_snap_obj;
- ds_next->ds_phys->ds_prev_snap_txg =
- ds->ds_phys->ds_prev_snap_txg;
- ASSERT3U(ds->ds_phys->ds_prev_snap_txg, ==,
- ds_prev ? ds_prev->ds_phys->ds_creation_txg : 0);
+ dsl_dataset_phys(ds_next)->ds_prev_snap_obj =
+ dsl_dataset_phys(ds)->ds_prev_snap_obj;
+ dsl_dataset_phys(ds_next)->ds_prev_snap_txg =
+ dsl_dataset_phys(ds)->ds_prev_snap_txg;
+ ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, ==,
+ ds_prev ? dsl_dataset_phys(ds_prev)->ds_creation_txg : 0);
if (ds_next->ds_deadlist.dl_oldfmt) {
process_old_deadlist(ds, ds_prev, ds_next,
@@ -312,15 +327,15 @@
/* Adjust prev's unique space. */
if (ds_prev && !after_branch_point) {
dsl_deadlist_space_range(&ds_next->ds_deadlist,
- ds_prev->ds_phys->ds_prev_snap_txg,
- ds->ds_phys->ds_prev_snap_txg,
+ dsl_dataset_phys(ds_prev)->ds_prev_snap_txg,
+ dsl_dataset_phys(ds)->ds_prev_snap_txg,
&used, &comp, &uncomp);
- ds_prev->ds_phys->ds_unique_bytes += used;
+ dsl_dataset_phys(ds_prev)->ds_unique_bytes += used;
}
/* Adjust snapused. */
dsl_deadlist_space_range(&ds_next->ds_deadlist,
- ds->ds_phys->ds_prev_snap_txg, UINT64_MAX,
+ dsl_dataset_phys(ds)->ds_prev_snap_txg, UINT64_MAX,
&used, &comp, &uncomp);
dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP,
-used, -comp, -uncomp, tx);
@@ -327,7 +342,7 @@
/* Move blocks to be freed to pool's free list. */
dsl_deadlist_move_bpobj(&ds_next->ds_deadlist,
- &dp->dp_free_bpobj, ds->ds_phys->ds_prev_snap_txg,
+ &dp->dp_free_bpobj, dsl_dataset_phys(ds)->ds_prev_snap_txg,
tx);
dsl_dir_diduse_space(tx->tx_pool->dp_free_dir,
DD_USED_HEAD, used, comp, uncomp, tx);
@@ -334,18 +349,18 @@
/* Merge our deadlist into next's and free it. */
dsl_deadlist_merge(&ds_next->ds_deadlist,
- ds->ds_phys->ds_deadlist_obj, tx);
+ dsl_dataset_phys(ds)->ds_deadlist_obj, tx);
}
dsl_deadlist_close(&ds->ds_deadlist);
- dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx);
+ dsl_deadlist_free(mos, dsl_dataset_phys(ds)->ds_deadlist_obj, tx);
dmu_buf_will_dirty(ds->ds_dbuf, tx);
- ds->ds_phys->ds_deadlist_obj = 0;
+ dsl_dataset_phys(ds)->ds_deadlist_obj = 0;
/* Collapse range in clone heads */
dsl_dataset_remove_clones_key(ds,
- ds->ds_phys->ds_creation_txg, tx);
+ dsl_dataset_phys(ds)->ds_creation_txg, tx);
- if (dsl_dataset_is_snapshot(ds_next)) {
+ if (ds_next->ds_is_snapshot) {
dsl_dataset_t *ds_nextnext;
/*
@@ -358,12 +373,13 @@
* deadlist).
*/
VERIFY0(dsl_dataset_hold_obj(dp,
- ds_next->ds_phys->ds_next_snap_obj, FTAG, &ds_nextnext));
+ dsl_dataset_phys(ds_next)->ds_next_snap_obj,
+ FTAG, &ds_nextnext));
dsl_deadlist_space_range(&ds_nextnext->ds_deadlist,
- ds->ds_phys->ds_prev_snap_txg,
- ds->ds_phys->ds_creation_txg,
+ dsl_dataset_phys(ds)->ds_prev_snap_txg,
+ dsl_dataset_phys(ds)->ds_creation_txg,
&used, &comp, &uncomp);
- ds_next->ds_phys->ds_unique_bytes += used;
+ dsl_dataset_phys(ds_next)->ds_unique_bytes += used;
dsl_dataset_rele(ds_nextnext, FTAG);
ASSERT3P(ds_next->ds_prev, ==, NULL);
@@ -370,9 +386,9 @@
/* Collapse range in this head. */
dsl_dataset_t *hds;
VERIFY0(dsl_dataset_hold_obj(dp,
- ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &hds));
+ dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj, FTAG, &hds));
dsl_deadlist_remove_key(&hds->ds_deadlist,
- ds->ds_phys->ds_creation_txg, tx);
+ dsl_dataset_phys(ds)->ds_creation_txg, tx);
dsl_dataset_rele(hds, FTAG);
} else {
@@ -381,7 +397,7 @@
ds_next->ds_prev = NULL;
if (ds_prev) {
VERIFY0(dsl_dataset_hold_obj(dp,
- ds->ds_phys->ds_prev_snap_obj,
+ dsl_dataset_phys(ds)->ds_prev_snap_obj,
ds_next, &ds_next->ds_prev));
}
@@ -395,7 +411,7 @@
if (old_unique < ds_next->ds_reserved) {
int64_t mrsdelta;
uint64_t new_unique =
- ds_next->ds_phys->ds_unique_bytes;
+ dsl_dataset_phys(ds_next)->ds_unique_bytes;
ASSERT(old_unique <= new_unique);
mrsdelta = MIN(new_unique - old_unique,
@@ -417,9 +433,9 @@
/* remove from snapshot namespace */
dsl_dataset_t *ds_head;
- ASSERT(ds->ds_phys->ds_snapnames_zapobj == 0);
+ ASSERT(dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0);
VERIFY0(dsl_dataset_hold_obj(dp,
- ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG, &ds_head));
+ dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj, FTAG, &ds_head));
VERIFY0(dsl_dataset_get_snapname(ds));
#ifdef ZFS_DEBUG
{
@@ -431,7 +447,7 @@
ASSERT3U(val, ==, obj);
}
#endif
- VERIFY0(dsl_dataset_snap_remove(ds_head, ds->ds_snapname, tx));
+ VERIFY0(dsl_dataset_snap_remove(ds_head, ds->ds_snapname, tx, B_TRUE));
dsl_dataset_rele(ds_head, FTAG);
if (ds_prev != NULL)
@@ -439,20 +455,23 @@
spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx);
- if (ds->ds_phys->ds_next_clones_obj != 0) {
+ if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) {
uint64_t count;
ASSERT0(zap_count(mos,
- ds->ds_phys->ds_next_clones_obj, &count) && count == 0);
+ dsl_dataset_phys(ds)->ds_next_clones_obj, &count) &&
+ count == 0);
VERIFY0(dmu_object_free(mos,
- ds->ds_phys->ds_next_clones_obj, tx));
+ dsl_dataset_phys(ds)->ds_next_clones_obj, tx));
}
- if (ds->ds_phys->ds_props_obj != 0)
- VERIFY0(zap_destroy(mos, ds->ds_phys->ds_props_obj, tx));
- if (ds->ds_phys->ds_userrefs_obj != 0)
- VERIFY0(zap_destroy(mos, ds->ds_phys->ds_userrefs_obj, tx));
+ if (dsl_dataset_phys(ds)->ds_props_obj != 0)
+ VERIFY0(zap_destroy(mos, dsl_dataset_phys(ds)->ds_props_obj,
+ tx));
+ if (dsl_dataset_phys(ds)->ds_userrefs_obj != 0)
+ VERIFY0(zap_destroy(mos, dsl_dataset_phys(ds)->ds_userrefs_obj,
+ tx));
dsl_dir_rele(ds->ds_dir, ds);
ds->ds_dir = NULL;
- VERIFY0(dmu_object_free(mos, obj, tx));
+ dmu_object_free_zapified(mos, obj, tx);
}
static void
@@ -506,7 +525,7 @@
error = dsl_sync_task(nvpair_name(pair),
dsl_destroy_snapshot_check, dsl_destroy_snapshot_sync,
- &dsda, 0);
+ &dsda, 0, ZFS_SPACE_CHECK_NONE);
fnvlist_free(dsda.dsda_successful_snaps);
return (error);
@@ -534,12 +553,12 @@
/* ARGSUSED */
static int
kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
- const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
+ const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
{
struct killarg *ka = arg;
dmu_tx_t *tx = ka->tx;
- if (bp == NULL)
+ if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
return (0);
if (zb->zb_level == ZB_ZIL_LEVEL) {
@@ -551,7 +570,8 @@
dsl_free(ka->tx->tx_pool, ka->tx->tx_txg, bp);
} else {
ASSERT(zilog == NULL);
- ASSERT3U(bp->blk_birth, >, ka->ds->ds_phys->ds_prev_snap_txg);
+ ASSERT3U(bp->blk_birth, >,
+ dsl_dataset_phys(ka->ds)->ds_prev_snap_txg);
(void) dsl_dataset_block_kill(ka->ds, bp, tx, B_FALSE);
}
@@ -573,9 +593,10 @@
ka.ds = ds;
ka.tx = tx;
VERIFY0(traverse_dataset(ds,
- ds->ds_phys->ds_prev_snap_txg, TRAVERSE_POST,
+ dsl_dataset_phys(ds)->ds_prev_snap_txg, TRAVERSE_POST,
kill_blkptr, &ka));
- ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || ds->ds_phys->ds_unique_bytes == 0);
+ ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) ||
+ dsl_dataset_phys(ds)->ds_unique_bytes == 0);
}
typedef struct dsl_destroy_head_arg {
@@ -589,7 +610,8 @@
uint64_t count;
objset_t *mos;
- if (dsl_dataset_is_snapshot(ds))
+ ASSERT(!ds->ds_is_snapshot);
+ if (ds->ds_is_snapshot)
return (SET_ERROR(EINVAL));
if (refcount_count(&ds->ds_longholds) != expected_holds)
@@ -603,7 +625,7 @@
* from.)
*/
if (ds->ds_prev != NULL &&
- ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object)
+ dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj == ds->ds_object)
return (SET_ERROR(EBUSY));
/*
@@ -610,7 +632,7 @@
* Can't delete if there are children of this fs.
*/
error = zap_count(mos,
- ds->ds_dir->dd_phys->dd_child_dir_zapobj, &count);
+ dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, &count);
if (error != 0)
return (error);
if (count != 0)
@@ -617,7 +639,7 @@
return (SET_ERROR(EEXIST));
if (dsl_dir_is_clone(ds->ds_dir) && DS_IS_DEFER_DESTROY(ds->ds_prev) &&
- ds->ds_prev->ds_phys->ds_num_children == 2 &&
+ dsl_dataset_phys(ds->ds_prev)->ds_num_children == 2 &&
ds->ds_prev->ds_userrefs == 0) {
/* We need to remove the origin snapshot as well. */
if (!refcount_is_zero(&ds->ds_prev->ds_longholds))
@@ -655,27 +677,39 @@
VERIFY0(dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd));
- ASSERT0(dd->dd_phys->dd_head_dataset_obj);
+ ASSERT0(dsl_dir_phys(dd)->dd_head_dataset_obj);
/*
+ * Decrement the filesystem count for all parent filesystems.
+ *
+ * When we receive an incremental stream into a filesystem that already
+ * exists, a temporary clone is created. We never count this temporary
+ * clone, whose name begins with a '%'.
+ */
+ if (dd->dd_myname[0] != '%' && dd->dd_parent != NULL)
+ dsl_fs_ss_count_adjust(dd->dd_parent, -1,
+ DD_FIELD_FILESYSTEM_COUNT, tx);
+
+ /*
* Remove our reservation. The impl() routine avoids setting the
* actual property, which would require the (already destroyed) ds.
*/
dsl_dir_set_reservation_sync_impl(dd, 0, tx);
- ASSERT0(dd->dd_phys->dd_used_bytes);
- ASSERT0(dd->dd_phys->dd_reserved);
+ ASSERT0(dsl_dir_phys(dd)->dd_used_bytes);
+ ASSERT0(dsl_dir_phys(dd)->dd_reserved);
for (t = 0; t < DD_USED_NUM; t++)
- ASSERT0(dd->dd_phys->dd_used_breakdown[t]);
+ ASSERT0(dsl_dir_phys(dd)->dd_used_breakdown[t]);
- VERIFY0(zap_destroy(mos, dd->dd_phys->dd_child_dir_zapobj, tx));
- VERIFY0(zap_destroy(mos, dd->dd_phys->dd_props_zapobj, tx));
- VERIFY0(dsl_deleg_destroy(mos, dd->dd_phys->dd_deleg_zapobj, tx));
+ VERIFY0(zap_destroy(mos, dsl_dir_phys(dd)->dd_child_dir_zapobj, tx));
+ VERIFY0(zap_destroy(mos, dsl_dir_phys(dd)->dd_props_zapobj, tx));
+ VERIFY0(dsl_deleg_destroy(mos, dsl_dir_phys(dd)->dd_deleg_zapobj, tx));
VERIFY0(zap_remove(mos,
- dd->dd_parent->dd_phys->dd_child_dir_zapobj, dd->dd_myname, tx));
+ dsl_dir_phys(dd->dd_parent)->dd_child_dir_zapobj,
+ dd->dd_myname, tx));
dsl_dir_rele(dd, FTAG);
- VERIFY0(dmu_object_free(mos, ddobj, tx));
+ dmu_object_free_zapified(mos, ddobj, tx);
}
void
@@ -686,10 +720,12 @@
uint64_t obj, ddobj, prevobj = 0;
boolean_t rmorigin;
- ASSERT3U(ds->ds_phys->ds_num_children, <=, 1);
+ ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1);
ASSERT(ds->ds_prev == NULL ||
- ds->ds_prev->ds_phys->ds_next_snap_obj != ds->ds_object);
- ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg);
+ dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj != ds->ds_object);
+ rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
+ ASSERT3U(dsl_dataset_phys(ds)->ds_bp.blk_birth, <=, tx->tx_txg);
+ rrw_exit(&ds->ds_bp_rwlock, FTAG);
ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
/* We need to log before removing it from the namespace. */
@@ -697,10 +733,10 @@
rmorigin = (dsl_dir_is_clone(ds->ds_dir) &&
DS_IS_DEFER_DESTROY(ds->ds_prev) &&
- ds->ds_prev->ds_phys->ds_num_children == 2 &&
+ dsl_dataset_phys(ds->ds_prev)->ds_num_children == 2 &&
ds->ds_prev->ds_userrefs == 0);
- /* Remove our reservation */
+ /* Remove our reservation. */
if (ds->ds_reserved != 0) {
dsl_dataset_set_refreservation_sync_impl(ds,
(ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED),
@@ -708,30 +744,34 @@
ASSERT0(ds->ds_reserved);
}
+ obj = ds->ds_object;
+
+ for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
+ if (ds->ds_feature_inuse[f]) {
+ dsl_dataset_deactivate_feature(obj, f, tx);
+ ds->ds_feature_inuse[f] = B_FALSE;
+ }
+ }
+
dsl_scan_ds_destroyed(ds, tx);
- obj = ds->ds_object;
-
- if (ds->ds_phys->ds_prev_snap_obj != 0) {
+ if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
/* This is a clone */
ASSERT(ds->ds_prev != NULL);
- ASSERT3U(ds->ds_prev->ds_phys->ds_next_snap_obj, !=, obj);
- ASSERT0(ds->ds_phys->ds_next_snap_obj);
+ ASSERT3U(dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj, !=,
+ obj);
+ ASSERT0(dsl_dataset_phys(ds)->ds_next_snap_obj);
dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
- if (ds->ds_prev->ds_phys->ds_next_clones_obj != 0) {
+ if (dsl_dataset_phys(ds->ds_prev)->ds_next_clones_obj != 0) {
dsl_dataset_remove_from_next_clones(ds->ds_prev,
obj, tx);
}
- ASSERT3U(ds->ds_prev->ds_phys->ds_num_children, >, 1);
- ds->ds_prev->ds_phys->ds_num_children--;
+ ASSERT3U(dsl_dataset_phys(ds->ds_prev)->ds_num_children, >, 1);
+ dsl_dataset_phys(ds->ds_prev)->ds_num_children--;
}
- zfeature_info_t *async_destroy =
- &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY];
- objset_t *os;
-
/*
* Destroy the deadlist. Unless it's a clone, the
* deadlist should be empty. (If it's a clone, it's
@@ -738,13 +778,14 @@
* safe to ignore the deadlist contents.)
*/
dsl_deadlist_close(&ds->ds_deadlist);
- dsl_deadlist_free(mos, ds->ds_phys->ds_deadlist_obj, tx);
+ dsl_deadlist_free(mos, dsl_dataset_phys(ds)->ds_deadlist_obj, tx);
dmu_buf_will_dirty(ds->ds_dbuf, tx);
- ds->ds_phys->ds_deadlist_obj = 0;
+ dsl_dataset_phys(ds)->ds_deadlist_obj = 0;
+ objset_t *os;
VERIFY0(dmu_objset_from_ds(ds, &os));
- if (!spa_feature_is_enabled(dp->dp_spa, async_destroy)) {
+ if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY)) {
old_synchronous_dataset_destroy(ds, tx);
} else {
/*
@@ -755,10 +796,11 @@
zil_destroy_sync(dmu_objset_zil(os), tx);
- if (!spa_feature_is_active(dp->dp_spa, async_destroy)) {
+ if (!spa_feature_is_active(dp->dp_spa,
+ SPA_FEATURE_ASYNC_DESTROY)) {
dsl_scan_t *scn = dp->dp_scan;
-
- spa_feature_incr(dp->dp_spa, async_destroy, tx);
+ spa_feature_incr(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY,
+ tx);
dp->dp_bptree_obj = bptree_alloc(mos, tx);
VERIFY0(zap_add(mos,
DMU_POOL_DIRECTORY_OBJECT,
@@ -768,16 +810,19 @@
scn->scn_async_destroying = B_TRUE;
}
- used = ds->ds_dir->dd_phys->dd_used_bytes;
- comp = ds->ds_dir->dd_phys->dd_compressed_bytes;
- uncomp = ds->ds_dir->dd_phys->dd_uncompressed_bytes;
+ used = dsl_dir_phys(ds->ds_dir)->dd_used_bytes;
+ comp = dsl_dir_phys(ds->ds_dir)->dd_compressed_bytes;
+ uncomp = dsl_dir_phys(ds->ds_dir)->dd_uncompressed_bytes;
ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) ||
- ds->ds_phys->ds_unique_bytes == used);
+ dsl_dataset_phys(ds)->ds_unique_bytes == used);
+ rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
bptree_add(mos, dp->dp_bptree_obj,
- &ds->ds_phys->ds_bp, ds->ds_phys->ds_prev_snap_txg,
+ &dsl_dataset_phys(ds)->ds_bp,
+ dsl_dataset_phys(ds)->ds_prev_snap_txg,
used, comp, uncomp, tx);
+ rrw_exit(&ds->ds_bp_rwlock, FTAG);
dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD,
-used, -comp, -uncomp, tx);
dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD,
@@ -787,7 +832,7 @@
if (ds->ds_prev != NULL) {
if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) {
VERIFY0(zap_remove_int(mos,
- ds->ds_prev->ds_dir->dd_phys->dd_clones,
+ dsl_dir_phys(ds->ds_prev->ds_dir)->dd_clones,
ds->ds_object, tx));
}
prevobj = ds->ds_prev->ds_object;
@@ -806,19 +851,25 @@
/* Erase the link in the dir */
dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
- ds->ds_dir->dd_phys->dd_head_dataset_obj = 0;
+ dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj = 0;
ddobj = ds->ds_dir->dd_object;
- ASSERT(ds->ds_phys->ds_snapnames_zapobj != 0);
- VERIFY0(zap_destroy(mos, ds->ds_phys->ds_snapnames_zapobj, tx));
+ ASSERT(dsl_dataset_phys(ds)->ds_snapnames_zapobj != 0);
+ VERIFY0(zap_destroy(mos,
+ dsl_dataset_phys(ds)->ds_snapnames_zapobj, tx));
+ if (ds->ds_bookmarks != 0) {
+ VERIFY0(zap_destroy(mos, ds->ds_bookmarks, tx));
+ spa_feature_decr(dp->dp_spa, SPA_FEATURE_BOOKMARKS, tx);
+ }
+
spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx);
- ASSERT0(ds->ds_phys->ds_next_clones_obj);
- ASSERT0(ds->ds_phys->ds_props_obj);
- ASSERT0(ds->ds_phys->ds_userrefs_obj);
+ ASSERT0(dsl_dataset_phys(ds)->ds_next_clones_obj);
+ ASSERT0(dsl_dataset_phys(ds)->ds_props_obj);
+ ASSERT0(dsl_dataset_phys(ds)->ds_userrefs_obj);
dsl_dir_rele(ds->ds_dir, ds);
ds->ds_dir = NULL;
- VERIFY0(dmu_object_free(mos, obj, tx));
+ dmu_object_free_zapified(mos, obj, tx);
dsl_dir_destroy_sync(ddobj, tx);
@@ -853,7 +904,7 @@
/* Mark it as inconsistent on-disk, in case we crash */
dmu_buf_will_dirty(ds->ds_dbuf, tx);
- ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
+ dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_INCONSISTENT;
spa_history_log_internal_ds(ds, "destroy begin", tx, "");
dsl_dataset_rele(ds, FTAG);
@@ -874,8 +925,7 @@
error = spa_open(name, &spa, FTAG);
if (error != 0)
return (error);
- isenabled = spa_feature_is_enabled(spa,
- &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY]);
+ isenabled = spa_feature_is_enabled(spa, SPA_FEATURE_ASYNC_DESTROY);
spa_close(spa, FTAG);
ddha.ddha_name = name;
@@ -884,7 +934,8 @@
objset_t *os;
error = dsl_sync_task(name, dsl_destroy_head_check,
- dsl_destroy_head_begin_sync, &ddha, 0);
+ dsl_destroy_head_begin_sync, &ddha,
+ 0, ZFS_SPACE_CHECK_NONE);
if (error != 0)
return (error);
@@ -896,11 +947,12 @@
error = dmu_objset_own(name, DMU_OST_ANY, B_FALSE, FTAG, &os);
if (error == 0) {
uint64_t prev_snap_txg =
- dmu_objset_ds(os)->ds_phys->ds_prev_snap_txg;
+ dsl_dataset_phys(dmu_objset_ds(os))->
+ ds_prev_snap_txg;
for (uint64_t obj = 0; error == 0;
error = dmu_object_next(os, &obj, FALSE,
prev_snap_txg))
- (void) dmu_free_object(os, obj);
+ (void) dmu_free_long_object(os, obj);
/* sync out all frees */
txg_wait_synced(dmu_objset_pool(os), 0);
dmu_objset_disown(os, FTAG);
@@ -908,7 +960,7 @@
}
return (dsl_sync_task(name, dsl_destroy_head_check,
- dsl_destroy_head_sync, &ddha, 0));
+ dsl_destroy_head_sync, &ddha, 0, ZFS_SPACE_CHECK_NONE));
}
/*
@@ -924,9 +976,17 @@
objset_t *os;
if (dmu_objset_hold(dsname, FTAG, &os) == 0) {
- boolean_t inconsistent = DS_IS_INCONSISTENT(dmu_objset_ds(os));
+ boolean_t need_destroy = DS_IS_INCONSISTENT(dmu_objset_ds(os));
+
+ /*
+ * If the dataset is inconsistent because a resumable receive
+ * has failed, then do not destroy it.
+ */
+ if (dsl_dataset_has_resume_receive_state(dmu_objset_ds(os)))
+ need_destroy = B_FALSE;
+
dmu_objset_rele(os, FTAG);
- if (inconsistent)
+ if (need_destroy)
(void) dsl_destroy_head(dsname);
}
return (0);
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -23,7 +23,10 @@
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011 Pawel Jakub Dawidek <pawel at dawidek.net>.
* All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
*/
#include <sys/dmu.h>
@@ -34,6 +37,7 @@
#include <sys/dsl_prop.h>
#include <sys/dsl_synctask.h>
#include <sys/dsl_deleg.h>
+#include <sys/dmu_impl.h>
#include <sys/spa.h>
#include <sys/metaslab.h>
#include <sys/zap.h>
@@ -44,18 +48,100 @@
#ifdef _KERNEL
#include <sys/zfs_vfsops.h>
#endif
+#include <sys/zfeature.h>
+#include <sys/policy.h>
+#include <sys/zfs_znode.h>
#include "zfs_namecheck.h"
+#include "zfs_prop.h"
+/*
+ * Filesystem and Snapshot Limits
+ * ------------------------------
+ *
+ * These limits are used to restrict the number of filesystems and/or snapshots
+ * that can be created at a given level in the tree or below. A typical
+ * use-case is with a delegated dataset where the administrator wants to ensure
+ * that a user within the zone is not creating too many additional filesystems
+ * or snapshots, even though they're not exceeding their space quota.
+ *
+ * The filesystem and snapshot counts are stored as extensible properties. This
+ * capability is controlled by a feature flag and must be enabled to be used.
+ * Once enabled, the feature is not active until the first limit is set. At
+ * that point, future operations to create/destroy filesystems or snapshots
+ * will validate and update the counts.
+ *
+ * Because the count properties will not exist before the feature is active,
+ * the counts are updated when a limit is first set on an uninitialized
+ * dsl_dir node in the tree (The filesystem/snapshot count on a node includes
+ * all of the nested filesystems/snapshots. Thus, a new leaf node has a
+ * filesystem count of 0 and a snapshot count of 0. Non-existent filesystem and
+ * snapshot count properties on a node indicate uninitialized counts on that
+ * node.) When first setting a limit on an uninitialized node, the code starts
+ * at the filesystem with the new limit and descends into all sub-filesystems
+ * to add the count properties.
+ *
+ * In practice this is lightweight since a limit is typically set when the
+ * filesystem is created and thus has no children. Once valid, changing the
+ * limit value won't require a re-traversal since the counts are already valid.
+ * When recursively fixing the counts, if a node with a limit is encountered
+ * during the descent, the counts are known to be valid and there is no need to
+ * descend into that filesystem's children. The counts on filesystems above the
+ * one with the new limit will still be uninitialized, unless a limit is
+ * eventually set on one of those filesystems. The counts are always recursively
+ * updated when a limit is set on a dataset, unless there is already a limit.
+ * When a new limit value is set on a filesystem with an existing limit, it is
+ * possible for the new limit to be less than the current count at that level
+ * since a user who can change the limit is also allowed to exceed the limit.
+ *
+ * Once the feature is active, then whenever a filesystem or snapshot is
+ * created, the code recurses up the tree, validating the new count against the
+ * limit at each initialized level. In practice, most levels will not have a
+ * limit set. If there is a limit at any initialized level up the tree, the
+ * check must pass or the creation will fail. Likewise, when a filesystem or
+ * snapshot is destroyed, the counts are recursively adjusted all the way up
+ * the initizized nodes in the tree. Renaming a filesystem into different point
+ * in the tree will first validate, then update the counts on each branch up to
+ * the common ancestor. A receive will also validate the counts and then update
+ * them.
+ *
+ * An exception to the above behavior is that the limit is not enforced if the
+ * user has permission to modify the limit. This is primarily so that
+ * recursive snapshots in the global zone always work. We want to prevent a
+ * denial-of-service in which a lower level delegated dataset could max out its
+ * limit and thus block recursive snapshots from being taken in the global zone.
+ * Because of this, it is possible for the snapshot count to be over the limit
+ * and snapshots taken in the global zone could cause a lower level dataset to
+ * hit or exceed its limit. The administrator taking the global zone recursive
+ * snapshot should be aware of this side-effect and behave accordingly.
+ * For consistency, the filesystem limit is also not enforced if the user can
+ * modify the limit.
+ *
+ * The filesystem and snapshot limits are validated by dsl_fs_ss_limit_check()
+ * and updated by dsl_fs_ss_count_adjust(). A new limit value is setup in
+ * dsl_dir_activate_fs_ss_limit() and the counts are adjusted, if necessary, by
+ * dsl_dir_init_fs_ss_count().
+ *
+ * There is a special case when we receive a filesystem that already exists. In
+ * this case a temporary clone name of %X is created (see dmu_recv_begin). We
+ * never update the filesystem counts for temporary clones.
+ *
+ * Likewise, we do not update the snapshot counts for temporary snapshots,
+ * such as those created by zfs diff.
+ */
+
+extern inline dsl_dir_phys_t *dsl_dir_phys(dsl_dir_t *dd);
+
static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd);
-/* ARGSUSED */
static void
-dsl_dir_evict(dmu_buf_t *db, void *arg)
+dsl_dir_evict(void *dbu)
{
- dsl_dir_t *dd = arg;
+ dsl_dir_t *dd = dbu;
dsl_pool_t *dp = dd->dd_pool;
int t;
+ dd->dd_dbuf = NULL;
+
for (t = 0; t < TXG_SIZE; t++) {
ASSERT(!txg_list_member(&dp->dp_dirty_dirs, dd, t));
ASSERT(dd->dd_tempreserved[t] == 0);
@@ -63,15 +149,11 @@
}
if (dd->dd_parent)
- dsl_dir_rele(dd->dd_parent, dd);
+ dsl_dir_async_rele(dd->dd_parent, dd);
- spa_close(dd->dd_pool->dp_spa, dd);
+ spa_async_close(dd->dd_pool->dp_spa, dd);
- /*
- * The props callback list should have been cleaned up by
- * objset_evict().
- */
- list_destroy(&dd->dd_prop_cbs);
+ dsl_prop_fini(dd);
mutex_destroy(&dd->dd_lock);
kmem_free(dd, sizeof (dsl_dir_t));
}
@@ -94,7 +176,7 @@
{
dmu_object_info_t doi;
dmu_object_info_from_db(dbuf, &doi);
- ASSERT3U(doi.doi_type, ==, DMU_OT_DSL_DIR);
+ ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_DSL_DIR);
ASSERT3U(doi.doi_bonus_size, >=, sizeof (dsl_dir_phys_t));
}
#endif
@@ -105,17 +187,15 @@
dd->dd_object = ddobj;
dd->dd_dbuf = dbuf;
dd->dd_pool = dp;
- dd->dd_phys = dbuf->db_data;
mutex_init(&dd->dd_lock, NULL, MUTEX_DEFAULT, NULL);
+ dsl_prop_init(dd);
- list_create(&dd->dd_prop_cbs, sizeof (dsl_prop_cb_record_t),
- offsetof(dsl_prop_cb_record_t, cbr_node));
-
dsl_dir_snap_cmtime_update(dd);
- if (dd->dd_phys->dd_parent_obj) {
- err = dsl_dir_hold_obj(dp, dd->dd_phys->dd_parent_obj,
- NULL, dd, &dd->dd_parent);
+ if (dsl_dir_phys(dd)->dd_parent_obj) {
+ err = dsl_dir_hold_obj(dp,
+ dsl_dir_phys(dd)->dd_parent_obj, NULL, dd,
+ &dd->dd_parent);
if (err != 0)
goto errout;
if (tail) {
@@ -123,14 +203,16 @@
uint64_t foundobj;
err = zap_lookup(dp->dp_meta_objset,
- dd->dd_parent->dd_phys->dd_child_dir_zapobj,
- tail, sizeof (foundobj), 1, &foundobj);
+ dsl_dir_phys(dd->dd_parent)->
+ dd_child_dir_zapobj, tail,
+ sizeof (foundobj), 1, &foundobj);
ASSERT(err || foundobj == ddobj);
#endif
(void) strcpy(dd->dd_myname, tail);
} else {
err = zap_value_search(dp->dp_meta_objset,
- dd->dd_parent->dd_phys->dd_child_dir_zapobj,
+ dsl_dir_phys(dd->dd_parent)->
+ dd_child_dir_zapobj,
ddobj, 0, dd->dd_myname);
}
if (err != 0)
@@ -149,7 +231,8 @@
* Just look at its phys directly instead.
*/
err = dmu_bonus_hold(dp->dp_meta_objset,
- dd->dd_phys->dd_origin_obj, FTAG, &origin_bonus);
+ dsl_dir_phys(dd)->dd_origin_obj, FTAG,
+ &origin_bonus);
if (err != 0)
goto errout;
origin_phys = origin_bonus->db_data;
@@ -158,11 +241,12 @@
dmu_buf_rele(origin_bonus, FTAG);
}
- winner = dmu_buf_set_user_ie(dbuf, dd, &dd->dd_phys,
- dsl_dir_evict);
- if (winner) {
+ dmu_buf_init_user(&dd->dd_dbu, dsl_dir_evict, &dd->dd_dbuf);
+ winner = dmu_buf_set_user_ie(dbuf, &dd->dd_dbu);
+ if (winner != NULL) {
if (dd->dd_parent)
dsl_dir_rele(dd->dd_parent, dd);
+ dsl_prop_fini(dd);
mutex_destroy(&dd->dd_lock);
kmem_free(dd, sizeof (dsl_dir_t));
dd = winner;
@@ -190,6 +274,7 @@
errout:
if (dd->dd_parent)
dsl_dir_rele(dd->dd_parent, dd);
+ dsl_prop_fini(dd);
mutex_destroy(&dd->dd_lock);
kmem_free(dd, sizeof (dsl_dir_t));
dmu_buf_rele(dbuf, tag);
@@ -204,13 +289,29 @@
dmu_buf_rele(dd->dd_dbuf, tag);
}
-/* buf must be long enough (MAXNAMELEN + strlen(MOS_DIR_NAME) + 1 should do) */
+/*
+ * Remove a reference to the given dsl dir that is being asynchronously
+ * released. Async releases occur from a taskq performing eviction of
+ * dsl datasets and dirs. This process is identical to a normal release
+ * with the exception of using the async API for releasing the reference on
+ * the spa.
+ */
void
+dsl_dir_async_rele(dsl_dir_t *dd, void *tag)
+{
+ dprintf_dd(dd, "%s\n", "");
+ spa_async_close(dd->dd_pool->dp_spa, tag);
+ dmu_buf_rele(dd->dd_dbuf, tag);
+}
+
+/* buf must be at least ZFS_MAX_DATASET_NAME_LEN bytes */
+void
dsl_dir_name(dsl_dir_t *dd, char *buf)
{
if (dd->dd_parent) {
dsl_dir_name(dd->dd_parent, buf);
- (void) strcat(buf, "/");
+ VERIFY3U(strlcat(buf, "/", ZFS_MAX_DATASET_NAME_LEN), <,
+ ZFS_MAX_DATASET_NAME_LEN);
} else {
buf[0] = '\0';
}
@@ -220,10 +321,12 @@
* dprintf_dd() with dd_lock held
*/
mutex_enter(&dd->dd_lock);
- (void) strcat(buf, dd->dd_myname);
+ VERIFY3U(strlcat(buf, dd->dd_myname, ZFS_MAX_DATASET_NAME_LEN),
+ <, ZFS_MAX_DATASET_NAME_LEN);
mutex_exit(&dd->dd_lock);
} else {
- (void) strcat(buf, dd->dd_myname);
+ VERIFY3U(strlcat(buf, dd->dd_myname, ZFS_MAX_DATASET_NAME_LEN),
+ <, ZFS_MAX_DATASET_NAME_LEN);
}
}
@@ -272,12 +375,12 @@
if (p != NULL &&
(p[0] != '@' || strpbrk(path+1, "/@") || p[1] == '\0'))
return (SET_ERROR(EINVAL));
- if (strlen(path) >= MAXNAMELEN)
+ if (strlen(path) >= ZFS_MAX_DATASET_NAME_LEN)
return (SET_ERROR(ENAMETOOLONG));
(void) strcpy(component, path);
p = NULL;
} else if (p[0] == '/') {
- if (p - path >= MAXNAMELEN)
+ if (p - path >= ZFS_MAX_DATASET_NAME_LEN)
return (SET_ERROR(ENAMETOOLONG));
(void) strncpy(component, path, p - path);
component[p - path] = '\0';
@@ -289,7 +392,7 @@
*/
if (strchr(path, '/'))
return (SET_ERROR(EINVAL));
- if (p - path >= MAXNAMELEN)
+ if (p - path >= ZFS_MAX_DATASET_NAME_LEN)
return (SET_ERROR(ENAMETOOLONG));
(void) strncpy(component, path, p - path);
component[p - path] = '\0';
@@ -311,7 +414,7 @@
dsl_dir_hold(dsl_pool_t *dp, const char *name, void *tag,
dsl_dir_t **ddp, const char **tailp)
{
- char buf[MAXNAMELEN];
+ char buf[ZFS_MAX_DATASET_NAME_LEN];
const char *spaname, *next, *nextnext = NULL;
int err;
dsl_dir_t *dd;
@@ -324,7 +427,7 @@
/* Make sure the name is in the specified pool. */
spaname = spa_name(dp->dp_spa);
if (strcmp(buf, spaname) != 0)
- return (SET_ERROR(EINVAL));
+ return (SET_ERROR(EXDEV));
ASSERT(dsl_pool_config_held(dp));
@@ -334,7 +437,7 @@
}
while (next != NULL) {
- dsl_dir_t *child_ds;
+ dsl_dir_t *child_dd;
err = getcomponent(next, buf, &nextnext);
if (err != 0)
break;
@@ -342,10 +445,10 @@
if (next[0] == '@')
break;
dprintf("looking up %s in obj%lld\n",
- buf, dd->dd_phys->dd_child_dir_zapobj);
+ buf, dsl_dir_phys(dd)->dd_child_dir_zapobj);
err = zap_lookup(dp->dp_meta_objset,
- dd->dd_phys->dd_child_dir_zapobj,
+ dsl_dir_phys(dd)->dd_child_dir_zapobj,
buf, sizeof (ddobj), 1, &ddobj);
if (err != 0) {
if (err == ENOENT)
@@ -353,11 +456,11 @@
break;
}
- err = dsl_dir_hold_obj(dp, ddobj, buf, tag, &child_ds);
+ err = dsl_dir_hold_obj(dp, ddobj, buf, tag, &child_dd);
if (err != 0)
break;
dsl_dir_rele(dd, tag);
- dd = child_ds;
+ dd = child_dd;
next = nextnext;
}
@@ -383,6 +486,404 @@
return (err);
}
+/*
+ * If the counts are already initialized for this filesystem and its
+ * descendants then do nothing, otherwise initialize the counts.
+ *
+ * The counts on this filesystem, and those below, may be uninitialized due to
+ * either the use of a pre-existing pool which did not support the
+ * filesystem/snapshot limit feature, or one in which the feature had not yet
+ * been enabled.
+ *
+ * Recursively descend the filesystem tree and update the filesystem/snapshot
+ * counts on each filesystem below, then update the cumulative count on the
+ * current filesystem. If the filesystem already has a count set on it,
+ * then we know that its counts, and the counts on the filesystems below it,
+ * are already correct, so we don't have to update this filesystem.
+ */
+static void
+dsl_dir_init_fs_ss_count(dsl_dir_t *dd, dmu_tx_t *tx)
+{
+ uint64_t my_fs_cnt = 0;
+ uint64_t my_ss_cnt = 0;
+ dsl_pool_t *dp = dd->dd_pool;
+ objset_t *os = dp->dp_meta_objset;
+ zap_cursor_t *zc;
+ zap_attribute_t *za;
+ dsl_dataset_t *ds;
+
+ ASSERT(spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT));
+ ASSERT(dsl_pool_config_held(dp));
+ ASSERT(dmu_tx_is_syncing(tx));
+
+ dsl_dir_zapify(dd, tx);
+
+ /*
+ * If the filesystem count has already been initialized then we
+ * don't need to recurse down any further.
+ */
+ if (zap_contains(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT) == 0)
+ return;
+
+ zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP);
+ za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
+
+ /* Iterate my child dirs */
+ for (zap_cursor_init(zc, os, dsl_dir_phys(dd)->dd_child_dir_zapobj);
+ zap_cursor_retrieve(zc, za) == 0; zap_cursor_advance(zc)) {
+ dsl_dir_t *chld_dd;
+ uint64_t count;
+
+ VERIFY0(dsl_dir_hold_obj(dp, za->za_first_integer, NULL, FTAG,
+ &chld_dd));
+
+ /*
+ * Ignore hidden ($FREE, $MOS & $ORIGIN) objsets and
+ * temporary datasets.
+ */
+ if (chld_dd->dd_myname[0] == '$' ||
+ chld_dd->dd_myname[0] == '%') {
+ dsl_dir_rele(chld_dd, FTAG);
+ continue;
+ }
+
+ my_fs_cnt++; /* count this child */
+
+ dsl_dir_init_fs_ss_count(chld_dd, tx);
+
+ VERIFY0(zap_lookup(os, chld_dd->dd_object,
+ DD_FIELD_FILESYSTEM_COUNT, sizeof (count), 1, &count));
+ my_fs_cnt += count;
+ VERIFY0(zap_lookup(os, chld_dd->dd_object,
+ DD_FIELD_SNAPSHOT_COUNT, sizeof (count), 1, &count));
+ my_ss_cnt += count;
+
+ dsl_dir_rele(chld_dd, FTAG);
+ }
+ zap_cursor_fini(zc);
+ /* Count my snapshots (we counted children's snapshots above) */
+ VERIFY0(dsl_dataset_hold_obj(dd->dd_pool,
+ dsl_dir_phys(dd)->dd_head_dataset_obj, FTAG, &ds));
+
+ for (zap_cursor_init(zc, os, dsl_dataset_phys(ds)->ds_snapnames_zapobj);
+ zap_cursor_retrieve(zc, za) == 0;
+ zap_cursor_advance(zc)) {
+ /* Don't count temporary snapshots */
+ if (za->za_name[0] != '%')
+ my_ss_cnt++;
+ }
+ zap_cursor_fini(zc);
+
+ dsl_dataset_rele(ds, FTAG);
+
+ kmem_free(zc, sizeof (zap_cursor_t));
+ kmem_free(za, sizeof (zap_attribute_t));
+
+ /* we're in a sync task, update counts */
+ dmu_buf_will_dirty(dd->dd_dbuf, tx);
+ VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT,
+ sizeof (my_fs_cnt), 1, &my_fs_cnt, tx));
+ VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT,
+ sizeof (my_ss_cnt), 1, &my_ss_cnt, tx));
+}
+
+static int
+dsl_dir_actv_fs_ss_limit_check(void *arg, dmu_tx_t *tx)
+{
+ char *ddname = (char *)arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ dsl_dataset_t *ds;
+ dsl_dir_t *dd;
+ int error;
+
+ error = dsl_dataset_hold(dp, ddname, FTAG, &ds);
+ if (error != 0)
+ return (error);
+
+ if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) {
+ dsl_dataset_rele(ds, FTAG);
+ return (SET_ERROR(ENOTSUP));
+ }
+
+ dd = ds->ds_dir;
+ if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT) &&
+ dsl_dir_is_zapified(dd) &&
+ zap_contains(dp->dp_meta_objset, dd->dd_object,
+ DD_FIELD_FILESYSTEM_COUNT) == 0) {
+ dsl_dataset_rele(ds, FTAG);
+ return (SET_ERROR(EALREADY));
+ }
+
+ dsl_dataset_rele(ds, FTAG);
+ return (0);
+}
+
+static void
+dsl_dir_actv_fs_ss_limit_sync(void *arg, dmu_tx_t *tx)
+{
+ char *ddname = (char *)arg;
+ dsl_pool_t *dp = dmu_tx_pool(tx);
+ dsl_dataset_t *ds;
+ spa_t *spa;
+
+ VERIFY0(dsl_dataset_hold(dp, ddname, FTAG, &ds));
+
+ spa = dsl_dataset_get_spa(ds);
+
+ if (!spa_feature_is_active(spa, SPA_FEATURE_FS_SS_LIMIT)) {
+ /*
+ * Since the feature was not active and we're now setting a
+ * limit, increment the feature-active counter so that the
+ * feature becomes active for the first time.
+ *
+ * We are already in a sync task so we can update the MOS.
+ */
+ spa_feature_incr(spa, SPA_FEATURE_FS_SS_LIMIT, tx);
+ }
+
+ /*
+ * Since we are now setting a non-UINT64_MAX limit on the filesystem,
+ * we need to ensure the counts are correct. Descend down the tree from
+ * this point and update all of the counts to be accurate.
+ */
+ dsl_dir_init_fs_ss_count(ds->ds_dir, tx);
+
+ dsl_dataset_rele(ds, FTAG);
+}
+
+/*
+ * Make sure the feature is enabled and activate it if necessary.
+ * Since we're setting a limit, ensure the on-disk counts are valid.
+ * This is only called by the ioctl path when setting a limit value.
+ *
+ * We do not need to validate the new limit, since users who can change the
+ * limit are also allowed to exceed the limit.
+ */
+int
+dsl_dir_activate_fs_ss_limit(const char *ddname)
+{
+ int error;
+
+ error = dsl_sync_task(ddname, dsl_dir_actv_fs_ss_limit_check,
+ dsl_dir_actv_fs_ss_limit_sync, (void *)ddname, 0,
+ ZFS_SPACE_CHECK_RESERVED);
+
+ if (error == EALREADY)
+ error = 0;
+
+ return (error);
+}
+
+/*
+ * Used to determine if the filesystem_limit or snapshot_limit should be
+ * enforced. We allow the limit to be exceeded if the user has permission to
+ * write the property value. We pass in the creds that we got in the open
+ * context since we will always be the GZ root in syncing context. We also have
+ * to handle the case where we are allowed to change the limit on the current
+ * dataset, but there may be another limit in the tree above.
+ *
+ * We can never modify these two properties within a non-global zone. In
+ * addition, the other checks are modeled on zfs_secpolicy_write_perms. We
+ * can't use that function since we are already holding the dp_config_rwlock.
+ * In addition, we already have the dd and dealing with snapshots is simplified
+ * in this code.
+ */
+
+typedef enum {
+ ENFORCE_ALWAYS,
+ ENFORCE_NEVER,
+ ENFORCE_ABOVE
+} enforce_res_t;
+
+static enforce_res_t
+dsl_enforce_ds_ss_limits(dsl_dir_t *dd, zfs_prop_t prop, cred_t *cr)
+{
+ enforce_res_t enforce = ENFORCE_ALWAYS;
+ uint64_t obj;
+ dsl_dataset_t *ds;
+ uint64_t zoned;
+
+ ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT ||
+ prop == ZFS_PROP_SNAPSHOT_LIMIT);
+
+#ifdef _KERNEL
+#ifdef __FreeBSD__
+ if (jailed(cr))
+#else
+ if (crgetzoneid(cr) != GLOBAL_ZONEID)
+#endif
+ return (ENFORCE_ALWAYS);
+
+ if (secpolicy_zfs(cr) == 0)
+ return (ENFORCE_NEVER);
+#endif
+
+ if ((obj = dsl_dir_phys(dd)->dd_head_dataset_obj) == 0)
+ return (ENFORCE_ALWAYS);
+
+ ASSERT(dsl_pool_config_held(dd->dd_pool));
+
+ if (dsl_dataset_hold_obj(dd->dd_pool, obj, FTAG, &ds) != 0)
+ return (ENFORCE_ALWAYS);
+
+ if (dsl_prop_get_ds(ds, "zoned", 8, 1, &zoned, NULL) || zoned) {
+ /* Only root can access zoned fs's from the GZ */
+ enforce = ENFORCE_ALWAYS;
+ } else {
+ if (dsl_deleg_access_impl(ds, zfs_prop_to_name(prop), cr) == 0)
+ enforce = ENFORCE_ABOVE;
+ }
+
+ dsl_dataset_rele(ds, FTAG);
+ return (enforce);
+}
+
+/*
+ * Check if adding additional child filesystem(s) would exceed any filesystem
+ * limits or adding additional snapshot(s) would exceed any snapshot limits.
+ * The prop argument indicates which limit to check.
+ *
+ * Note that all filesystem limits up to the root (or the highest
+ * initialized) filesystem or the given ancestor must be satisfied.
+ */
+int
+dsl_fs_ss_limit_check(dsl_dir_t *dd, uint64_t delta, zfs_prop_t prop,
+ dsl_dir_t *ancestor, cred_t *cr)
+{
+ objset_t *os = dd->dd_pool->dp_meta_objset;
+ uint64_t limit, count;
+ char *count_prop;
+ enforce_res_t enforce;
+ int err = 0;
+
+ ASSERT(dsl_pool_config_held(dd->dd_pool));
+ ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT ||
+ prop == ZFS_PROP_SNAPSHOT_LIMIT);
+
+ /*
+ * If we're allowed to change the limit, don't enforce the limit
+ * e.g. this can happen if a snapshot is taken by an administrative
+ * user in the global zone (i.e. a recursive snapshot by root).
+ * However, we must handle the case of delegated permissions where we
+ * are allowed to change the limit on the current dataset, but there
+ * is another limit in the tree above.
+ */
+ enforce = dsl_enforce_ds_ss_limits(dd, prop, cr);
+ if (enforce == ENFORCE_NEVER)
+ return (0);
+
+ /*
+ * e.g. if renaming a dataset with no snapshots, count adjustment
+ * is 0.
+ */
+ if (delta == 0)
+ return (0);
+
+ if (prop == ZFS_PROP_SNAPSHOT_LIMIT) {
+ /*
+ * We don't enforce the limit for temporary snapshots. This is
+ * indicated by a NULL cred_t argument.
+ */
+ if (cr == NULL)
+ return (0);
+
+ count_prop = DD_FIELD_SNAPSHOT_COUNT;
+ } else {
+ count_prop = DD_FIELD_FILESYSTEM_COUNT;
+ }
+
+ /*
+ * If an ancestor has been provided, stop checking the limit once we
+ * hit that dir. We need this during rename so that we don't overcount
+ * the check once we recurse up to the common ancestor.
+ */
+ if (ancestor == dd)
+ return (0);
+
+ /*
+ * If we hit an uninitialized node while recursing up the tree, we can
+ * stop since we know there is no limit here (or above). The counts are
+ * not valid on this node and we know we won't touch this node's counts.
+ */
+ if (!dsl_dir_is_zapified(dd) || zap_lookup(os, dd->dd_object,
+ count_prop, sizeof (count), 1, &count) == ENOENT)
+ return (0);
+
+ err = dsl_prop_get_dd(dd, zfs_prop_to_name(prop), 8, 1, &limit, NULL,
+ B_FALSE);
+ if (err != 0)
+ return (err);
+
+ /* Is there a limit which we've hit? */
+ if (enforce == ENFORCE_ALWAYS && (count + delta) > limit)
+ return (SET_ERROR(EDQUOT));
+
+ if (dd->dd_parent != NULL)
+ err = dsl_fs_ss_limit_check(dd->dd_parent, delta, prop,
+ ancestor, cr);
+
+ return (err);
+}
+
+/*
+ * Adjust the filesystem or snapshot count for the specified dsl_dir_t and all
+ * parents. When a new filesystem/snapshot is created, increment the count on
+ * all parents, and when a filesystem/snapshot is destroyed, decrement the
+ * count.
+ */
+void
+dsl_fs_ss_count_adjust(dsl_dir_t *dd, int64_t delta, const char *prop,
+ dmu_tx_t *tx)
+{
+ int err;
+ objset_t *os = dd->dd_pool->dp_meta_objset;
+ uint64_t count;
+
+ ASSERT(dsl_pool_config_held(dd->dd_pool));
+ ASSERT(dmu_tx_is_syncing(tx));
+ ASSERT(strcmp(prop, DD_FIELD_FILESYSTEM_COUNT) == 0 ||
+ strcmp(prop, DD_FIELD_SNAPSHOT_COUNT) == 0);
+
+ /*
+ * When we receive an incremental stream into a filesystem that already
+ * exists, a temporary clone is created. We don't count this temporary
+ * clone, whose name begins with a '%'. We also ignore hidden ($FREE,
+ * $MOS & $ORIGIN) objsets.
+ */
+ if ((dd->dd_myname[0] == '%' || dd->dd_myname[0] == '$') &&
+ strcmp(prop, DD_FIELD_FILESYSTEM_COUNT) == 0)
+ return;
+
+ /*
+ * e.g. if renaming a dataset with no snapshots, count adjustment is 0
+ */
+ if (delta == 0)
+ return;
+
+ /*
+ * If we hit an uninitialized node while recursing up the tree, we can
+ * stop since we know the counts are not valid on this node and we
+ * know we shouldn't touch this node's counts. An uninitialized count
+ * on the node indicates that either the feature has not yet been
+ * activated or there are no limits on this part of the tree.
+ */
+ if (!dsl_dir_is_zapified(dd) || (err = zap_lookup(os, dd->dd_object,
+ prop, sizeof (count), 1, &count)) == ENOENT)
+ return;
+ VERIFY0(err);
+
+ count += delta;
+ /* Use a signed verify to make sure we're not neg. */
+ VERIFY3S(count, >=, 0);
+
+ VERIFY0(zap_update(os, dd->dd_object, prop, sizeof (count), 1, &count,
+ tx));
+
+ /* Roll up this additional count into our ancestors */
+ if (dd->dd_parent != NULL)
+ dsl_fs_ss_count_adjust(dd->dd_parent, delta, prop, tx);
+}
+
uint64_t
dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name,
dmu_tx_t *tx)
@@ -395,7 +896,7 @@
ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0,
DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx);
if (pds) {
- VERIFY(0 == zap_add(mos, pds->dd_phys->dd_child_dir_zapobj,
+ VERIFY(0 == zap_add(mos, dsl_dir_phys(pds)->dd_child_dir_zapobj,
name, sizeof (uint64_t), 1, &ddobj, tx));
} else {
/* it's the root dir */
@@ -407,8 +908,12 @@
ddphys = dbuf->db_data;
ddphys->dd_creation_time = gethrestime_sec();
- if (pds)
+ if (pds) {
ddphys->dd_parent_obj = pds->dd_object;
+
+ /* update the filesystem counts */
+ dsl_fs_ss_count_adjust(pds, 1, DD_FIELD_FILESYSTEM_COUNT, tx);
+ }
ddphys->dd_props_zapobj = zap_create(mos,
DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx);
ddphys->dd_child_dir_zapobj = zap_create(mos,
@@ -423,9 +928,9 @@
boolean_t
dsl_dir_is_clone(dsl_dir_t *dd)
{
- return (dd->dd_phys->dd_origin_obj &&
+ return (dsl_dir_phys(dd)->dd_origin_obj &&
(dd->dd_pool->dp_origin_snap == NULL ||
- dd->dd_phys->dd_origin_obj !=
+ dsl_dir_phys(dd)->dd_origin_obj !=
dd->dd_pool->dp_origin_snap->ds_object));
}
@@ -434,35 +939,52 @@
{
mutex_enter(&dd->dd_lock);
dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED,
- dd->dd_phys->dd_used_bytes);
- dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_QUOTA, dd->dd_phys->dd_quota);
+ dsl_dir_phys(dd)->dd_used_bytes);
+ dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_QUOTA,
+ dsl_dir_phys(dd)->dd_quota);
dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_RESERVATION,
- dd->dd_phys->dd_reserved);
+ dsl_dir_phys(dd)->dd_reserved);
dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO,
- dd->dd_phys->dd_compressed_bytes == 0 ? 100 :
- (dd->dd_phys->dd_uncompressed_bytes * 100 /
- dd->dd_phys->dd_compressed_bytes));
+ dsl_dir_phys(dd)->dd_compressed_bytes == 0 ? 100 :
+ (dsl_dir_phys(dd)->dd_uncompressed_bytes * 100 /
+ dsl_dir_phys(dd)->dd_compressed_bytes));
dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALUSED,
- dd->dd_phys->dd_uncompressed_bytes);
- if (dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) {
+ dsl_dir_phys(dd)->dd_uncompressed_bytes);
+ if (dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN) {
dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDSNAP,
- dd->dd_phys->dd_used_breakdown[DD_USED_SNAP]);
+ dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_SNAP]);
dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDDS,
- dd->dd_phys->dd_used_breakdown[DD_USED_HEAD]);
+ dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_HEAD]);
dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDREFRESERV,
- dd->dd_phys->dd_used_breakdown[DD_USED_REFRSRV]);
+ dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_REFRSRV]);
dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDCHILD,
- dd->dd_phys->dd_used_breakdown[DD_USED_CHILD] +
- dd->dd_phys->dd_used_breakdown[DD_USED_CHILD_RSRV]);
+ dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_CHILD] +
+ dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_CHILD_RSRV]);
}
mutex_exit(&dd->dd_lock);
+ if (dsl_dir_is_zapified(dd)) {
+ uint64_t count;
+ objset_t *os = dd->dd_pool->dp_meta_objset;
+
+ if (zap_lookup(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT,
+ sizeof (count), 1, &count) == 0) {
+ dsl_prop_nvlist_add_uint64(nv,
+ ZFS_PROP_FILESYSTEM_COUNT, count);
+ }
+ if (zap_lookup(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT,
+ sizeof (count), 1, &count) == 0) {
+ dsl_prop_nvlist_add_uint64(nv,
+ ZFS_PROP_SNAPSHOT_COUNT, count);
+ }
+ }
+
if (dsl_dir_is_clone(dd)) {
dsl_dataset_t *ds;
- char buf[MAXNAMELEN];
+ char buf[ZFS_MAX_DATASET_NAME_LEN];
VERIFY0(dsl_dataset_hold_obj(dd->dd_pool,
- dd->dd_phys->dd_origin_obj, FTAG, &ds));
+ dsl_dir_phys(dd)->dd_origin_obj, FTAG, &ds));
dsl_dataset_name(ds, buf);
dsl_dataset_rele(ds, FTAG);
dsl_prop_nvlist_add_string(nv, ZFS_PROP_ORIGIN, buf);
@@ -474,7 +996,7 @@
{
dsl_pool_t *dp = dd->dd_pool;
- ASSERT(dd->dd_phys);
+ ASSERT(dsl_dir_phys(dd));
if (txg_list_add(&dp->dp_dirty_dirs, dd, tx->tx_txg)) {
/* up the hold count until we can be written out */
@@ -485,8 +1007,9 @@
static int64_t
parent_delta(dsl_dir_t *dd, uint64_t used, int64_t delta)
{
- uint64_t old_accounted = MAX(used, dd->dd_phys->dd_reserved);
- uint64_t new_accounted = MAX(used + delta, dd->dd_phys->dd_reserved);
+ uint64_t old_accounted = MAX(used, dsl_dir_phys(dd)->dd_reserved);
+ uint64_t new_accounted =
+ MAX(used + delta, dsl_dir_phys(dd)->dd_reserved);
return (new_accounted - old_accounted);
}
@@ -545,9 +1068,9 @@
}
mutex_enter(&dd->dd_lock);
- if (dd->dd_phys->dd_quota != 0)
- quota = dd->dd_phys->dd_quota;
- used = dd->dd_phys->dd_used_bytes;
+ if (dsl_dir_phys(dd)->dd_quota != 0)
+ quota = dsl_dir_phys(dd)->dd_quota;
+ used = dsl_dir_phys(dd)->dd_used_bytes;
if (!ondiskonly)
used += dsl_dir_space_towrite(dd);
@@ -556,12 +1079,12 @@
quota = MIN(quota, poolsize);
}
- if (dd->dd_phys->dd_reserved > used && parentspace != UINT64_MAX) {
+ if (dsl_dir_phys(dd)->dd_reserved > used && parentspace != UINT64_MAX) {
/*
* We have some space reserved, in addition to what our
* parent gave us.
*/
- parentspace += dd->dd_phys->dd_reserved - used;
+ parentspace += dsl_dir_phys(dd)->dd_reserved - used;
}
if (dd == ancestor) {
@@ -590,7 +1113,6 @@
struct tempreserve {
list_node_t tr_node;
- dsl_pool_t *tr_dp;
dsl_dir_t *tr_ds;
uint64_t tr_size;
};
@@ -621,7 +1143,7 @@
est_inflight = dsl_dir_space_towrite(dd);
for (i = 0; i < TXG_SIZE; i++)
est_inflight += dd->dd_tempreserved[i];
- used_on_disk = dd->dd_phys->dd_used_bytes;
+ used_on_disk = dsl_dir_phys(dd)->dd_used_bytes;
/*
* On the first iteration, fetch the dataset's used-on-disk and
@@ -644,10 +1166,10 @@
* If this transaction will result in a net free of space,
* we want to let it through.
*/
- if (ignorequota || netfree || dd->dd_phys->dd_quota == 0)
+ if (ignorequota || netfree || dsl_dir_phys(dd)->dd_quota == 0)
quota = UINT64_MAX;
else
- quota = dd->dd_phys->dd_quota;
+ quota = dsl_dir_phys(dd)->dd_quota;
/*
* Adjust the quota against the actual pool size at the root
@@ -701,7 +1223,7 @@
/* see if it's OK with our parent */
if (dd->dd_parent && parent_rsrv) {
- boolean_t ismos = (dd->dd_phys->dd_head_dataset_obj == 0);
+ boolean_t ismos = (dsl_dir_phys(dd)->dd_head_dataset_obj == 0);
return (dsl_dir_tempreserve_impl(dd->dd_parent,
parent_rsrv, netfree, ismos, TRUE, tr_list, tx, FALSE));
@@ -741,24 +1263,24 @@
tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP);
tr->tr_size = lsize;
list_insert_tail(tr_list, tr);
-
- err = dsl_pool_tempreserve_space(dd->dd_pool, asize, tx);
} else {
if (err == EAGAIN) {
- txg_delay(dd->dd_pool, tx->tx_txg, 1);
+ /*
+ * If arc_memory_throttle() detected that pageout
+ * is running and we are low on memory, we delay new
+ * non-pageout transactions to give pageout an
+ * advantage.
+ *
+ * It is unfortunate to be delaying while the caller's
+ * locks are held.
+ */
+ txg_delay(dd->dd_pool, tx->tx_txg,
+ MSEC2NSEC(10), MSEC2NSEC(10));
err = SET_ERROR(ERESTART);
}
- dsl_pool_memory_pressure(dd->dd_pool);
}
if (err == 0) {
- struct tempreserve *tr;
-
- tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP);
- tr->tr_dp = dd->dd_pool;
- tr->tr_size = asize;
- list_insert_tail(tr_list, tr);
-
err = dsl_dir_tempreserve_impl(dd, asize, fsize >= asize,
FALSE, asize > usize, tr_list, tx, TRUE);
}
@@ -787,10 +1309,8 @@
if (tr_cookie == NULL)
return;
- while (tr = list_head(tr_list)) {
- if (tr->tr_dp) {
- dsl_pool_tempreserve_clear(tr->tr_dp, tr->tr_size, tx);
- } else if (tr->tr_ds) {
+ while ((tr = list_head(tr_list)) != NULL) {
+ if (tr->tr_ds) {
mutex_enter(&tr->tr_ds->dd_lock);
ASSERT3U(tr->tr_ds->dd_tempreserved[txgidx], >=,
tr->tr_size);
@@ -806,8 +1326,14 @@
kmem_free(tr_list, sizeof (list_t));
}
-static void
-dsl_dir_willuse_space_impl(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx)
+/*
+ * This should be called from open context when we think we're going to write
+ * or free space, for example when dirtying data. Be conservative; it's okay
+ * to write less space or free more, but we don't want to write more or free
+ * less than the amount specified.
+ */
+void
+dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx)
{
int64_t parent_space;
uint64_t est_used;
@@ -816,7 +1342,7 @@
if (space > 0)
dd->dd_space_towrite[tx->tx_txg & TXG_MASK] += space;
- est_used = dsl_dir_space_towrite(dd) + dd->dd_phys->dd_used_bytes;
+ est_used = dsl_dir_space_towrite(dd) + dsl_dir_phys(dd)->dd_used_bytes;
parent_space = parent_delta(dd, est_used, space);
mutex_exit(&dd->dd_lock);
@@ -825,21 +1351,9 @@
/* XXX this is potentially expensive and unnecessary... */
if (parent_space && dd->dd_parent)
- dsl_dir_willuse_space_impl(dd->dd_parent, parent_space, tx);
+ dsl_dir_willuse_space(dd->dd_parent, parent_space, tx);
}
-/*
- * Call in open context when we think we're going to write/free space,
- * eg. when dirtying data. Be conservative (ie. OK to write less than
- * this or free more than this, but don't write more or free less).
- */
-void
-dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx)
-{
- dsl_pool_willuse_space(dd->dd_pool, space, tx);
- dsl_dir_willuse_space_impl(dd, space, tx);
-}
-
/* call from syncing context when we actually write/free space for this dd */
void
dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type,
@@ -846,34 +1360,44 @@
int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx)
{
int64_t accounted_delta;
+
+ /*
+ * dsl_dataset_set_refreservation_sync_impl() calls this with
+ * dd_lock held, so that it can atomically update
+ * ds->ds_reserved and the dsl_dir accounting, so that
+ * dsl_dataset_check_quota() can see dataset and dir accounting
+ * consistently.
+ */
boolean_t needlock = !MUTEX_HELD(&dd->dd_lock);
ASSERT(dmu_tx_is_syncing(tx));
ASSERT(type < DD_USED_NUM);
+ dmu_buf_will_dirty(dd->dd_dbuf, tx);
+
if (needlock)
mutex_enter(&dd->dd_lock);
- accounted_delta = parent_delta(dd, dd->dd_phys->dd_used_bytes, used);
- ASSERT(used >= 0 || dd->dd_phys->dd_used_bytes >= -used);
+ accounted_delta =
+ parent_delta(dd, dsl_dir_phys(dd)->dd_used_bytes, used);
+ ASSERT(used >= 0 || dsl_dir_phys(dd)->dd_used_bytes >= -used);
ASSERT(compressed >= 0 ||
- dd->dd_phys->dd_compressed_bytes >= -compressed);
+ dsl_dir_phys(dd)->dd_compressed_bytes >= -compressed);
ASSERT(uncompressed >= 0 ||
- dd->dd_phys->dd_uncompressed_bytes >= -uncompressed);
- dmu_buf_will_dirty(dd->dd_dbuf, tx);
- dd->dd_phys->dd_used_bytes += used;
- dd->dd_phys->dd_uncompressed_bytes += uncompressed;
- dd->dd_phys->dd_compressed_bytes += compressed;
+ dsl_dir_phys(dd)->dd_uncompressed_bytes >= -uncompressed);
+ dsl_dir_phys(dd)->dd_used_bytes += used;
+ dsl_dir_phys(dd)->dd_uncompressed_bytes += uncompressed;
+ dsl_dir_phys(dd)->dd_compressed_bytes += compressed;
- if (dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) {
+ if (dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN) {
ASSERT(used > 0 ||
- dd->dd_phys->dd_used_breakdown[type] >= -used);
- dd->dd_phys->dd_used_breakdown[type] += used;
+ dsl_dir_phys(dd)->dd_used_breakdown[type] >= -used);
+ dsl_dir_phys(dd)->dd_used_breakdown[type] += used;
#ifdef DEBUG
dd_used_t t;
uint64_t u = 0;
for (t = 0; t < DD_USED_NUM; t++)
- u += dd->dd_phys->dd_used_breakdown[t];
- ASSERT3U(u, ==, dd->dd_phys->dd_used_bytes);
+ u += dsl_dir_phys(dd)->dd_used_breakdown[t];
+ ASSERT3U(u, ==, dsl_dir_phys(dd)->dd_used_bytes);
#endif
}
if (needlock)
@@ -884,7 +1408,7 @@
accounted_delta, compressed, uncompressed, tx);
dsl_dir_transfer_space(dd->dd_parent,
used - accounted_delta,
- DD_USED_CHILD_RSRV, DD_USED_CHILD, tx);
+ DD_USED_CHILD_RSRV, DD_USED_CHILD, NULL);
}
}
@@ -892,26 +1416,24 @@
dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta,
dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx)
{
- boolean_t needlock = !MUTEX_HELD(&dd->dd_lock);
-
- ASSERT(dmu_tx_is_syncing(tx));
+ ASSERT(tx == NULL || dmu_tx_is_syncing(tx));
ASSERT(oldtype < DD_USED_NUM);
ASSERT(newtype < DD_USED_NUM);
- if (delta == 0 || !(dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN))
+ if (delta == 0 ||
+ !(dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN))
return;
- if (needlock)
- mutex_enter(&dd->dd_lock);
+ if (tx != NULL)
+ dmu_buf_will_dirty(dd->dd_dbuf, tx);
+ mutex_enter(&dd->dd_lock);
ASSERT(delta > 0 ?
- dd->dd_phys->dd_used_breakdown[oldtype] >= delta :
- dd->dd_phys->dd_used_breakdown[newtype] >= -delta);
- ASSERT(dd->dd_phys->dd_used_bytes >= ABS(delta));
- dmu_buf_will_dirty(dd->dd_dbuf, tx);
- dd->dd_phys->dd_used_breakdown[oldtype] -= delta;
- dd->dd_phys->dd_used_breakdown[newtype] += delta;
- if (needlock)
- mutex_exit(&dd->dd_lock);
+ dsl_dir_phys(dd)->dd_used_breakdown[oldtype] >= delta :
+ dsl_dir_phys(dd)->dd_used_breakdown[newtype] >= -delta);
+ ASSERT(dsl_dir_phys(dd)->dd_used_bytes >= ABS(delta));
+ dsl_dir_phys(dd)->dd_used_breakdown[oldtype] -= delta;
+ dsl_dir_phys(dd)->dd_used_breakdown[newtype] += delta;
+ mutex_exit(&dd->dd_lock);
}
typedef struct dsl_dir_set_qr_arg {
@@ -954,8 +1476,8 @@
*/
towrite = dsl_dir_space_towrite(ds->ds_dir);
if ((dmu_tx_is_syncing(tx) || towrite == 0) &&
- (newval < ds->ds_dir->dd_phys->dd_reserved ||
- newval < ds->ds_dir->dd_phys->dd_used_bytes + towrite)) {
+ (newval < dsl_dir_phys(ds->ds_dir)->dd_reserved ||
+ newval < dsl_dir_phys(ds->ds_dir)->dd_used_bytes + towrite)) {
error = SET_ERROR(ENOSPC);
}
mutex_exit(&ds->ds_dir->dd_lock);
@@ -988,7 +1510,7 @@
dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
mutex_enter(&ds->ds_dir->dd_lock);
- ds->ds_dir->dd_phys->dd_quota = newval;
+ dsl_dir_phys(ds->ds_dir)->dd_quota = newval;
mutex_exit(&ds->ds_dir->dd_lock);
dsl_dataset_rele(ds, FTAG);
}
@@ -1003,7 +1525,7 @@
ddsqra.ddsqra_value = quota;
return (dsl_sync_task(ddname, dsl_dir_set_quota_check,
- dsl_dir_set_quota_sync, &ddsqra, 0));
+ dsl_dir_set_quota_sync, &ddsqra, 0, ZFS_SPACE_CHECK_NONE));
}
int
@@ -1039,7 +1561,7 @@
}
mutex_enter(&dd->dd_lock);
- used = dd->dd_phys->dd_used_bytes;
+ used = dsl_dir_phys(dd)->dd_used_bytes;
mutex_exit(&dd->dd_lock);
if (dd->dd_parent) {
@@ -1049,13 +1571,13 @@
avail = dsl_pool_adjustedsize(dd->dd_pool, B_FALSE) - used;
}
- if (MAX(used, newval) > MAX(used, dd->dd_phys->dd_reserved)) {
+ if (MAX(used, newval) > MAX(used, dsl_dir_phys(dd)->dd_reserved)) {
uint64_t delta = MAX(used, newval) -
- MAX(used, dd->dd_phys->dd_reserved);
+ MAX(used, dsl_dir_phys(dd)->dd_reserved);
if (delta > avail ||
- (dd->dd_phys->dd_quota > 0 &&
- newval > dd->dd_phys->dd_quota))
+ (dsl_dir_phys(dd)->dd_quota > 0 &&
+ newval > dsl_dir_phys(dd)->dd_quota))
error = SET_ERROR(ENOSPC);
}
@@ -1072,9 +1594,9 @@
dmu_buf_will_dirty(dd->dd_dbuf, tx);
mutex_enter(&dd->dd_lock);
- used = dd->dd_phys->dd_used_bytes;
- delta = MAX(used, value) - MAX(used, dd->dd_phys->dd_reserved);
- dd->dd_phys->dd_reserved = value;
+ used = dsl_dir_phys(dd)->dd_used_bytes;
+ delta = MAX(used, value) - MAX(used, dsl_dir_phys(dd)->dd_reserved);
+ dsl_dir_phys(dd)->dd_reserved = value;
if (dd->dd_parent != NULL) {
/* Roll up this additional usage into our ancestors */
@@ -1124,7 +1646,7 @@
ddsqra.ddsqra_value = reservation;
return (dsl_sync_task(ddname, dsl_dir_set_reservation_check,
- dsl_dir_set_reservation_sync, &ddsqra, 0));
+ dsl_dir_set_reservation_sync, &ddsqra, 0, ZFS_SPACE_CHECK_NONE));
}
static dsl_dir_t *
@@ -1151,7 +1673,7 @@
return (delta);
mutex_enter(&dd->dd_lock);
- delta = parent_delta(dd, dd->dd_phys->dd_used_bytes, delta);
+ delta = parent_delta(dd, dsl_dir_phys(dd)->dd_used_bytes, delta);
mutex_exit(&dd->dd_lock);
return (would_change(dd->dd_parent, delta, ancestor));
}
@@ -1159,6 +1681,7 @@
typedef struct dsl_dir_rename_arg {
const char *ddra_oldname;
const char *ddra_newname;
+ cred_t *ddra_cred;
} dsl_dir_rename_arg_t;
/* ARGSUSED */
@@ -1166,11 +1689,11 @@
dsl_valid_rename(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
{
int *deltap = arg;
- char namebuf[MAXNAMELEN];
+ char namebuf[ZFS_MAX_DATASET_NAME_LEN];
dsl_dataset_name(ds, namebuf);
- if (strlen(namebuf) + *deltap >= MAXNAMELEN)
+ if (strlen(namebuf) + *deltap >= ZFS_MAX_DATASET_NAME_LEN)
return (SET_ERROR(ENAMETOOLONG));
return (0);
}
@@ -1202,7 +1725,7 @@
if (dd->dd_pool != newparent->dd_pool) {
dsl_dir_rele(newparent, FTAG);
dsl_dir_rele(dd, FTAG);
- return (SET_ERROR(ENXIO));
+ return (SET_ERROR(EXDEV));
}
/* new name should not already exist */
@@ -1223,11 +1746,58 @@
}
}
+ if (dmu_tx_is_syncing(tx)) {
+ if (spa_feature_is_active(dp->dp_spa,
+ SPA_FEATURE_FS_SS_LIMIT)) {
+ /*
+ * Although this is the check function and we don't
+ * normally make on-disk changes in check functions,
+ * we need to do that here.
+ *
+ * Ensure this portion of the tree's counts have been
+ * initialized in case the new parent has limits set.
+ */
+ dsl_dir_init_fs_ss_count(dd, tx);
+ }
+ }
+
if (newparent != dd->dd_parent) {
/* is there enough space? */
uint64_t myspace =
- MAX(dd->dd_phys->dd_used_bytes, dd->dd_phys->dd_reserved);
+ MAX(dsl_dir_phys(dd)->dd_used_bytes,
+ dsl_dir_phys(dd)->dd_reserved);
+ objset_t *os = dd->dd_pool->dp_meta_objset;
+ uint64_t fs_cnt = 0;
+ uint64_t ss_cnt = 0;
+ if (dsl_dir_is_zapified(dd)) {
+ int err;
+
+ err = zap_lookup(os, dd->dd_object,
+ DD_FIELD_FILESYSTEM_COUNT, sizeof (fs_cnt), 1,
+ &fs_cnt);
+ if (err != ENOENT && err != 0) {
+ dsl_dir_rele(newparent, FTAG);
+ dsl_dir_rele(dd, FTAG);
+ return (err);
+ }
+
+ /*
+ * have to add 1 for the filesystem itself that we're
+ * moving
+ */
+ fs_cnt++;
+
+ err = zap_lookup(os, dd->dd_object,
+ DD_FIELD_SNAPSHOT_COUNT, sizeof (ss_cnt), 1,
+ &ss_cnt);
+ if (err != ENOENT && err != 0) {
+ dsl_dir_rele(newparent, FTAG);
+ dsl_dir_rele(dd, FTAG);
+ return (err);
+ }
+ }
+
/* no rename into our descendant */
if (closest_common_ancestor(dd, newparent) == dd) {
dsl_dir_rele(newparent, FTAG);
@@ -1236,7 +1806,7 @@
}
error = dsl_dir_transfer_possible(dd->dd_parent,
- newparent, myspace);
+ newparent, fs_cnt, ss_cnt, myspace, ddra->ddra_cred);
if (error != 0) {
dsl_dir_rele(newparent, FTAG);
dsl_dir_rele(dd, FTAG);
@@ -1268,18 +1838,50 @@
"-> %s", ddra->ddra_newname);
if (newparent != dd->dd_parent) {
+ objset_t *os = dd->dd_pool->dp_meta_objset;
+ uint64_t fs_cnt = 0;
+ uint64_t ss_cnt = 0;
+
+ /*
+ * We already made sure the dd counts were initialized in the
+ * check function.
+ */
+ if (spa_feature_is_active(dp->dp_spa,
+ SPA_FEATURE_FS_SS_LIMIT)) {
+ VERIFY0(zap_lookup(os, dd->dd_object,
+ DD_FIELD_FILESYSTEM_COUNT, sizeof (fs_cnt), 1,
+ &fs_cnt));
+ /* add 1 for the filesystem itself that we're moving */
+ fs_cnt++;
+
+ VERIFY0(zap_lookup(os, dd->dd_object,
+ DD_FIELD_SNAPSHOT_COUNT, sizeof (ss_cnt), 1,
+ &ss_cnt));
+ }
+
+ dsl_fs_ss_count_adjust(dd->dd_parent, -fs_cnt,
+ DD_FIELD_FILESYSTEM_COUNT, tx);
+ dsl_fs_ss_count_adjust(newparent, fs_cnt,
+ DD_FIELD_FILESYSTEM_COUNT, tx);
+
+ dsl_fs_ss_count_adjust(dd->dd_parent, -ss_cnt,
+ DD_FIELD_SNAPSHOT_COUNT, tx);
+ dsl_fs_ss_count_adjust(newparent, ss_cnt,
+ DD_FIELD_SNAPSHOT_COUNT, tx);
+
dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD,
- -dd->dd_phys->dd_used_bytes,
- -dd->dd_phys->dd_compressed_bytes,
- -dd->dd_phys->dd_uncompressed_bytes, tx);
+ -dsl_dir_phys(dd)->dd_used_bytes,
+ -dsl_dir_phys(dd)->dd_compressed_bytes,
+ -dsl_dir_phys(dd)->dd_uncompressed_bytes, tx);
dsl_dir_diduse_space(newparent, DD_USED_CHILD,
- dd->dd_phys->dd_used_bytes,
- dd->dd_phys->dd_compressed_bytes,
- dd->dd_phys->dd_uncompressed_bytes, tx);
+ dsl_dir_phys(dd)->dd_used_bytes,
+ dsl_dir_phys(dd)->dd_compressed_bytes,
+ dsl_dir_phys(dd)->dd_uncompressed_bytes, tx);
- if (dd->dd_phys->dd_reserved > dd->dd_phys->dd_used_bytes) {
- uint64_t unused_rsrv = dd->dd_phys->dd_reserved -
- dd->dd_phys->dd_used_bytes;
+ if (dsl_dir_phys(dd)->dd_reserved >
+ dsl_dir_phys(dd)->dd_used_bytes) {
+ uint64_t unused_rsrv = dsl_dir_phys(dd)->dd_reserved -
+ dsl_dir_phys(dd)->dd_used_bytes;
dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV,
-unused_rsrv, 0, 0, tx);
@@ -1291,18 +1893,19 @@
dmu_buf_will_dirty(dd->dd_dbuf, tx);
/* remove from old parent zapobj */
- error = zap_remove(mos, dd->dd_parent->dd_phys->dd_child_dir_zapobj,
+ error = zap_remove(mos,
+ dsl_dir_phys(dd->dd_parent)->dd_child_dir_zapobj,
dd->dd_myname, tx);
ASSERT0(error);
(void) strcpy(dd->dd_myname, mynewname);
dsl_dir_rele(dd->dd_parent, dd);
- dd->dd_phys->dd_parent_obj = newparent->dd_object;
+ dsl_dir_phys(dd)->dd_parent_obj = newparent->dd_object;
VERIFY0(dsl_dir_hold_obj(dp,
newparent->dd_object, NULL, dd, &dd->dd_parent));
/* add to new parent zapobj */
- VERIFY0(zap_add(mos, newparent->dd_phys->dd_child_dir_zapobj,
+ VERIFY0(zap_add(mos, dsl_dir_phys(newparent)->dd_child_dir_zapobj,
dd->dd_myname, 8, 1, &dd->dd_object, tx));
#ifdef __FreeBSD__
@@ -1325,17 +1928,21 @@
ddra.ddra_oldname = oldname;
ddra.ddra_newname = newname;
+ ddra.ddra_cred = CRED();
return (dsl_sync_task(oldname,
- dsl_dir_rename_check, dsl_dir_rename_sync, &ddra, 3));
+ dsl_dir_rename_check, dsl_dir_rename_sync, &ddra,
+ 3, ZFS_SPACE_CHECK_RESERVED));
}
int
-dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t space)
+dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd,
+ uint64_t fs_cnt, uint64_t ss_cnt, uint64_t space, cred_t *cr)
{
dsl_dir_t *ancestor;
int64_t adelta;
uint64_t avail;
+ int err;
ancestor = closest_common_ancestor(sdd, tdd);
adelta = would_change(sdd, -space, ancestor);
@@ -1343,6 +1950,15 @@
if (avail < space)
return (SET_ERROR(ENOSPC));
+ err = dsl_fs_ss_limit_check(tdd, fs_cnt, ZFS_PROP_FILESYSTEM_LIMIT,
+ ancestor, cr);
+ if (err != 0)
+ return (err);
+ err = dsl_fs_ss_limit_check(tdd, ss_cnt, ZFS_PROP_SNAPSHOT_LIMIT,
+ ancestor, cr);
+ if (err != 0)
+ return (err);
+
return (0);
}
@@ -1368,3 +1984,19 @@
dd->dd_snap_cmtime = t;
mutex_exit(&dd->dd_lock);
}
+
+void
+dsl_dir_zapify(dsl_dir_t *dd, dmu_tx_t *tx)
+{
+ objset_t *mos = dd->dd_pool->dp_meta_objset;
+ dmu_object_zapify(mos, dd->dd_object, DMU_OT_DSL_DIR, tx);
+}
+
+boolean_t
+dsl_dir_is_zapified(dsl_dir_t *dd)
+{
+ dmu_object_info_t doi;
+
+ dmu_object_info_from_db(dd->dd_dbuf, &doi);
+ return (doi.doi_type == DMU_OTN_ZAP_METADATA);
+}
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,8 +21,10 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
* Copyright (c) 2013 Steven Hartland. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
*/
#include <sys/dsl_pool.h>
@@ -47,45 +49,189 @@
#include <sys/zil_impl.h>
#include <sys/dsl_userhold.h>
-int zfs_no_write_throttle = 0;
-int zfs_write_limit_shift = 3; /* 1/8th of physical memory */
-int zfs_txg_synctime_ms = 1000; /* target millisecs to sync a txg */
+#ifdef __FreeBSD__
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#endif
-uint64_t zfs_write_limit_min = 32 << 20; /* min write limit is 32MB */
-uint64_t zfs_write_limit_max = 0; /* max data payload per txg */
-uint64_t zfs_write_limit_inflated = 0;
-uint64_t zfs_write_limit_override = 0;
+/*
+ * ZFS Write Throttle
+ * ------------------
+ *
+ * ZFS must limit the rate of incoming writes to the rate at which it is able
+ * to sync data modifications to the backend storage. Throttling by too much
+ * creates an artificial limit; throttling by too little can only be sustained
+ * for short periods and would lead to highly lumpy performance. On a per-pool
+ * basis, ZFS tracks the amount of modified (dirty) data. As operations change
+ * data, the amount of dirty data increases; as ZFS syncs out data, the amount
+ * of dirty data decreases. When the amount of dirty data exceeds a
+ * predetermined threshold further modifications are blocked until the amount
+ * of dirty data decreases (as data is synced out).
+ *
+ * The limit on dirty data is tunable, and should be adjusted according to
+ * both the IO capacity and available memory of the system. The larger the
+ * window, the more ZFS is able to aggregate and amortize metadata (and data)
+ * changes. However, memory is a limited resource, and allowing for more dirty
+ * data comes at the cost of keeping other useful data in memory (for example
+ * ZFS data cached by the ARC).
+ *
+ * Implementation
+ *
+ * As buffers are modified dsl_pool_willuse_space() increments both the per-
+ * txg (dp_dirty_pertxg[]) and poolwide (dp_dirty_total) accounting of
+ * dirty space used; dsl_pool_dirty_space() decrements those values as data
+ * is synced out from dsl_pool_sync(). While only the poolwide value is
+ * relevant, the per-txg value is useful for debugging. The tunable
+ * zfs_dirty_data_max determines the dirty space limit. Once that value is
+ * exceeded, new writes are halted until space frees up.
+ *
+ * The zfs_dirty_data_sync tunable dictates the threshold at which we
+ * ensure that there is a txg syncing (see the comment in txg.c for a full
+ * description of transaction group stages).
+ *
+ * The IO scheduler uses both the dirty space limit and current amount of
+ * dirty data as inputs. Those values affect the number of concurrent IOs ZFS
+ * issues. See the comment in vdev_queue.c for details of the IO scheduler.
+ *
+ * The delay is also calculated based on the amount of dirty data. See the
+ * comment above dmu_tx_delay() for details.
+ */
-kmutex_t zfs_write_limit_lock;
+/*
+ * zfs_dirty_data_max will be set to zfs_dirty_data_max_percent% of all memory,
+ * capped at zfs_dirty_data_max_max. It can also be overridden in /etc/system.
+ */
+uint64_t zfs_dirty_data_max;
+uint64_t zfs_dirty_data_max_max = 4ULL * 1024 * 1024 * 1024;
+int zfs_dirty_data_max_percent = 10;
-static pgcnt_t old_physmem = 0;
+/*
+ * If there is at least this much dirty data, push out a txg.
+ */
+uint64_t zfs_dirty_data_sync = 64 * 1024 * 1024;
+/*
+ * Once there is this amount of dirty data, the dmu_tx_delay() will kick in
+ * and delay each transaction.
+ * This value should be >= zfs_vdev_async_write_active_max_dirty_percent.
+ */
+int zfs_delay_min_dirty_percent = 60;
+
+/*
+ * This controls how quickly the delay approaches infinity.
+ * Larger values cause it to delay more for a given amount of dirty data.
+ * Therefore larger values will cause there to be less dirty data for a
+ * given throughput.
+ *
+ * For the smoothest delay, this value should be about 1 billion divided
+ * by the maximum number of operations per second. This will smoothly
+ * handle between 10x and 1/10th this number.
+ *
+ * Note: zfs_delay_scale * zfs_dirty_data_max must be < 2^64, due to the
+ * multiply in dmu_tx_delay().
+ */
+uint64_t zfs_delay_scale = 1000 * 1000 * 1000 / 2000;
+
+
+#ifdef __FreeBSD__
+
+extern int zfs_vdev_async_write_active_max_dirty_percent;
+
SYSCTL_DECL(_vfs_zfs);
-TUNABLE_INT("vfs.zfs.no_write_throttle", &zfs_no_write_throttle);
-SYSCTL_INT(_vfs_zfs, OID_AUTO, no_write_throttle, CTLFLAG_RDTUN,
- &zfs_no_write_throttle, 0, "");
-TUNABLE_INT("vfs.zfs.write_limit_shift", &zfs_write_limit_shift);
-SYSCTL_INT(_vfs_zfs, OID_AUTO, write_limit_shift, CTLFLAG_RDTUN,
- &zfs_write_limit_shift, 0, "2^N of physical memory");
-SYSCTL_DECL(_vfs_zfs_txg);
-TUNABLE_INT("vfs.zfs.txg.synctime_ms", &zfs_txg_synctime_ms);
-SYSCTL_INT(_vfs_zfs_txg, OID_AUTO, synctime_ms, CTLFLAG_RDTUN,
- &zfs_txg_synctime_ms, 0, "Target milliseconds to sync a txg");
-TUNABLE_QUAD("vfs.zfs.write_limit_min", &zfs_write_limit_min);
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, write_limit_min, CTLFLAG_RDTUN,
- &zfs_write_limit_min, 0, "Minimum write limit");
-TUNABLE_QUAD("vfs.zfs.write_limit_max", &zfs_write_limit_max);
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, write_limit_max, CTLFLAG_RDTUN,
- &zfs_write_limit_max, 0, "Maximum data payload per txg");
-TUNABLE_QUAD("vfs.zfs.write_limit_inflated", &zfs_write_limit_inflated);
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, write_limit_inflated, CTLFLAG_RDTUN,
- &zfs_write_limit_inflated, 0, "Maximum size of the dynamic write limit");
-TUNABLE_QUAD("vfs.zfs.write_limit_override", &zfs_write_limit_override);
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, write_limit_override, CTLFLAG_RDTUN,
- &zfs_write_limit_override, 0,
- "Force a txg if dirty buffers exceed this value (bytes)");
+TUNABLE_QUAD("vfs.zfs.dirty_data_max", &zfs_dirty_data_max);
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, dirty_data_max, CTLFLAG_RWTUN,
+ &zfs_dirty_data_max, 0,
+ "The maximum amount of dirty data in bytes after which new writes are "
+ "halted until space becomes available");
+TUNABLE_QUAD("vfs.zfs.dirty_data_max_max", &zfs_dirty_data_max_max);
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, dirty_data_max_max, CTLFLAG_RDTUN,
+ &zfs_dirty_data_max_max, 0,
+ "The absolute cap on dirty_data_max when auto calculating");
+
+TUNABLE_INT("vfs.zfs.dirty_data_max_percent", &zfs_dirty_data_max_percent);
+static int sysctl_zfs_dirty_data_max_percent(SYSCTL_HANDLER_ARGS);
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, dirty_data_max_percent,
+ CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RWTUN, 0, sizeof(int),
+ sysctl_zfs_dirty_data_max_percent, "I",
+ "The percent of physical memory used to auto calculate dirty_data_max");
+
+TUNABLE_QUAD("vfs.zfs.dirty_data_sync", &zfs_dirty_data_sync);
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, dirty_data_sync, CTLFLAG_RWTUN,
+ &zfs_dirty_data_sync, 0,
+ "Force a txg if the number of dirty buffer bytes exceed this value");
+
+static int sysctl_zfs_delay_min_dirty_percent(SYSCTL_HANDLER_ARGS);
+/* No zfs_delay_min_dirty_percent tunable due to limit requirements */
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, delay_min_dirty_percent,
+ CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(int),
+ sysctl_zfs_delay_min_dirty_percent, "I",
+ "The limit of outstanding dirty data before transactions are delayed");
+
+static int sysctl_zfs_delay_scale(SYSCTL_HANDLER_ARGS);
+/* No zfs_delay_scale tunable due to limit requirements */
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, delay_scale,
+ CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t),
+ sysctl_zfs_delay_scale, "QU",
+ "Controls how quickly the delay approaches infinity");
+
+static int
+sysctl_zfs_dirty_data_max_percent(SYSCTL_HANDLER_ARGS)
+{
+ int val, err;
+
+ val = zfs_dirty_data_max_percent;
+ err = sysctl_handle_int(oidp, &val, 0, req);
+ if (err != 0 || req->newptr == NULL)
+ return (err);
+
+ if (val < 0 || val > 100)
+ return (EINVAL);
+
+ zfs_dirty_data_max_percent = val;
+
+ return (0);
+}
+
+static int
+sysctl_zfs_delay_min_dirty_percent(SYSCTL_HANDLER_ARGS)
+{
+ int val, err;
+
+ val = zfs_delay_min_dirty_percent;
+ err = sysctl_handle_int(oidp, &val, 0, req);
+ if (err != 0 || req->newptr == NULL)
+ return (err);
+
+ if (val < zfs_vdev_async_write_active_max_dirty_percent)
+ return (EINVAL);
+
+ zfs_delay_min_dirty_percent = val;
+
+ return (0);
+}
+
+static int
+sysctl_zfs_delay_scale(SYSCTL_HANDLER_ARGS)
+{
+ uint64_t val;
+ int err;
+
+ val = zfs_delay_scale;
+ err = sysctl_handle_64(oidp, &val, 0, req);
+ if (err != 0 || req->newptr == NULL)
+ return (err);
+
+ if (val > UINT64_MAX / zfs_dirty_data_max)
+ return (EINVAL);
+
+ zfs_delay_scale = val;
+
+ return (0);
+}
+#endif
+
int
dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp)
{
@@ -93,7 +239,7 @@
int err;
err = zap_lookup(dp->dp_meta_objset,
- dp->dp_root_dir->dd_phys->dd_child_dir_zapobj,
+ dsl_dir_phys(dp->dp_root_dir)->dd_child_dir_zapobj,
name, sizeof (obj), 1, &obj);
if (err)
return (err);
@@ -111,7 +257,6 @@
dp->dp_spa = spa;
dp->dp_meta_rootbp = *bp;
rrw_init(&dp->dp_config_rwlock, B_TRUE);
- dp->dp_write_limit = zfs_write_limit_min;
txg_init(dp, txg);
txg_list_create(&dp->dp_dirty_datasets,
@@ -124,6 +269,7 @@
offsetof(dsl_sync_task_t, dst_node));
mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&dp->dp_spaceavail_cv, NULL, CV_DEFAULT, NULL);
dp->dp_vnrele_taskq = taskq_create("zfs_vn_rele_taskq", 1, minclsyspri,
1, 4, 0);
@@ -175,11 +321,11 @@
err = dsl_pool_open_special_dir(dp, ORIGIN_DIR_NAME, &dd);
if (err)
goto out;
- err = dsl_dataset_hold_obj(dp, dd->dd_phys->dd_head_dataset_obj,
- FTAG, &ds);
+ err = dsl_dataset_hold_obj(dp,
+ dsl_dir_phys(dd)->dd_head_dataset_obj, FTAG, &ds);
if (err == 0) {
err = dsl_dataset_hold_obj(dp,
- ds->ds_phys->ds_prev_snap_obj, dp,
+ dsl_dataset_phys(ds)->ds_prev_snap_obj, dp,
&dp->dp_origin_snap);
dsl_dataset_rele(ds, FTAG);
}
@@ -202,8 +348,14 @@
dp->dp_meta_objset, obj));
}
- if (spa_feature_is_active(dp->dp_spa,
- &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) {
+ /*
+ * Note: errors ignored, because the leak dir will not exist if we
+ * have not encountered a leak yet.
+ */
+ (void) dsl_pool_open_special_dir(dp, LEAK_DIR_NAME,
+ &dp->dp_leak_dir);
+
+ if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY)) {
err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1,
&dp->dp_bptree_obj);
@@ -211,8 +363,7 @@
goto out;
}
- if (spa_feature_is_active(dp->dp_spa,
- &spa_feature_table[SPA_FEATURE_EMPTY_BPOBJ])) {
+ if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMPTY_BPOBJ)) {
err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1,
&dp->dp_empty_bpobj);
@@ -238,9 +389,9 @@
void
dsl_pool_close(dsl_pool_t *dp)
{
- /* drop our references from dsl_pool_open() */
-
/*
+ * Drop our references from dsl_pool_open().
+ *
* Since we held the origin_snap from "syncing" context (which
* includes pool-opening context), it actually only got a "ref"
* and not a hold, so just drop that here.
@@ -251,6 +402,8 @@
dsl_dir_rele(dp->dp_mos_dir, dp);
if (dp->dp_free_dir)
dsl_dir_rele(dp->dp_free_dir, dp);
+ if (dp->dp_leak_dir)
+ dsl_dir_rele(dp->dp_leak_dir, dp);
if (dp->dp_root_dir)
dsl_dir_rele(dp->dp_root_dir, dp);
@@ -265,9 +418,18 @@
txg_list_destroy(&dp->dp_sync_tasks);
txg_list_destroy(&dp->dp_dirty_dirs);
- arc_flush(dp->dp_spa);
+ /*
+ * We can't set retry to TRUE since we're explicitly specifying
+ * a spa to flush. This is good enough; any missed buffers for
+ * this spa won't cause trouble, and they'll eventually fall
+ * out of the ARC just like any other unused buffer.
+ */
+ arc_flush(dp->dp_spa, FALSE);
+
txg_fini(dp);
dsl_scan_fini(dp);
+ dmu_buf_user_evict_wait();
+
rrw_destroy(&dp->dp_config_rwlock);
mutex_destroy(&dp->dp_lock);
taskq_destroy(dp->dp_vnrele_taskq);
@@ -318,7 +480,7 @@
FREE_DIR_NAME, &dp->dp_free_dir));
/* create and open the free_bplist */
- obj = bpobj_alloc(dp->dp_meta_objset, SPA_MAXBLOCKSIZE, tx);
+ obj = bpobj_alloc(dp->dp_meta_objset, SPA_OLD_MAXBLOCKSIZE, tx);
VERIFY(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx) == 0);
VERIFY0(bpobj_open(&dp->dp_free_bpobj,
@@ -333,8 +495,10 @@
/* create the root objset */
VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG, &ds));
+ rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
os = dmu_objset_create_impl(dp->dp_spa, ds,
dsl_dataset_get_blkptr(ds), DMU_OST_ZFS, tx);
+ rrw_exit(&ds->ds_bp_rwlock, FTAG);
#ifdef _KERNEL
zfs_create_fs(os, kcred, zplprops, tx);
#endif
@@ -362,14 +526,34 @@
mutex_exit(&dp->dp_lock);
}
-static int
-deadlist_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+static void
+dsl_pool_sync_mos(dsl_pool_t *dp, dmu_tx_t *tx)
{
- dsl_deadlist_t *dl = arg;
- dsl_deadlist_insert(dl, bp, tx);
- return (0);
+ zio_t *zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
+ dmu_objset_sync(dp->dp_meta_objset, zio, tx);
+ VERIFY0(zio_wait(zio));
+ dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", "");
+ spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp);
}
+static void
+dsl_pool_dirty_delta(dsl_pool_t *dp, int64_t delta)
+{
+ ASSERT(MUTEX_HELD(&dp->dp_lock));
+
+ if (delta < 0)
+ ASSERT3U(-delta, <=, dp->dp_dirty_total);
+
+ dp->dp_dirty_total += delta;
+
+ /*
+ * Note: we signal even when increasing dp_dirty_total.
+ * This ensures forward progress -- each thread wakes the next waiter.
+ */
+ if (dp->dp_dirty_total < zfs_dirty_data_max)
+ cv_signal(&dp->dp_spaceavail_cv);
+}
+
void
dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
{
@@ -378,29 +562,18 @@
dsl_dir_t *dd;
dsl_dataset_t *ds;
objset_t *mos = dp->dp_meta_objset;
- hrtime_t start, write_time;
- uint64_t data_written;
- int err;
list_t synced_datasets;
list_create(&synced_datasets, sizeof (dsl_dataset_t),
offsetof(dsl_dataset_t, ds_synced_link));
+ tx = dmu_tx_create_assigned(dp, txg);
+
/*
- * We need to copy dp_space_towrite() before doing
- * dsl_sync_task_sync(), because
- * dsl_dataset_snapshot_reserve_space() will increase
- * dp_space_towrite but not actually write anything.
+ * Write out all dirty blocks of dirty datasets.
*/
- data_written = dp->dp_space_towrite[txg & TXG_MASK];
-
- tx = dmu_tx_create_assigned(dp, txg);
-
- dp->dp_read_overhead = 0;
- start = gethrtime();
-
zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
- while (ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) {
+ while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) {
/*
* We must not sync any non-MOS datasets twice, because
* we may have taken a snapshot of them. However, we
@@ -410,20 +583,25 @@
list_insert_tail(&synced_datasets, ds);
dsl_dataset_sync(ds, zio, tx);
}
- DTRACE_PROBE(pool_sync__1setup);
- err = zio_wait(zio);
+ VERIFY0(zio_wait(zio));
- write_time = gethrtime() - start;
- ASSERT(err == 0);
- DTRACE_PROBE(pool_sync__2rootzio);
+ /*
+ * We have written all of the accounted dirty data, so our
+ * dp_space_towrite should now be zero. However, some seldom-used
+ * code paths do not adhere to this (e.g. dbuf_undirty(), also
+ * rounding error in dbuf_write_physdone).
+ * Shore up the accounting of any dirtied space now.
+ */
+ dsl_pool_undirty_space(dp, dp->dp_dirty_pertxg[txg & TXG_MASK], txg);
/*
* After the data blocks have been written (ensured by the zio_wait()
* above), update the user/group space accounting.
*/
- for (ds = list_head(&synced_datasets); ds;
- ds = list_next(&synced_datasets, ds))
+ for (ds = list_head(&synced_datasets); ds != NULL;
+ ds = list_next(&synced_datasets, ds)) {
dmu_objset_do_userquota_updates(ds->ds_objset, tx);
+ }
/*
* Sync the datasets again to push out the changes due to
@@ -433,12 +611,12 @@
* about which blocks are part of the snapshot).
*/
zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
- while (ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) {
+ while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) {
ASSERT(list_link_active(&ds->ds_synced_link));
dmu_buf_rele(ds->ds_dbuf, ds);
dsl_dataset_sync(ds, zio, tx);
}
- err = zio_wait(zio);
+ VERIFY0(zio_wait(zio));
/*
* Now that the datasets have been completely synced, we can
@@ -447,18 +625,12 @@
* - move dead blocks from the pending deadlist to the on-disk deadlist
* - release hold from dsl_dataset_dirty()
*/
- while (ds = list_remove_head(&synced_datasets)) {
- objset_t *os = ds->ds_objset;
- bplist_iterate(&ds->ds_pending_deadlist,
- deadlist_enqueue_cb, &ds->ds_deadlist, tx);
- ASSERT(!dmu_objset_is_dirty(os, txg));
- dmu_buf_rele(ds->ds_dbuf, ds);
+ while ((ds = list_remove_head(&synced_datasets)) != NULL) {
+ dsl_dataset_sync_done(ds, tx);
}
-
- start = gethrtime();
- while (dd = txg_list_remove(&dp->dp_dirty_dirs, txg))
+ while ((dd = txg_list_remove(&dp->dp_dirty_dirs, txg)) != NULL) {
dsl_dir_sync(dd, tx);
- write_time += gethrtime() - start;
+ }
/*
* The MOS's space is accounted for in the pool/$MOS
@@ -476,20 +648,10 @@
dp->dp_mos_uncompressed_delta = 0;
}
- start = gethrtime();
if (list_head(&mos->os_dirty_dnodes[txg & TXG_MASK]) != NULL ||
list_head(&mos->os_free_dnodes[txg & TXG_MASK]) != NULL) {
- zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
- dmu_objset_sync(mos, zio, tx);
- err = zio_wait(zio);
- ASSERT(err == 0);
- dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", "");
- spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp);
+ dsl_pool_sync_mos(dp, tx);
}
- write_time += gethrtime() - start;
- DTRACE_PROBE2(pool_sync__4io, hrtime_t, write_time,
- hrtime_t, dp->dp_read_overhead);
- write_time -= dp->dp_read_overhead;
/*
* If we modify a dataset in the same txg that we want to destroy it,
@@ -500,7 +662,6 @@
* The MOS data dirtied by the sync_tasks will be synced on the next
* pass.
*/
- DTRACE_PROBE(pool_sync__3task);
if (!txg_list_empty(&dp->dp_sync_tasks, txg)) {
dsl_sync_task_t *dst;
/*
@@ -507,54 +668,14 @@
* No more sync tasks should have been added while we
* were syncing.
*/
- ASSERT(spa_sync_pass(dp->dp_spa) == 1);
- while (dst = txg_list_remove(&dp->dp_sync_tasks, txg))
+ ASSERT3U(spa_sync_pass(dp->dp_spa), ==, 1);
+ while ((dst = txg_list_remove(&dp->dp_sync_tasks, txg)) != NULL)
dsl_sync_task_sync(dst, tx);
}
dmu_tx_commit(tx);
- dp->dp_space_towrite[txg & TXG_MASK] = 0;
- ASSERT(dp->dp_tempreserved[txg & TXG_MASK] == 0);
-
- /*
- * If the write limit max has not been explicitly set, set it
- * to a fraction of available physical memory (default 1/8th).
- * Note that we must inflate the limit because the spa
- * inflates write sizes to account for data replication.
- * Check this each sync phase to catch changing memory size.
- */
- if (physmem != old_physmem && zfs_write_limit_shift) {
- mutex_enter(&zfs_write_limit_lock);
- old_physmem = physmem;
- zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift;
- zfs_write_limit_inflated = MAX(zfs_write_limit_min,
- spa_get_asize(dp->dp_spa, zfs_write_limit_max));
- mutex_exit(&zfs_write_limit_lock);
- }
-
- /*
- * Attempt to keep the sync time consistent by adjusting the
- * amount of write traffic allowed into each transaction group.
- * Weight the throughput calculation towards the current value:
- * thru = 3/4 old_thru + 1/4 new_thru
- *
- * Note: write_time is in nanosecs, so write_time/MICROSEC
- * yields millisecs
- */
- ASSERT(zfs_write_limit_min > 0);
- if (data_written > zfs_write_limit_min / 8 && write_time > MICROSEC) {
- uint64_t throughput = data_written / (write_time / MICROSEC);
-
- if (dp->dp_throughput)
- dp->dp_throughput = throughput / 4 +
- 3 * dp->dp_throughput / 4;
- else
- dp->dp_throughput = throughput;
- dp->dp_write_limit = MIN(zfs_write_limit_inflated,
- MAX(zfs_write_limit_min,
- dp->dp_throughput * zfs_txg_synctime_ms));
- }
+ DTRACE_PROBE2(dsl_pool_sync__done, dsl_pool_t *dp, dp, uint64_t, txg);
}
void
@@ -561,11 +682,17 @@
dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg)
{
zilog_t *zilog;
- dsl_dataset_t *ds;
- while (zilog = txg_list_remove(&dp->dp_dirty_zilogs, txg)) {
- ds = dmu_objset_ds(zilog->zl_os);
+ while (zilog = txg_list_head(&dp->dp_dirty_zilogs, txg)) {
+ dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);
+ /*
+ * We don't remove the zilog from the dp_dirty_zilogs
+ * list until after we've cleaned it. This ensures that
+ * callers of zilog_is_dirty() receive an accurate
+ * answer when they are racing with the spa sync thread.
+ */
zil_clean(zilog, txg);
+ (void) txg_list_remove_this(&dp->dp_dirty_zilogs, zilog, txg);
ASSERT(!dmu_objset_is_dirty(zilog->zl_os, txg));
dmu_buf_rele(ds->ds_dbuf, zilog);
}
@@ -589,17 +716,12 @@
uint64_t space, resv;
/*
- * Reserve about 1.6% (1/64), or at least 32MB, for allocation
- * efficiency.
- * XXX The intent log is not accounted for, so it must fit
- * within this slop.
- *
* If we're trying to assess whether it's OK to do a free,
* cut the reservation in half to allow forward progress
* (e.g. make it possible to rm(1) files from a full pool).
*/
space = spa_get_dspace(dp->dp_spa);
- resv = MAX(space >> 6, SPA_MINDEVSIZE >> 1);
+ resv = spa_get_slop_space(dp->dp_spa);
if (netfree)
resv >>= 1;
@@ -606,82 +728,50 @@
return (space - resv);
}
-int
-dsl_pool_tempreserve_space(dsl_pool_t *dp, uint64_t space, dmu_tx_t *tx)
+boolean_t
+dsl_pool_need_dirty_delay(dsl_pool_t *dp)
{
- uint64_t reserved = 0;
- uint64_t write_limit = (zfs_write_limit_override ?
- zfs_write_limit_override : dp->dp_write_limit);
+ uint64_t delay_min_bytes =
+ zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
+ boolean_t rv;
- if (zfs_no_write_throttle) {
- atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK],
- space);
- return (0);
- }
-
- /*
- * Check to see if we have exceeded the maximum allowed IO for
- * this transaction group. We can do this without locks since
- * a little slop here is ok. Note that we do the reserved check
- * with only half the requested reserve: this is because the
- * reserve requests are worst-case, and we really don't want to
- * throttle based off of worst-case estimates.
- */
- if (write_limit > 0) {
- reserved = dp->dp_space_towrite[tx->tx_txg & TXG_MASK]
- + dp->dp_tempreserved[tx->tx_txg & TXG_MASK] / 2;
-
- if (reserved && reserved > write_limit)
- return (SET_ERROR(ERESTART));
- }
-
- atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], space);
-
- /*
- * If this transaction group is over 7/8ths capacity, delay
- * the caller 1 clock tick. This will slow down the "fill"
- * rate until the sync process can catch up with us.
- */
- if (reserved && reserved > (write_limit - (write_limit >> 3)))
- txg_delay(dp, tx->tx_txg, 1);
-
- return (0);
+ mutex_enter(&dp->dp_lock);
+ if (dp->dp_dirty_total > zfs_dirty_data_sync)
+ txg_kick(dp);
+ rv = (dp->dp_dirty_total > delay_min_bytes);
+ mutex_exit(&dp->dp_lock);
+ return (rv);
}
void
-dsl_pool_tempreserve_clear(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx)
+dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx)
{
- ASSERT(dp->dp_tempreserved[tx->tx_txg & TXG_MASK] >= space);
- atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], -space);
+ if (space > 0) {
+ mutex_enter(&dp->dp_lock);
+ dp->dp_dirty_pertxg[tx->tx_txg & TXG_MASK] += space;
+ dsl_pool_dirty_delta(dp, space);
+ mutex_exit(&dp->dp_lock);
+ }
}
void
-dsl_pool_memory_pressure(dsl_pool_t *dp)
+dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg)
{
- uint64_t space_inuse = 0;
- int i;
-
- if (dp->dp_write_limit == zfs_write_limit_min)
+ ASSERT3S(space, >=, 0);
+ if (space == 0)
return;
-
- for (i = 0; i < TXG_SIZE; i++) {
- space_inuse += dp->dp_space_towrite[i];
- space_inuse += dp->dp_tempreserved[i];
+ mutex_enter(&dp->dp_lock);
+ if (dp->dp_dirty_pertxg[txg & TXG_MASK] < space) {
+ /* XXX writing something we didn't dirty? */
+ space = dp->dp_dirty_pertxg[txg & TXG_MASK];
}
- dp->dp_write_limit = MAX(zfs_write_limit_min,
- MIN(dp->dp_write_limit, space_inuse / 4));
+ ASSERT3U(dp->dp_dirty_pertxg[txg & TXG_MASK], >=, space);
+ dp->dp_dirty_pertxg[txg & TXG_MASK] -= space;
+ ASSERT3U(dp->dp_dirty_total, >=, space);
+ dsl_pool_dirty_delta(dp, -space);
+ mutex_exit(&dp->dp_lock);
}
-void
-dsl_pool_willuse_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx)
-{
- if (space > 0) {
- mutex_enter(&dp->dp_lock);
- dp->dp_space_towrite[tx->tx_txg & TXG_MASK] += space;
- mutex_exit(&dp->dp_lock);
- }
-}
-
/* ARGSUSED */
static int
upgrade_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
@@ -694,15 +784,15 @@
if (err)
return (err);
- while (ds->ds_phys->ds_prev_snap_obj != 0) {
- err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj,
- FTAG, &prev);
+ while (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
+ err = dsl_dataset_hold_obj(dp,
+ dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
if (err) {
dsl_dataset_rele(ds, FTAG);
return (err);
}
- if (prev->ds_phys->ds_next_snap_obj != ds->ds_object)
+ if (dsl_dataset_phys(prev)->ds_next_snap_obj != ds->ds_object)
break;
dsl_dataset_rele(ds, FTAG);
ds = prev;
@@ -716,7 +806,9 @@
* The $ORIGIN can't have any data, or the accounting
* will be wrong.
*/
- ASSERT0(prev->ds_phys->ds_bp.blk_birth);
+ rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
+ ASSERT0(dsl_dataset_phys(prev)->ds_bp.blk_birth);
+ rrw_exit(&ds->ds_bp_rwlock, FTAG);
/* The origin doesn't get attached to itself */
if (ds->ds_object == prev->ds_object) {
@@ -725,33 +817,35 @@
}
dmu_buf_will_dirty(ds->ds_dbuf, tx);
- ds->ds_phys->ds_prev_snap_obj = prev->ds_object;
- ds->ds_phys->ds_prev_snap_txg = prev->ds_phys->ds_creation_txg;
+ dsl_dataset_phys(ds)->ds_prev_snap_obj = prev->ds_object;
+ dsl_dataset_phys(ds)->ds_prev_snap_txg =
+ dsl_dataset_phys(prev)->ds_creation_txg;
dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx);
- ds->ds_dir->dd_phys->dd_origin_obj = prev->ds_object;
+ dsl_dir_phys(ds->ds_dir)->dd_origin_obj = prev->ds_object;
dmu_buf_will_dirty(prev->ds_dbuf, tx);
- prev->ds_phys->ds_num_children++;
+ dsl_dataset_phys(prev)->ds_num_children++;
- if (ds->ds_phys->ds_next_snap_obj == 0) {
+ if (dsl_dataset_phys(ds)->ds_next_snap_obj == 0) {
ASSERT(ds->ds_prev == NULL);
VERIFY0(dsl_dataset_hold_obj(dp,
- ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev));
+ dsl_dataset_phys(ds)->ds_prev_snap_obj,
+ ds, &ds->ds_prev));
}
}
- ASSERT3U(ds->ds_dir->dd_phys->dd_origin_obj, ==, prev->ds_object);
- ASSERT3U(ds->ds_phys->ds_prev_snap_obj, ==, prev->ds_object);
+ ASSERT3U(dsl_dir_phys(ds->ds_dir)->dd_origin_obj, ==, prev->ds_object);
+ ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_obj, ==, prev->ds_object);
- if (prev->ds_phys->ds_next_clones_obj == 0) {
+ if (dsl_dataset_phys(prev)->ds_next_clones_obj == 0) {
dmu_buf_will_dirty(prev->ds_dbuf, tx);
- prev->ds_phys->ds_next_clones_obj =
+ dsl_dataset_phys(prev)->ds_next_clones_obj =
zap_create(dp->dp_meta_objset,
DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx);
}
VERIFY0(zap_add_int(dp->dp_meta_objset,
- prev->ds_phys->ds_next_clones_obj, ds->ds_object, tx));
+ dsl_dataset_phys(prev)->ds_next_clones_obj, ds->ds_object, tx));
dsl_dataset_rele(ds, FTAG);
if (prev != dp->dp_origin_snap)
@@ -766,7 +860,7 @@
ASSERT(dp->dp_origin_snap != NULL);
VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, upgrade_clones_cb,
- tx, DS_FIND_CHILDREN));
+ tx, DS_FIND_CHILDREN | DS_FIND_SERIALIZE));
}
/* ARGSUSED */
@@ -776,20 +870,22 @@
dmu_tx_t *tx = arg;
objset_t *mos = dp->dp_meta_objset;
- if (ds->ds_dir->dd_phys->dd_origin_obj != 0) {
+ if (dsl_dir_phys(ds->ds_dir)->dd_origin_obj != 0) {
dsl_dataset_t *origin;
VERIFY0(dsl_dataset_hold_obj(dp,
- ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &origin));
+ dsl_dir_phys(ds->ds_dir)->dd_origin_obj, FTAG, &origin));
- if (origin->ds_dir->dd_phys->dd_clones == 0) {
+ if (dsl_dir_phys(origin->ds_dir)->dd_clones == 0) {
dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx);
- origin->ds_dir->dd_phys->dd_clones = zap_create(mos,
- DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx);
+ dsl_dir_phys(origin->ds_dir)->dd_clones =
+ zap_create(mos, DMU_OT_DSL_CLONES, DMU_OT_NONE,
+ 0, tx);
}
VERIFY0(zap_add_int(dp->dp_meta_objset,
- origin->ds_dir->dd_phys->dd_clones, ds->ds_object, tx));
+ dsl_dir_phys(origin->ds_dir)->dd_clones,
+ ds->ds_object, tx));
dsl_dataset_rele(origin, FTAG);
}
@@ -812,13 +908,13 @@
* subobj support. So call dmu_object_alloc() directly.
*/
obj = dmu_object_alloc(dp->dp_meta_objset, DMU_OT_BPOBJ,
- SPA_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx);
+ SPA_OLD_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx);
VERIFY0(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx));
VERIFY0(bpobj_open(&dp->dp_free_bpobj, dp->dp_meta_objset, obj));
VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
- upgrade_dir_clones_cb, tx, DS_FIND_CHILDREN));
+ upgrade_dir_clones_cb, tx, DS_FIND_CHILDREN | DS_FIND_SERIALIZE));
}
void
@@ -836,7 +932,7 @@
NULL, 0, kcred, tx);
VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
dsl_dataset_snapshot_sync_impl(ds, ORIGIN_DIR_NAME, tx);
- VERIFY0(dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj,
+ VERIFY0(dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj,
dp, &dp->dp_origin_snap));
dsl_dataset_rele(ds, FTAG);
}
@@ -1052,6 +1148,13 @@
}
void
+dsl_pool_config_enter_prio(dsl_pool_t *dp, void *tag)
+{
+ ASSERT(!rrw_held(&dp->dp_config_rwlock, RW_READER));
+ rrw_enter_read_prio(&dp->dp_config_rwlock, tag);
+}
+
+void
dsl_pool_config_exit(dsl_pool_t *dp, void *tag)
{
rrw_exit(&dp->dp_config_rwlock, tag);
@@ -1062,3 +1165,9 @@
{
return (RRW_LOCK_HELD(&dp->dp_config_rwlock));
}
+
+boolean_t
+dsl_pool_config_held_writer(dsl_pool_t *dp)
+{
+ return (RRW_WRITE_HELD(&dp->dp_config_rwlock));
+}
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,7 +21,8 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright 2015, Joyent, Inc.
*/
#include <sys/zfs_context.h>
@@ -42,20 +43,20 @@
#define ZPROP_RECVD_SUFFIX "$recvd"
static int
-dodefault(const char *propname, int intsz, int numints, void *buf)
+dodefault(zfs_prop_t prop, int intsz, int numints, void *buf)
{
- zfs_prop_t prop;
-
/*
* The setonce properties are read-only, BUT they still
* have a default value that can be used as the initial
* value.
*/
- if ((prop = zfs_name_to_prop(propname)) == ZPROP_INVAL ||
+ if (prop == ZPROP_INVAL ||
(zfs_prop_readonly(prop) && !zfs_prop_setonce(prop)))
return (SET_ERROR(ENOENT));
if (zfs_prop_get_type(prop) == PROP_TYPE_STRING) {
+ if (zfs_prop_default_string(prop) == NULL)
+ return (SET_ERROR(ENOENT));
if (intsz != 1)
return (SET_ERROR(EOVERFLOW));
(void) strncpy(buf, zfs_prop_default_string(prop),
@@ -105,8 +106,8 @@
}
/* Check for a local value. */
- err = zap_lookup(mos, dd->dd_phys->dd_props_zapobj, propname,
- intsz, numints, buf);
+ err = zap_lookup(mos, dsl_dir_phys(dd)->dd_props_zapobj,
+ propname, intsz, numints, buf);
if (err != ENOENT) {
if (setpoint != NULL && err == 0)
dsl_dir_name(dd, setpoint);
@@ -117,7 +118,7 @@
* Skip the check for a received value if there is an explicit
* inheritance entry.
*/
- err = zap_contains(mos, dd->dd_phys->dd_props_zapobj,
+ err = zap_contains(mos, dsl_dir_phys(dd)->dd_props_zapobj,
inheritstr);
if (err != 0 && err != ENOENT)
break;
@@ -124,7 +125,7 @@
if (err == ENOENT) {
/* Check for a received value. */
- err = zap_lookup(mos, dd->dd_phys->dd_props_zapobj,
+ err = zap_lookup(mos, dsl_dir_phys(dd)->dd_props_zapobj,
recvdstr, intsz, numints, buf);
if (err != ENOENT) {
if (setpoint != NULL && err == 0) {
@@ -149,7 +150,7 @@
}
if (err == ENOENT)
- err = dodefault(propname, intsz, numints, buf);
+ err = dodefault(prop, intsz, numints, buf);
strfree(inheritstr);
strfree(recvdstr);
@@ -163,19 +164,17 @@
{
zfs_prop_t prop = zfs_name_to_prop(propname);
boolean_t inheritable;
- boolean_t snapshot;
uint64_t zapobj;
ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
inheritable = (prop == ZPROP_INVAL || zfs_prop_inheritable(prop));
- snapshot = (ds->ds_phys != NULL && dsl_dataset_is_snapshot(ds));
- zapobj = (ds->ds_phys == NULL ? 0 : ds->ds_phys->ds_props_obj);
+ zapobj = dsl_dataset_phys(ds)->ds_props_obj;
if (zapobj != 0) {
objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
int err;
- ASSERT(snapshot);
+ ASSERT(ds->ds_is_snapshot);
/* Check for a local value. */
err = zap_lookup(mos, zapobj, propname, intsz, numints, buf);
@@ -215,9 +214,61 @@
}
return (dsl_prop_get_dd(ds->ds_dir, propname,
- intsz, numints, buf, setpoint, snapshot));
+ intsz, numints, buf, setpoint, ds->ds_is_snapshot));
}
+static dsl_prop_record_t *
+dsl_prop_record_find(dsl_dir_t *dd, const char *propname)
+{
+ dsl_prop_record_t *pr = NULL;
+
+ ASSERT(MUTEX_HELD(&dd->dd_lock));
+
+ for (pr = list_head(&dd->dd_props);
+ pr != NULL; pr = list_next(&dd->dd_props, pr)) {
+ if (strcmp(pr->pr_propname, propname) == 0)
+ break;
+ }
+
+ return (pr);
+}
+
+static dsl_prop_record_t *
+dsl_prop_record_create(dsl_dir_t *dd, const char *propname)
+{
+ dsl_prop_record_t *pr;
+
+ ASSERT(MUTEX_HELD(&dd->dd_lock));
+
+ pr = kmem_alloc(sizeof (dsl_prop_record_t), KM_SLEEP);
+ pr->pr_propname = spa_strdup(propname);
+ list_create(&pr->pr_cbs, sizeof (dsl_prop_cb_record_t),
+ offsetof(dsl_prop_cb_record_t, cbr_pr_node));
+ list_insert_head(&dd->dd_props, pr);
+
+ return (pr);
+}
+
+void
+dsl_prop_init(dsl_dir_t *dd)
+{
+ list_create(&dd->dd_props, sizeof (dsl_prop_record_t),
+ offsetof(dsl_prop_record_t, pr_node));
+}
+
+void
+dsl_prop_fini(dsl_dir_t *dd)
+{
+ dsl_prop_record_t *pr;
+
+ while ((pr = list_remove_head(&dd->dd_props)) != NULL) {
+ list_destroy(&pr->pr_cbs);
+ strfree((char *)pr->pr_propname);
+ kmem_free(pr, sizeof (dsl_prop_record_t));
+ }
+ list_destroy(&dd->dd_props);
+}
+
/*
* Register interest in the named property. We'll call the callback
* once to notify it of the current property value, and again each time
@@ -232,6 +283,7 @@
dsl_dir_t *dd = ds->ds_dir;
dsl_pool_t *dp = dd->dd_pool;
uint64_t value;
+ dsl_prop_record_t *pr;
dsl_prop_cb_record_t *cbr;
int err;
@@ -243,12 +295,16 @@
cbr = kmem_alloc(sizeof (dsl_prop_cb_record_t), KM_SLEEP);
cbr->cbr_ds = ds;
- cbr->cbr_propname = kmem_alloc(strlen(propname)+1, KM_SLEEP);
- (void) strcpy((char *)cbr->cbr_propname, propname);
cbr->cbr_func = callback;
cbr->cbr_arg = cbarg;
+
mutex_enter(&dd->dd_lock);
- list_insert_head(&dd->dd_prop_cbs, cbr);
+ pr = dsl_prop_record_find(dd, propname);
+ if (pr == NULL)
+ pr = dsl_prop_record_create(dd, propname);
+ cbr->cbr_pr = pr;
+ list_insert_head(&pr->pr_cbs, cbr);
+ list_insert_head(&ds->ds_prop_cbs, cbr);
mutex_exit(&dd->dd_lock);
cbr->cbr_func(cbr->cbr_arg, value);
@@ -327,7 +383,7 @@
}
mos = dd->dd_pool->dp_meta_objset;
- zapobj = dd->dd_phys->dd_props_zapobj;
+ zapobj = dsl_dir_phys(dd)->dd_props_zapobj;
recvdstr = kmem_asprintf("%s%s", propname, ZPROP_RECVD_SUFFIX);
version = spa_version(dd->dd_pool->dp_spa);
@@ -379,56 +435,34 @@
}
/*
- * Unregister this callback. Return 0 on success, ENOENT if ddname is
- * invalid, or ENOMSG if no matching callback registered.
+ * Unregister all callbacks that are registered with the
+ * given callback argument.
*/
-int
-dsl_prop_unregister(dsl_dataset_t *ds, const char *propname,
- dsl_prop_changed_cb_t *callback, void *cbarg)
+void
+dsl_prop_unregister_all(dsl_dataset_t *ds, void *cbarg)
{
+ dsl_prop_cb_record_t *cbr, *next_cbr;
+
dsl_dir_t *dd = ds->ds_dir;
- dsl_prop_cb_record_t *cbr;
mutex_enter(&dd->dd_lock);
- for (cbr = list_head(&dd->dd_prop_cbs);
- cbr; cbr = list_next(&dd->dd_prop_cbs, cbr)) {
- if (cbr->cbr_ds == ds &&
- cbr->cbr_func == callback &&
- cbr->cbr_arg == cbarg &&
- strcmp(cbr->cbr_propname, propname) == 0)
- break;
+ next_cbr = list_head(&ds->ds_prop_cbs);
+ while (next_cbr != NULL) {
+ cbr = next_cbr;
+ next_cbr = list_next(&ds->ds_prop_cbs, cbr);
+ if (cbr->cbr_arg == cbarg) {
+ list_remove(&ds->ds_prop_cbs, cbr);
+ list_remove(&cbr->cbr_pr->pr_cbs, cbr);
+ kmem_free(cbr, sizeof (dsl_prop_cb_record_t));
+ }
}
-
- if (cbr == NULL) {
- mutex_exit(&dd->dd_lock);
- return (SET_ERROR(ENOMSG));
- }
-
- list_remove(&dd->dd_prop_cbs, cbr);
mutex_exit(&dd->dd_lock);
- kmem_free((void*)cbr->cbr_propname, strlen(cbr->cbr_propname)+1);
- kmem_free(cbr, sizeof (dsl_prop_cb_record_t));
-
- return (0);
}
boolean_t
dsl_prop_hascb(dsl_dataset_t *ds)
{
- dsl_dir_t *dd = ds->ds_dir;
- boolean_t rv = B_FALSE;
- dsl_prop_cb_record_t *cbr;
-
- mutex_enter(&dd->dd_lock);
- for (cbr = list_head(&dd->dd_prop_cbs); cbr;
- cbr = list_next(&dd->dd_prop_cbs, cbr)) {
- if (cbr->cbr_ds == ds) {
- rv = B_TRUE;
- break;
- }
- }
- mutex_exit(&dd->dd_lock);
- return (rv);
+ return (!list_is_empty(&ds->ds_prop_cbs));
}
/* ARGSUSED */
@@ -436,16 +470,50 @@
dsl_prop_notify_all_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
{
dsl_dir_t *dd = ds->ds_dir;
+ dsl_prop_record_t *pr;
dsl_prop_cb_record_t *cbr;
mutex_enter(&dd->dd_lock);
- for (cbr = list_head(&dd->dd_prop_cbs); cbr;
- cbr = list_next(&dd->dd_prop_cbs, cbr)) {
- uint64_t value;
+ for (pr = list_head(&dd->dd_props);
+ pr; pr = list_next(&dd->dd_props, pr)) {
+ for (cbr = list_head(&pr->pr_cbs); cbr;
+ cbr = list_next(&pr->pr_cbs, cbr)) {
+ uint64_t value;
- if (dsl_prop_get_ds(cbr->cbr_ds, cbr->cbr_propname,
- sizeof (value), 1, &value, NULL) == 0)
- cbr->cbr_func(cbr->cbr_arg, value);
+ /*
+ * Callback entries do not have holds on their
+ * datasets so that datasets with registered
+ * callbacks are still eligible for eviction.
+ * Unlike operations to update properties on a
+ * single dataset, we are performing a recursive
+ * descent of related head datasets. The caller
+ * of this function only has a dataset hold on
+ * the passed in head dataset, not the snapshots
+ * associated with this dataset. Without a hold,
+ * the dataset pointer within callback records
+ * for snapshots can be invalidated by eviction
+ * at any time.
+ *
+ * Use dsl_dataset_try_add_ref() to verify
+ * that the dataset for a snapshot has not
+ * begun eviction processing and to prevent
+ * eviction from occurring for the duration of
+ * the callback. If the hold attempt fails,
+ * this object is already being evicted and the
+ * callback can be safely ignored.
+ */
+ if (ds != cbr->cbr_ds &&
+ !dsl_dataset_try_add_ref(dp, cbr->cbr_ds, FTAG))
+ continue;
+
+ if (dsl_prop_get_ds(cbr->cbr_ds,
+ cbr->cbr_pr->pr_propname, sizeof (value), 1,
+ &value, NULL) == 0)
+ cbr->cbr_func(cbr->cbr_arg, value);
+
+ if (ds != cbr->cbr_ds)
+ dsl_dataset_rele(cbr->cbr_ds, FTAG);
+ }
}
mutex_exit(&dd->dd_lock);
@@ -470,6 +538,7 @@
const char *propname, uint64_t value, int first)
{
dsl_dir_t *dd;
+ dsl_prop_record_t *pr;
dsl_prop_cb_record_t *cbr;
objset_t *mos = dp->dp_meta_objset;
zap_cursor_t zc;
@@ -486,7 +555,8 @@
* If the prop is set here, then this change is not
* being inherited here or below; stop the recursion.
*/
- err = zap_contains(mos, dd->dd_phys->dd_props_zapobj, propname);
+ err = zap_contains(mos, dsl_dir_phys(dd)->dd_props_zapobj,
+ propname);
if (err == 0) {
dsl_dir_rele(dd, FTAG);
return;
@@ -495,27 +565,39 @@
}
mutex_enter(&dd->dd_lock);
- for (cbr = list_head(&dd->dd_prop_cbs); cbr;
- cbr = list_next(&dd->dd_prop_cbs, cbr)) {
- uint64_t propobj = cbr->cbr_ds->ds_phys->ds_props_obj;
+ pr = dsl_prop_record_find(dd, propname);
+ if (pr != NULL) {
+ for (cbr = list_head(&pr->pr_cbs); cbr;
+ cbr = list_next(&pr->pr_cbs, cbr)) {
+ uint64_t propobj;
- if (strcmp(cbr->cbr_propname, propname) != 0)
- continue;
+ /*
+ * cbr->cbr_ds may be invalidated due to eviction,
+ * requiring the use of dsl_dataset_try_add_ref().
+ * See comment block in dsl_prop_notify_all_cb()
+ * for details.
+ */
+ if (!dsl_dataset_try_add_ref(dp, cbr->cbr_ds, FTAG))
+ continue;
- /*
- * If the property is set on this ds, then it is not
- * inherited here; don't call the callback.
- */
- if (propobj && 0 == zap_contains(mos, propobj, propname))
- continue;
+ propobj = dsl_dataset_phys(cbr->cbr_ds)->ds_props_obj;
- cbr->cbr_func(cbr->cbr_arg, value);
+ /*
+ * If the property is not set on this ds, then it is
+ * inherited here; call the callback.
+ */
+ if (propobj == 0 ||
+ zap_contains(mos, propobj, propname) != 0)
+ cbr->cbr_func(cbr->cbr_arg, value);
+
+ dsl_dataset_rele(cbr->cbr_ds, FTAG);
+ }
}
mutex_exit(&dd->dd_lock);
za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
for (zap_cursor_init(&zc, mos,
- dd->dd_phys->dd_child_dir_zapobj);
+ dsl_dir_phys(dd)->dd_child_dir_zapobj);
zap_cursor_retrieve(&zc, za) == 0;
zap_cursor_advance(&zc)) {
dsl_prop_changed_notify(dp, za->za_first_integer,
@@ -542,19 +624,19 @@
int err;
uint64_t version = spa_version(ds->ds_dir->dd_pool->dp_spa);
- isint = (dodefault(propname, 8, 1, &intval) == 0);
+ isint = (dodefault(zfs_name_to_prop(propname), 8, 1, &intval) == 0);
- if (ds->ds_phys != NULL && dsl_dataset_is_snapshot(ds)) {
+ if (ds->ds_is_snapshot) {
ASSERT(version >= SPA_VERSION_SNAP_PROPS);
- if (ds->ds_phys->ds_props_obj == 0) {
+ if (dsl_dataset_phys(ds)->ds_props_obj == 0) {
dmu_buf_will_dirty(ds->ds_dbuf, tx);
- ds->ds_phys->ds_props_obj =
+ dsl_dataset_phys(ds)->ds_props_obj =
zap_create(mos,
DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx);
}
- zapobj = ds->ds_phys->ds_props_obj;
+ zapobj = dsl_dataset_phys(ds)->ds_props_obj;
} else {
- zapobj = ds->ds_dir->dd_phys->dd_props_zapobj;
+ zapobj = dsl_dir_phys(ds->ds_dir)->dd_props_zapobj;
}
if (version < SPA_VERSION_RECVD_PROPS) {
@@ -641,7 +723,7 @@
if (isint) {
VERIFY0(dsl_prop_get_int_ds(ds, propname, &intval));
- if (ds->ds_phys != NULL && dsl_dataset_is_snapshot(ds)) {
+ if (ds->ds_is_snapshot) {
dsl_prop_cb_record_t *cbr;
/*
* It's a snapshot; nothing can inherit this
@@ -649,10 +731,10 @@
* ds here.
*/
mutex_enter(&ds->ds_dir->dd_lock);
- for (cbr = list_head(&ds->ds_dir->dd_prop_cbs); cbr;
- cbr = list_next(&ds->ds_dir->dd_prop_cbs, cbr)) {
- if (cbr->cbr_ds == ds &&
- strcmp(cbr->cbr_propname, propname) == 0)
+ for (cbr = list_head(&ds->ds_prop_cbs); cbr;
+ cbr = list_next(&ds->ds_prop_cbs, cbr)) {
+ if (strcmp(cbr->cbr_pr->pr_propname,
+ propname) == 0)
cbr->cbr_func(cbr->cbr_arg, intval);
}
mutex_exit(&ds->ds_dir->dd_lock);
@@ -759,7 +841,7 @@
}
}
- if (dsl_dataset_is_snapshot(ds) && version < SPA_VERSION_SNAP_PROPS) {
+ if (ds->ds_is_snapshot && version < SPA_VERSION_SNAP_PROPS) {
dsl_dataset_rele(ds, FTAG);
return (SET_ERROR(ENOTSUP));
}
@@ -835,7 +917,7 @@
nblks = 2 * fnvlist_num_pairs(props);
return (dsl_sync_task(dsname, dsl_props_set_check, dsl_props_set_sync,
- &dpsa, nblks));
+ &dpsa, nblks, ZFS_SPACE_CHECK_RESERVED));
}
typedef enum dsl_prop_getflags {
@@ -978,20 +1060,20 @@
dsl_pool_t *dp = dd->dd_pool;
objset_t *mos = dp->dp_meta_objset;
int err = 0;
- char setpoint[MAXNAMELEN];
+ char setpoint[ZFS_MAX_DATASET_NAME_LEN];
VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
- if (dsl_dataset_is_snapshot(ds))
+ if (ds->ds_is_snapshot)
flags |= DSL_PROP_GET_SNAPSHOT;
ASSERT(dsl_pool_config_held(dp));
- if (ds->ds_phys->ds_props_obj != 0) {
+ if (dsl_dataset_phys(ds)->ds_props_obj != 0) {
ASSERT(flags & DSL_PROP_GET_SNAPSHOT);
dsl_dataset_name(ds, setpoint);
- err = dsl_prop_get_all_impl(mos, ds->ds_phys->ds_props_obj,
- setpoint, flags, *nvp);
+ err = dsl_prop_get_all_impl(mos,
+ dsl_dataset_phys(ds)->ds_props_obj, setpoint, flags, *nvp);
if (err)
goto out;
}
@@ -1004,8 +1086,8 @@
flags |= DSL_PROP_GET_INHERITING;
}
dsl_dir_name(dd, setpoint);
- err = dsl_prop_get_all_impl(mos, dd->dd_phys->dd_props_zapobj,
- setpoint, flags, *nvp);
+ err = dsl_prop_get_all_impl(mos,
+ dsl_dir_phys(dd)->dd_props_zapobj, setpoint, flags, *nvp);
if (err)
break;
}
@@ -1100,7 +1182,7 @@
VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, value) == 0);
/* Indicate the default source if we can. */
- if (dodefault(propname, 8, 1, &default_value) == 0 &&
+ if (dodefault(prop, 8, 1, &default_value) == 0 &&
value == default_value) {
VERIFY(nvlist_add_string(propval, ZPROP_SOURCE, "") == 0);
}
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,7 +21,8 @@
*/
/*
* Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright 2016 Gary Mills
*/
#include <sys/dsl_scan.h>
@@ -51,13 +52,13 @@
#include <sys/zfs_vfsops.h>
#endif
-typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *);
+typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *,
+ const zbookmark_phys_t *);
-static scan_cb_t dsl_scan_defrag_cb;
static scan_cb_t dsl_scan_scrub_cb;
-static scan_cb_t dsl_scan_remove_cb;
static void dsl_scan_cancel_sync(void *, dmu_tx_t *);
-static void dsl_scan_sync_state(dsl_scan_t *, dmu_tx_t *tx);
+static void dsl_scan_sync_state(dsl_scan_t *, dmu_tx_t *);
+static boolean_t dsl_scan_restarting(dsl_scan_t *, dmu_tx_t *);
unsigned int zfs_top_maxinflight = 32; /* maximum I/Os per top-level */
unsigned int zfs_resilver_delay = 2; /* number of ticks to delay resilver */
@@ -69,7 +70,7 @@
unsigned int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver
per txg */
boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */
-boolean_t zfs_no_scrub_prefetch = B_FALSE; /* set to disable srub prefetching */
+boolean_t zfs_no_scrub_prefetch = B_FALSE; /* set to disable scrub prefetch */
SYSCTL_DECL(_vfs_zfs);
TUNABLE_INT("vfs.zfs.top_maxinflight", &zfs_top_maxinflight);
@@ -101,7 +102,12 @@
&zfs_no_scrub_prefetch, 0, "Disable scrub prefetching");
enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE;
+/* max number of blocks to free in a single TXG */
+uint64_t zfs_free_max_blocks = UINT64_MAX;
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, free_max_blocks, CTLFLAG_RWTUN,
+ &zfs_free_max_blocks, 0, "Maximum number of blocks to free in one TXG");
+
#define DSL_SCAN_IS_SCRUB_RESILVER(scn) \
((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB || \
(scn)->scn_phys.scn_func == POOL_SCAN_RESILVER)
@@ -108,6 +114,14 @@
extern int zfs_txg_timeout;
+/*
+ * Enable/disable the processing of the free_bpobj object.
+ */
+boolean_t zfs_free_bpobj_enabled = B_TRUE;
+
+SYSCTL_INT(_vfs_zfs, OID_AUTO, free_bpobj_enabled, CTLFLAG_RWTUN,
+ &zfs_free_bpobj_enabled, 0, "Enable free_bpobj processing");
+
/* the order has to match pool_scan_type */
static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = {
NULL,
@@ -133,7 +147,7 @@
*/
ASSERT(!scn->scn_async_destroying);
scn->scn_async_destroying = spa_feature_is_active(dp->dp_spa,
- &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY]);
+ SPA_FEATURE_ASYNC_DESTROY);
err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
"scrub_func", sizeof (uint64_t), 1, &f);
@@ -225,6 +239,7 @@
scn->scn_phys.scn_errors = 0;
scn->scn_phys.scn_to_examine = spa->spa_root_vdev->vdev_stat.vs_alloc;
scn->scn_restart_txg = 0;
+ scn->scn_done_txg = 0;
spa_scan_stat_init(spa);
if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
@@ -316,8 +331,15 @@
else
scn->scn_phys.scn_state = DSS_CANCELED;
- spa_history_log_internal(spa, "scan done", tx,
- "complete=%u", complete);
+ if (dsl_scan_restarting(scn, tx))
+ spa_history_log_internal(spa, "scan aborted, restarting", tx,
+ "errors=%llu", spa_get_errlog_size(spa));
+ else if (!complete)
+ spa_history_log_internal(spa, "scan cancelled", tx,
+ "errors=%llu", spa_get_errlog_size(spa));
+ else
+ spa_history_log_internal(spa, "scan done", tx,
+ "errors=%llu", spa_get_errlog_size(spa));
if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
mutex_enter(&spa->spa_scrub_lock);
@@ -377,16 +399,15 @@
dsl_scan_cancel(dsl_pool_t *dp)
{
return (dsl_sync_task(spa_name(dp->dp_spa), dsl_scan_cancel_check,
- dsl_scan_cancel_sync, NULL, 3));
+ dsl_scan_cancel_sync, NULL, 3, ZFS_SPACE_CHECK_RESERVED));
}
-static void dsl_scan_visitbp(blkptr_t *bp,
- const zbookmark_t *zb, dnode_phys_t *dnp, arc_buf_t *pbuf,
- dsl_dataset_t *ds, dsl_scan_t *scn, dmu_objset_type_t ostype,
- dmu_tx_t *tx);
+static void dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb,
+ dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn,
+ dmu_objset_type_t ostype, dmu_tx_t *tx);
static void dsl_scan_visitdnode(dsl_scan_t *, dsl_dataset_t *ds,
dmu_objset_type_t ostype,
- dnode_phys_t *dnp, arc_buf_t *buf, uint64_t object, dmu_tx_t *tx);
+ dnode_phys_t *dnp, uint64_t object, dmu_tx_t *tx);
void
dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bp)
@@ -406,8 +427,8 @@
dsl_scan_ds_maxtxg(dsl_dataset_t *ds)
{
uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg;
- if (dsl_dataset_is_snapshot(ds))
- return (MIN(smt, ds->ds_phys->ds_creation_txg));
+ if (ds->ds_is_snapshot)
+ return (MIN(smt, dsl_dataset_phys(ds)->ds_creation_txg));
return (smt);
}
@@ -420,12 +441,11 @@
&scn->scn_phys, tx));
}
+extern int zfs_vdev_async_write_active_min_dirty_percent;
+
static boolean_t
-dsl_scan_check_pause(dsl_scan_t *scn, const zbookmark_t *zb)
+dsl_scan_check_pause(dsl_scan_t *scn, const zbookmark_phys_t *zb)
{
- uint64_t elapsed_nanosecs;
- unsigned int mintime;
-
/* we never skip user/group accounting objects */
if (zb && (int64_t)zb->zb_object < 0)
return (B_FALSE);
@@ -440,12 +460,28 @@
if (zb && zb->zb_level != 0)
return (B_FALSE);
- mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ?
+ /*
+ * We pause if:
+ * - we have scanned for the maximum time: an entire txg
+ * timeout (default 5 sec)
+ * or
+ * - we have scanned for at least the minimum time (default 1 sec
+ * for scrub, 3 sec for resilver), and either we have sufficient
+ * dirty data that we are starting to write more quickly
+ * (default 30%), or someone is explicitly waiting for this txg
+ * to complete.
+ * or
+ * - the spa is shutting down because this pool is being exported
+ * or the machine is rebooting.
+ */
+ int mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ?
zfs_resilver_min_time_ms : zfs_scan_min_time_ms;
- elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
- if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
- (elapsed_nanosecs / MICROSEC > mintime &&
- txg_sync_waiting(scn->scn_dp)) ||
+ uint64_t elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
+ int dirty_pct = scn->scn_dp->dp_dirty_total * 100 / zfs_dirty_data_max;
+ if (elapsed_nanosecs / NANOSEC >= zfs_txg_timeout ||
+ (NSEC2MSEC(elapsed_nanosecs) > mintime &&
+ (txg_sync_waiting(scn->scn_dp) ||
+ dirty_pct >= zfs_vdev_async_write_active_min_dirty_percent)) ||
spa_shutting_down(scn->scn_dp->dp_spa)) {
if (zb) {
dprintf("pausing at bookmark %llx/%llx/%llx/%llx\n",
@@ -479,9 +515,9 @@
dsl_pool_t *dp = zsa->zsa_dp;
dsl_scan_t *scn = dp->dp_scan;
zil_header_t *zh = zsa->zsa_zh;
- zbookmark_t zb;
+ zbookmark_phys_t zb;
- if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
+ if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
return (0);
/*
@@ -511,9 +547,10 @@
zil_header_t *zh = zsa->zsa_zh;
lr_write_t *lr = (lr_write_t *)lrc;
blkptr_t *bp = &lr->lr_blkptr;
- zbookmark_t zb;
+ zbookmark_phys_t zb;
- if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
+ if (BP_IS_HOLE(bp) ||
+ bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
return (0);
/*
@@ -560,8 +597,8 @@
dsl_scan_prefetch(dsl_scan_t *scn, arc_buf_t *buf, blkptr_t *bp,
uint64_t objset, uint64_t object, uint64_t blkid)
{
- zbookmark_t czb;
- uint32_t flags = ARC_NOWAIT | ARC_PREFETCH;
+ zbookmark_phys_t czb;
+ arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
if (zfs_no_scrub_prefetch)
return;
@@ -579,7 +616,7 @@
static boolean_t
dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp,
- const zbookmark_t *zb)
+ const zbookmark_phys_t *zb)
{
/*
* We never skip over user/group accounting objects (obj<0)
@@ -590,7 +627,8 @@
* If we already visited this bp & everything below (in
* a prior txg sync), don't bother doing it again.
*/
- if (zbookmark_is_before(dnp, zb, &scn->scn_phys.scn_bookmark))
+ if (zbookmark_subtree_completed(dnp, zb,
+ &scn->scn_phys.scn_bookmark))
return (B_TRUE);
/*
@@ -619,7 +657,7 @@
static int
dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
dnode_phys_t *dnp, const blkptr_t *bp,
- const zbookmark_t *zb, dmu_tx_t *tx, arc_buf_t **bufp)
+ const zbookmark_phys_t *zb, dmu_tx_t *tx)
{
dsl_pool_t *dp = scn->scn_dp;
int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD;
@@ -626,68 +664,64 @@
int err;
if (BP_GET_LEVEL(bp) > 0) {
- uint32_t flags = ARC_WAIT;
+ arc_flags_t flags = ARC_FLAG_WAIT;
int i;
blkptr_t *cbp;
int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
+ arc_buf_t *buf;
- err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, bufp,
+ err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf,
ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
if (err) {
scn->scn_phys.scn_errors++;
return (err);
}
- for (i = 0, cbp = (*bufp)->b_data; i < epb; i++, cbp++) {
- dsl_scan_prefetch(scn, *bufp, cbp, zb->zb_objset,
+ for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) {
+ dsl_scan_prefetch(scn, buf, cbp, zb->zb_objset,
zb->zb_object, zb->zb_blkid * epb + i);
}
- for (i = 0, cbp = (*bufp)->b_data; i < epb; i++, cbp++) {
- zbookmark_t czb;
+ for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) {
+ zbookmark_phys_t czb;
SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
zb->zb_level - 1,
zb->zb_blkid * epb + i);
dsl_scan_visitbp(cbp, &czb, dnp,
- *bufp, ds, scn, ostype, tx);
+ ds, scn, ostype, tx);
}
- } else if (BP_GET_TYPE(bp) == DMU_OT_USERGROUP_USED) {
- uint32_t flags = ARC_WAIT;
-
- err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, bufp,
- ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
- if (err) {
- scn->scn_phys.scn_errors++;
- return (err);
- }
+ arc_buf_destroy(buf, &buf);
} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
- uint32_t flags = ARC_WAIT;
+ arc_flags_t flags = ARC_FLAG_WAIT;
dnode_phys_t *cdnp;
int i, j;
int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
+ arc_buf_t *buf;
- err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, bufp,
+ err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf,
ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
if (err) {
scn->scn_phys.scn_errors++;
return (err);
}
- for (i = 0, cdnp = (*bufp)->b_data; i < epb; i++, cdnp++) {
+ for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) {
for (j = 0; j < cdnp->dn_nblkptr; j++) {
blkptr_t *cbp = &cdnp->dn_blkptr[j];
- dsl_scan_prefetch(scn, *bufp, cbp,
+ dsl_scan_prefetch(scn, buf, cbp,
zb->zb_objset, zb->zb_blkid * epb + i, j);
}
}
- for (i = 0, cdnp = (*bufp)->b_data; i < epb; i++, cdnp++) {
+ for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) {
dsl_scan_visitdnode(scn, ds, ostype,
- cdnp, *bufp, zb->zb_blkid * epb + i, tx);
+ cdnp, zb->zb_blkid * epb + i, tx);
}
+ arc_buf_destroy(buf, &buf);
} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
- uint32_t flags = ARC_WAIT;
+ arc_flags_t flags = ARC_FLAG_WAIT;
objset_phys_t *osp;
+ arc_buf_t *buf;
- err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, bufp,
+ err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf,
ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb);
if (err) {
scn->scn_phys.scn_errors++;
@@ -694,12 +728,12 @@
return (err);
}
- osp = (*bufp)->b_data;
+ osp = buf->b_data;
dsl_scan_visitdnode(scn, ds, osp->os_type,
- &osp->os_meta_dnode, *bufp, DMU_META_DNODE_OBJECT, tx);
+ &osp->os_meta_dnode, DMU_META_DNODE_OBJECT, tx);
- if (OBJSET_BUF_HAS_USERUSED(*bufp)) {
+ if (OBJSET_BUF_HAS_USERUSED(buf)) {
/*
* We also always visit user/group accounting
* objects, and never skip them, even if we are
@@ -707,12 +741,13 @@
* deltas from this txg get integrated.
*/
dsl_scan_visitdnode(scn, ds, osp->os_type,
- &osp->os_groupused_dnode, *bufp,
+ &osp->os_groupused_dnode,
DMU_GROUPUSED_OBJECT, tx);
dsl_scan_visitdnode(scn, ds, osp->os_type,
- &osp->os_userused_dnode, *bufp,
+ &osp->os_userused_dnode,
DMU_USERUSED_OBJECT, tx);
}
+ arc_buf_destroy(buf, &buf);
}
return (0);
@@ -720,26 +755,26 @@
static void
dsl_scan_visitdnode(dsl_scan_t *scn, dsl_dataset_t *ds,
- dmu_objset_type_t ostype, dnode_phys_t *dnp, arc_buf_t *buf,
+ dmu_objset_type_t ostype, dnode_phys_t *dnp,
uint64_t object, dmu_tx_t *tx)
{
int j;
for (j = 0; j < dnp->dn_nblkptr; j++) {
- zbookmark_t czb;
+ zbookmark_phys_t czb;
SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object,
dnp->dn_nlevels - 1, j);
dsl_scan_visitbp(&dnp->dn_blkptr[j],
- &czb, dnp, buf, ds, scn, ostype, tx);
+ &czb, dnp, ds, scn, ostype, tx);
}
if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
- zbookmark_t czb;
+ zbookmark_phys_t czb;
SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object,
0, DMU_SPILL_BLKID);
dsl_scan_visitbp(&dnp->dn_spill,
- &czb, dnp, buf, ds, scn, ostype, tx);
+ &czb, dnp, ds, scn, ostype, tx);
}
}
@@ -748,10 +783,9 @@
* first 5; we want them to be useful.
*/
static void
-dsl_scan_visitbp(blkptr_t *bp, const zbookmark_t *zb,
- dnode_phys_t *dnp, arc_buf_t *pbuf,
- dsl_dataset_t *ds, dsl_scan_t *scn, dmu_objset_type_t ostype,
- dmu_tx_t *tx)
+dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb,
+ dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn,
+ dmu_objset_type_t ostype, dmu_tx_t *tx)
{
dsl_pool_t *dp = scn->scn_dp;
arc_buf_t *buf = NULL;
@@ -765,22 +799,21 @@
if (dsl_scan_check_resume(scn, dnp, zb))
return;
- if (bp->blk_birth == 0)
+ if (BP_IS_HOLE(bp))
return;
scn->scn_visited_this_txg++;
dprintf_bp(bp,
- "visiting ds=%p/%llu zb=%llx/%llx/%llx/%llx buf=%p bp=%p",
+ "visiting ds=%p/%llu zb=%llx/%llx/%llx/%llx bp=%p",
ds, ds ? ds->ds_object : 0,
zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid,
- pbuf, bp);
+ bp);
if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
return;
- if (dsl_scan_recurse(scn, ds, ostype, dnp, &bp_toread, zb, tx,
- &buf) != 0)
+ if (dsl_scan_recurse(scn, ds, ostype, dnp, &bp_toread, zb, tx) != 0)
return;
/*
@@ -801,11 +834,9 @@
* Don't scan it now unless we need to because something
* under it was modified.
*/
- if (bp->blk_birth <= scn->scn_phys.scn_cur_max_txg) {
+ if (BP_PHYSICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_max_txg) {
scan_funcs[scn->scn_phys.scn_func](dp, bp, zb);
}
- if (buf)
- (void) arc_buf_remove_ref(buf, &buf);
}
static void
@@ -812,11 +843,11 @@
dsl_scan_visit_rootbp(dsl_scan_t *scn, dsl_dataset_t *ds, blkptr_t *bp,
dmu_tx_t *tx)
{
- zbookmark_t zb;
+ zbookmark_phys_t zb;
SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
- dsl_scan_visitbp(bp, &zb, NULL, NULL,
+ dsl_scan_visitbp(bp, &zb, NULL,
ds, scn, DMU_OST_NONE, tx);
dprintf_ds(ds, "finished scan%s", "");
@@ -833,14 +864,24 @@
return;
if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) {
- if (dsl_dataset_is_snapshot(ds)) {
- /* Note, scn_cur_{min,max}_txg stays the same. */
+ if (ds->ds_is_snapshot) {
+ /*
+ * Note:
+ * - scn_cur_{min,max}_txg stays the same.
+ * - Setting the flag is not really necessary if
+ * scn_cur_max_txg == scn_max_txg, because there
+ * is nothing after this snapshot that we care
+ * about. However, we set it anyway and then
+ * ignore it when we retraverse it in
+ * dsl_scan_visitds().
+ */
scn->scn_phys.scn_bookmark.zb_objset =
- ds->ds_phys->ds_next_snap_obj;
+ dsl_dataset_phys(ds)->ds_next_snap_obj;
zfs_dbgmsg("destroying ds %llu; currently traversing; "
"reset zb_objset to %llu",
(u_longlong_t)ds->ds_object,
- (u_longlong_t)ds->ds_phys->ds_next_snap_obj);
+ (u_longlong_t)dsl_dataset_phys(ds)->
+ ds_next_snap_obj);
scn->scn_phys.scn_flags |= DSF_VISIT_DS_AGAIN;
} else {
SET_BOOKMARK(&scn->scn_phys.scn_bookmark,
@@ -851,10 +892,10 @@
}
} else if (zap_lookup_int_key(dp->dp_meta_objset,
scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) {
- ASSERT3U(ds->ds_phys->ds_num_children, <=, 1);
+ ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1);
VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
scn->scn_phys.scn_queue_obj, ds->ds_object, tx));
- if (dsl_dataset_is_snapshot(ds)) {
+ if (ds->ds_is_snapshot) {
/*
* We keep the same mintxg; it could be >
* ds_creation_txg if the previous snapshot was
@@ -862,18 +903,17 @@
*/
VERIFY(zap_add_int_key(dp->dp_meta_objset,
scn->scn_phys.scn_queue_obj,
- ds->ds_phys->ds_next_snap_obj, mintxg, tx) == 0);
+ dsl_dataset_phys(ds)->ds_next_snap_obj,
+ mintxg, tx) == 0);
zfs_dbgmsg("destroying ds %llu; in queue; "
"replacing with %llu",
(u_longlong_t)ds->ds_object,
- (u_longlong_t)ds->ds_phys->ds_next_snap_obj);
+ (u_longlong_t)dsl_dataset_phys(ds)->
+ ds_next_snap_obj);
} else {
zfs_dbgmsg("destroying ds %llu; in queue; removing",
(u_longlong_t)ds->ds_object);
}
- } else {
- zfs_dbgmsg("destroying ds %llu; ignoring",
- (u_longlong_t)ds->ds_object);
}
/*
@@ -893,15 +933,15 @@
if (scn->scn_phys.scn_state != DSS_SCANNING)
return;
- ASSERT(ds->ds_phys->ds_prev_snap_obj != 0);
+ ASSERT(dsl_dataset_phys(ds)->ds_prev_snap_obj != 0);
if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) {
scn->scn_phys.scn_bookmark.zb_objset =
- ds->ds_phys->ds_prev_snap_obj;
+ dsl_dataset_phys(ds)->ds_prev_snap_obj;
zfs_dbgmsg("snapshotting ds %llu; currently traversing; "
"reset zb_objset to %llu",
(u_longlong_t)ds->ds_object,
- (u_longlong_t)ds->ds_phys->ds_prev_snap_obj);
+ (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj);
} else if (zap_lookup_int_key(dp->dp_meta_objset,
scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) {
VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
@@ -908,11 +948,11 @@
scn->scn_phys.scn_queue_obj, ds->ds_object, tx));
VERIFY(zap_add_int_key(dp->dp_meta_objset,
scn->scn_phys.scn_queue_obj,
- ds->ds_phys->ds_prev_snap_obj, mintxg, tx) == 0);
+ dsl_dataset_phys(ds)->ds_prev_snap_obj, mintxg, tx) == 0);
zfs_dbgmsg("snapshotting ds %llu; in queue; "
"replacing with %llu",
(u_longlong_t)ds->ds_object,
- (u_longlong_t)ds->ds_phys->ds_prev_snap_obj);
+ (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj);
}
dsl_scan_sync_state(scn, tx);
}
@@ -945,8 +985,8 @@
ds1->ds_object, &mintxg) == 0) {
int err;
- ASSERT3U(mintxg, ==, ds1->ds_phys->ds_prev_snap_txg);
- ASSERT3U(mintxg, ==, ds2->ds_phys->ds_prev_snap_txg);
+ ASSERT3U(mintxg, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg);
+ ASSERT3U(mintxg, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg);
VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
scn->scn_phys.scn_queue_obj, ds1->ds_object, tx));
err = zap_add_int_key(dp->dp_meta_objset,
@@ -964,8 +1004,8 @@
(u_longlong_t)ds2->ds_object);
} else if (zap_lookup_int_key(dp->dp_meta_objset,
scn->scn_phys.scn_queue_obj, ds2->ds_object, &mintxg) == 0) {
- ASSERT3U(mintxg, ==, ds1->ds_phys->ds_prev_snap_txg);
- ASSERT3U(mintxg, ==, ds2->ds_phys->ds_prev_snap_txg);
+ ASSERT3U(mintxg, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg);
+ ASSERT3U(mintxg, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg);
VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
scn->scn_phys.scn_queue_obj, ds2->ds_object, tx));
VERIFY(0 == zap_add_int_key(dp->dp_meta_objset,
@@ -993,7 +1033,7 @@
int err;
dsl_scan_t *scn = dp->dp_scan;
- if (hds->ds_dir->dd_phys->dd_origin_obj != eca->originobj)
+ if (dsl_dir_phys(hds->ds_dir)->dd_origin_obj != eca->originobj)
return (0);
err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds);
@@ -1000,10 +1040,10 @@
if (err)
return (err);
- while (ds->ds_phys->ds_prev_snap_obj != eca->originobj) {
+ while (dsl_dataset_phys(ds)->ds_prev_snap_obj != eca->originobj) {
dsl_dataset_t *prev;
err = dsl_dataset_hold_obj(dp,
- ds->ds_phys->ds_prev_snap_obj, FTAG, &prev);
+ dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
dsl_dataset_rele(ds, FTAG);
if (err)
@@ -1012,7 +1052,7 @@
}
VERIFY(zap_add_int_key(dp->dp_meta_objset,
scn->scn_phys.scn_queue_obj, ds->ds_object,
- ds->ds_phys->ds_prev_snap_txg, eca->tx) == 0);
+ dsl_dataset_phys(ds)->ds_prev_snap_txg, eca->tx) == 0);
dsl_dataset_rele(ds, FTAG);
return (0);
}
@@ -1026,6 +1066,46 @@
VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
+ if (scn->scn_phys.scn_cur_min_txg >=
+ scn->scn_phys.scn_max_txg) {
+ /*
+ * This can happen if this snapshot was created after the
+ * scan started, and we already completed a previous snapshot
+ * that was created after the scan started. This snapshot
+ * only references blocks with:
+ *
+ * birth < our ds_creation_txg
+ * cur_min_txg is no less than ds_creation_txg.
+ * We have already visited these blocks.
+ * or
+ * birth > scn_max_txg
+ * The scan requested not to visit these blocks.
+ *
+ * Subsequent snapshots (and clones) can reference our
+ * blocks, or blocks with even higher birth times.
+ * Therefore we do not need to visit them either,
+ * so we do not add them to the work queue.
+ *
+ * Note that checking for cur_min_txg >= cur_max_txg
+ * is not sufficient, because in that case we may need to
+ * visit subsequent snapshots. This happens when min_txg > 0,
+ * which raises cur_min_txg. In this case we will visit
+ * this dataset but skip all of its blocks, because the
+ * rootbp's birth time is < cur_min_txg. Then we will
+ * add the next snapshots/clones to the work queue.
+ */
+ char *dsname = kmem_alloc(MAXNAMELEN, KM_SLEEP);
+ dsl_dataset_name(ds, dsname);
+ zfs_dbgmsg("scanning dataset %llu (%s) is unnecessary because "
+ "cur_min_txg (%llu) >= max_txg (%llu)",
+ dsobj, dsname,
+ scn->scn_phys.scn_cur_min_txg,
+ scn->scn_phys.scn_max_txg);
+ kmem_free(dsname, MAXNAMELEN);
+
+ goto out;
+ }
+
if (dmu_objset_from_ds(ds, &os))
goto out;
@@ -1036,7 +1116,7 @@
* ZIL here, rather than in scan_recurse(), because the regular
* snapshot block-sharing rules don't apply to it.
*/
- if (DSL_SCAN_IS_SCRUB_RESILVER(scn) && !dsl_dataset_is_snapshot(ds))
+ if (DSL_SCAN_IS_SCRUB_RESILVER(scn) && !ds->ds_is_snapshot)
dsl_scan_zil(dp, &os->os_zil_header);
/*
@@ -1043,9 +1123,11 @@
* Iterate over the bps in this ds.
*/
dmu_buf_will_dirty(ds->ds_dbuf, tx);
- dsl_scan_visit_rootbp(scn, ds, &ds->ds_phys->ds_bp, tx);
+ rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
+ dsl_scan_visit_rootbp(scn, ds, &dsl_dataset_phys(ds)->ds_bp, tx);
+ rrw_exit(&ds->ds_bp_rwlock, FTAG);
- char *dsname = kmem_alloc(ZFS_MAXNAMELEN, KM_SLEEP);
+ char *dsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
dsl_dataset_name(ds, dsname);
zfs_dbgmsg("scanned dataset %llu (%s) with min=%llu max=%llu; "
"pausing=%u",
@@ -1053,7 +1135,7 @@
(longlong_t)scn->scn_phys.scn_cur_min_txg,
(longlong_t)scn->scn_phys.scn_cur_max_txg,
(int)scn->scn_pausing);
- kmem_free(dsname, ZFS_MAXNAMELEN);
+ kmem_free(dsname, ZFS_MAX_DATASET_NAME_LEN);
if (scn->scn_pausing)
goto out;
@@ -1077,14 +1159,15 @@
/*
* Add descendent datasets to work queue.
*/
- if (ds->ds_phys->ds_next_snap_obj != 0) {
+ if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0) {
VERIFY(zap_add_int_key(dp->dp_meta_objset,
- scn->scn_phys.scn_queue_obj, ds->ds_phys->ds_next_snap_obj,
- ds->ds_phys->ds_creation_txg, tx) == 0);
+ scn->scn_phys.scn_queue_obj,
+ dsl_dataset_phys(ds)->ds_next_snap_obj,
+ dsl_dataset_phys(ds)->ds_creation_txg, tx) == 0);
}
- if (ds->ds_phys->ds_num_children > 1) {
+ if (dsl_dataset_phys(ds)->ds_num_children > 1) {
boolean_t usenext = B_FALSE;
- if (ds->ds_phys->ds_next_clones_obj != 0) {
+ if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) {
uint64_t count;
/*
* A bug in a previous version of the code could
@@ -1094,17 +1177,17 @@
* next_clones_obj when its count is correct.
*/
int err = zap_count(dp->dp_meta_objset,
- ds->ds_phys->ds_next_clones_obj, &count);
+ dsl_dataset_phys(ds)->ds_next_clones_obj, &count);
if (err == 0 &&
- count == ds->ds_phys->ds_num_children - 1)
+ count == dsl_dataset_phys(ds)->ds_num_children - 1)
usenext = B_TRUE;
}
if (usenext) {
VERIFY0(zap_join_key(dp->dp_meta_objset,
- ds->ds_phys->ds_next_clones_obj,
+ dsl_dataset_phys(ds)->ds_next_clones_obj,
scn->scn_phys.scn_queue_obj,
- ds->ds_phys->ds_creation_txg, tx));
+ dsl_dataset_phys(ds)->ds_creation_txg, tx));
} else {
struct enqueue_clones_arg eca;
eca.tx = tx;
@@ -1132,10 +1215,10 @@
if (err)
return (err);
- while (ds->ds_phys->ds_prev_snap_obj != 0) {
+ while (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) {
dsl_dataset_t *prev;
- err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj,
- FTAG, &prev);
+ err = dsl_dataset_hold_obj(dp,
+ dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
if (err) {
dsl_dataset_rele(ds, FTAG);
return (err);
@@ -1144,7 +1227,7 @@
/*
* If this is a clone, we don't need to worry about it for now.
*/
- if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) {
+ if (dsl_dataset_phys(prev)->ds_next_snap_obj != ds->ds_object) {
dsl_dataset_rele(ds, FTAG);
dsl_dataset_rele(prev, FTAG);
return (0);
@@ -1154,7 +1237,7 @@
}
VERIFY(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
- ds->ds_object, ds->ds_phys->ds_prev_snap_txg, tx) == 0);
+ ds->ds_object, dsl_dataset_phys(ds)->ds_prev_snap_txg, tx) == 0);
dsl_dataset_rele(ds, FTAG);
return (0);
}
@@ -1239,7 +1322,7 @@
const ddt_key_t *ddk = &dde->dde_key;
ddt_phys_t *ddp = dde->dde_phys;
blkptr_t bp;
- zbookmark_t zb = { 0 };
+ zbookmark_phys_t zb = { 0 };
if (scn->scn_phys.scn_state != DSS_SCANNING)
return;
@@ -1246,7 +1329,7 @@
for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
if (ddp->ddp_phys_birth == 0 ||
- ddp->ddp_phys_birth > scn->scn_phys.scn_cur_max_txg)
+ ddp->ddp_phys_birth > scn->scn_phys.scn_max_txg)
continue;
ddt_bp_create(checksum, ddk, ddp, &bp);
@@ -1307,7 +1390,7 @@
* In case we were paused right at the end of the ds, zero the
* bookmark so we don't think that we're still trying to resume.
*/
- bzero(&scn->scn_phys.scn_bookmark, sizeof (zbookmark_t));
+ bzero(&scn->scn_phys.scn_bookmark, sizeof (zbookmark_phys_t));
/* keep pulling things out of the zap-object-as-queue */
while (zap_cursor_init(&zc, dp->dp_meta_objset,
@@ -1329,7 +1412,7 @@
} else {
scn->scn_phys.scn_cur_min_txg =
MAX(scn->scn_phys.scn_min_txg,
- ds->ds_phys->ds_prev_snap_txg);
+ dsl_dataset_phys(ds)->ds_prev_snap_txg);
}
scn->scn_phys.scn_cur_max_txg = dsl_scan_ds_maxtxg(ds);
dsl_dataset_rele(ds, FTAG);
@@ -1347,9 +1430,15 @@
{
uint64_t elapsed_nanosecs;
+ if (zfs_recover)
+ return (B_FALSE);
+
+ if (scn->scn_visited_this_txg >= zfs_free_max_blocks)
+ return (B_TRUE);
+
elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
return (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
- (elapsed_nanosecs / MICROSEC > zfs_free_min_time_ms &&
+ (NSEC2MSEC(elapsed_nanosecs) > zfs_free_min_time_ms &&
txg_sync_waiting(scn->scn_dp)) ||
spa_shutting_down(scn->scn_dp->dp_spa));
}
@@ -1384,9 +1473,8 @@
return (B_FALSE);
if (spa_shutting_down(spa))
return (B_FALSE);
-
if (scn->scn_phys.scn_state == DSS_SCANNING ||
- scn->scn_async_destroying)
+ (scn->scn_async_destroying && !scn->scn_async_stalled))
return (B_TRUE);
if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
@@ -1401,7 +1489,7 @@
{
dsl_scan_t *scn = dp->dp_scan;
spa_t *spa = dp->dp_spa;
- int err;
+ int err = 0;
/*
* Check for scn_restart_txg before checking spa_load_state, so
@@ -1408,8 +1496,7 @@
* that we can restart an old-style scan while the pool is being
* imported (see dsl_scan_init).
*/
- if (scn->scn_restart_txg != 0 &&
- scn->scn_restart_txg <= tx->tx_txg) {
+ if (dsl_scan_restarting(scn, tx)) {
pool_scan_func_t func = POOL_SCAN_SCRUB;
dsl_scan_done(scn, B_FALSE, tx);
if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL))
@@ -1419,10 +1506,26 @@
dsl_scan_setup_sync(&func, tx);
}
- if (!dsl_scan_active(scn) ||
- spa_sync_pass(dp->dp_spa) > 1)
+ /*
+ * Only process scans in sync pass 1.
+ */
+ if (spa_sync_pass(dp->dp_spa) > 1)
return;
+ /*
+ * If the spa is shutting down, then stop scanning. This will
+ * ensure that the scan does not dirty any new data during the
+ * shutdown phase.
+ */
+ if (spa_shutting_down(spa))
+ return;
+
+ /*
+ * If the scan is inactive due to a stalled async destroy, try again.
+ */
+ if (!scn->scn_async_stalled && !dsl_scan_active(scn))
+ return;
+
scn->scn_visited_this_txg = 0;
scn->scn_pausing = B_FALSE;
scn->scn_sync_start_time = gethrtime();
@@ -1429,12 +1532,14 @@
spa->spa_scrub_active = B_TRUE;
/*
- * First process the free list. If we pause the free, don't do
- * any scanning. This ensures that there is no free list when
- * we are scanning, so the scan code doesn't have to worry about
- * traversing it.
+ * First process the async destroys. If we pause, don't do
+ * any scrubbing or resilvering. This ensures that there are no
+ * async destroys while we are scanning, so the scan code doesn't
+ * have to worry about traversing it. It is also faster to free the
+ * blocks than to scrub them.
*/
- if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
+ if (zfs_free_bpobj_enabled &&
+ spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
scn->scn_is_bptree = B_FALSE;
scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
NULL, ZIO_FLAG_MUSTSUCCEED);
@@ -1442,53 +1547,117 @@
dsl_scan_free_block_cb, scn, tx);
VERIFY3U(0, ==, zio_wait(scn->scn_zio_root));
- if (err == 0 && spa_feature_is_active(spa,
- &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) {
- ASSERT(scn->scn_async_destroying);
- scn->scn_is_bptree = B_TRUE;
- scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
- NULL, ZIO_FLAG_MUSTSUCCEED);
- err = bptree_iterate(dp->dp_meta_objset,
- dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb,
- scn, tx);
- VERIFY0(zio_wait(scn->scn_zio_root));
+ if (err != 0 && err != ERESTART)
+ zfs_panic_recover("error %u from bpobj_iterate()", err);
+ }
- if (err == 0) {
- zfeature_info_t *feat = &spa_feature_table
- [SPA_FEATURE_ASYNC_DESTROY];
- /* finished; deactivate async destroy feature */
- spa_feature_decr(spa, feat, tx);
- ASSERT(!spa_feature_is_active(spa, feat));
- VERIFY0(zap_remove(dp->dp_meta_objset,
- DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_BPTREE_OBJ, tx));
- VERIFY0(bptree_free(dp->dp_meta_objset,
- dp->dp_bptree_obj, tx));
- dp->dp_bptree_obj = 0;
- scn->scn_async_destroying = B_FALSE;
- }
+ if (err == 0 && spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) {
+ ASSERT(scn->scn_async_destroying);
+ scn->scn_is_bptree = B_TRUE;
+ scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
+ NULL, ZIO_FLAG_MUSTSUCCEED);
+ err = bptree_iterate(dp->dp_meta_objset,
+ dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb, scn, tx);
+ VERIFY0(zio_wait(scn->scn_zio_root));
+
+ if (err == EIO || err == ECKSUM) {
+ err = 0;
+ } else if (err != 0 && err != ERESTART) {
+ zfs_panic_recover("error %u from "
+ "traverse_dataset_destroyed()", err);
}
- if (scn->scn_visited_this_txg) {
- zfs_dbgmsg("freed %llu blocks in %llums from "
- "free_bpobj/bptree txg %llu",
- (longlong_t)scn->scn_visited_this_txg,
- (longlong_t)
- (gethrtime() - scn->scn_sync_start_time) / MICROSEC,
- (longlong_t)tx->tx_txg);
- scn->scn_visited_this_txg = 0;
+
+ if (bptree_is_empty(dp->dp_meta_objset, dp->dp_bptree_obj)) {
+ /* finished; deactivate async destroy feature */
+ spa_feature_decr(spa, SPA_FEATURE_ASYNC_DESTROY, tx);
+ ASSERT(!spa_feature_is_active(spa,
+ SPA_FEATURE_ASYNC_DESTROY));
+ VERIFY0(zap_remove(dp->dp_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_BPTREE_OBJ, tx));
+ VERIFY0(bptree_free(dp->dp_meta_objset,
+ dp->dp_bptree_obj, tx));
+ dp->dp_bptree_obj = 0;
+ scn->scn_async_destroying = B_FALSE;
+ scn->scn_async_stalled = B_FALSE;
+ } else {
/*
- * Re-sync the ddt so that we can further modify
- * it when doing bprewrite.
+ * If we didn't make progress, mark the async
+ * destroy as stalled, so that we will not initiate
+ * a spa_sync() on its behalf. Note that we only
+ * check this if we are not finished, because if the
+ * bptree had no blocks for us to visit, we can
+ * finish without "making progress".
*/
- ddt_sync(spa, tx->tx_txg);
+ scn->scn_async_stalled =
+ (scn->scn_visited_this_txg == 0);
}
- if (err == ERESTART)
- return;
}
+ if (scn->scn_visited_this_txg) {
+ zfs_dbgmsg("freed %llu blocks in %llums from "
+ "free_bpobj/bptree txg %llu; err=%d",
+ (longlong_t)scn->scn_visited_this_txg,
+ (longlong_t)
+ NSEC2MSEC(gethrtime() - scn->scn_sync_start_time),
+ (longlong_t)tx->tx_txg, err);
+ scn->scn_visited_this_txg = 0;
+ /*
+ * Write out changes to the DDT that may be required as a
+ * result of the blocks freed. This ensures that the DDT
+ * is clean when a scrub/resilver runs.
+ */
+ ddt_sync(spa, tx->tx_txg);
+ }
+ if (err != 0)
+ return;
+ if (dp->dp_free_dir != NULL && !scn->scn_async_destroying &&
+ zfs_free_leak_on_eio &&
+ (dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes != 0 ||
+ dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes != 0 ||
+ dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes != 0)) {
+ /*
+ * We have finished background destroying, but there is still
+ * some space left in the dp_free_dir. Transfer this leaked
+ * space to the dp_leak_dir.
+ */
+ if (dp->dp_leak_dir == NULL) {
+ rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
+ (void) dsl_dir_create_sync(dp, dp->dp_root_dir,
+ LEAK_DIR_NAME, tx);
+ VERIFY0(dsl_pool_open_special_dir(dp,
+ LEAK_DIR_NAME, &dp->dp_leak_dir));
+ rrw_exit(&dp->dp_config_rwlock, FTAG);
+ }
+ dsl_dir_diduse_space(dp->dp_leak_dir, DD_USED_HEAD,
+ dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes,
+ dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes,
+ dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx);
+ dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD,
+ -dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes,
+ -dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes,
+ -dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx);
+ }
+ if (dp->dp_free_dir != NULL && !scn->scn_async_destroying) {
+ /* finished; verify that space accounting went to zero */
+ ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes);
+ ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes);
+ ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes);
+ }
+
if (scn->scn_phys.scn_state != DSS_SCANNING)
return;
+ if (scn->scn_done_txg == tx->tx_txg) {
+ ASSERT(!scn->scn_pausing);
+ /* finished with scan. */
+ zfs_dbgmsg("txg %llu scan complete", tx->tx_txg);
+ dsl_scan_done(scn, B_TRUE, tx);
+ ASSERT3U(spa->spa_scrub_inflight, ==, 0);
+ dsl_scan_sync_state(scn, tx);
+ return;
+ }
+
if (scn->scn_phys.scn_ddt_bookmark.ddb_class <=
scn->scn_phys.scn_ddt_class_max) {
zfs_dbgmsg("doing scan sync txg %llu; "
@@ -1521,12 +1690,12 @@
zfs_dbgmsg("visited %llu blocks in %llums",
(longlong_t)scn->scn_visited_this_txg,
- (longlong_t)(gethrtime() - scn->scn_sync_start_time) / MICROSEC);
+ (longlong_t)NSEC2MSEC(gethrtime() - scn->scn_sync_start_time));
if (!scn->scn_pausing) {
- /* finished with scan. */
- zfs_dbgmsg("finished scan txg %llu", (longlong_t)tx->tx_txg);
- dsl_scan_done(scn, B_TRUE, tx);
+ scn->scn_done_txg = tx->tx_txg + 1;
+ zfs_dbgmsg("txg %llu traversal complete, waiting till txg %llu",
+ tx->tx_txg, scn->scn_done_txg);
}
if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
@@ -1640,7 +1809,7 @@
static int
dsl_scan_scrub_cb(dsl_pool_t *dp,
- const blkptr_t *bp, const zbookmark_t *zb)
+ const blkptr_t *bp, const zbookmark_phys_t *zb)
{
dsl_scan_t *scn = dp->dp_scan;
size_t size = BP_GET_PSIZE(bp);
@@ -1648,7 +1817,6 @@
uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp);
boolean_t needs_io;
int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL;
- int zio_priority;
unsigned int scan_delay = 0;
if (phys_birth <= scn->scn_phys.scn_min_txg ||
@@ -1657,16 +1825,17 @@
count_block(dp->dp_blkstats, bp);
+ if (BP_IS_EMBEDDED(bp))
+ return (0);
+
ASSERT(DSL_SCAN_IS_SCRUB_RESILVER(scn));
if (scn->scn_phys.scn_func == POOL_SCAN_SCRUB) {
zio_flags |= ZIO_FLAG_SCRUB;
- zio_priority = ZIO_PRIORITY_SCRUB;
needs_io = B_TRUE;
scan_delay = zfs_scrub_delay;
} else {
ASSERT3U(scn->scn_phys.scn_func, ==, POOL_SCAN_RESILVER);
zio_flags |= ZIO_FLAG_RESILVER;
- zio_priority = ZIO_PRIORITY_RESILVER;
needs_io = B_FALSE;
scan_delay = zfs_resilver_delay;
}
@@ -1725,7 +1894,7 @@
delay(MAX((int)scan_delay, 0));
zio_nowait(zio_read(NULL, spa, bp, data, size,
- dsl_scan_scrub_done, NULL, zio_priority,
+ dsl_scan_scrub_done, NULL, ZIO_PRIORITY_SCRUB,
zio_flags, zb));
}
@@ -1752,5 +1921,12 @@
(void) spa_vdev_state_exit(spa, NULL, 0);
return (dsl_sync_task(spa_name(spa), dsl_scan_setup_check,
- dsl_scan_setup_sync, &func, 0));
+ dsl_scan_setup_sync, &func, 0, ZFS_SPACE_CHECK_NONE));
}
+
+static boolean_t
+dsl_scan_restarting(dsl_scan_t *scn, dmu_tx_t *tx)
+{
+ return (scn->scn_restart_txg != 0 &&
+ scn->scn_restart_txg <= tx->tx_txg);
+}
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_synctask.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,7 +21,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
*/
#include <sys/dmu.h>
@@ -65,7 +65,8 @@
*/
int
dsl_sync_task(const char *pool, dsl_checkfunc_t *checkfunc,
- dsl_syncfunc_t *syncfunc, void *arg, int blocks_modified)
+ dsl_syncfunc_t *syncfunc, void *arg,
+ int blocks_modified, zfs_space_check_t space_check)
{
spa_t *spa;
dmu_tx_t *tx;
@@ -85,6 +86,7 @@
dst.dst_pool = dp;
dst.dst_txg = dmu_tx_get_txg(tx);
dst.dst_space = blocks_modified << DST_AVG_BLKSHIFT;
+ dst.dst_space_check = space_check;
dst.dst_checkfunc = checkfunc != NULL ? checkfunc : dsl_null_checkfunc;
dst.dst_syncfunc = syncfunc;
dst.dst_arg = arg;
@@ -118,7 +120,7 @@
void
dsl_sync_task_nowait(dsl_pool_t *dp, dsl_syncfunc_t *syncfunc, void *arg,
- int blocks_modified, dmu_tx_t *tx)
+ int blocks_modified, zfs_space_check_t space_check, dmu_tx_t *tx)
{
dsl_sync_task_t *dst = kmem_zalloc(sizeof (*dst), KM_SLEEP);
@@ -125,6 +127,7 @@
dst->dst_pool = dp;
dst->dst_txg = dmu_tx_get_txg(tx);
dst->dst_space = blocks_modified << DST_AVG_BLKSHIFT;
+ dst->dst_space_check = space_check;
dst->dst_checkfunc = dsl_null_checkfunc;
dst->dst_syncfunc = syncfunc;
dst->dst_arg = arg;
@@ -141,25 +144,34 @@
dsl_sync_task_sync(dsl_sync_task_t *dst, dmu_tx_t *tx)
{
dsl_pool_t *dp = dst->dst_pool;
- uint64_t quota, used;
ASSERT0(dst->dst_error);
/*
- * Check for sufficient space. We just check against what's
- * on-disk; we don't want any in-flight accounting to get in our
- * way, because open context may have already used up various
- * in-core limits (arc_tempreserve, dsl_pool_tempreserve).
+ * Check for sufficient space.
+ *
+ * When the sync task was created, the caller specified the
+ * type of space checking required. See the comment in
+ * zfs_space_check_t for details on the semantics of each
+ * type of space checking.
+ *
+ * We just check against what's on-disk; we don't want any
+ * in-flight accounting to get in our way, because open context
+ * may have already used up various in-core limits
+ * (arc_tempreserve, dsl_pool_tempreserve).
*/
- quota = dsl_pool_adjustedsize(dp, B_FALSE) -
- metaslab_class_get_deferred(spa_normal_class(dp->dp_spa));
- used = dp->dp_root_dir->dd_phys->dd_used_bytes;
- /* MOS space is triple-dittoed, so we multiply by 3. */
- if (dst->dst_space > 0 && used + dst->dst_space * 3 > quota) {
- dst->dst_error = SET_ERROR(ENOSPC);
- if (dst->dst_nowaiter)
- kmem_free(dst, sizeof (*dst));
- return;
+ if (dst->dst_space_check != ZFS_SPACE_CHECK_NONE) {
+ uint64_t quota = dsl_pool_adjustedsize(dp,
+ dst->dst_space_check == ZFS_SPACE_CHECK_RESERVED) -
+ metaslab_class_get_deferred(spa_normal_class(dp->dp_spa));
+ uint64_t used = dsl_dir_phys(dp->dp_root_dir)->dd_used_bytes;
+ /* MOS space is triple-dittoed, so we multiply by 3. */
+ if (dst->dst_space > 0 && used + dst->dst_space * 3 > quota) {
+ dst->dst_error = SET_ERROR(ENOSPC);
+ if (dst->dst_nowaiter)
+ kmem_free(dst, sizeof (*dst));
+ return;
+ }
}
/*
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_userhold.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_userhold.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_userhold.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,7 +21,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
* Copyright (c) 2013 Steven Hartland. All rights reserved.
*/
@@ -65,10 +65,10 @@
return (SET_ERROR(E2BIG));
/* tags must be unique (if ds already exists) */
- if (ds != NULL && ds->ds_phys->ds_userrefs_obj != 0) {
+ if (ds != NULL && dsl_dataset_phys(ds)->ds_userrefs_obj != 0) {
uint64_t value;
- error = zap_lookup(mos, ds->ds_phys->ds_userrefs_obj,
+ error = zap_lookup(mos, dsl_dataset_phys(ds)->ds_userrefs_obj,
htag, 8, 1, &value);
if (error == 0)
error = SET_ERROR(EEXIST);
@@ -141,16 +141,16 @@
ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock));
- if (ds->ds_phys->ds_userrefs_obj == 0) {
+ if (dsl_dataset_phys(ds)->ds_userrefs_obj == 0) {
/*
* This is the first user hold for this dataset. Create
* the userrefs zap object.
*/
dmu_buf_will_dirty(ds->ds_dbuf, tx);
- zapobj = ds->ds_phys->ds_userrefs_obj =
+ zapobj = dsl_dataset_phys(ds)->ds_userrefs_obj =
zap_create(mos, DMU_OT_USERREFS, DMU_OT_NONE, 0, tx);
} else {
- zapobj = ds->ds_phys->ds_userrefs_obj;
+ zapobj = dsl_dataset_phys(ds)->ds_userrefs_obj;
}
ds->ds_userrefs++;
@@ -181,7 +181,7 @@
}
typedef struct zfs_hold_cleanup_arg {
- char zhca_spaname[MAXNAMELEN];
+ char zhca_spaname[ZFS_MAX_DATASET_NAME_LEN];
uint64_t zhca_spa_load_guid;
nvlist_t *zhca_holds;
} zfs_hold_cleanup_arg_t;
@@ -318,7 +318,8 @@
dduha.dduha_minor = cleanup_minor;
ret = dsl_sync_task(nvpair_name(pair), dsl_dataset_user_hold_check,
- dsl_dataset_user_hold_sync, &dduha, fnvlist_num_pairs(holds));
+ dsl_dataset_user_hold_sync, &dduha,
+ fnvlist_num_pairs(holds), ZFS_SPACE_CHECK_RESERVED);
fnvlist_free(dduha.dduha_chkholds);
return (ret);
@@ -352,7 +353,7 @@
objset_t *mos;
int numholds;
- if (!dsl_dataset_is_snapshot(ds))
+ if (!ds->ds_is_snapshot)
return (SET_ERROR(EINVAL));
if (nvlist_empty(holds))
@@ -360,7 +361,7 @@
numholds = 0;
mos = ds->ds_dir->dd_pool->dp_meta_objset;
- zapobj = ds->ds_phys->ds_userrefs_obj;
+ zapobj = dsl_dataset_phys(ds)->ds_userrefs_obj;
holds_found = fnvlist_alloc();
for (nvpair_t *pair = nvlist_next_nvpair(holds, NULL); pair != NULL;
@@ -398,7 +399,8 @@
numholds++;
}
- if (DS_IS_DEFER_DESTROY(ds) && ds->ds_phys->ds_num_children == 1 &&
+ if (DS_IS_DEFER_DESTROY(ds) &&
+ dsl_dataset_phys(ds)->ds_num_children == 1 &&
ds->ds_userrefs == numholds) {
/* we need to destroy the snapshot as well */
if (dsl_dataset_long_held(ds)) {
@@ -484,8 +486,8 @@
error = dsl_pool_user_release(dp, ds->ds_object, holdname, tx);
VERIFY(error == 0 || error == ENOENT);
- VERIFY0(zap_remove(mos, ds->ds_phys->ds_userrefs_obj, holdname,
- tx));
+ VERIFY0(zap_remove(mos, dsl_dataset_phys(ds)->ds_userrefs_obj,
+ holdname, tx));
ds->ds_userrefs--;
spa_history_log_internal_ds(ds, "release", tx,
@@ -514,7 +516,7 @@
fnvpair_value_nvlist(pair), tx);
if (nvlist_exists(ddura->ddura_todelete, name)) {
ASSERT(ds->ds_userrefs == 0 &&
- ds->ds_phys->ds_num_children == 1 &&
+ dsl_dataset_phys(ds)->ds_num_children == 1 &&
DS_IS_DEFER_DESTROY(ds));
dsl_destroy_snapshot_sync_impl(ds, B_FALSE, tx);
}
@@ -565,21 +567,23 @@
ddura.ddura_holdfunc = dsl_dataset_hold_obj_string;
pool = spa_name(tmpdp->dp_spa);
#ifdef _KERNEL
- dsl_pool_config_enter(tmpdp, FTAG);
for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL;
pair = nvlist_next_nvpair(holds, pair)) {
dsl_dataset_t *ds;
+ dsl_pool_config_enter(tmpdp, FTAG);
error = dsl_dataset_hold_obj_string(tmpdp,
nvpair_name(pair), FTAG, &ds);
if (error == 0) {
- char name[MAXNAMELEN];
+ char name[ZFS_MAX_DATASET_NAME_LEN];
dsl_dataset_name(ds, name);
+ dsl_pool_config_exit(tmpdp, FTAG);
dsl_dataset_rele(ds, FTAG);
(void) zfs_unmount_snap(name);
+ } else {
+ dsl_pool_config_exit(tmpdp, FTAG);
}
}
- dsl_pool_config_exit(tmpdp, FTAG);
#endif
} else {
/* Non-temporary holds are specified by name. */
@@ -599,8 +603,7 @@
ddura.ddura_chkholds = fnvlist_alloc();
error = dsl_sync_task(pool, dsl_dataset_user_release_check,
- dsl_dataset_user_release_sync, &ddura,
- fnvlist_num_pairs(holds));
+ dsl_dataset_user_release_sync, &ddura, 0, ZFS_SPACE_CHECK_NONE);
fnvlist_free(ddura.ddura_todelete);
fnvlist_free(ddura.ddura_chkholds);
@@ -643,13 +646,13 @@
return (err);
}
- if (ds->ds_phys->ds_userrefs_obj != 0) {
+ if (dsl_dataset_phys(ds)->ds_userrefs_obj != 0) {
zap_attribute_t *za;
zap_cursor_t zc;
za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
for (zap_cursor_init(&zc, ds->ds_dir->dd_pool->dp_meta_objset,
- ds->ds_phys->ds_userrefs_obj);
+ dsl_dataset_phys(ds)->ds_userrefs_obj);
zap_cursor_retrieve(&zc, za) == 0;
zap_cursor_advance(&zc)) {
fnvlist_add_uint64(nvl, za->za_name,
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lz4.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lz4.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lz4.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -45,6 +45,8 @@
static int LZ4_compress64kCtx(void *ctx, const char *source, char *dest,
int isize, int osize);
+static kmem_cache_t *lz4_ctx_cache;
+
/*ARGSUSED*/
size_t
lz4_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
@@ -841,7 +843,7 @@
real_LZ4_compress(const char *source, char *dest, int isize, int osize)
{
#if HEAPMODE
- void *ctx = kmem_zalloc(sizeof (struct refTables), KM_NOSLEEP);
+ void *ctx = kmem_cache_alloc(lz4_ctx_cache, KM_NOSLEEP);
int result;
/*
@@ -851,12 +853,13 @@
if (ctx == NULL)
return (0);
+ bzero(ctx, sizeof(struct refTables));
if (isize < LZ4_64KLIMIT)
result = LZ4_compress64kCtx(ctx, source, dest, isize, osize);
else
result = LZ4_compressCtx(ctx, source, dest, isize, osize);
- kmem_free(ctx, sizeof (struct refTables));
+ kmem_cache_free(lz4_ctx_cache, ctx);
return (result);
#else
if (isize < (int)LZ4_64KLIMIT)
@@ -1002,3 +1005,22 @@
_output_error:
return (int)(-(((char *)ip) - source));
}
+
+extern void
+lz4_init(void)
+{
+
+#if HEAPMODE
+ lz4_ctx_cache = kmem_cache_create("lz4_ctx", sizeof(struct refTables),
+ 0, NULL, NULL, NULL, NULL, NULL, 0);
+#endif
+}
+
+extern void
+lz4_fini(void)
+{
+
+#if HEAPMODE
+ kmem_cache_destroy(lz4_ctx_cache);
+#endif
+}
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lzjb.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lzjb.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/lzjb.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -118,7 +118,9 @@
src += 2;
if ((cpy = dst - offset) < (uchar_t *)d_start)
return (-1);
- while (--mlen >= 0 && dst < d_end)
+ if (mlen > (d_end - dst))
+ mlen = d_end - dst;
+ while (--mlen >= 0)
*dst++ = *cpy++;
} else {
*dst++ = *src++;
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,8 +21,9 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
*/
#include <sys/zfs_context.h>
@@ -32,21 +33,26 @@
#include <sys/metaslab_impl.h>
#include <sys/vdev_impl.h>
#include <sys/zio.h>
+#include <sys/spa_impl.h>
+#include <sys/zfeature.h>
-/*
- * Allow allocations to switch to gang blocks quickly. We do this to
- * avoid having to load lots of space_maps in a given txg. There are,
- * however, some cases where we want to avoid "fast" ganging and instead
- * we want to do an exhaustive search of all metaslabs on this device.
- * Currently we don't allow any gang, zil, or dump device related allocations
- * to "fast" gang.
- */
-#define CAN_FASTGANG(flags) \
- (!((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | \
- METASLAB_GANG_AVOID)))
+SYSCTL_DECL(_vfs_zfs);
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, metaslab, CTLFLAG_RW, 0, "ZFS metaslab");
+#define GANG_ALLOCATION(flags) \
+ ((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER))
+
+#define METASLAB_WEIGHT_PRIMARY (1ULL << 63)
+#define METASLAB_WEIGHT_SECONDARY (1ULL << 62)
+#define METASLAB_ACTIVE_MASK \
+ (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY)
+
uint64_t metaslab_aliquot = 512ULL << 10;
uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */
+TUNABLE_QUAD("vfs.zfs.metaslab.gang_bang", &metaslab_gang_bang);
+SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, gang_bang, CTLFLAG_RWTUN,
+ &metaslab_gang_bang, 0,
+ "Force gang block allocation for blocks larger than or equal to this value");
/*
* The in-core space map representation is more compact than its on-disk form.
@@ -55,32 +61,105 @@
* Values should be greater than or equal to 100.
*/
int zfs_condense_pct = 200;
+TUNABLE_INT("vfs.zfs.condense_pct", &zfs_condense_pct);
+SYSCTL_INT(_vfs_zfs, OID_AUTO, condense_pct, CTLFLAG_RWTUN,
+ &zfs_condense_pct, 0,
+ "Condense on-disk spacemap when it is more than this many percents"
+ " of in-memory counterpart");
/*
- * This value defines the number of allowed allocation failures per vdev.
- * If a device reaches this threshold in a given txg then we consider skipping
- * allocations on that device.
+ * Condensing a metaslab is not guaranteed to actually reduce the amount of
+ * space used on disk. In particular, a space map uses data in increments of
+ * MAX(1 << ashift, space_map_blksize), so a metaslab might use the
+ * same number of blocks after condensing. Since the goal of condensing is to
+ * reduce the number of IOPs required to read the space map, we only want to
+ * condense when we can be sure we will reduce the number of blocks used by the
+ * space map. Unfortunately, we cannot precisely compute whether or not this is
+ * the case in metaslab_should_condense since we are holding ms_lock. Instead,
+ * we apply the following heuristic: do not condense a spacemap unless the
+ * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold
+ * blocks.
*/
-int zfs_mg_alloc_failures = 0;
+int zfs_metaslab_condense_block_threshold = 4;
-SYSCTL_DECL(_vfs_zfs);
-SYSCTL_INT(_vfs_zfs, OID_AUTO, mg_alloc_failures, CTLFLAG_RDTUN,
- &zfs_mg_alloc_failures, 0,
- "Number of allowed allocation failures per vdev");
-TUNABLE_INT("vfs.zfs.mg_alloc_failures", &zfs_mg_alloc_failures);
+/*
+ * The zfs_mg_noalloc_threshold defines which metaslab groups should
+ * be eligible for allocation. The value is defined as a percentage of
+ * free space. Metaslab groups that have more free space than
+ * zfs_mg_noalloc_threshold are always eligible for allocations. Once
+ * a metaslab group's free space is less than or equal to the
+ * zfs_mg_noalloc_threshold the allocator will avoid allocating to that
+ * group unless all groups in the pool have reached zfs_mg_noalloc_threshold.
+ * Once all groups in the pool reach zfs_mg_noalloc_threshold then all
+ * groups are allowed to accept allocations. Gang blocks are always
+ * eligible to allocate on any metaslab group. The default value of 0 means
+ * no metaslab group will be excluded based on this criterion.
+ */
+int zfs_mg_noalloc_threshold = 0;
+TUNABLE_INT("vfs.zfs.mg_noalloc_threshold", &zfs_mg_noalloc_threshold);
+SYSCTL_INT(_vfs_zfs, OID_AUTO, mg_noalloc_threshold, CTLFLAG_RWTUN,
+ &zfs_mg_noalloc_threshold, 0,
+ "Percentage of metaslab group size that should be free"
+ " to make it eligible for allocation");
/*
- * Metaslab debugging: when set, keeps all space maps in core to verify frees.
+ * Metaslab groups are considered eligible for allocations if their
+ * fragmenation metric (measured as a percentage) is less than or equal to
+ * zfs_mg_fragmentation_threshold. If a metaslab group exceeds this threshold
+ * then it will be skipped unless all metaslab groups within the metaslab
+ * class have also crossed this threshold.
*/
-static int metaslab_debug = 0;
+int zfs_mg_fragmentation_threshold = 85;
+TUNABLE_INT("vfs.zfs.mg_fragmentation_threshold", &zfs_mg_fragmentation_threshold);
+SYSCTL_INT(_vfs_zfs, OID_AUTO, mg_fragmentation_threshold, CTLFLAG_RWTUN,
+ &zfs_mg_fragmentation_threshold, 0,
+ "Percentage of metaslab group size that should be considered "
+ "eligible for allocations unless all metaslab groups within the metaslab class "
+ "have also crossed this threshold");
/*
+ * Allow metaslabs to keep their active state as long as their fragmentation
+ * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An
+ * active metaslab that exceeds this threshold will no longer keep its active
+ * status allowing better metaslabs to be selected.
+ */
+int zfs_metaslab_fragmentation_threshold = 70;
+TUNABLE_INT("vfs.zfs.metaslab.fragmentation_threshold",
+ &zfs_metaslab_fragmentation_threshold);
+SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, fragmentation_threshold, CTLFLAG_RWTUN,
+ &zfs_metaslab_fragmentation_threshold, 0,
+ "Maximum percentage of metaslab fragmentation level to keep their active state");
+
+/*
+ * When set will load all metaslabs when pool is first opened.
+ */
+int metaslab_debug_load = 0;
+TUNABLE_INT("vfs.zfs.metaslab.debug_load", &metaslab_debug_load);
+SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, debug_load, CTLFLAG_RWTUN,
+ &metaslab_debug_load, 0,
+ "Load all metaslabs when pool is first opened");
+
+/*
+ * When set will prevent metaslabs from being unloaded.
+ */
+int metaslab_debug_unload = 0;
+TUNABLE_INT("vfs.zfs.metaslab.debug_unload", &metaslab_debug_unload);
+SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, debug_unload, CTLFLAG_RWTUN,
+ &metaslab_debug_unload, 0,
+ "Prevent metaslabs from being unloaded");
+
+/*
* Minimum size which forces the dynamic allocator to change
* it's allocation strategy. Once the space map cannot satisfy
* an allocation of this size then it switches to using more
* aggressive strategy (i.e search by size rather than offset).
*/
-uint64_t metaslab_df_alloc_threshold = SPA_MAXBLOCKSIZE;
+uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE;
+TUNABLE_QUAD("vfs.zfs.metaslab.df_alloc_threshold",
+ &metaslab_df_alloc_threshold);
+SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, df_alloc_threshold, CTLFLAG_RWTUN,
+ &metaslab_df_alloc_threshold, 0,
+ "Minimum size which forces the dynamic allocator to change it's allocation strategy");
/*
* The minimum free space, in percent, which must be available
@@ -89,6 +168,10 @@
* switch to using best-fit allocations.
*/
int metaslab_df_free_pct = 4;
+TUNABLE_INT("vfs.zfs.metaslab.df_free_pct", &metaslab_df_free_pct);
+SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, df_free_pct, CTLFLAG_RWTUN,
+ &metaslab_df_free_pct, 0,
+ "The minimum free space, in percent, which must be available in a space map to continue allocations in a first-fit fashion");
/*
* A metaslab is considered "free" if it contains a contiguous
@@ -95,33 +178,89 @@
* segment which is greater than metaslab_min_alloc_size.
*/
uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS;
+TUNABLE_QUAD("vfs.zfs.metaslab.min_alloc_size",
+ &metaslab_min_alloc_size);
+SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, min_alloc_size, CTLFLAG_RWTUN,
+ &metaslab_min_alloc_size, 0,
+ "A metaslab is considered \"free\" if it contains a contiguous segment which is greater than vfs.zfs.metaslab.min_alloc_size");
/*
- * Max number of space_maps to prefetch.
+ * Percentage of all cpus that can be used by the metaslab taskq.
*/
-int metaslab_prefetch_limit = SPA_DVAS_PER_BP;
+int metaslab_load_pct = 50;
+TUNABLE_INT("vfs.zfs.metaslab.load_pct", &metaslab_load_pct);
+SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, load_pct, CTLFLAG_RWTUN,
+ &metaslab_load_pct, 0,
+ "Percentage of cpus that can be used by the metaslab taskq");
/*
- * Percentage bonus multiplier for metaslabs that are in the bonus area.
+ * Determines how many txgs a metaslab may remain loaded without having any
+ * allocations from it. As long as a metaslab continues to be used we will
+ * keep it loaded.
*/
-int metaslab_smo_bonus_pct = 150;
+int metaslab_unload_delay = TXG_SIZE * 2;
+TUNABLE_INT("vfs.zfs.metaslab.unload_delay", &metaslab_unload_delay);
+SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, unload_delay, CTLFLAG_RWTUN,
+ &metaslab_unload_delay, 0,
+ "Number of TXGs that an unused metaslab can be kept in memory");
/*
- * Should we be willing to write data to degraded vdevs?
+ * Max number of metaslabs per group to preload.
*/
-boolean_t zfs_write_to_degraded = B_FALSE;
-SYSCTL_INT(_vfs_zfs, OID_AUTO, write_to_degraded, CTLFLAG_RW,
- &zfs_write_to_degraded, 0,
- "Allow writing data to degraded vdevs");
-TUNABLE_INT("vfs.zfs.write_to_degraded", &zfs_write_to_degraded);
+int metaslab_preload_limit = SPA_DVAS_PER_BP;
+TUNABLE_INT("vfs.zfs.metaslab.preload_limit", &metaslab_preload_limit);
+SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_limit, CTLFLAG_RWTUN,
+ &metaslab_preload_limit, 0,
+ "Max number of metaslabs per group to preload");
/*
+ * Enable/disable preloading of metaslab.
+ */
+boolean_t metaslab_preload_enabled = B_TRUE;
+TUNABLE_INT("vfs.zfs.metaslab.preload_enabled", &metaslab_preload_enabled);
+SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_enabled, CTLFLAG_RWTUN,
+ &metaslab_preload_enabled, 0,
+ "Max number of metaslabs per group to preload");
+
+/*
+ * Enable/disable fragmentation weighting on metaslabs.
+ */
+boolean_t metaslab_fragmentation_factor_enabled = B_TRUE;
+TUNABLE_INT("vfs.zfs.metaslab_fragmentation_factor_enabled",
+ &metaslab_fragmentation_factor_enabled);
+SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, fragmentation_factor_enabled, CTLFLAG_RWTUN,
+ &metaslab_fragmentation_factor_enabled, 0,
+ "Enable fragmentation weighting on metaslabs");
+
+/*
+ * Enable/disable lba weighting (i.e. outer tracks are given preference).
+ */
+boolean_t metaslab_lba_weighting_enabled = B_TRUE;
+TUNABLE_INT("vfs.zfs.metaslab.lba_weighting_enabled",
+ &metaslab_lba_weighting_enabled);
+SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, lba_weighting_enabled, CTLFLAG_RWTUN,
+ &metaslab_lba_weighting_enabled, 0,
+ "Enable LBA weighting (i.e. outer tracks are given preference)");
+
+/*
+ * Enable/disable metaslab group biasing.
+ */
+boolean_t metaslab_bias_enabled = B_TRUE;
+TUNABLE_INT("vfs.zfs.metaslab.bias_enabled",
+ &metaslab_bias_enabled);
+SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, bias_enabled, CTLFLAG_RWTUN,
+ &metaslab_bias_enabled, 0,
+ "Enable metaslab group biasing");
+
+static uint64_t metaslab_fragmentation(metaslab_t *);
+
+/*
* ==========================================================================
* Metaslab classes
* ==========================================================================
*/
metaslab_class_t *
-metaslab_class_create(spa_t *spa, space_map_ops_t *ops)
+metaslab_class_create(spa_t *spa, metaslab_ops_t *ops)
{
metaslab_class_t *mc;
@@ -130,6 +269,8 @@
mc->mc_spa = spa;
mc->mc_rotor = NULL;
mc->mc_ops = ops;
+ mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL);
+ refcount_create_tracked(&mc->mc_alloc_slots);
return (mc);
}
@@ -143,6 +284,8 @@
ASSERT(mc->mc_space == 0);
ASSERT(mc->mc_dspace == 0);
+ refcount_destroy(&mc->mc_alloc_slots);
+ mutex_destroy(&mc->mc_lock);
kmem_free(mc, sizeof (metaslab_class_t));
}
@@ -182,6 +325,27 @@
atomic_add_64(&mc->mc_dspace, dspace_delta);
}
+void
+metaslab_class_minblocksize_update(metaslab_class_t *mc)
+{
+ metaslab_group_t *mg;
+ vdev_t *vd;
+ uint64_t minashift = UINT64_MAX;
+
+ if ((mg = mc->mc_rotor) == NULL) {
+ mc->mc_minblocksize = SPA_MINBLOCKSIZE;
+ return;
+ }
+
+ do {
+ vd = mg->mg_vd;
+ if (vd->vdev_ashift < minashift)
+ minashift = vd->vdev_ashift;
+ } while ((mg = mg->mg_next) != mc->mc_rotor);
+
+ mc->mc_minblocksize = 1ULL << minashift;
+}
+
uint64_t
metaslab_class_get_alloc(metaslab_class_t *mc)
{
@@ -206,7 +370,134 @@
return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space);
}
+uint64_t
+metaslab_class_get_minblocksize(metaslab_class_t *mc)
+{
+ return (mc->mc_minblocksize);
+}
+
+void
+metaslab_class_histogram_verify(metaslab_class_t *mc)
+{
+ vdev_t *rvd = mc->mc_spa->spa_root_vdev;
+ uint64_t *mc_hist;
+ int i;
+
+ if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
+ return;
+
+ mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
+ KM_SLEEP);
+
+ for (int c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *tvd = rvd->vdev_child[c];
+ metaslab_group_t *mg = tvd->vdev_mg;
+
+ /*
+ * Skip any holes, uninitialized top-levels, or
+ * vdevs that are not in this metalab class.
+ */
+ if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 ||
+ mg->mg_class != mc) {
+ continue;
+ }
+
+ for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
+ mc_hist[i] += mg->mg_histogram[i];
+ }
+
+ for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
+ VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]);
+
+ kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
+}
+
/*
+ * Calculate the metaslab class's fragmentation metric. The metric
+ * is weighted based on the space contribution of each metaslab group.
+ * The return value will be a number between 0 and 100 (inclusive), or
+ * ZFS_FRAG_INVALID if the metric has not been set. See comment above the
+ * zfs_frag_table for more information about the metric.
+ */
+uint64_t
+metaslab_class_fragmentation(metaslab_class_t *mc)
+{
+ vdev_t *rvd = mc->mc_spa->spa_root_vdev;
+ uint64_t fragmentation = 0;
+
+ spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
+
+ for (int c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *tvd = rvd->vdev_child[c];
+ metaslab_group_t *mg = tvd->vdev_mg;
+
+ /*
+ * Skip any holes, uninitialized top-levels, or
+ * vdevs that are not in this metalab class.
+ */
+ if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 ||
+ mg->mg_class != mc) {
+ continue;
+ }
+
+ /*
+ * If a metaslab group does not contain a fragmentation
+ * metric then just bail out.
+ */
+ if (mg->mg_fragmentation == ZFS_FRAG_INVALID) {
+ spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
+ return (ZFS_FRAG_INVALID);
+ }
+
+ /*
+ * Determine how much this metaslab_group is contributing
+ * to the overall pool fragmentation metric.
+ */
+ fragmentation += mg->mg_fragmentation *
+ metaslab_group_get_space(mg);
+ }
+ fragmentation /= metaslab_class_get_space(mc);
+
+ ASSERT3U(fragmentation, <=, 100);
+ spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
+ return (fragmentation);
+}
+
+/*
+ * Calculate the amount of expandable space that is available in
+ * this metaslab class. If a device is expanded then its expandable
+ * space will be the amount of allocatable space that is currently not
+ * part of this metaslab class.
+ */
+uint64_t
+metaslab_class_expandable_space(metaslab_class_t *mc)
+{
+ vdev_t *rvd = mc->mc_spa->spa_root_vdev;
+ uint64_t space = 0;
+
+ spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER);
+ for (int c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *tvd = rvd->vdev_child[c];
+ metaslab_group_t *mg = tvd->vdev_mg;
+
+ if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 ||
+ mg->mg_class != mc) {
+ continue;
+ }
+
+ /*
+ * Calculate if we have enough space to add additional
+ * metaslabs. We report the expandable space in terms
+ * of the metaslab size since that's the unit of expansion.
+ */
+ space += P2ALIGN(tvd->vdev_max_asize - tvd->vdev_asize,
+ 1ULL << tvd->vdev_ms_shift);
+ }
+ spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG);
+ return (space);
+}
+
+/*
* ==========================================================================
* Metaslab groups
* ==========================================================================
@@ -225,9 +516,9 @@
/*
* If the weights are identical, use the offset to force uniqueness.
*/
- if (m1->ms_map->sm_start < m2->ms_map->sm_start)
+ if (m1->ms_start < m2->ms_start)
return (-1);
- if (m1->ms_map->sm_start > m2->ms_map->sm_start)
+ if (m1->ms_start > m2->ms_start)
return (1);
ASSERT3P(m1, ==, m2);
@@ -235,6 +526,87 @@
return (0);
}
+/*
+ * Update the allocatable flag and the metaslab group's capacity.
+ * The allocatable flag is set to true if the capacity is below
+ * the zfs_mg_noalloc_threshold or has a fragmentation value that is
+ * greater than zfs_mg_fragmentation_threshold. If a metaslab group
+ * transitions from allocatable to non-allocatable or vice versa then the
+ * metaslab group's class is updated to reflect the transition.
+ */
+static void
+metaslab_group_alloc_update(metaslab_group_t *mg)
+{
+ vdev_t *vd = mg->mg_vd;
+ metaslab_class_t *mc = mg->mg_class;
+ vdev_stat_t *vs = &vd->vdev_stat;
+ boolean_t was_allocatable;
+ boolean_t was_initialized;
+
+ ASSERT(vd == vd->vdev_top);
+
+ mutex_enter(&mg->mg_lock);
+ was_allocatable = mg->mg_allocatable;
+ was_initialized = mg->mg_initialized;
+
+ mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) /
+ (vs->vs_space + 1);
+
+ mutex_enter(&mc->mc_lock);
+
+ /*
+ * If the metaslab group was just added then it won't
+ * have any space until we finish syncing out this txg.
+ * At that point we will consider it initialized and available
+ * for allocations. We also don't consider non-activated
+ * metaslab groups (e.g. vdevs that are in the middle of being removed)
+ * to be initialized, because they can't be used for allocation.
+ */
+ mg->mg_initialized = metaslab_group_initialized(mg);
+ if (!was_initialized && mg->mg_initialized) {
+ mc->mc_groups++;
+ } else if (was_initialized && !mg->mg_initialized) {
+ ASSERT3U(mc->mc_groups, >, 0);
+ mc->mc_groups--;
+ }
+ if (mg->mg_initialized)
+ mg->mg_no_free_space = B_FALSE;
+
+ /*
+ * A metaslab group is considered allocatable if it has plenty
+ * of free space or is not heavily fragmented. We only take
+ * fragmentation into account if the metaslab group has a valid
+ * fragmentation metric (i.e. a value between 0 and 100).
+ */
+ mg->mg_allocatable = (mg->mg_activation_count > 0 &&
+ mg->mg_free_capacity > zfs_mg_noalloc_threshold &&
+ (mg->mg_fragmentation == ZFS_FRAG_INVALID ||
+ mg->mg_fragmentation <= zfs_mg_fragmentation_threshold));
+
+ /*
+ * The mc_alloc_groups maintains a count of the number of
+ * groups in this metaslab class that are still above the
+ * zfs_mg_noalloc_threshold. This is used by the allocating
+ * threads to determine if they should avoid allocations to
+ * a given group. The allocator will avoid allocations to a group
+ * if that group has reached or is below the zfs_mg_noalloc_threshold
+ * and there are still other groups that are above the threshold.
+ * When a group transitions from allocatable to non-allocatable or
+ * vice versa we update the metaslab class to reflect that change.
+ * When the mc_alloc_groups value drops to 0 that means that all
+ * groups have reached the zfs_mg_noalloc_threshold making all groups
+ * eligible for allocations. This effectively means that all devices
+ * are balanced again.
+ */
+ if (was_allocatable && !mg->mg_allocatable)
+ mc->mc_alloc_groups--;
+ else if (!was_allocatable && mg->mg_allocatable)
+ mc->mc_alloc_groups++;
+ mutex_exit(&mc->mc_lock);
+
+ mutex_exit(&mg->mg_lock);
+}
+
metaslab_group_t *
metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
{
@@ -247,7 +619,13 @@
mg->mg_vd = vd;
mg->mg_class = mc;
mg->mg_activation_count = 0;
+ mg->mg_initialized = B_FALSE;
+ mg->mg_no_free_space = B_TRUE;
+ refcount_create_tracked(&mg->mg_alloc_queue_depth);
+ mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct,
+ minclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT);
+
return (mg);
}
@@ -263,8 +641,10 @@
*/
ASSERT(mg->mg_activation_count <= 0);
+ taskq_destroy(mg->mg_taskq);
avl_destroy(&mg->mg_metaslab_tree);
mutex_destroy(&mg->mg_lock);
+ refcount_destroy(&mg->mg_alloc_queue_depth);
kmem_free(mg, sizeof (metaslab_group_t));
}
@@ -285,6 +665,7 @@
return;
mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
+ metaslab_group_alloc_update(mg);
if ((mgprev = mc->mc_rotor) == NULL) {
mg->mg_prev = mg;
@@ -297,6 +678,7 @@
mgnext->mg_prev = mg;
}
mc->mc_rotor = mg;
+ metaslab_class_minblocksize_update(mc);
}
void
@@ -315,6 +697,9 @@
return;
}
+ taskq_wait(mg->mg_taskq);
+ metaslab_group_alloc_update(mg);
+
mgprev = mg->mg_prev;
mgnext = mg->mg_next;
@@ -328,22 +713,125 @@
mg->mg_prev = NULL;
mg->mg_next = NULL;
+ metaslab_class_minblocksize_update(mc);
}
+boolean_t
+metaslab_group_initialized(metaslab_group_t *mg)
+{
+ vdev_t *vd = mg->mg_vd;
+ vdev_stat_t *vs = &vd->vdev_stat;
+
+ return (vs->vs_space != 0 && mg->mg_activation_count > 0);
+}
+
+uint64_t
+metaslab_group_get_space(metaslab_group_t *mg)
+{
+ return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count);
+}
+
+void
+metaslab_group_histogram_verify(metaslab_group_t *mg)
+{
+ uint64_t *mg_hist;
+ vdev_t *vd = mg->mg_vd;
+ uint64_t ashift = vd->vdev_ashift;
+ int i;
+
+ if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0)
+ return;
+
+ mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE,
+ KM_SLEEP);
+
+ ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=,
+ SPACE_MAP_HISTOGRAM_SIZE + ashift);
+
+ for (int m = 0; m < vd->vdev_ms_count; m++) {
+ metaslab_t *msp = vd->vdev_ms[m];
+
+ if (msp->ms_sm == NULL)
+ continue;
+
+ for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++)
+ mg_hist[i + ashift] +=
+ msp->ms_sm->sm_phys->smp_histogram[i];
+ }
+
+ for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++)
+ VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]);
+
+ kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE);
+}
+
static void
+metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp)
+{
+ metaslab_class_t *mc = mg->mg_class;
+ uint64_t ashift = mg->mg_vd->vdev_ashift;
+
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+ if (msp->ms_sm == NULL)
+ return;
+
+ mutex_enter(&mg->mg_lock);
+ for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
+ mg->mg_histogram[i + ashift] +=
+ msp->ms_sm->sm_phys->smp_histogram[i];
+ mc->mc_histogram[i + ashift] +=
+ msp->ms_sm->sm_phys->smp_histogram[i];
+ }
+ mutex_exit(&mg->mg_lock);
+}
+
+void
+metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp)
+{
+ metaslab_class_t *mc = mg->mg_class;
+ uint64_t ashift = mg->mg_vd->vdev_ashift;
+
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+ if (msp->ms_sm == NULL)
+ return;
+
+ mutex_enter(&mg->mg_lock);
+ for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
+ ASSERT3U(mg->mg_histogram[i + ashift], >=,
+ msp->ms_sm->sm_phys->smp_histogram[i]);
+ ASSERT3U(mc->mc_histogram[i + ashift], >=,
+ msp->ms_sm->sm_phys->smp_histogram[i]);
+
+ mg->mg_histogram[i + ashift] -=
+ msp->ms_sm->sm_phys->smp_histogram[i];
+ mc->mc_histogram[i + ashift] -=
+ msp->ms_sm->sm_phys->smp_histogram[i];
+ }
+ mutex_exit(&mg->mg_lock);
+}
+
+static void
metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
{
+ ASSERT(msp->ms_group == NULL);
mutex_enter(&mg->mg_lock);
- ASSERT(msp->ms_group == NULL);
msp->ms_group = mg;
msp->ms_weight = 0;
avl_add(&mg->mg_metaslab_tree, msp);
mutex_exit(&mg->mg_lock);
+
+ mutex_enter(&msp->ms_lock);
+ metaslab_group_histogram_add(mg, msp);
+ mutex_exit(&msp->ms_lock);
}
static void
metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp)
{
+ mutex_enter(&msp->ms_lock);
+ metaslab_group_histogram_remove(mg, msp);
+ mutex_exit(&msp->ms_lock);
+
mutex_enter(&mg->mg_lock);
ASSERT(msp->ms_group == mg);
avl_remove(&mg->mg_metaslab_tree, msp);
@@ -356,9 +844,9 @@
{
/*
* Although in principle the weight can be any value, in
- * practice we do not use values in the range [1, 510].
+ * practice we do not use values in the range [1, 511].
*/
- ASSERT(weight >= SPA_MINBLOCKSIZE-1 || weight == 0);
+ ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0);
ASSERT(MUTEX_HELD(&msp->ms_lock));
mutex_enter(&mg->mg_lock);
@@ -370,26 +858,162 @@
}
/*
+ * Calculate the fragmentation for a given metaslab group. We can use
+ * a simple average here since all metaslabs within the group must have
+ * the same size. The return value will be a value between 0 and 100
+ * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this
+ * group have a fragmentation metric.
+ */
+uint64_t
+metaslab_group_fragmentation(metaslab_group_t *mg)
+{
+ vdev_t *vd = mg->mg_vd;
+ uint64_t fragmentation = 0;
+ uint64_t valid_ms = 0;
+
+ for (int m = 0; m < vd->vdev_ms_count; m++) {
+ metaslab_t *msp = vd->vdev_ms[m];
+
+ if (msp->ms_fragmentation == ZFS_FRAG_INVALID)
+ continue;
+
+ valid_ms++;
+ fragmentation += msp->ms_fragmentation;
+ }
+
+ if (valid_ms <= vd->vdev_ms_count / 2)
+ return (ZFS_FRAG_INVALID);
+
+ fragmentation /= valid_ms;
+ ASSERT3U(fragmentation, <=, 100);
+ return (fragmentation);
+}
+
+/*
+ * Determine if a given metaslab group should skip allocations. A metaslab
+ * group should avoid allocations if its free capacity is less than the
+ * zfs_mg_noalloc_threshold or its fragmentation metric is greater than
+ * zfs_mg_fragmentation_threshold and there is at least one metaslab group
+ * that can still handle allocations. If the allocation throttle is enabled
+ * then we skip allocations to devices that have reached their maximum
+ * allocation queue depth unless the selected metaslab group is the only
+ * eligible group remaining.
+ */
+static boolean_t
+metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor,
+ uint64_t psize)
+{
+ spa_t *spa = mg->mg_vd->vdev_spa;
+ metaslab_class_t *mc = mg->mg_class;
+
+ /*
+ * We can only consider skipping this metaslab group if it's
+ * in the normal metaslab class and there are other metaslab
+ * groups to select from. Otherwise, we always consider it eligible
+ * for allocations.
+ */
+ if (mc != spa_normal_class(spa) || mc->mc_groups <= 1)
+ return (B_TRUE);
+
+ /*
+ * If the metaslab group's mg_allocatable flag is set (see comments
+ * in metaslab_group_alloc_update() for more information) and
+ * the allocation throttle is disabled then allow allocations to this
+ * device. However, if the allocation throttle is enabled then
+ * check if we have reached our allocation limit (mg_alloc_queue_depth)
+ * to determine if we should allow allocations to this metaslab group.
+ * If all metaslab groups are no longer considered allocatable
+ * (mc_alloc_groups == 0) or we're trying to allocate the smallest
+ * gang block size then we allow allocations on this metaslab group
+ * regardless of the mg_allocatable or throttle settings.
+ */
+ if (mg->mg_allocatable) {
+ metaslab_group_t *mgp;
+ int64_t qdepth;
+ uint64_t qmax = mg->mg_max_alloc_queue_depth;
+
+ if (!mc->mc_alloc_throttle_enabled)
+ return (B_TRUE);
+
+ /*
+ * If this metaslab group does not have any free space, then
+ * there is no point in looking further.
+ */
+ if (mg->mg_no_free_space)
+ return (B_FALSE);
+
+ qdepth = refcount_count(&mg->mg_alloc_queue_depth);
+
+ /*
+ * If this metaslab group is below its qmax or it's
+ * the only allocatable metasable group, then attempt
+ * to allocate from it.
+ */
+ if (qdepth < qmax || mc->mc_alloc_groups == 1)
+ return (B_TRUE);
+ ASSERT3U(mc->mc_alloc_groups, >, 1);
+
+ /*
+ * Since this metaslab group is at or over its qmax, we
+ * need to determine if there are metaslab groups after this
+ * one that might be able to handle this allocation. This is
+ * racy since we can't hold the locks for all metaslab
+ * groups at the same time when we make this check.
+ */
+ for (mgp = mg->mg_next; mgp != rotor; mgp = mgp->mg_next) {
+ qmax = mgp->mg_max_alloc_queue_depth;
+
+ qdepth = refcount_count(&mgp->mg_alloc_queue_depth);
+
+ /*
+ * If there is another metaslab group that
+ * might be able to handle the allocation, then
+ * we return false so that we skip this group.
+ */
+ if (qdepth < qmax && !mgp->mg_no_free_space)
+ return (B_FALSE);
+ }
+
+ /*
+ * We didn't find another group to handle the allocation
+ * so we can't skip this metaslab group even though
+ * we are at or over our qmax.
+ */
+ return (B_TRUE);
+
+ } else if (mc->mc_alloc_groups == 0 || psize == SPA_MINBLOCKSIZE) {
+ return (B_TRUE);
+ }
+ return (B_FALSE);
+}
+
+/*
* ==========================================================================
- * Common allocator routines
+ * Range tree callbacks
* ==========================================================================
*/
+
+/*
+ * Comparison function for the private size-ordered tree. Tree is sorted
+ * by size, larger sizes at the end of the tree.
+ */
static int
-metaslab_segsize_compare(const void *x1, const void *x2)
+metaslab_rangesize_compare(const void *x1, const void *x2)
{
- const space_seg_t *s1 = x1;
- const space_seg_t *s2 = x2;
- uint64_t ss_size1 = s1->ss_end - s1->ss_start;
- uint64_t ss_size2 = s2->ss_end - s2->ss_start;
+ const range_seg_t *r1 = x1;
+ const range_seg_t *r2 = x2;
+ uint64_t rs_size1 = r1->rs_end - r1->rs_start;
+ uint64_t rs_size2 = r2->rs_end - r2->rs_start;
- if (ss_size1 < ss_size2)
+ if (rs_size1 < rs_size2)
return (-1);
- if (ss_size1 > ss_size2)
+ if (rs_size1 > rs_size2)
return (1);
- if (s1->ss_start < s2->ss_start)
+ if (r1->rs_start < r2->rs_start)
return (-1);
- if (s1->ss_start > s2->ss_start)
+
+ if (r1->rs_start > r2->rs_start)
return (1);
return (0);
@@ -396,137 +1020,195 @@
}
/*
- * This is a helper function that can be used by the allocator to find
- * a suitable block to allocate. This will search the specified AVL
- * tree looking for a block that matches the specified criteria.
+ * Create any block allocator specific components. The current allocators
+ * rely on using both a size-ordered range_tree_t and an array of uint64_t's.
*/
-static uint64_t
-metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
- uint64_t align)
+static void
+metaslab_rt_create(range_tree_t *rt, void *arg)
{
- space_seg_t *ss, ssearch;
- avl_index_t where;
+ metaslab_t *msp = arg;
- ssearch.ss_start = *cursor;
- ssearch.ss_end = *cursor + size;
+ ASSERT3P(rt->rt_arg, ==, msp);
+ ASSERT(msp->ms_tree == NULL);
- ss = avl_find(t, &ssearch, &where);
- if (ss == NULL)
- ss = avl_nearest(t, where, AVL_AFTER);
-
- while (ss != NULL) {
- uint64_t offset = P2ROUNDUP(ss->ss_start, align);
-
- if (offset + size <= ss->ss_end) {
- *cursor = offset + size;
- return (offset);
- }
- ss = AVL_NEXT(t, ss);
- }
-
- /*
- * If we know we've searched the whole map (*cursor == 0), give up.
- * Otherwise, reset the cursor to the beginning and try again.
- */
- if (*cursor == 0)
- return (-1ULL);
-
- *cursor = 0;
- return (metaslab_block_picker(t, cursor, size, align));
+ avl_create(&msp->ms_size_tree, metaslab_rangesize_compare,
+ sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
}
+/*
+ * Destroy the block allocator specific components.
+ */
static void
-metaslab_pp_load(space_map_t *sm)
+metaslab_rt_destroy(range_tree_t *rt, void *arg)
{
- space_seg_t *ss;
+ metaslab_t *msp = arg;
- ASSERT(sm->sm_ppd == NULL);
- sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP);
+ ASSERT3P(rt->rt_arg, ==, msp);
+ ASSERT3P(msp->ms_tree, ==, rt);
+ ASSERT0(avl_numnodes(&msp->ms_size_tree));
- sm->sm_pp_root = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
- avl_create(sm->sm_pp_root, metaslab_segsize_compare,
- sizeof (space_seg_t), offsetof(struct space_seg, ss_pp_node));
-
- for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss))
- avl_add(sm->sm_pp_root, ss);
+ avl_destroy(&msp->ms_size_tree);
}
static void
-metaslab_pp_unload(space_map_t *sm)
+metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg)
{
- void *cookie = NULL;
+ metaslab_t *msp = arg;
- kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t));
- sm->sm_ppd = NULL;
-
- while (avl_destroy_nodes(sm->sm_pp_root, &cookie) != NULL) {
- /* tear down the tree */
- }
-
- avl_destroy(sm->sm_pp_root);
- kmem_free(sm->sm_pp_root, sizeof (avl_tree_t));
- sm->sm_pp_root = NULL;
+ ASSERT3P(rt->rt_arg, ==, msp);
+ ASSERT3P(msp->ms_tree, ==, rt);
+ VERIFY(!msp->ms_condensing);
+ avl_add(&msp->ms_size_tree, rs);
}
-/* ARGSUSED */
static void
-metaslab_pp_claim(space_map_t *sm, uint64_t start, uint64_t size)
+metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
{
- /* No need to update cursor */
+ metaslab_t *msp = arg;
+
+ ASSERT3P(rt->rt_arg, ==, msp);
+ ASSERT3P(msp->ms_tree, ==, rt);
+ VERIFY(!msp->ms_condensing);
+ avl_remove(&msp->ms_size_tree, rs);
}
-/* ARGSUSED */
static void
-metaslab_pp_free(space_map_t *sm, uint64_t start, uint64_t size)
+metaslab_rt_vacate(range_tree_t *rt, void *arg)
{
- /* No need to update cursor */
+ metaslab_t *msp = arg;
+
+ ASSERT3P(rt->rt_arg, ==, msp);
+ ASSERT3P(msp->ms_tree, ==, rt);
+
+ /*
+ * Normally one would walk the tree freeing nodes along the way.
+ * Since the nodes are shared with the range trees we can avoid
+ * walking all nodes and just reinitialize the avl tree. The nodes
+ * will be freed by the range tree, so we don't want to free them here.
+ */
+ avl_create(&msp->ms_size_tree, metaslab_rangesize_compare,
+ sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
}
+static range_tree_ops_t metaslab_rt_ops = {
+ metaslab_rt_create,
+ metaslab_rt_destroy,
+ metaslab_rt_add,
+ metaslab_rt_remove,
+ metaslab_rt_vacate
+};
+
/*
+ * ==========================================================================
+ * Metaslab block operations
+ * ==========================================================================
+ */
+
+/*
* Return the maximum contiguous segment within the metaslab.
*/
uint64_t
-metaslab_pp_maxsize(space_map_t *sm)
+metaslab_block_maxsize(metaslab_t *msp)
{
- avl_tree_t *t = sm->sm_pp_root;
- space_seg_t *ss;
+ avl_tree_t *t = &msp->ms_size_tree;
+ range_seg_t *rs;
- if (t == NULL || (ss = avl_last(t)) == NULL)
+ if (t == NULL || (rs = avl_last(t)) == NULL)
return (0ULL);
- return (ss->ss_end - ss->ss_start);
+ return (rs->rs_end - rs->rs_start);
}
+uint64_t
+metaslab_block_alloc(metaslab_t *msp, uint64_t size)
+{
+ uint64_t start;
+ range_tree_t *rt = msp->ms_tree;
+
+ VERIFY(!msp->ms_condensing);
+
+ start = msp->ms_ops->msop_alloc(msp, size);
+ if (start != -1ULL) {
+ vdev_t *vd = msp->ms_group->mg_vd;
+
+ VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift));
+ VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
+ VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size);
+ range_tree_remove(rt, start, size);
+ }
+ return (start);
+}
+
/*
* ==========================================================================
- * The first-fit block allocator
+ * Common allocator routines
* ==========================================================================
*/
+
+/*
+ * This is a helper function that can be used by the allocator to find
+ * a suitable block to allocate. This will search the specified AVL
+ * tree looking for a block that matches the specified criteria.
+ */
static uint64_t
-metaslab_ff_alloc(space_map_t *sm, uint64_t size)
+metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
+ uint64_t align)
{
- avl_tree_t *t = &sm->sm_root;
- uint64_t align = size & -size;
- uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
+ range_seg_t *rs, rsearch;
+ avl_index_t where;
+ rsearch.rs_start = *cursor;
+ rsearch.rs_end = *cursor + size;
+
+ rs = avl_find(t, &rsearch, &where);
+ if (rs == NULL)
+ rs = avl_nearest(t, where, AVL_AFTER);
+
+ while (rs != NULL) {
+ uint64_t offset = P2ROUNDUP(rs->rs_start, align);
+
+ if (offset + size <= rs->rs_end) {
+ *cursor = offset + size;
+ return (offset);
+ }
+ rs = AVL_NEXT(t, rs);
+ }
+
+ /*
+ * If we know we've searched the whole map (*cursor == 0), give up.
+ * Otherwise, reset the cursor to the beginning and try again.
+ */
+ if (*cursor == 0)
+ return (-1ULL);
+
+ *cursor = 0;
return (metaslab_block_picker(t, cursor, size, align));
}
-/* ARGSUSED */
-boolean_t
-metaslab_ff_fragmented(space_map_t *sm)
+/*
+ * ==========================================================================
+ * The first-fit block allocator
+ * ==========================================================================
+ */
+static uint64_t
+metaslab_ff_alloc(metaslab_t *msp, uint64_t size)
{
- return (B_TRUE);
+ /*
+ * Find the largest power of 2 block size that evenly divides the
+ * requested size. This is used to try to allocate blocks with similar
+ * alignment from the same area of the metaslab (i.e. same cursor
+ * bucket) but it does not guarantee that other allocations sizes
+ * may exist in the same region.
+ */
+ uint64_t align = size & -size;
+ uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
+ avl_tree_t *t = &msp->ms_tree->rt_root;
+
+ return (metaslab_block_picker(t, cursor, size, align));
}
-static space_map_ops_t metaslab_ff_ops = {
- metaslab_pp_load,
- metaslab_pp_unload,
- metaslab_ff_alloc,
- metaslab_pp_claim,
- metaslab_pp_free,
- metaslab_pp_maxsize,
- metaslab_ff_fragmented
+static metaslab_ops_t metaslab_ff_ops = {
+ metaslab_ff_alloc
};
/*
@@ -538,16 +1220,24 @@
* ==========================================================================
*/
static uint64_t
-metaslab_df_alloc(space_map_t *sm, uint64_t size)
+metaslab_df_alloc(metaslab_t *msp, uint64_t size)
{
- avl_tree_t *t = &sm->sm_root;
+ /*
+ * Find the largest power of 2 block size that evenly divides the
+ * requested size. This is used to try to allocate blocks with similar
+ * alignment from the same area of the metaslab (i.e. same cursor
+ * bucket) but it does not guarantee that other allocations sizes
+ * may exist in the same region.
+ */
uint64_t align = size & -size;
- uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
- uint64_t max_size = metaslab_pp_maxsize(sm);
- int free_pct = sm->sm_space * 100 / sm->sm_size;
+ uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1];
+ range_tree_t *rt = msp->ms_tree;
+ avl_tree_t *t = &rt->rt_root;
+ uint64_t max_size = metaslab_block_maxsize(msp);
+ int free_pct = range_tree_space(rt) * 100 / msp->ms_size;
- ASSERT(MUTEX_HELD(sm->sm_lock));
- ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root));
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+ ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
if (max_size < size)
return (-1ULL);
@@ -558,7 +1248,7 @@
*/
if (max_size < metaslab_df_alloc_threshold ||
free_pct < metaslab_df_free_pct) {
- t = sm->sm_pp_root;
+ t = &msp->ms_size_tree;
*cursor = 0;
}
@@ -565,158 +1255,113 @@
return (metaslab_block_picker(t, cursor, size, 1ULL));
}
-static boolean_t
-metaslab_df_fragmented(space_map_t *sm)
-{
- uint64_t max_size = metaslab_pp_maxsize(sm);
- int free_pct = sm->sm_space * 100 / sm->sm_size;
-
- if (max_size >= metaslab_df_alloc_threshold &&
- free_pct >= metaslab_df_free_pct)
- return (B_FALSE);
-
- return (B_TRUE);
-}
-
-static space_map_ops_t metaslab_df_ops = {
- metaslab_pp_load,
- metaslab_pp_unload,
- metaslab_df_alloc,
- metaslab_pp_claim,
- metaslab_pp_free,
- metaslab_pp_maxsize,
- metaslab_df_fragmented
+static metaslab_ops_t metaslab_df_ops = {
+ metaslab_df_alloc
};
/*
* ==========================================================================
- * Other experimental allocators
+ * Cursor fit block allocator -
+ * Select the largest region in the metaslab, set the cursor to the beginning
+ * of the range and the cursor_end to the end of the range. As allocations
+ * are made advance the cursor. Continue allocating from the cursor until
+ * the range is exhausted and then find a new range.
* ==========================================================================
*/
static uint64_t
-metaslab_cdf_alloc(space_map_t *sm, uint64_t size)
+metaslab_cf_alloc(metaslab_t *msp, uint64_t size)
{
- avl_tree_t *t = &sm->sm_root;
- uint64_t *cursor = (uint64_t *)sm->sm_ppd;
- uint64_t *extent_end = (uint64_t *)sm->sm_ppd + 1;
- uint64_t max_size = metaslab_pp_maxsize(sm);
- uint64_t rsize = size;
+ range_tree_t *rt = msp->ms_tree;
+ avl_tree_t *t = &msp->ms_size_tree;
+ uint64_t *cursor = &msp->ms_lbas[0];
+ uint64_t *cursor_end = &msp->ms_lbas[1];
uint64_t offset = 0;
- ASSERT(MUTEX_HELD(sm->sm_lock));
- ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root));
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+ ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&rt->rt_root));
- if (max_size < size)
- return (-1ULL);
+ ASSERT3U(*cursor_end, >=, *cursor);
- ASSERT3U(*extent_end, >=, *cursor);
+ if ((*cursor + size) > *cursor_end) {
+ range_seg_t *rs;
- /*
- * If we're running low on space switch to using the size
- * sorted AVL tree (best-fit).
- */
- if ((*cursor + size) > *extent_end) {
+ rs = avl_last(&msp->ms_size_tree);
+ if (rs == NULL || (rs->rs_end - rs->rs_start) < size)
+ return (-1ULL);
- t = sm->sm_pp_root;
- *cursor = *extent_end = 0;
-
- if (max_size > 2 * SPA_MAXBLOCKSIZE)
- rsize = MIN(metaslab_min_alloc_size, max_size);
- offset = metaslab_block_picker(t, extent_end, rsize, 1ULL);
- if (offset != -1)
- *cursor = offset + size;
- } else {
- offset = metaslab_block_picker(t, cursor, rsize, 1ULL);
+ *cursor = rs->rs_start;
+ *cursor_end = rs->rs_end;
}
- ASSERT3U(*cursor, <=, *extent_end);
- return (offset);
-}
-static boolean_t
-metaslab_cdf_fragmented(space_map_t *sm)
-{
- uint64_t max_size = metaslab_pp_maxsize(sm);
+ offset = *cursor;
+ *cursor += size;
- if (max_size > (metaslab_min_alloc_size * 10))
- return (B_FALSE);
- return (B_TRUE);
+ return (offset);
}
-static space_map_ops_t metaslab_cdf_ops = {
- metaslab_pp_load,
- metaslab_pp_unload,
- metaslab_cdf_alloc,
- metaslab_pp_claim,
- metaslab_pp_free,
- metaslab_pp_maxsize,
- metaslab_cdf_fragmented
+static metaslab_ops_t metaslab_cf_ops = {
+ metaslab_cf_alloc
};
+/*
+ * ==========================================================================
+ * New dynamic fit allocator -
+ * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift
+ * contiguous blocks. If no region is found then just use the largest segment
+ * that remains.
+ * ==========================================================================
+ */
+
+/*
+ * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift)
+ * to request from the allocator.
+ */
uint64_t metaslab_ndf_clump_shift = 4;
static uint64_t
-metaslab_ndf_alloc(space_map_t *sm, uint64_t size)
+metaslab_ndf_alloc(metaslab_t *msp, uint64_t size)
{
- avl_tree_t *t = &sm->sm_root;
+ avl_tree_t *t = &msp->ms_tree->rt_root;
avl_index_t where;
- space_seg_t *ss, ssearch;
- uint64_t hbit = highbit(size);
- uint64_t *cursor = (uint64_t *)sm->sm_ppd + hbit - 1;
- uint64_t max_size = metaslab_pp_maxsize(sm);
+ range_seg_t *rs, rsearch;
+ uint64_t hbit = highbit64(size);
+ uint64_t *cursor = &msp->ms_lbas[hbit - 1];
+ uint64_t max_size = metaslab_block_maxsize(msp);
- ASSERT(MUTEX_HELD(sm->sm_lock));
- ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root));
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+ ASSERT3U(avl_numnodes(t), ==, avl_numnodes(&msp->ms_size_tree));
if (max_size < size)
return (-1ULL);
- ssearch.ss_start = *cursor;
- ssearch.ss_end = *cursor + size;
+ rsearch.rs_start = *cursor;
+ rsearch.rs_end = *cursor + size;
- ss = avl_find(t, &ssearch, &where);
- if (ss == NULL || (ss->ss_start + size > ss->ss_end)) {
- t = sm->sm_pp_root;
+ rs = avl_find(t, &rsearch, &where);
+ if (rs == NULL || (rs->rs_end - rs->rs_start) < size) {
+ t = &msp->ms_size_tree;
- ssearch.ss_start = 0;
- ssearch.ss_end = MIN(max_size,
+ rsearch.rs_start = 0;
+ rsearch.rs_end = MIN(max_size,
1ULL << (hbit + metaslab_ndf_clump_shift));
- ss = avl_find(t, &ssearch, &where);
- if (ss == NULL)
- ss = avl_nearest(t, where, AVL_AFTER);
- ASSERT(ss != NULL);
+ rs = avl_find(t, &rsearch, &where);
+ if (rs == NULL)
+ rs = avl_nearest(t, where, AVL_AFTER);
+ ASSERT(rs != NULL);
}
- if (ss != NULL) {
- if (ss->ss_start + size <= ss->ss_end) {
- *cursor = ss->ss_start + size;
- return (ss->ss_start);
- }
+ if ((rs->rs_end - rs->rs_start) >= size) {
+ *cursor = rs->rs_start + size;
+ return (rs->rs_start);
}
return (-1ULL);
}
-static boolean_t
-metaslab_ndf_fragmented(space_map_t *sm)
-{
- uint64_t max_size = metaslab_pp_maxsize(sm);
-
- if (max_size > (metaslab_min_alloc_size << metaslab_ndf_clump_shift))
- return (B_FALSE);
- return (B_TRUE);
-}
-
-
-static space_map_ops_t metaslab_ndf_ops = {
- metaslab_pp_load,
- metaslab_pp_unload,
- metaslab_ndf_alloc,
- metaslab_pp_claim,
- metaslab_pp_free,
- metaslab_pp_maxsize,
- metaslab_ndf_fragmented
+static metaslab_ops_t metaslab_ndf_ops = {
+ metaslab_ndf_alloc
};
-space_map_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
+metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
/*
* ==========================================================================
@@ -723,38 +1368,109 @@
* Metaslabs
* ==========================================================================
*/
-metaslab_t *
-metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo,
- uint64_t start, uint64_t size, uint64_t txg)
+
+/*
+ * Wait for any in-progress metaslab loads to complete.
+ */
+void
+metaslab_load_wait(metaslab_t *msp)
{
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+ while (msp->ms_loading) {
+ ASSERT(!msp->ms_loaded);
+ cv_wait(&msp->ms_load_cv, &msp->ms_lock);
+ }
+}
+
+int
+metaslab_load(metaslab_t *msp)
+{
+ int error = 0;
+
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+ ASSERT(!msp->ms_loaded);
+ ASSERT(!msp->ms_loading);
+
+ msp->ms_loading = B_TRUE;
+
+ /*
+ * If the space map has not been allocated yet, then treat
+ * all the space in the metaslab as free and add it to the
+ * ms_tree.
+ */
+ if (msp->ms_sm != NULL)
+ error = space_map_load(msp->ms_sm, msp->ms_tree, SM_FREE);
+ else
+ range_tree_add(msp->ms_tree, msp->ms_start, msp->ms_size);
+
+ msp->ms_loaded = (error == 0);
+ msp->ms_loading = B_FALSE;
+
+ if (msp->ms_loaded) {
+ for (int t = 0; t < TXG_DEFER_SIZE; t++) {
+ range_tree_walk(msp->ms_defertree[t],
+ range_tree_remove, msp->ms_tree);
+ }
+ }
+ cv_broadcast(&msp->ms_load_cv);
+ return (error);
+}
+
+void
+metaslab_unload(metaslab_t *msp)
+{
+ ASSERT(MUTEX_HELD(&msp->ms_lock));
+ range_tree_vacate(msp->ms_tree, NULL, NULL);
+ msp->ms_loaded = B_FALSE;
+ msp->ms_weight &= ~METASLAB_ACTIVE_MASK;
+}
+
+int
+metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
+ metaslab_t **msp)
+{
vdev_t *vd = mg->mg_vd;
- metaslab_t *msp;
+ objset_t *mos = vd->vdev_spa->spa_meta_objset;
+ metaslab_t *ms;
+ int error;
- msp = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
- mutex_init(&msp->ms_lock, NULL, MUTEX_DEFAULT, NULL);
+ ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP);
+ mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL);
+ ms->ms_id = id;
+ ms->ms_start = id << vd->vdev_ms_shift;
+ ms->ms_size = 1ULL << vd->vdev_ms_shift;
- msp->ms_smo_syncing = *smo;
+ /*
+ * We only open space map objects that already exist. All others
+ * will be opened when we finally allocate an object for it.
+ */
+ if (object != 0) {
+ error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start,
+ ms->ms_size, vd->vdev_ashift, &ms->ms_lock);
+ if (error != 0) {
+ kmem_free(ms, sizeof (metaslab_t));
+ return (error);
+ }
+
+ ASSERT(ms->ms_sm != NULL);
+ }
+
/*
- * We create the main space map here, but we don't create the
- * allocmaps and freemaps until metaslab_sync_done(). This serves
+ * We create the main range tree here, but we don't create the
+ * alloctree and freetree until metaslab_sync_done(). This serves
* two purposes: it allows metaslab_sync_done() to detect the
* addition of new space; and for debugging, it ensures that we'd
* data fault on any attempt to use this metaslab before it's ready.
*/
- msp->ms_map = kmem_zalloc(sizeof (space_map_t), KM_SLEEP);
- space_map_create(msp->ms_map, start, size,
- vd->vdev_ashift, &msp->ms_lock);
+ ms->ms_tree = range_tree_create(&metaslab_rt_ops, ms, &ms->ms_lock);
+ metaslab_group_add(mg, ms);
- metaslab_group_add(mg, msp);
+ ms->ms_fragmentation = metaslab_fragmentation(ms);
+ ms->ms_ops = mg->mg_class->mc_ops;
- if (metaslab_debug && smo->smo_object != 0) {
- mutex_enter(&msp->ms_lock);
- VERIFY(space_map_load(msp->ms_map, mg->mg_class->mc_ops,
- SM_FREE, smo, spa_meta_objset(vd->vdev_spa)) == 0);
- mutex_exit(&msp->ms_lock);
- }
-
/*
* If we're opening an existing pool (txg == 0) or creating
* a new one (txg == TXG_INITIAL), all space is available now.
@@ -762,14 +1478,27 @@
* does not become available until after this txg has synced.
*/
if (txg <= TXG_INITIAL)
- metaslab_sync_done(msp, 0);
+ metaslab_sync_done(ms, 0);
+ /*
+ * If metaslab_debug_load is set and we're initializing a metaslab
+ * that has an allocated space_map object then load the its space
+ * map so that can verify frees.
+ */
+ if (metaslab_debug_load && ms->ms_sm != NULL) {
+ mutex_enter(&ms->ms_lock);
+ VERIFY0(metaslab_load(ms));
+ mutex_exit(&ms->ms_lock);
+ }
+
if (txg != 0) {
vdev_dirty(vd, 0, NULL, txg);
- vdev_dirty(vd, VDD_METASLAB, msp, txg);
+ vdev_dirty(vd, VDD_METASLAB, ms, txg);
}
- return (msp);
+ *msp = ms;
+
+ return (0);
}
void
@@ -777,48 +1506,149 @@
{
metaslab_group_t *mg = msp->ms_group;
- vdev_space_update(mg->mg_vd,
- -msp->ms_smo.smo_alloc, 0, -msp->ms_map->sm_size);
-
metaslab_group_remove(mg, msp);
mutex_enter(&msp->ms_lock);
- space_map_unload(msp->ms_map);
- space_map_destroy(msp->ms_map);
- kmem_free(msp->ms_map, sizeof (*msp->ms_map));
+ VERIFY(msp->ms_group == NULL);
+ vdev_space_update(mg->mg_vd, -space_map_allocated(msp->ms_sm),
+ 0, -msp->ms_size);
+ space_map_close(msp->ms_sm);
+ metaslab_unload(msp);
+ range_tree_destroy(msp->ms_tree);
+
for (int t = 0; t < TXG_SIZE; t++) {
- space_map_destroy(msp->ms_allocmap[t]);
- space_map_destroy(msp->ms_freemap[t]);
- kmem_free(msp->ms_allocmap[t], sizeof (*msp->ms_allocmap[t]));
- kmem_free(msp->ms_freemap[t], sizeof (*msp->ms_freemap[t]));
+ range_tree_destroy(msp->ms_alloctree[t]);
+ range_tree_destroy(msp->ms_freetree[t]);
}
for (int t = 0; t < TXG_DEFER_SIZE; t++) {
- space_map_destroy(msp->ms_defermap[t]);
- kmem_free(msp->ms_defermap[t], sizeof (*msp->ms_defermap[t]));
+ range_tree_destroy(msp->ms_defertree[t]);
}
ASSERT0(msp->ms_deferspace);
mutex_exit(&msp->ms_lock);
+ cv_destroy(&msp->ms_load_cv);
mutex_destroy(&msp->ms_lock);
kmem_free(msp, sizeof (metaslab_t));
}
-#define METASLAB_WEIGHT_PRIMARY (1ULL << 63)
-#define METASLAB_WEIGHT_SECONDARY (1ULL << 62)
-#define METASLAB_ACTIVE_MASK \
- (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY)
+#define FRAGMENTATION_TABLE_SIZE 17
+/*
+ * This table defines a segment size based fragmentation metric that will
+ * allow each metaslab to derive its own fragmentation value. This is done
+ * by calculating the space in each bucket of the spacemap histogram and
+ * multiplying that by the fragmetation metric in this table. Doing
+ * this for all buckets and dividing it by the total amount of free
+ * space in this metaslab (i.e. the total free space in all buckets) gives
+ * us the fragmentation metric. This means that a high fragmentation metric
+ * equates to most of the free space being comprised of small segments.
+ * Conversely, if the metric is low, then most of the free space is in
+ * large segments. A 10% change in fragmentation equates to approximately
+ * double the number of segments.
+ *
+ * This table defines 0% fragmented space using 16MB segments. Testing has
+ * shown that segments that are greater than or equal to 16MB do not suffer
+ * from drastic performance problems. Using this value, we derive the rest
+ * of the table. Since the fragmentation value is never stored on disk, it
+ * is possible to change these calculations in the future.
+ */
+int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = {
+ 100, /* 512B */
+ 100, /* 1K */
+ 98, /* 2K */
+ 95, /* 4K */
+ 90, /* 8K */
+ 80, /* 16K */
+ 70, /* 32K */
+ 60, /* 64K */
+ 50, /* 128K */
+ 40, /* 256K */
+ 30, /* 512K */
+ 20, /* 1M */
+ 15, /* 2M */
+ 10, /* 4M */
+ 5, /* 8M */
+ 0 /* 16M */
+};
+
+/*
+ * Calclate the metaslab's fragmentation metric. A return value
+ * of ZFS_FRAG_INVALID means that the metaslab has not been upgraded and does
+ * not support this metric. Otherwise, the return value should be in the
+ * range [0, 100].
+ */
static uint64_t
+metaslab_fragmentation(metaslab_t *msp)
+{
+ spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
+ uint64_t fragmentation = 0;
+ uint64_t total = 0;
+ boolean_t feature_enabled = spa_feature_is_enabled(spa,
+ SPA_FEATURE_SPACEMAP_HISTOGRAM);
+
+ if (!feature_enabled)
+ return (ZFS_FRAG_INVALID);
+
+ /*
+ * A null space map means that the entire metaslab is free
+ * and thus is not fragmented.
+ */
+ if (msp->ms_sm == NULL)
+ return (0);
+
+ /*
+ * If this metaslab's space_map has not been upgraded, flag it
+ * so that we upgrade next time we encounter it.
+ */
+ if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) {
+ uint64_t txg = spa_syncing_txg(spa);
+ vdev_t *vd = msp->ms_group->mg_vd;
+
+ if (spa_writeable(spa)) {
+ msp->ms_condense_wanted = B_TRUE;
+ vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
+ spa_dbgmsg(spa, "txg %llu, requesting force condense: "
+ "msp %p, vd %p", txg, msp, vd);
+ }
+ return (ZFS_FRAG_INVALID);
+ }
+
+ for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) {
+ uint64_t space = 0;
+ uint8_t shift = msp->ms_sm->sm_shift;
+ int idx = MIN(shift - SPA_MINBLOCKSHIFT + i,
+ FRAGMENTATION_TABLE_SIZE - 1);
+
+ if (msp->ms_sm->sm_phys->smp_histogram[i] == 0)
+ continue;
+
+ space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift);
+ total += space;
+
+ ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE);
+ fragmentation += space * zfs_frag_table[idx];
+ }
+
+ if (total > 0)
+ fragmentation /= total;
+ ASSERT3U(fragmentation, <=, 100);
+ return (fragmentation);
+}
+
+/*
+ * Compute a weight -- a selection preference value -- for the given metaslab.
+ * This is based on the amount of free space, the level of fragmentation,
+ * the LBA range, and whether the metaslab is loaded.
+ */
+static uint64_t
metaslab_weight(metaslab_t *msp)
{
metaslab_group_t *mg = msp->ms_group;
- space_map_t *sm = msp->ms_map;
- space_map_obj_t *smo = &msp->ms_smo;
vdev_t *vd = mg->mg_vd;
uint64_t weight, space;
@@ -829,7 +1659,7 @@
* for us to do here.
*/
if (vd->vdev_removing) {
- ASSERT0(smo->smo_alloc);
+ ASSERT0(space_map_allocated(msp->ms_sm));
ASSERT0(vd->vdev_ms_shift);
return (0);
}
@@ -837,7 +1667,30 @@
/*
* The baseline weight is the metaslab's free space.
*/
- space = sm->sm_size - smo->smo_alloc;
+ space = msp->ms_size - space_map_allocated(msp->ms_sm);
+
+ msp->ms_fragmentation = metaslab_fragmentation(msp);
+ if (metaslab_fragmentation_factor_enabled &&
+ msp->ms_fragmentation != ZFS_FRAG_INVALID) {
+ /*
+ * Use the fragmentation information to inversely scale
+ * down the baseline weight. We need to ensure that we
+ * don't exclude this metaslab completely when it's 100%
+ * fragmented. To avoid this we reduce the fragmented value
+ * by 1.
+ */
+ space = (space * (100 - (msp->ms_fragmentation - 1))) / 100;
+
+ /*
+ * If space < SPA_MINBLOCKSIZE, then we will not allocate from
+ * this metaslab again. The fragmentation metric may have
+ * decreased the space to something smaller than
+ * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE
+ * so that we can consume any remaining space.
+ */
+ if (space > 0 && space < SPA_MINBLOCKSIZE)
+ space = SPA_MINBLOCKSIZE;
+ }
weight = space;
/*
@@ -849,100 +1702,44 @@
* In effect, this means that we'll select the metaslab with the most
* free bandwidth rather than simply the one with the most free space.
*/
- weight = 2 * weight -
- ((sm->sm_start >> vd->vdev_ms_shift) * weight) / vd->vdev_ms_count;
- ASSERT(weight >= space && weight <= 2 * space);
+ if (metaslab_lba_weighting_enabled) {
+ weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count;
+ ASSERT(weight >= space && weight <= 2 * space);
+ }
/*
- * For locality, assign higher weight to metaslabs which have
- * a lower offset than what we've already activated.
+ * If this metaslab is one we're actively using, adjust its
+ * weight to make it preferable to any inactive metaslab so
+ * we'll polish it off. If the fragmentation on this metaslab
+ * has exceed our threshold, then don't mark it active.
*/
- if (sm->sm_start <= mg->mg_bonus_area)
- weight *= (metaslab_smo_bonus_pct / 100);
- ASSERT(weight >= space &&
- weight <= 2 * (metaslab_smo_bonus_pct / 100) * space);
-
- if (sm->sm_loaded && !sm->sm_ops->smop_fragmented(sm)) {
- /*
- * If this metaslab is one we're actively using, adjust its
- * weight to make it preferable to any inactive metaslab so
- * we'll polish it off.
- */
+ if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID &&
+ msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) {
weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
}
+
return (weight);
}
-static void
-metaslab_prefetch(metaslab_group_t *mg)
-{
- spa_t *spa = mg->mg_vd->vdev_spa;
- metaslab_t *msp;
- avl_tree_t *t = &mg->mg_metaslab_tree;
- int m;
-
- mutex_enter(&mg->mg_lock);
-
- /*
- * Prefetch the next potential metaslabs
- */
- for (msp = avl_first(t), m = 0; msp; msp = AVL_NEXT(t, msp), m++) {
- space_map_t *sm = msp->ms_map;
- space_map_obj_t *smo = &msp->ms_smo;
-
- /* If we have reached our prefetch limit then we're done */
- if (m >= metaslab_prefetch_limit)
- break;
-
- if (!sm->sm_loaded && smo->smo_object != 0) {
- mutex_exit(&mg->mg_lock);
- dmu_prefetch(spa_meta_objset(spa), smo->smo_object,
- 0ULL, smo->smo_objsize);
- mutex_enter(&mg->mg_lock);
- }
- }
- mutex_exit(&mg->mg_lock);
-}
-
static int
metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
{
- metaslab_group_t *mg = msp->ms_group;
- space_map_t *sm = msp->ms_map;
- space_map_ops_t *sm_ops = msp->ms_group->mg_class->mc_ops;
-
ASSERT(MUTEX_HELD(&msp->ms_lock));
if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
- space_map_load_wait(sm);
- if (!sm->sm_loaded) {
- space_map_obj_t *smo = &msp->ms_smo;
-
- int error = space_map_load(sm, sm_ops, SM_FREE, smo,
- spa_meta_objset(msp->ms_group->mg_vd->vdev_spa));
- if (error) {
+ metaslab_load_wait(msp);
+ if (!msp->ms_loaded) {
+ int error = metaslab_load(msp);
+ if (error) {
metaslab_group_sort(msp->ms_group, msp, 0);
return (error);
}
- for (int t = 0; t < TXG_DEFER_SIZE; t++)
- space_map_walk(msp->ms_defermap[t],
- space_map_claim, sm);
-
}
- /*
- * Track the bonus area as we activate new metaslabs.
- */
- if (sm->sm_start > mg->mg_bonus_area) {
- mutex_enter(&mg->mg_lock);
- mg->mg_bonus_area = sm->sm_start;
- mutex_exit(&mg->mg_lock);
- }
-
metaslab_group_sort(msp->ms_group, msp,
msp->ms_weight | activation_weight);
}
- ASSERT(sm->sm_loaded);
+ ASSERT(msp->ms_loaded);
ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK);
return (0);
@@ -956,26 +1753,102 @@
* this metaslab again. In that case, it had better be empty,
* or we would be leaving space on the table.
*/
- ASSERT(size >= SPA_MINBLOCKSIZE || msp->ms_map->sm_space == 0);
+ ASSERT(size >= SPA_MINBLOCKSIZE || range_tree_space(msp->ms_tree) == 0);
metaslab_group_sort(msp->ms_group, msp, MIN(msp->ms_weight, size));
ASSERT((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0);
}
+static void
+metaslab_preload(void *arg)
+{
+ metaslab_t *msp = arg;
+ spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
+
+ ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock));
+
+ mutex_enter(&msp->ms_lock);
+ metaslab_load_wait(msp);
+ if (!msp->ms_loaded)
+ (void) metaslab_load(msp);
+
+ /*
+ * Set the ms_access_txg value so that we don't unload it right away.
+ */
+ msp->ms_access_txg = spa_syncing_txg(spa) + metaslab_unload_delay + 1;
+ mutex_exit(&msp->ms_lock);
+}
+
+static void
+metaslab_group_preload(metaslab_group_t *mg)
+{
+ spa_t *spa = mg->mg_vd->vdev_spa;
+ metaslab_t *msp;
+ avl_tree_t *t = &mg->mg_metaslab_tree;
+ int m = 0;
+
+ if (spa_shutting_down(spa) || !metaslab_preload_enabled) {
+ taskq_wait(mg->mg_taskq);
+ return;
+ }
+
+ mutex_enter(&mg->mg_lock);
+ /*
+ * Load the next potential metaslabs
+ */
+ msp = avl_first(t);
+ while (msp != NULL) {
+ metaslab_t *msp_next = AVL_NEXT(t, msp);
+
+ /*
+ * We preload only the maximum number of metaslabs specified
+ * by metaslab_preload_limit. If a metaslab is being forced
+ * to condense then we preload it too. This will ensure
+ * that force condensing happens in the next txg.
+ */
+ if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) {
+ msp = msp_next;
+ continue;
+ }
+
+ /*
+ * We must drop the metaslab group lock here to preserve
+ * lock ordering with the ms_lock (when grabbing both
+ * the mg_lock and the ms_lock, the ms_lock must be taken
+ * first). As a result, it is possible that the ordering
+ * of the metaslabs within the avl tree may change before
+ * we reacquire the lock. The metaslab cannot be removed from
+ * the tree while we're in syncing context so it is safe to
+ * drop the mg_lock here. If the metaslabs are reordered
+ * nothing will break -- we just may end up loading a
+ * less than optimal one.
+ */
+ mutex_exit(&mg->mg_lock);
+ VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload,
+ msp, TQ_SLEEP) != 0);
+ mutex_enter(&mg->mg_lock);
+ msp = msp_next;
+ }
+ mutex_exit(&mg->mg_lock);
+}
+
/*
- * Determine if the in-core space map representation can be condensed on-disk.
- * We would like to use the following criteria to make our decision:
+ * Determine if the space map's on-disk footprint is past our tolerance
+ * for inefficiency. We would like to use the following criteria to make
+ * our decision:
*
* 1. The size of the space map object should not dramatically increase as a
- * result of writing out our in-core free map.
+ * result of writing out the free space range tree.
*
* 2. The minimal on-disk space map representation is zfs_condense_pct/100
- * times the size than the in-core representation (i.e. zfs_condense_pct = 110
- * and in-core = 1MB, minimal = 1.1.MB).
+ * times the size than the free space range tree representation
+ * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1.MB).
*
+ * 3. The on-disk size of the space map should actually decrease.
+ *
* Checking the first condition is tricky since we don't want to walk
* the entire AVL tree calculating the estimated on-disk size. Instead we
- * use the size-ordered AVL tree in the space map and calculate the
- * size required for the largest segment in our in-core free map. If the
+ * use the size-ordered range tree in the metaslab and calculate the
+ * size required to write out the largest segment in our free tree. If the
* size required to represent that segment on disk is larger than the space
* map object then we avoid condensing this map.
*
@@ -982,25 +1855,33 @@
* To determine the second criterion we use a best-case estimate and assume
* each segment can be represented on-disk as a single 64-bit entry. We refer
* to this best-case estimate as the space map's minimal form.
+ *
+ * Unfortunately, we cannot compute the on-disk size of the space map in this
+ * context because we cannot accurately compute the effects of compression, etc.
+ * Instead, we apply the heuristic described in the block comment for
+ * zfs_metaslab_condense_block_threshold - we only condense if the space used
+ * is greater than a threshold number of blocks.
*/
static boolean_t
metaslab_should_condense(metaslab_t *msp)
{
- space_map_t *sm = msp->ms_map;
- space_map_obj_t *smo = &msp->ms_smo_syncing;
- space_seg_t *ss;
- uint64_t size, entries, segsz;
+ space_map_t *sm = msp->ms_sm;
+ range_seg_t *rs;
+ uint64_t size, entries, segsz, object_size, optimal_size, record_size;
+ dmu_object_info_t doi;
+ uint64_t vdev_blocksize = 1 << msp->ms_group->mg_vd->vdev_ashift;
ASSERT(MUTEX_HELD(&msp->ms_lock));
- ASSERT(sm->sm_loaded);
+ ASSERT(msp->ms_loaded);
/*
- * Use the sm_pp_root AVL tree, which is ordered by size, to obtain
- * the largest segment in the in-core free map. If the tree is
- * empty then we should condense the map.
+ * Use the ms_size_tree range tree, which is ordered by size, to
+ * obtain the largest segment in the free tree. We always condense
+ * metaslabs that are empty and metaslabs for which a condense
+ * request has been made.
*/
- ss = avl_last(sm->sm_pp_root);
- if (ss == NULL)
+ rs = avl_last(&msp->ms_size_tree);
+ if (rs == NULL || msp->ms_condense_wanted)
return (B_TRUE);
/*
@@ -1009,102 +1890,106 @@
* larger on-disk than the entire current on-disk structure, then
* clearly condensing will increase the on-disk structure size.
*/
- size = (ss->ss_end - ss->ss_start) >> sm->sm_shift;
+ size = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
entries = size / (MIN(size, SM_RUN_MAX));
segsz = entries * sizeof (uint64_t);
- return (segsz <= smo->smo_objsize &&
- smo->smo_objsize >= (zfs_condense_pct *
- sizeof (uint64_t) * avl_numnodes(&sm->sm_root)) / 100);
+ optimal_size = sizeof (uint64_t) * avl_numnodes(&msp->ms_tree->rt_root);
+ object_size = space_map_length(msp->ms_sm);
+
+ dmu_object_info_from_db(sm->sm_dbuf, &doi);
+ record_size = MAX(doi.doi_data_block_size, vdev_blocksize);
+
+ return (segsz <= object_size &&
+ object_size >= (optimal_size * zfs_condense_pct / 100) &&
+ object_size > zfs_metaslab_condense_block_threshold * record_size);
}
/*
* Condense the on-disk space map representation to its minimized form.
* The minimized form consists of a small number of allocations followed by
- * the in-core free map.
+ * the entries of the free range tree.
*/
static void
metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx)
{
spa_t *spa = msp->ms_group->mg_vd->vdev_spa;
- space_map_t *freemap = msp->ms_freemap[txg & TXG_MASK];
- space_map_t condense_map;
- space_map_t *sm = msp->ms_map;
- objset_t *mos = spa_meta_objset(spa);
- space_map_obj_t *smo = &msp->ms_smo_syncing;
+ range_tree_t *freetree = msp->ms_freetree[txg & TXG_MASK];
+ range_tree_t *condense_tree;
+ space_map_t *sm = msp->ms_sm;
ASSERT(MUTEX_HELD(&msp->ms_lock));
ASSERT3U(spa_sync_pass(spa), ==, 1);
- ASSERT(sm->sm_loaded);
+ ASSERT(msp->ms_loaded);
- spa_dbgmsg(spa, "condensing: txg %llu, msp[%llu] %p, "
- "smo size %llu, segments %lu", txg,
- (msp->ms_map->sm_start / msp->ms_map->sm_size), msp,
- smo->smo_objsize, avl_numnodes(&sm->sm_root));
+ spa_dbgmsg(spa, "condensing: txg %llu, msp[%llu] %p, vdev id %llu, "
+ "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg,
+ msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id,
+ msp->ms_group->mg_vd->vdev_spa->spa_name,
+ space_map_length(msp->ms_sm), avl_numnodes(&msp->ms_tree->rt_root),
+ msp->ms_condense_wanted ? "TRUE" : "FALSE");
+
+ msp->ms_condense_wanted = B_FALSE;
+
/*
- * Create an map that is a 100% allocated map. We remove segments
+ * Create an range tree that is 100% allocated. We remove segments
* that have been freed in this txg, any deferred frees that exist,
* and any allocation in the future. Removing segments should be
- * a relatively inexpensive operation since we expect these maps to
- * a small number of nodes.
+ * a relatively inexpensive operation since we expect these trees to
+ * have a small number of nodes.
*/
- space_map_create(&condense_map, sm->sm_start, sm->sm_size,
- sm->sm_shift, sm->sm_lock);
- space_map_add(&condense_map, condense_map.sm_start,
- condense_map.sm_size);
+ condense_tree = range_tree_create(NULL, NULL, &msp->ms_lock);
+ range_tree_add(condense_tree, msp->ms_start, msp->ms_size);
/*
- * Remove what's been freed in this txg from the condense_map.
+ * Remove what's been freed in this txg from the condense_tree.
* Since we're in sync_pass 1, we know that all the frees from
- * this txg are in the freemap.
+ * this txg are in the freetree.
*/
- space_map_walk(freemap, space_map_remove, &condense_map);
+ range_tree_walk(freetree, range_tree_remove, condense_tree);
- for (int t = 0; t < TXG_DEFER_SIZE; t++)
- space_map_walk(msp->ms_defermap[t],
- space_map_remove, &condense_map);
+ for (int t = 0; t < TXG_DEFER_SIZE; t++) {
+ range_tree_walk(msp->ms_defertree[t],
+ range_tree_remove, condense_tree);
+ }
- for (int t = 1; t < TXG_CONCURRENT_STATES; t++)
- space_map_walk(msp->ms_allocmap[(txg + t) & TXG_MASK],
- space_map_remove, &condense_map);
+ for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
+ range_tree_walk(msp->ms_alloctree[(txg + t) & TXG_MASK],
+ range_tree_remove, condense_tree);
+ }
/*
* We're about to drop the metaslab's lock thus allowing
* other consumers to change it's content. Set the
- * space_map's sm_condensing flag to ensure that
+ * metaslab's ms_condensing flag to ensure that
* allocations on this metaslab do not occur while we're
* in the middle of committing it to disk. This is only critical
- * for the ms_map as all other space_maps use per txg
+ * for the ms_tree as all other range trees use per txg
* views of their content.
*/
- sm->sm_condensing = B_TRUE;
+ msp->ms_condensing = B_TRUE;
mutex_exit(&msp->ms_lock);
- space_map_truncate(smo, mos, tx);
+ space_map_truncate(sm, tx);
mutex_enter(&msp->ms_lock);
/*
* While we would ideally like to create a space_map representation
* that consists only of allocation records, doing so can be
- * prohibitively expensive because the in-core free map can be
+ * prohibitively expensive because the in-core free tree can be
* large, and therefore computationally expensive to subtract
- * from the condense_map. Instead we sync out two maps, a cheap
- * allocation only map followed by the in-core free map. While not
+ * from the condense_tree. Instead we sync out two trees, a cheap
+ * allocation only tree followed by the in-core free tree. While not
* optimal, this is typically close to optimal, and much cheaper to
* compute.
*/
- space_map_sync(&condense_map, SM_ALLOC, smo, mos, tx);
- space_map_vacate(&condense_map, NULL, NULL);
- space_map_destroy(&condense_map);
+ space_map_write(sm, condense_tree, SM_ALLOC, tx);
+ range_tree_vacate(condense_tree, NULL, NULL);
+ range_tree_destroy(condense_tree);
- space_map_sync(sm, SM_FREE, smo, mos, tx);
- sm->sm_condensing = B_FALSE;
-
- spa_dbgmsg(spa, "condensed: txg %llu, msp[%llu] %p, "
- "smo size %llu", txg,
- (msp->ms_map->sm_start / msp->ms_map->sm_size), msp,
- smo->smo_objsize);
+ space_map_write(sm, msp->ms_tree, SM_FREE, tx);
+ msp->ms_condensing = B_FALSE;
}
/*
@@ -1113,16 +1998,16 @@
void
metaslab_sync(metaslab_t *msp, uint64_t txg)
{
- vdev_t *vd = msp->ms_group->mg_vd;
+ metaslab_group_t *mg = msp->ms_group;
+ vdev_t *vd = mg->mg_vd;
spa_t *spa = vd->vdev_spa;
objset_t *mos = spa_meta_objset(spa);
- space_map_t *allocmap = msp->ms_allocmap[txg & TXG_MASK];
- space_map_t **freemap = &msp->ms_freemap[txg & TXG_MASK];
- space_map_t **freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
- space_map_t *sm = msp->ms_map;
- space_map_obj_t *smo = &msp->ms_smo_syncing;
- dmu_buf_t *db;
+ range_tree_t *alloctree = msp->ms_alloctree[txg & TXG_MASK];
+ range_tree_t **freetree = &msp->ms_freetree[txg & TXG_MASK];
+ range_tree_t **freed_tree =
+ &msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK];
dmu_tx_t *tx;
+ uint64_t object = space_map_object(msp->ms_sm);
ASSERT(!vd->vdev_ishole);
@@ -1129,78 +2014,114 @@
/*
* This metaslab has just been added so there's no work to do now.
*/
- if (*freemap == NULL) {
- ASSERT3P(allocmap, ==, NULL);
+ if (*freetree == NULL) {
+ ASSERT3P(alloctree, ==, NULL);
return;
}
- ASSERT3P(allocmap, !=, NULL);
- ASSERT3P(*freemap, !=, NULL);
- ASSERT3P(*freed_map, !=, NULL);
+ ASSERT3P(alloctree, !=, NULL);
+ ASSERT3P(*freetree, !=, NULL);
+ ASSERT3P(*freed_tree, !=, NULL);
- if (allocmap->sm_space == 0 && (*freemap)->sm_space == 0)
+ /*
+ * Normally, we don't want to process a metaslab if there
+ * are no allocations or frees to perform. However, if the metaslab
+ * is being forced to condense we need to let it through.
+ */
+ if (range_tree_space(alloctree) == 0 &&
+ range_tree_space(*freetree) == 0 &&
+ !msp->ms_condense_wanted)
return;
/*
* The only state that can actually be changing concurrently with
- * metaslab_sync() is the metaslab's ms_map. No other thread can
- * be modifying this txg's allocmap, freemap, freed_map, or smo.
- * Therefore, we only hold ms_lock to satify space_map ASSERTs.
- * We drop it whenever we call into the DMU, because the DMU
- * can call down to us (e.g. via zio_free()) at any time.
+ * metaslab_sync() is the metaslab's ms_tree. No other thread can
+ * be modifying this txg's alloctree, freetree, freed_tree, or
+ * space_map_phys_t. Therefore, we only hold ms_lock to satify
+ * space_map ASSERTs. We drop it whenever we call into the DMU,
+ * because the DMU can call down to us (e.g. via zio_free()) at
+ * any time.
*/
tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
- if (smo->smo_object == 0) {
- ASSERT(smo->smo_objsize == 0);
- ASSERT(smo->smo_alloc == 0);
- smo->smo_object = dmu_object_alloc(mos,
- DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT,
- DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx);
- ASSERT(smo->smo_object != 0);
- dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
- (sm->sm_start >> vd->vdev_ms_shift),
- sizeof (uint64_t), &smo->smo_object, tx);
+ if (msp->ms_sm == NULL) {
+ uint64_t new_object;
+
+ new_object = space_map_alloc(mos, tx);
+ VERIFY3U(new_object, !=, 0);
+
+ VERIFY0(space_map_open(&msp->ms_sm, mos, new_object,
+ msp->ms_start, msp->ms_size, vd->vdev_ashift,
+ &msp->ms_lock));
+ ASSERT(msp->ms_sm != NULL);
}
mutex_enter(&msp->ms_lock);
- if (sm->sm_loaded && spa_sync_pass(spa) == 1 &&
+ /*
+ * Note: metaslab_condense() clears the space_map's histogram.
+ * Therefore we must verify and remove this histogram before
+ * condensing.
+ */
+ metaslab_group_histogram_verify(mg);
+ metaslab_class_histogram_verify(mg->mg_class);
+ metaslab_group_histogram_remove(mg, msp);
+
+ if (msp->ms_loaded && spa_sync_pass(spa) == 1 &&
metaslab_should_condense(msp)) {
metaslab_condense(msp, txg, tx);
} else {
- space_map_sync(allocmap, SM_ALLOC, smo, mos, tx);
- space_map_sync(*freemap, SM_FREE, smo, mos, tx);
+ space_map_write(msp->ms_sm, alloctree, SM_ALLOC, tx);
+ space_map_write(msp->ms_sm, *freetree, SM_FREE, tx);
}
- space_map_vacate(allocmap, NULL, NULL);
+ if (msp->ms_loaded) {
+ /*
+ * When the space map is loaded, we have an accruate
+ * histogram in the range tree. This gives us an opportunity
+ * to bring the space map's histogram up-to-date so we clear
+ * it first before updating it.
+ */
+ space_map_histogram_clear(msp->ms_sm);
+ space_map_histogram_add(msp->ms_sm, msp->ms_tree, tx);
+ } else {
+ /*
+ * Since the space map is not loaded we simply update the
+ * exisiting histogram with what was freed in this txg. This
+ * means that the on-disk histogram may not have an accurate
+ * view of the free space but it's close enough to allow
+ * us to make allocation decisions.
+ */
+ space_map_histogram_add(msp->ms_sm, *freetree, tx);
+ }
+ metaslab_group_histogram_add(mg, msp);
+ metaslab_group_histogram_verify(mg);
+ metaslab_class_histogram_verify(mg->mg_class);
/*
- * For sync pass 1, we avoid walking the entire space map and
- * instead will just swap the pointers for freemap and
- * freed_map. We can safely do this since the freed_map is
+ * For sync pass 1, we avoid traversing this txg's free range tree
+ * and instead will just swap the pointers for freetree and
+ * freed_tree. We can safely do this since the freed_tree is
* guaranteed to be empty on the initial pass.
*/
if (spa_sync_pass(spa) == 1) {
- ASSERT0((*freed_map)->sm_space);
- ASSERT0(avl_numnodes(&(*freed_map)->sm_root));
- space_map_swap(freemap, freed_map);
+ range_tree_swap(freetree, freed_tree);
} else {
- space_map_vacate(*freemap, space_map_add, *freed_map);
+ range_tree_vacate(*freetree, range_tree_add, *freed_tree);
}
+ range_tree_vacate(alloctree, NULL, NULL);
- ASSERT0(msp->ms_allocmap[txg & TXG_MASK]->sm_space);
- ASSERT0(msp->ms_freemap[txg & TXG_MASK]->sm_space);
+ ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK]));
+ ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK]));
mutex_exit(&msp->ms_lock);
- VERIFY0(dmu_bonus_hold(mos, smo->smo_object, FTAG, &db));
- dmu_buf_will_dirty(db, tx);
- ASSERT3U(db->db_size, >=, sizeof (*smo));
- bcopy(smo, db->db_data, sizeof (*smo));
- dmu_buf_rele(db, FTAG);
-
+ if (object != space_map_object(msp->ms_sm)) {
+ object = space_map_object(msp->ms_sm);
+ dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
+ msp->ms_id, sizeof (uint64_t), &object, tx);
+ }
dmu_tx_commit(tx);
}
@@ -1211,13 +2132,10 @@
void
metaslab_sync_done(metaslab_t *msp, uint64_t txg)
{
- space_map_obj_t *smo = &msp->ms_smo;
- space_map_obj_t *smosync = &msp->ms_smo_syncing;
- space_map_t *sm = msp->ms_map;
- space_map_t **freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
- space_map_t **defer_map = &msp->ms_defermap[txg % TXG_DEFER_SIZE];
metaslab_group_t *mg = msp->ms_group;
vdev_t *vd = mg->mg_vd;
+ range_tree_t **freed_tree;
+ range_tree_t **defer_tree;
int64_t alloc_delta, defer_delta;
ASSERT(!vd->vdev_ishole);
@@ -1226,63 +2144,63 @@
/*
* If this metaslab is just becoming available, initialize its
- * allocmaps, freemaps, and defermap and add its capacity to the vdev.
+ * alloctrees, freetrees, and defertree and add its capacity to
+ * the vdev.
*/
- if (*freed_map == NULL) {
- ASSERT(*defer_map == NULL);
+ if (msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK] == NULL) {
for (int t = 0; t < TXG_SIZE; t++) {
- msp->ms_allocmap[t] = kmem_zalloc(sizeof (space_map_t),
- KM_SLEEP);
- space_map_create(msp->ms_allocmap[t], sm->sm_start,
- sm->sm_size, sm->sm_shift, sm->sm_lock);
- msp->ms_freemap[t] = kmem_zalloc(sizeof (space_map_t),
- KM_SLEEP);
- space_map_create(msp->ms_freemap[t], sm->sm_start,
- sm->sm_size, sm->sm_shift, sm->sm_lock);
+ ASSERT(msp->ms_alloctree[t] == NULL);
+ ASSERT(msp->ms_freetree[t] == NULL);
+
+ msp->ms_alloctree[t] = range_tree_create(NULL, msp,
+ &msp->ms_lock);
+ msp->ms_freetree[t] = range_tree_create(NULL, msp,
+ &msp->ms_lock);
}
for (int t = 0; t < TXG_DEFER_SIZE; t++) {
- msp->ms_defermap[t] = kmem_zalloc(sizeof (space_map_t),
- KM_SLEEP);
- space_map_create(msp->ms_defermap[t], sm->sm_start,
- sm->sm_size, sm->sm_shift, sm->sm_lock);
+ ASSERT(msp->ms_defertree[t] == NULL);
+
+ msp->ms_defertree[t] = range_tree_create(NULL, msp,
+ &msp->ms_lock);
}
- freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
- defer_map = &msp->ms_defermap[txg % TXG_DEFER_SIZE];
-
- vdev_space_update(vd, 0, 0, sm->sm_size);
+ vdev_space_update(vd, 0, 0, msp->ms_size);
}
- alloc_delta = smosync->smo_alloc - smo->smo_alloc;
- defer_delta = (*freed_map)->sm_space - (*defer_map)->sm_space;
+ freed_tree = &msp->ms_freetree[TXG_CLEAN(txg) & TXG_MASK];
+ defer_tree = &msp->ms_defertree[txg % TXG_DEFER_SIZE];
+ alloc_delta = space_map_alloc_delta(msp->ms_sm);
+ defer_delta = range_tree_space(*freed_tree) -
+ range_tree_space(*defer_tree);
+
vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0);
- ASSERT(msp->ms_allocmap[txg & TXG_MASK]->sm_space == 0);
- ASSERT(msp->ms_freemap[txg & TXG_MASK]->sm_space == 0);
+ ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK]));
+ ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK]));
/*
- * If there's a space_map_load() in progress, wait for it to complete
+ * If there's a metaslab_load() in progress, wait for it to complete
* so that we have a consistent view of the in-core space map.
*/
- space_map_load_wait(sm);
+ metaslab_load_wait(msp);
/*
- * Move the frees from the defer_map to this map (if it's loaded).
- * Swap the freed_map and the defer_map -- this is safe to do
- * because we've just emptied out the defer_map.
+ * Move the frees from the defer_tree back to the free
+ * range tree (if it's loaded). Swap the freed_tree and the
+ * defer_tree -- this is safe to do because we've just emptied out
+ * the defer_tree.
*/
- space_map_vacate(*defer_map, sm->sm_loaded ? space_map_free : NULL, sm);
- ASSERT0((*defer_map)->sm_space);
- ASSERT0(avl_numnodes(&(*defer_map)->sm_root));
- space_map_swap(freed_map, defer_map);
+ range_tree_vacate(*defer_tree,
+ msp->ms_loaded ? range_tree_add : NULL, msp->ms_tree);
+ range_tree_swap(freed_tree, defer_tree);
- *smo = *smosync;
+ space_map_update(msp->ms_sm);
msp->ms_deferspace += defer_delta;
ASSERT3S(msp->ms_deferspace, >=, 0);
- ASSERT3S(msp->ms_deferspace, <=, sm->sm_size);
+ ASSERT3S(msp->ms_deferspace, <=, msp->ms_size);
if (msp->ms_deferspace != 0) {
/*
* Keep syncing this metaslab until all deferred frees
@@ -1291,24 +2209,17 @@
vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
}
- /*
- * If the map is loaded but no longer active, evict it as soon as all
- * future allocations have synced. (If we unloaded it now and then
- * loaded a moment later, the map wouldn't reflect those allocations.)
- */
- if (sm->sm_loaded && (msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
- int evictable = 1;
+ if (msp->ms_loaded && msp->ms_access_txg < txg) {
+ for (int t = 1; t < TXG_CONCURRENT_STATES; t++) {
+ VERIFY0(range_tree_space(
+ msp->ms_alloctree[(txg + t) & TXG_MASK]));
+ }
- for (int t = 1; t < TXG_CONCURRENT_STATES; t++)
- if (msp->ms_allocmap[(txg + t) & TXG_MASK]->sm_space)
- evictable = 0;
-
- if (evictable && !metaslab_debug)
- space_map_unload(sm);
+ if (!metaslab_debug_unload)
+ metaslab_unload(msp);
}
metaslab_group_sort(mg, msp, metaslab_weight(msp));
-
mutex_exit(&msp->ms_lock);
}
@@ -1315,30 +2226,13 @@
void
metaslab_sync_reassess(metaslab_group_t *mg)
{
- vdev_t *vd = mg->mg_vd;
- int64_t failures = mg->mg_alloc_failures;
+ metaslab_group_alloc_update(mg);
+ mg->mg_fragmentation = metaslab_group_fragmentation(mg);
/*
- * Re-evaluate all metaslabs which have lower offsets than the
- * bonus area.
+ * Preload the next potential metaslabs
*/
- for (int m = 0; m < vd->vdev_ms_count; m++) {
- metaslab_t *msp = vd->vdev_ms[m];
-
- if (msp->ms_map->sm_start > mg->mg_bonus_area)
- break;
-
- mutex_enter(&msp->ms_lock);
- metaslab_group_sort(mg, msp, metaslab_weight(msp));
- mutex_exit(&msp->ms_lock);
- }
-
- atomic_add_64(&mg->mg_alloc_failures, -failures);
-
- /*
- * Prefetch the next potential metaslabs
- */
- metaslab_prefetch(mg);
+ metaslab_group_preload(mg);
}
static uint64_t
@@ -1346,7 +2240,7 @@
{
uint64_t ms_shift = msp->ms_group->mg_vd->vdev_ms_shift;
uint64_t offset = DVA_GET_OFFSET(dva) >> ms_shift;
- uint64_t start = msp->ms_map->sm_start >> ms_shift;
+ uint64_t start = msp->ms_id;
if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva))
return (1ULL << 63);
@@ -1358,9 +2252,58 @@
return (0);
}
+/*
+ * ==========================================================================
+ * Metaslab block operations
+ * ==========================================================================
+ */
+
+static void
+metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags)
+{
+ if (!(flags & METASLAB_ASYNC_ALLOC) ||
+ flags & METASLAB_DONT_THROTTLE)
+ return;
+
+ metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
+ if (!mg->mg_class->mc_alloc_throttle_enabled)
+ return;
+
+ (void) refcount_add(&mg->mg_alloc_queue_depth, tag);
+}
+
+void
+metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags)
+{
+ if (!(flags & METASLAB_ASYNC_ALLOC) ||
+ flags & METASLAB_DONT_THROTTLE)
+ return;
+
+ metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
+ if (!mg->mg_class->mc_alloc_throttle_enabled)
+ return;
+
+ (void) refcount_remove(&mg->mg_alloc_queue_depth, tag);
+}
+
+void
+metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag)
+{
+#ifdef ZFS_DEBUG
+ const dva_t *dva = bp->blk_dva;
+ int ndvas = BP_GET_NDVAS(bp);
+
+ for (int d = 0; d < ndvas; d++) {
+ uint64_t vdev = DVA_GET_VDEV(&dva[d]);
+ metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg;
+ VERIFY(refcount_not_held(&mg->mg_alloc_queue_depth, tag));
+ }
+#endif
+}
+
static uint64_t
-metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize,
- uint64_t txg, uint64_t min_distance, dva_t *dva, int d, int flags)
+metaslab_group_alloc(metaslab_group_t *mg, uint64_t asize,
+ uint64_t txg, uint64_t min_distance, dva_t *dva, int d)
{
spa_t *spa = mg->mg_vd->vdev_spa;
metaslab_t *msp = NULL;
@@ -1386,11 +2329,10 @@
if (msp->ms_weight < asize) {
spa_dbgmsg(spa, "%s: failed to meet weight "
"requirement: vdev %llu, txg %llu, mg %p, "
- "msp %p, psize %llu, asize %llu, "
- "failures %llu, weight %llu",
- spa_name(spa), mg->mg_vd->vdev_id, txg,
- mg, msp, psize, asize,
- mg->mg_alloc_failures, msp->ms_weight);
+ "msp %p, asize %llu, "
+ "weight %llu", spa_name(spa),
+ mg->mg_vd->vdev_id, txg,
+ mg, msp, asize, msp->ms_weight);
mutex_exit(&mg->mg_lock);
return (-1ULL);
}
@@ -1398,7 +2340,7 @@
/*
* If the selected metaslab is condensing, skip it.
*/
- if (msp->ms_map->sm_condensing)
+ if (msp->ms_condensing)
continue;
was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
@@ -1406,7 +2348,8 @@
break;
target_distance = min_distance +
- (msp->ms_smo.smo_alloc ? 0 : min_distance >> 1);
+ (space_map_allocated(msp->ms_sm) != 0 ? 0 :
+ min_distance >> 1);
for (i = 0; i < d; i++)
if (metaslab_distance(msp, &dva[i]) <
@@ -1419,25 +2362,6 @@
if (msp == NULL)
return (-1ULL);
- /*
- * If we've already reached the allowable number of failed
- * allocation attempts on this metaslab group then we
- * consider skipping it. We skip it only if we're allowed
- * to "fast" gang, the physical size is larger than
- * a gang block, and we're attempting to allocate from
- * the primary metaslab.
- */
- if (mg->mg_alloc_failures > zfs_mg_alloc_failures &&
- CAN_FASTGANG(flags) && psize > SPA_GANGBLOCKSIZE &&
- activation_weight == METASLAB_WEIGHT_PRIMARY) {
- spa_dbgmsg(spa, "%s: skipping metaslab group: "
- "vdev %llu, txg %llu, mg %p, psize %llu, "
- "asize %llu, failures %llu", spa_name(spa),
- mg->mg_vd->vdev_id, txg, mg, psize, asize,
- mg->mg_alloc_failures);
- return (-1ULL);
- }
-
mutex_enter(&msp->ms_lock);
/*
@@ -1471,28 +2395,25 @@
* we can't manipulate this metaslab until it's committed
* to disk.
*/
- if (msp->ms_map->sm_condensing) {
+ if (msp->ms_condensing) {
mutex_exit(&msp->ms_lock);
continue;
}
- if ((offset = space_map_alloc(msp->ms_map, asize)) != -1ULL)
+ if ((offset = metaslab_block_alloc(msp, asize)) != -1ULL)
break;
- atomic_inc_64(&mg->mg_alloc_failures);
-
- metaslab_passivate(msp, space_map_maxsize(msp->ms_map));
-
+ metaslab_passivate(msp, metaslab_block_maxsize(msp));
mutex_exit(&msp->ms_lock);
}
- if (msp->ms_allocmap[txg & TXG_MASK]->sm_space == 0)
+ if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
- space_map_add(msp->ms_allocmap[txg & TXG_MASK], offset, asize);
+ range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, asize);
+ msp->ms_access_txg = txg + metaslab_unload_delay;
mutex_exit(&msp->ms_lock);
-
return (offset);
}
@@ -1509,7 +2430,6 @@
int all_zero;
int zio_lock = B_FALSE;
boolean_t allocatable;
- uint64_t offset = -1ULL;
uint64_t asize;
uint64_t distance;
@@ -1579,7 +2499,6 @@
all_zero = B_TRUE;
do {
ASSERT(mg->mg_activation_count == 1);
-
vd = mg->mg_vd;
/*
@@ -1592,18 +2511,30 @@
} else {
allocatable = vdev_allocatable(vd);
}
+
+ /*
+ * Determine if the selected metaslab group is eligible
+ * for allocations. If we're ganging then don't allow
+ * this metaslab group to skip allocations since that would
+ * inadvertently return ENOSPC and suspend the pool
+ * even though space is still available.
+ */
+ if (allocatable && !GANG_ALLOCATION(flags) && !zio_lock) {
+ allocatable = metaslab_group_allocatable(mg, rotor,
+ psize);
+ }
+
if (!allocatable)
goto next;
+ ASSERT(mg->mg_initialized);
+
/*
- * Avoid writing single-copy data to a failing vdev
- * unless the user instructs us that it is okay.
+ * Avoid writing single-copy data to a failing vdev.
*/
if ((vd->vdev_stat.vs_write_errors > 0 ||
vd->vdev_state < VDEV_STATE_HEALTHY) &&
- d == 0 && dshift == 3 &&
- !(zfs_write_to_degraded && vd->vdev_state ==
- VDEV_STATE_DEGRADED)) {
+ d == 0 && dshift == 3 && vd->vdev_children == 0) {
all_zero = B_FALSE;
goto next;
}
@@ -1619,8 +2550,32 @@
asize = vdev_psize_to_asize(vd, psize);
ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
- offset = metaslab_group_alloc(mg, psize, asize, txg, distance,
- dva, d, flags);
+ uint64_t offset = metaslab_group_alloc(mg, asize, txg,
+ distance, dva, d);
+
+ mutex_enter(&mg->mg_lock);
+ if (offset == -1ULL) {
+ mg->mg_failed_allocations++;
+ if (asize == SPA_GANGBLOCKSIZE) {
+ /*
+ * This metaslab group was unable to allocate
+ * the minimum gang block size so it must be
+ * out of space. We must notify the allocation
+ * throttle to start skipping allocation
+ * attempts to this metaslab group until more
+ * space becomes available.
+ *
+ * Note: this failure cannot be caused by the
+ * allocation throttle since the allocation
+ * throttle is only responsible for skipping
+ * devices and not failing block allocations.
+ */
+ mg->mg_no_free_space = B_TRUE;
+ }
+ }
+ mg->mg_allocations++;
+ mutex_exit(&mg->mg_lock);
+
if (offset != -1ULL) {
/*
* If we've just selected this metaslab group,
@@ -1628,7 +2583,7 @@
* over- or under-used relative to the pool,
* and set an allocation bias to even it out.
*/
- if (mc->mc_aliquot == 0) {
+ if (mc->mc_aliquot == 0 && metaslab_bias_enabled) {
vdev_stat_t *vs = &vd->vdev_stat;
int64_t vu, cu;
@@ -1650,6 +2605,8 @@
*/
mg->mg_bias = ((cu - vu) *
(int64_t)mg->mg_aliquot) / 100;
+ } else if (!metaslab_bias_enabled) {
+ mg->mg_bias = 0;
}
if (atomic_add_64_nv(&mc->mc_aliquot, asize) >=
@@ -1721,13 +2678,22 @@
mutex_enter(&msp->ms_lock);
if (now) {
- space_map_remove(msp->ms_allocmap[txg & TXG_MASK],
+ range_tree_remove(msp->ms_alloctree[txg & TXG_MASK],
offset, size);
- space_map_free(msp->ms_map, offset, size);
+
+ VERIFY(!msp->ms_condensing);
+ VERIFY3U(offset, >=, msp->ms_start);
+ VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size);
+ VERIFY3U(range_tree_space(msp->ms_tree) + size, <=,
+ msp->ms_size);
+ VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
+ VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
+ range_tree_add(msp->ms_tree, offset, size);
} else {
- if (msp->ms_freemap[txg & TXG_MASK]->sm_space == 0)
+ if (range_tree_space(msp->ms_freetree[txg & TXG_MASK]) == 0)
vdev_dirty(vd, VDD_METASLAB, msp, txg);
- space_map_add(msp->ms_freemap[txg & TXG_MASK], offset, size);
+ range_tree_add(msp->ms_freetree[txg & TXG_MASK],
+ offset, size);
}
mutex_exit(&msp->ms_lock);
@@ -1762,10 +2728,10 @@
mutex_enter(&msp->ms_lock);
- if ((txg != 0 && spa_writeable(spa)) || !msp->ms_map->sm_loaded)
+ if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded)
error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
- if (error == 0 && !space_map_contains(msp->ms_map, offset, size))
+ if (error == 0 && !range_tree_contains(msp->ms_tree, offset, size))
error = SET_ERROR(ENOENT);
if (error || txg == 0) { /* txg == 0 indicates dry run */
@@ -1773,12 +2739,16 @@
return (error);
}
- space_map_claim(msp->ms_map, offset, size);
+ VERIFY(!msp->ms_condensing);
+ VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift));
+ VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift));
+ VERIFY3U(range_tree_space(msp->ms_tree) - size, <=, msp->ms_size);
+ range_tree_remove(msp->ms_tree, offset, size);
if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */
- if (msp->ms_allocmap[txg & TXG_MASK]->sm_space == 0)
+ if (range_tree_space(msp->ms_alloctree[txg & TXG_MASK]) == 0)
vdev_dirty(vd, VDD_METASLAB, msp, txg);
- space_map_add(msp->ms_allocmap[txg & TXG_MASK], offset, size);
+ range_tree_add(msp->ms_alloctree[txg & TXG_MASK], offset, size);
}
mutex_exit(&msp->ms_lock);
@@ -1786,9 +2756,57 @@
return (0);
}
+/*
+ * Reserve some allocation slots. The reservation system must be called
+ * before we call into the allocator. If there aren't any available slots
+ * then the I/O will be throttled until an I/O completes and its slots are
+ * freed up. The function returns true if it was successful in placing
+ * the reservation.
+ */
+boolean_t
+metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, zio_t *zio,
+ int flags)
+{
+ uint64_t available_slots = 0;
+ boolean_t slot_reserved = B_FALSE;
+
+ ASSERT(mc->mc_alloc_throttle_enabled);
+ mutex_enter(&mc->mc_lock);
+
+ uint64_t reserved_slots = refcount_count(&mc->mc_alloc_slots);
+ if (reserved_slots < mc->mc_alloc_max_slots)
+ available_slots = mc->mc_alloc_max_slots - reserved_slots;
+
+ if (slots <= available_slots || GANG_ALLOCATION(flags)) {
+ /*
+ * We reserve the slots individually so that we can unreserve
+ * them individually when an I/O completes.
+ */
+ for (int d = 0; d < slots; d++) {
+ reserved_slots = refcount_add(&mc->mc_alloc_slots, zio);
+ }
+ zio->io_flags |= ZIO_FLAG_IO_ALLOCATING;
+ slot_reserved = B_TRUE;
+ }
+
+ mutex_exit(&mc->mc_lock);
+ return (slot_reserved);
+}
+
+void
+metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, zio_t *zio)
+{
+ ASSERT(mc->mc_alloc_throttle_enabled);
+ mutex_enter(&mc->mc_lock);
+ for (int d = 0; d < slots; d++) {
+ (void) refcount_remove(&mc->mc_alloc_slots, zio);
+ }
+ mutex_exit(&mc->mc_lock);
+}
+
int
metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
- int ndvas, uint64_t txg, blkptr_t *hintbp, int flags)
+ int ndvas, uint64_t txg, blkptr_t *hintbp, int flags, zio_t *zio)
{
dva_t *dva = bp->blk_dva;
dva_t *hintdva = hintbp->blk_dva;
@@ -1811,14 +2829,24 @@
for (int d = 0; d < ndvas; d++) {
error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva,
txg, flags);
- if (error) {
+ if (error != 0) {
for (d--; d >= 0; d--) {
metaslab_free_dva(spa, &dva[d], txg, B_TRUE);
+ metaslab_group_alloc_decrement(spa,
+ DVA_GET_VDEV(&dva[d]), zio, flags);
bzero(&dva[d], sizeof (dva_t));
}
spa_config_exit(spa, SCL_ALLOC, FTAG);
return (error);
+ } else {
+ /*
+ * Update the metaslab group's queue depth
+ * based on the newly allocated dva.
+ */
+ metaslab_group_alloc_increment(spa,
+ DVA_GET_VDEV(&dva[d]), zio, flags);
}
+
}
ASSERT(error == 0);
ASSERT(BP_GET_NDVAS(bp) == ndvas);
@@ -1878,19 +2906,6 @@
return (error);
}
-static void
-checkmap(space_map_t *sm, uint64_t off, uint64_t size)
-{
- space_seg_t *ss;
- avl_index_t where;
-
- mutex_enter(sm->sm_lock);
- ss = space_map_find(sm, off, size, &where);
- if (ss != NULL)
- panic("freeing free block; ss=%p", (void *)ss);
- mutex_exit(sm->sm_lock);
-}
-
void
metaslab_check_free(spa_t *spa, const blkptr_t *bp)
{
@@ -1899,19 +2914,19 @@
spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
- uint64_t vdid = DVA_GET_VDEV(&bp->blk_dva[i]);
- vdev_t *vd = vdev_lookup_top(spa, vdid);
- uint64_t off = DVA_GET_OFFSET(&bp->blk_dva[i]);
+ uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]);
+ vdev_t *vd = vdev_lookup_top(spa, vdev);
+ uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]);
- metaslab_t *ms = vd->vdev_ms[off >> vd->vdev_ms_shift];
+ metaslab_t *msp = vd->vdev_ms[offset >> vd->vdev_ms_shift];
- if (ms->ms_map->sm_loaded)
- checkmap(ms->ms_map, off, size);
+ if (msp->ms_loaded)
+ range_tree_verify(msp->ms_tree, offset, size);
for (int j = 0; j < TXG_SIZE; j++)
- checkmap(ms->ms_freemap[j], off, size);
+ range_tree_verify(msp->ms_freetree[j], offset, size);
for (int j = 0; j < TXG_DEFER_SIZE; j++)
- checkmap(ms->ms_defermap[j], off, size);
+ range_tree_verify(msp->ms_defertree[j], offset, size);
}
spa_config_exit(spa, SCL_VDEV, FTAG);
}
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/refcount.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,7 +21,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -31,6 +31,11 @@
#ifdef _KERNEL
int reference_tracking_enable = FALSE; /* runs out of memory too easily */
+SYSCTL_DECL(_vfs_zfs);
+TUNABLE_INT("vfs.zfs.reference_tracking_enable", &reference_tracking_enable);
+SYSCTL_INT(_vfs_zfs, OID_AUTO, reference_tracking_enable, CTLFLAG_RDTUN,
+ &reference_tracking_enable, 0,
+ "Track reference holders to refcount_t objects, used mostly by ZFS");
#else
int reference_tracking_enable = TRUE;
#endif
@@ -70,6 +75,13 @@
}
void
+refcount_create_tracked(refcount_t *rc)
+{
+ refcount_create(rc);
+ rc->rc_tracked = B_TRUE;
+}
+
+void
refcount_create_untracked(refcount_t *rc)
{
refcount_create(rc);
@@ -228,4 +240,84 @@
list_destroy(&removed);
}
+void
+refcount_transfer_ownership(refcount_t *rc, void *current_holder,
+ void *new_holder)
+{
+ reference_t *ref;
+ boolean_t found = B_FALSE;
+
+ mutex_enter(&rc->rc_mtx);
+ if (!rc->rc_tracked) {
+ mutex_exit(&rc->rc_mtx);
+ return;
+ }
+
+ for (ref = list_head(&rc->rc_list); ref;
+ ref = list_next(&rc->rc_list, ref)) {
+ if (ref->ref_holder == current_holder) {
+ ref->ref_holder = new_holder;
+ found = B_TRUE;
+ break;
+ }
+ }
+ ASSERT(found);
+ mutex_exit(&rc->rc_mtx);
+}
+
+/*
+ * If tracking is enabled, return true if a reference exists that matches
+ * the "holder" tag. If tracking is disabled, then return true if a reference
+ * might be held.
+ */
+boolean_t
+refcount_held(refcount_t *rc, void *holder)
+{
+ reference_t *ref;
+
+ mutex_enter(&rc->rc_mtx);
+
+ if (!rc->rc_tracked) {
+ mutex_exit(&rc->rc_mtx);
+ return (rc->rc_count > 0);
+ }
+
+ for (ref = list_head(&rc->rc_list); ref;
+ ref = list_next(&rc->rc_list, ref)) {
+ if (ref->ref_holder == holder) {
+ mutex_exit(&rc->rc_mtx);
+ return (B_TRUE);
+ }
+ }
+ mutex_exit(&rc->rc_mtx);
+ return (B_FALSE);
+}
+
+/*
+ * If tracking is enabled, return true if a reference does not exist that
+ * matches the "holder" tag. If tracking is disabled, always return true
+ * since the reference might not be held.
+ */
+boolean_t
+refcount_not_held(refcount_t *rc, void *holder)
+{
+ reference_t *ref;
+
+ mutex_enter(&rc->rc_mtx);
+
+ if (!rc->rc_tracked) {
+ mutex_exit(&rc->rc_mtx);
+ return (B_TRUE);
+ }
+
+ for (ref = list_head(&rc->rc_list); ref;
+ ref = list_next(&rc->rc_list, ref)) {
+ if (ref->ref_holder == holder) {
+ mutex_exit(&rc->rc_mtx);
+ return (B_FALSE);
+ }
+ }
+ mutex_exit(&rc->rc_mtx);
+ return (B_TRUE);
+}
#endif /* ZFS_DEBUG */
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/rrwlock.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/rrwlock.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/rrwlock.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -160,8 +160,8 @@
refcount_destroy(&rrl->rr_linked_rcount);
}
-void
-rrw_enter_read(rrwlock_t *rrl, void *tag)
+static void
+rrw_enter_read_impl(rrwlock_t *rrl, boolean_t prio, void *tag)
{
mutex_enter(&rrl->rr_lock);
#if !defined(DEBUG) && defined(_KERNEL)
@@ -177,7 +177,7 @@
ASSERT(refcount_count(&rrl->rr_anon_rcount) >= 0);
while (rrl->rr_writer != NULL || (rrl->rr_writer_wanted &&
- refcount_is_zero(&rrl->rr_anon_rcount) &&
+ refcount_is_zero(&rrl->rr_anon_rcount) && !prio &&
rrn_find(rrl) == NULL))
cv_wait(&rrl->rr_cv, &rrl->rr_lock);
@@ -193,6 +193,25 @@
}
void
+rrw_enter_read(rrwlock_t *rrl, void *tag)
+{
+ rrw_enter_read_impl(rrl, B_FALSE, tag);
+}
+
+/*
+ * take a read lock even if there are pending write lock requests. if we want
+ * to take a lock reentrantly, but from different threads (that have a
+ * relationship to each other), the normal detection mechanism to overrule
+ * the pending writer does not work, so we have to give an explicit hint here.
+ */
+void
+rrw_enter_read_prio(rrwlock_t *rrl, void *tag)
+{
+ rrw_enter_read_impl(rrl, B_TRUE, tag);
+}
+
+
+void
rrw_enter_write(rrwlock_t *rrl)
{
mutex_enter(&rrl->rr_lock);
@@ -287,3 +306,91 @@
(void *)curthread, (void *)rn->rn_rrl);
}
}
+
+/*
+ * A reader-mostly lock implementation, tuning above reader-writer locks
+ * for hightly parallel read acquisitions, while pessimizing writes.
+ *
+ * The idea is to split single busy lock into array of locks, so that
+ * each reader can lock only one of them for read, depending on result
+ * of simple hash function. That proportionally reduces lock congestion.
+ * Writer same time has to sequentially aquire write on all the locks.
+ * That makes write aquisition proportionally slower, but in places where
+ * it is used (filesystem unmount) performance is not critical.
+ *
+ * All the functions below are direct wrappers around functions above.
+ */
+void
+rrm_init(rrmlock_t *rrl, boolean_t track_all)
+{
+ int i;
+
+ for (i = 0; i < RRM_NUM_LOCKS; i++)
+ rrw_init(&rrl->locks[i], track_all);
+}
+
+void
+rrm_destroy(rrmlock_t *rrl)
+{
+ int i;
+
+ for (i = 0; i < RRM_NUM_LOCKS; i++)
+ rrw_destroy(&rrl->locks[i]);
+}
+
+void
+rrm_enter(rrmlock_t *rrl, krw_t rw, void *tag)
+{
+ if (rw == RW_READER)
+ rrm_enter_read(rrl, tag);
+ else
+ rrm_enter_write(rrl);
+}
+
+/*
+ * This maps the current thread to a specific lock. Note that the lock
+ * must be released by the same thread that acquired it. We do this
+ * mapping by taking the thread pointer mod a prime number. We examine
+ * only the low 32 bits of the thread pointer, because 32-bit division
+ * is faster than 64-bit division, and the high 32 bits have little
+ * entropy anyway.
+ */
+#define RRM_TD_LOCK() (((uint32_t)(uintptr_t)(curthread)) % RRM_NUM_LOCKS)
+
+void
+rrm_enter_read(rrmlock_t *rrl, void *tag)
+{
+ rrw_enter_read(&rrl->locks[RRM_TD_LOCK()], tag);
+}
+
+void
+rrm_enter_write(rrmlock_t *rrl)
+{
+ int i;
+
+ for (i = 0; i < RRM_NUM_LOCKS; i++)
+ rrw_enter_write(&rrl->locks[i]);
+}
+
+void
+rrm_exit(rrmlock_t *rrl, void *tag)
+{
+ int i;
+
+ if (rrl->locks[0].rr_writer == curthread) {
+ for (i = 0; i < RRM_NUM_LOCKS; i++)
+ rrw_exit(&rrl->locks[i], tag);
+ } else {
+ rrw_exit(&rrl->locks[RRM_TD_LOCK()], tag);
+ }
+}
+
+boolean_t
+rrm_held(rrmlock_t *rrl, krw_t rw)
+{
+ if (rw == RW_WRITER) {
+ return (rrw_held(&rrl->locks[0], rw));
+ } else {
+ return (rrw_held(&rrl->locks[RRM_TD_LOCK()], rw));
+ }
+}
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -24,6 +24,8 @@
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
* Portions Copyright 2011 iXsystems, Inc
* Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
*/
#include <sys/zfs_context.h>
@@ -211,12 +213,6 @@
{
sa_handle_t *hdl = buf;
- hdl->sa_bonus_tab = NULL;
- hdl->sa_spill_tab = NULL;
- hdl->sa_os = NULL;
- hdl->sa_userp = NULL;
- hdl->sa_bonus = NULL;
- hdl->sa_spill = NULL;
mutex_init(&hdl->sa_lock, NULL, MUTEX_DEFAULT, NULL);
return (0);
}
@@ -501,7 +497,7 @@
if (size == 0) {
blocksize = SPA_MINBLOCKSIZE;
- } else if (size > SPA_MAXBLOCKSIZE) {
+ } else if (size > SPA_OLD_MAXBLOCKSIZE) {
ASSERT(0);
return (SET_ERROR(EFBIG));
} else {
@@ -553,10 +549,9 @@
{
int var_size = 0;
int i;
- int j = -1;
int full_space;
int hdrsize;
- boolean_t done = B_FALSE;
+ int extra_hdrsize;
if (buftype == SA_BONUS && sa->sa_force_spill) {
*total = 0;
@@ -567,10 +562,9 @@
*index = -1;
*total = 0;
+ *will_spill = B_FALSE;
- if (buftype == SA_BONUS)
- *will_spill = B_FALSE;
-
+ extra_hdrsize = 0;
hdrsize = (SA_BONUSTYPE_FROM_DB(db) == DMU_OT_ZNODE) ? 0 :
sizeof (sa_hdr_phys_t);
@@ -582,8 +576,8 @@
*total = P2ROUNDUP(*total, 8);
*total += attr_desc[i].sa_length;
- if (done)
- goto next;
+ if (*will_spill)
+ continue;
is_var_sz = (SA_REGISTERED_LEN(sa, attr_desc[i].sa_attr) == 0);
if (is_var_sz) {
@@ -591,21 +585,28 @@
}
if (is_var_sz && var_size > 1) {
- if (P2ROUNDUP(hdrsize + sizeof (uint16_t), 8) +
+ /*
+ * Don't worry that the spill block might overflow.
+ * It will be resized if needed in sa_build_layouts().
+ */
+ if (buftype == SA_SPILL ||
+ P2ROUNDUP(hdrsize + sizeof (uint16_t), 8) +
*total < full_space) {
/*
* Account for header space used by array of
* optional sizes of variable-length attributes.
- * Record the index in case this increase needs
- * to be reversed due to spill-over.
+ * Record the extra header size in case this
+ * increase needs to be reversed due to
+ * spill-over.
*/
hdrsize += sizeof (uint16_t);
- j = i;
+ if (*index != -1)
+ extra_hdrsize += sizeof (uint16_t);
} else {
- done = B_TRUE;
- *index = i;
- if (buftype == SA_BONUS)
- *will_spill = B_TRUE;
+ ASSERT(buftype == SA_BONUS);
+ if (*index == -1)
+ *index = i;
+ *will_spill = B_TRUE;
continue;
}
}
@@ -620,22 +621,15 @@
(*total + P2ROUNDUP(hdrsize, 8)) >
(full_space - sizeof (blkptr_t))) {
*index = i;
- done = B_TRUE;
}
-next:
if ((*total + P2ROUNDUP(hdrsize, 8)) > full_space &&
buftype == SA_BONUS)
*will_spill = B_TRUE;
}
- /*
- * j holds the index of the last variable-sized attribute for
- * which hdrsize was increased. Reverse the increase if that
- * attribute will be relocated to the spill block.
- */
- if (*will_spill && j == *index)
- hdrsize -= sizeof (uint16_t);
+ if (*will_spill)
+ hdrsize -= extra_hdrsize;
hdrsize = P2ROUNDUP(hdrsize, 8);
return (hdrsize);
@@ -676,7 +670,7 @@
hdrsize = sa_find_sizes(sa, attr_desc, attr_count, hdl->sa_bonus,
SA_BONUS, &i, &used, &spilling);
- if (used > SPA_MAXBLOCKSIZE)
+ if (used > SPA_OLD_MAXBLOCKSIZE)
return (SET_ERROR(EFBIG));
VERIFY(0 == dmu_set_bonus(hdl->sa_bonus, spilling ?
@@ -700,7 +694,7 @@
attr_count - i, hdl->sa_spill, SA_SPILL, &i,
&spill_used, &dummy);
- if (spill_used > SPA_MAXBLOCKSIZE)
+ if (spill_used > SPA_OLD_MAXBLOCKSIZE)
return (SET_ERROR(EFBIG));
buf_space = hdl->sa_spill->db_size - spillhdrsize;
@@ -1112,6 +1106,9 @@
if (sa->sa_user_table)
kmem_free(sa->sa_user_table, sa->sa_user_table_sz);
mutex_exit(&sa->sa_lock);
+ avl_destroy(&sa->sa_layout_hash_tree);
+ avl_destroy(&sa->sa_layout_num_tree);
+ mutex_destroy(&sa->sa_lock);
kmem_free(sa, sizeof (sa_os_t));
return ((error == ECKSUM) ? EIO : error);
}
@@ -1147,6 +1144,7 @@
avl_destroy(&sa->sa_layout_hash_tree);
avl_destroy(&sa->sa_layout_num_tree);
+ mutex_destroy(&sa->sa_lock);
kmem_free(sa, sizeof (sa_os_t));
os->os_sa = NULL;
@@ -1301,10 +1299,10 @@
}
/*ARGSUSED*/
-void
-sa_evict(dmu_buf_t *db, void *sap)
+static void
+sa_evict(void *dbu)
{
- panic("evicting sa dbuf %p\n", (void *)db);
+ panic("evicting sa dbuf\n");
}
static void
@@ -1343,18 +1341,16 @@
void
sa_handle_destroy(sa_handle_t *hdl)
{
+ dmu_buf_t *db = hdl->sa_bonus;
+
mutex_enter(&hdl->sa_lock);
- (void) dmu_buf_update_user((dmu_buf_t *)hdl->sa_bonus, hdl,
- NULL, NULL, NULL);
+ (void) dmu_buf_remove_user(db, &hdl->sa_dbu);
- if (hdl->sa_bonus_tab) {
+ if (hdl->sa_bonus_tab)
sa_idx_tab_rele(hdl->sa_os, hdl->sa_bonus_tab);
- hdl->sa_bonus_tab = NULL;
- }
- if (hdl->sa_spill_tab) {
+
+ if (hdl->sa_spill_tab)
sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab);
- hdl->sa_spill_tab = NULL;
- }
dmu_buf_rele(hdl->sa_bonus, NULL);
@@ -1371,7 +1367,7 @@
{
int error = 0;
dmu_object_info_t doi;
- sa_handle_t *handle;
+ sa_handle_t *handle = NULL;
#ifdef ZFS_DEBUG
dmu_object_info_from_db(db, &doi);
@@ -1381,23 +1377,31 @@
/* find handle, if it exists */
/* if one doesn't exist then create a new one, and initialize it */
- handle = (hdl_type == SA_HDL_SHARED) ? dmu_buf_get_user(db) : NULL;
+ if (hdl_type == SA_HDL_SHARED)
+ handle = dmu_buf_get_user(db);
+
if (handle == NULL) {
- sa_handle_t *newhandle;
+ sa_handle_t *winner = NULL;
+
handle = kmem_cache_alloc(sa_cache, KM_SLEEP);
+ handle->sa_dbu.dbu_evict_func = NULL;
handle->sa_userp = userp;
handle->sa_bonus = db;
handle->sa_os = os;
handle->sa_spill = NULL;
+ handle->sa_bonus_tab = NULL;
+ handle->sa_spill_tab = NULL;
error = sa_build_index(handle, SA_BONUS);
- newhandle = (hdl_type == SA_HDL_SHARED) ?
- dmu_buf_set_user_ie(db, handle,
- NULL, sa_evict) : NULL;
- if (newhandle != NULL) {
+ if (hdl_type == SA_HDL_SHARED) {
+ dmu_buf_init_user(&handle->sa_dbu, sa_evict, NULL);
+ winner = dmu_buf_set_user_ie(db, &handle->sa_dbu);
+ }
+
+ if (winner != NULL) {
kmem_cache_free(sa_cache, handle);
- handle = newhandle;
+ handle = winner;
}
}
*handlepp = handle;
@@ -1650,7 +1654,7 @@
int spill_data_size = 0;
int spill_attr_count = 0;
int error;
- uint16_t length;
+ uint16_t length, reg_length;
int i, j, k, length_idx;
sa_hdr_phys_t *hdr;
sa_idx_tab_t *idx_tab;
@@ -1710,34 +1714,50 @@
hdr = SA_GET_HDR(hdl, SA_BONUS);
idx_tab = SA_IDX_TAB_GET(hdl, SA_BONUS);
for (; k != 2; k++) {
- /* iterate over each attribute in layout */
+ /*
+ * Iterate over each attribute in layout. Fetch the
+ * size of variable-length attributes needing rewrite
+ * from sa_lengths[].
+ */
for (i = 0, length_idx = 0; i != count; i++) {
sa_attr_type_t attr;
attr = idx_tab->sa_layout->lot_attrs[i];
+ reg_length = SA_REGISTERED_LEN(sa, attr);
+ if (reg_length == 0) {
+ length = hdr->sa_lengths[length_idx];
+ length_idx++;
+ } else {
+ length = reg_length;
+ }
if (attr == newattr) {
- /* duplicate attributes are not allowed */
- ASSERT(action == SA_REPLACE ||
- action == SA_REMOVE);
- /* must be variable-sized to be replaced here */
- if (action == SA_REPLACE) {
- ASSERT(SA_REGISTERED_LEN(sa, attr) == 0);
- SA_ADD_BULK_ATTR(attr_desc, j, attr,
- locator, datastart, buflen);
- }
- } else {
- length = SA_REGISTERED_LEN(sa, attr);
- if (length == 0) {
- length = hdr->sa_lengths[length_idx];
- }
+ /*
+ * There is nothing to do for SA_REMOVE,
+ * so it is just skipped.
+ */
+ if (action == SA_REMOVE)
+ continue;
+ /*
+ * Duplicate attributes are not allowed, so the
+ * action can not be SA_ADD here.
+ */
+ ASSERT3S(action, ==, SA_REPLACE);
+
+ /*
+ * Only a variable-sized attribute can be
+ * replaced here, and its size must be changing.
+ */
+ ASSERT3U(reg_length, ==, 0);
+ ASSERT3U(length, !=, buflen);
SA_ADD_BULK_ATTR(attr_desc, j, attr,
+ locator, datastart, buflen);
+ } else {
+ SA_ADD_BULK_ATTR(attr_desc, j, attr,
NULL, (void *)
(TOC_OFF(idx_tab->sa_idx_tab[attr]) +
(uintptr_t)old_data[k]), length);
}
- if (SA_REGISTERED_LEN(sa, attr) == 0)
- length_idx++;
}
if (k == 0 && hdl->sa_spill) {
hdr = SA_GET_HDR(hdl, SA_SPILL);
@@ -1748,10 +1768,8 @@
}
}
if (action == SA_ADD) {
- length = SA_REGISTERED_LEN(sa, newattr);
- if (length == 0) {
- length = buflen;
- }
+ reg_length = SA_REGISTERED_LEN(sa, newattr);
+ IMPLY(reg_length != 0, reg_length == buflen);
SA_ADD_BULK_ATTR(attr_desc, j, newattr, locator,
datastart, buflen);
}
@@ -1915,14 +1933,6 @@
}
void
-sa_update_user(sa_handle_t *newhdl, sa_handle_t *oldhdl)
-{
- (void) dmu_buf_update_user((dmu_buf_t *)newhdl->sa_bonus,
- oldhdl, newhdl, NULL, sa_evict);
- oldhdl->sa_bonus = NULL;
-}
-
-void
sa_set_userp(sa_handle_t *hdl, void *ptr)
{
hdl->sa_userp = ptr;
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sha256.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sha256.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sha256.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -23,16 +23,21 @@
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
+/*
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
+ */
#include <sys/zfs_context.h>
#include <sys/zio.h>
#ifdef _KERNEL
-#include <crypto/sha2/sha2.h>
+#include <crypto/sha2/sha256.h>
#else
#include <sha256.h>
#endif
+/*ARGSUSED*/
void
-zio_checksum_SHA256(const void *buf, uint64_t size, zio_cksum_t *zcp)
+zio_checksum_SHA256(const void *buf, uint64_t size,
+ const void *ctx_template, zio_cksum_t *zcp)
{
SHA256_CTX ctx;
zio_cksum_t tmp;
@@ -53,3 +58,31 @@
zcp->zc_word[2] = BE_64(tmp.zc_word[2]);
zcp->zc_word[3] = BE_64(tmp.zc_word[3]);
}
+
+#ifdef illumos
+/*ARGSUSED*/
+void
+zio_checksum_SHA512_native(const void *buf, uint64_t size,
+ const void *ctx_template, zio_cksum_t *zcp)
+{
+ SHA2_CTX ctx;
+
+ SHA2Init(SHA512_256, &ctx);
+ SHA2Update(&ctx, buf, size);
+ SHA2Final(zcp, &ctx);
+}
+
+/*ARGSUSED*/
+void
+zio_checksum_SHA512_byteswap(const void *buf, uint64_t size,
+ const void *ctx_template, zio_cksum_t *zcp)
+{
+ zio_cksum_t tmp;
+
+ zio_checksum_SHA512_native(buf, size, ctx_template, &tmp);
+ zcp->zc_word[0] = BSWAP_64(tmp.zc_word[0]);
+ zcp->zc_word[1] = BSWAP_64(tmp.zc_word[1]);
+ zcp->zc_word[2] = BSWAP_64(tmp.zc_word[2]);
+ zcp->zc_word[3] = BSWAP_64(tmp.zc_word[3]);
+}
+#endif
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -22,9 +22,12 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
- * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2015, Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2013 Martin Matuska <mm at FreeBSD.org>. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
*/
/*
@@ -84,11 +87,6 @@
/* Check hostid on import? */
static int check_hostid = 1;
-SYSCTL_DECL(_vfs_zfs);
-TUNABLE_INT("vfs.zfs.check_hostid", &check_hostid);
-SYSCTL_INT(_vfs_zfs, OID_AUTO, check_hostid, CTLFLAG_RW, &check_hostid, 0,
- "Check hostid on import?");
-
/*
* The interval, in seconds, at which failed configuration cache file writes
* should be retried.
@@ -95,24 +93,33 @@
*/
static int zfs_ccw_retry_interval = 300;
+SYSCTL_DECL(_vfs_zfs);
+TUNABLE_INT("vfs.zfs.check_hostid", &check_hostid);
+SYSCTL_INT(_vfs_zfs, OID_AUTO, check_hostid, CTLFLAG_RWTUN, &check_hostid, 0,
+ "Check hostid on import?");
+TUNABLE_INT("vfs.zfs.ccw_retry_interval", &zfs_ccw_retry_interval);
+SYSCTL_INT(_vfs_zfs, OID_AUTO, ccw_retry_interval, CTLFLAG_RW,
+ &zfs_ccw_retry_interval, 0,
+ "Configuration cache file write, retry after failure, interval (seconds)");
+
typedef enum zti_modes {
- zti_mode_fixed, /* value is # of threads (min 1) */
- zti_mode_online_percent, /* value is % of online CPUs */
- zti_mode_batch, /* cpu-intensive; value is ignored */
- zti_mode_null, /* don't create a taskq */
- zti_nmodes
+ ZTI_MODE_FIXED, /* value is # of threads (min 1) */
+ ZTI_MODE_BATCH, /* cpu-intensive; value is ignored */
+ ZTI_MODE_NULL, /* don't create a taskq */
+ ZTI_NMODES
} zti_modes_t;
-#define ZTI_FIX(n) { zti_mode_fixed, (n) }
-#define ZTI_PCT(n) { zti_mode_online_percent, (n) }
-#define ZTI_BATCH { zti_mode_batch, 0 }
-#define ZTI_NULL { zti_mode_null, 0 }
+#define ZTI_P(n, q) { ZTI_MODE_FIXED, (n), (q) }
+#define ZTI_BATCH { ZTI_MODE_BATCH, 0, 1 }
+#define ZTI_NULL { ZTI_MODE_NULL, 0, 0 }
-#define ZTI_ONE ZTI_FIX(1)
+#define ZTI_N(n) ZTI_P(n, 1)
+#define ZTI_ONE ZTI_N(1)
typedef struct zio_taskq_info {
- enum zti_modes zti_mode;
+ zti_modes_t zti_mode;
uint_t zti_value;
+ uint_t zti_count;
} zio_taskq_info_t;
static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
@@ -120,19 +127,34 @@
};
/*
- * Define the taskq threads for the following I/O types:
- * NULL, READ, WRITE, FREE, CLAIM, and IOCTL
+ * This table defines the taskq settings for each ZFS I/O type. When
+ * initializing a pool, we use this table to create an appropriately sized
+ * taskq. Some operations are low volume and therefore have a small, static
+ * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE
+ * macros. Other operations process a large amount of data; the ZTI_BATCH
+ * macro causes us to create a taskq oriented for throughput. Some operations
+ * are so high frequency and short-lived that the taskq itself can become a a
+ * point of lock contention. The ZTI_P(#, #) macro indicates that we need an
+ * additional degree of parallelism specified by the number of threads per-
+ * taskq and the number of taskqs; when dispatching an event in this case, the
+ * particular taskq is chosen at random.
+ *
+ * The different taskq priorities are to handle the different contexts (issue
+ * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that
+ * need to be handled with minimum delay.
*/
const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
/* ISSUE ISSUE_HIGH INTR INTR_HIGH */
- { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL },
- { ZTI_FIX(8), ZTI_NULL, ZTI_BATCH, ZTI_NULL },
- { ZTI_BATCH, ZTI_FIX(5), ZTI_FIX(8), ZTI_FIX(5) },
- { ZTI_FIX(100), ZTI_NULL, ZTI_ONE, ZTI_NULL },
- { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL },
- { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL },
+ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */
+ { ZTI_N(8), ZTI_NULL, ZTI_P(12, 8), ZTI_NULL }, /* READ */
+ { ZTI_BATCH, ZTI_N(5), ZTI_N(8), ZTI_N(5) }, /* WRITE */
+ { ZTI_P(12, 8), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */
+ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */
+ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */
};
+static sysevent_t *spa_event_create(spa_t *spa, vdev_t *vd, const char *name);
+static void spa_event_post(sysevent_t *ev);
static void spa_sync_version(void *arg, dmu_tx_t *tx);
static void spa_sync_props(void *arg, dmu_tx_t *tx);
static boolean_t spa_has_active_shared_spare(spa_t *spa);
@@ -141,22 +163,18 @@
char **ereport);
static void spa_vdev_resilver_done(spa_t *spa);
-uint_t zio_taskq_batch_pct = 100; /* 1 thread per cpu in pset */
+uint_t zio_taskq_batch_pct = 75; /* 1 thread per cpu in pset */
#ifdef PSRSET_BIND
id_t zio_taskq_psrset_bind = PS_NONE;
#endif
#ifdef SYSDC
boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */
+uint_t zio_taskq_basedc = 80; /* base duty cycle */
#endif
-uint_t zio_taskq_basedc = 80; /* base duty cycle */
boolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */
extern int zfs_sync_pass_deferred_free;
-#ifndef illumos
-extern void spa_deadman(void *arg);
-#endif
-
/*
* This (illegal) pool name is used when temporarily importing a spa_t in order
* to get the vdev stats associated with the imported devices.
@@ -199,12 +217,10 @@
{
vdev_t *rvd = spa->spa_root_vdev;
dsl_pool_t *pool = spa->spa_dsl_pool;
- uint64_t size;
- uint64_t alloc;
- uint64_t space;
- uint64_t cap, version;
+ uint64_t size, alloc, cap, version;
zprop_source_t src = ZPROP_SRC_NONE;
spa_config_dirent_t *dp;
+ metaslab_class_t *mc = spa_normal_class(spa);
ASSERT(MUTEX_HELD(&spa->spa_props_lock));
@@ -217,14 +233,10 @@
spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL,
size - alloc, src);
- space = 0;
- for (int c = 0; c < rvd->vdev_children; c++) {
- vdev_t *tvd = rvd->vdev_child[c];
- space += tvd->vdev_max_asize - tvd->vdev_asize;
- }
- spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL, space,
- src);
-
+ spa_prop_add_list(*nvp, ZPOOL_PROP_FRAGMENTATION, NULL,
+ metaslab_class_fragmentation(mc), src);
+ spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL,
+ metaslab_class_expandable_space(mc), src);
spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL,
(spa_mode(spa) == FREAD), src);
@@ -246,19 +258,27 @@
}
if (pool != NULL) {
- dsl_dir_t *freedir = pool->dp_free_dir;
-
/*
* The $FREE directory was introduced in SPA_VERSION_DEADLISTS,
* when opening pools before this version freedir will be NULL.
*/
- if (freedir != NULL) {
+ if (pool->dp_free_dir != NULL) {
spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL,
- freedir->dd_phys->dd_used_bytes, src);
+ dsl_dir_phys(pool->dp_free_dir)->dd_used_bytes,
+ src);
} else {
spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING,
NULL, 0, src);
}
+
+ if (pool->dp_leak_dir != NULL) {
+ spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL,
+ dsl_dir_phys(pool->dp_leak_dir)->dd_used_bytes,
+ src);
+ } else {
+ spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED,
+ NULL, 0, src);
+ }
}
spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src);
@@ -272,6 +292,14 @@
spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root,
0, ZPROP_SRC_LOCAL);
+ if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) {
+ spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL,
+ MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE);
+ } else {
+ spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL,
+ SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE);
+ }
+
if ((dp = list_head(&spa->spa_config_list)) != NULL) {
if (dp->scd_path == NULL) {
spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
@@ -342,8 +370,7 @@
break;
}
- strval = kmem_alloc(
- MAXNAMELEN + strlen(MOS_DIR_NAME) + 1,
+ strval = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN,
KM_SLEEP);
dsl_dataset_name(ds, strval);
dsl_dataset_rele(ds, FTAG);
@@ -356,8 +383,7 @@
spa_prop_add_list(*nvp, prop, strval, intval, src);
if (strval != NULL)
- kmem_free(strval,
- MAXNAMELEN + strlen(MOS_DIR_NAME) + 1);
+ kmem_free(strval, ZFS_MAX_DATASET_NAME_LEN);
break;
@@ -486,7 +512,7 @@
if (!error) {
objset_t *os;
- uint64_t compress;
+ uint64_t propval;
if (strval == NULL || strval[0] == '\0') {
objnum = zpool_prop_default_numeric(
@@ -497,7 +523,11 @@
if (error = dmu_objset_hold(strval, FTAG, &os))
break;
- /* Must be ZPL and not gzip compressed. */
+ /*
+ * Must be ZPL, and its property settings
+ * must be supported by GRUB (compression
+ * is not gzip, and large blocks are not used).
+ */
if (dmu_objset_type(os) != DMU_OST_ZFS) {
error = SET_ERROR(ENOTSUP);
@@ -504,9 +534,15 @@
} else if ((error =
dsl_prop_get_int_ds(dmu_objset_ds(os),
zfs_prop_to_name(ZFS_PROP_COMPRESSION),
- &compress)) == 0 &&
- !BOOTFS_COMPRESS_VALID(compress)) {
+ &propval)) == 0 &&
+ !BOOTFS_COMPRESS_VALID(propval)) {
error = SET_ERROR(ENOTSUP);
+ } else if ((error =
+ dsl_prop_get_int_ds(dmu_objset_ds(os),
+ zfs_prop_to_name(ZFS_PROP_RECORDSIZE),
+ &propval)) == 0 &&
+ propval > SPA_OLD_MAXBLOCKSIZE) {
+ error = SET_ERROR(ENOTSUP);
} else {
objnum = dmu_objset_id(os);
}
@@ -573,7 +609,6 @@
error = SET_ERROR(EINVAL);
break;
}
- check++;
}
if (strlen(strval) > ZPROP_MAX_COMMENT)
error = E2BIG;
@@ -672,7 +707,8 @@
* feature descriptions object.
*/
error = dsl_sync_task(spa->spa_name, NULL,
- spa_sync_version, &ver, 6);
+ spa_sync_version, &ver,
+ 6, ZFS_SPACE_CHECK_RESERVED);
if (error)
return (error);
continue;
@@ -684,7 +720,7 @@
if (need_sync) {
return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props,
- nvp, 6));
+ nvp, 6, ZFS_SPACE_CHECK_RESERVED));
}
return (0);
@@ -760,11 +796,12 @@
int error;
uint64_t guid;
+ mutex_enter(&spa->spa_vdev_top_lock);
mutex_enter(&spa_namespace_lock);
guid = spa_generate_guid(NULL);
error = dsl_sync_task(spa->spa_name, spa_change_guid_check,
- spa_change_guid_sync, &guid, 5);
+ spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED);
if (error == 0) {
spa_config_sync(spa, B_FALSE, B_TRUE);
@@ -772,6 +809,7 @@
}
mutex_exit(&spa_namespace_lock);
+ mutex_exit(&spa->spa_vdev_top_lock);
return (error);
}
@@ -790,7 +828,7 @@
int ret;
ret = bcmp(&sa->se_bookmark, &sb->se_bookmark,
- sizeof (zbookmark_t));
+ sizeof (zbookmark_phys_t));
if (ret < 0)
return (-1);
@@ -820,50 +858,141 @@
offsetof(spa_error_entry_t, se_avl));
}
-static taskq_t *
-spa_taskq_create(spa_t *spa, const char *name, enum zti_modes mode,
- uint_t value)
+static void
+spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
{
- uint_t flags = TASKQ_PREPOPULATE;
+ const zio_taskq_info_t *ztip = &zio_taskqs[t][q];
+ enum zti_modes mode = ztip->zti_mode;
+ uint_t value = ztip->zti_value;
+ uint_t count = ztip->zti_count;
+ spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
+ char name[32];
+ uint_t flags = 0;
boolean_t batch = B_FALSE;
+ if (mode == ZTI_MODE_NULL) {
+ tqs->stqs_count = 0;
+ tqs->stqs_taskq = NULL;
+ return;
+ }
+
+ ASSERT3U(count, >, 0);
+
+ tqs->stqs_count = count;
+ tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP);
+
switch (mode) {
- case zti_mode_null:
- return (NULL); /* no taskq needed */
-
- case zti_mode_fixed:
+ case ZTI_MODE_FIXED:
ASSERT3U(value, >=, 1);
value = MAX(value, 1);
break;
- case zti_mode_batch:
+ case ZTI_MODE_BATCH:
batch = B_TRUE;
flags |= TASKQ_THREADS_CPU_PCT;
value = zio_taskq_batch_pct;
break;
- case zti_mode_online_percent:
- flags |= TASKQ_THREADS_CPU_PCT;
- break;
-
default:
- panic("unrecognized mode for %s taskq (%u:%u) in "
+ panic("unrecognized mode for %s_%s taskq (%u:%u) in "
"spa_activate()",
- name, mode, value);
+ zio_type_name[t], zio_taskq_types[q], mode, value);
break;
}
+ for (uint_t i = 0; i < count; i++) {
+ taskq_t *tq;
+
+ if (count > 1) {
+ (void) snprintf(name, sizeof (name), "%s_%s_%u",
+ zio_type_name[t], zio_taskq_types[q], i);
+ } else {
+ (void) snprintf(name, sizeof (name), "%s_%s",
+ zio_type_name[t], zio_taskq_types[q]);
+ }
+
#ifdef SYSDC
- if (zio_taskq_sysdc && spa->spa_proc != &p0) {
- if (batch)
- flags |= TASKQ_DC_BATCH;
+ if (zio_taskq_sysdc && spa->spa_proc != &p0) {
+ if (batch)
+ flags |= TASKQ_DC_BATCH;
- return (taskq_create_sysdc(name, value, 50, INT_MAX,
- spa->spa_proc, zio_taskq_basedc, flags));
+ tq = taskq_create_sysdc(name, value, 50, INT_MAX,
+ spa->spa_proc, zio_taskq_basedc, flags);
+ } else {
+#endif
+ pri_t pri = maxclsyspri;
+ /*
+ * The write issue taskq can be extremely CPU
+ * intensive. Run it at slightly lower priority
+ * than the other taskqs.
+ * FreeBSD notes:
+ * - numerically higher priorities are lower priorities;
+ * - if priorities divided by four (RQ_PPQ) are equal
+ * then a difference between them is insignificant.
+ */
+ if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE)
+#ifdef illumos
+ pri--;
+#else
+ pri += 4;
+#endif
+
+ tq = taskq_create_proc(name, value, pri, 50,
+ INT_MAX, spa->spa_proc, flags);
+#ifdef SYSDC
+ }
+#endif
+
+ tqs->stqs_taskq[i] = tq;
}
+}
+
+static void
+spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
+{
+ spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
+
+ if (tqs->stqs_taskq == NULL) {
+ ASSERT0(tqs->stqs_count);
+ return;
+ }
+
+ for (uint_t i = 0; i < tqs->stqs_count; i++) {
+ ASSERT3P(tqs->stqs_taskq[i], !=, NULL);
+ taskq_destroy(tqs->stqs_taskq[i]);
+ }
+
+ kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *));
+ tqs->stqs_taskq = NULL;
+}
+
+/*
+ * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority.
+ * Note that a type may have multiple discrete taskqs to avoid lock contention
+ * on the taskq itself. In that case we choose which taskq at random by using
+ * the low bits of gethrtime().
+ */
+void
+spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
+ task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent)
+{
+ spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
+ taskq_t *tq;
+
+ ASSERT3P(tqs->stqs_taskq, !=, NULL);
+ ASSERT3U(tqs->stqs_count, !=, 0);
+
+ if (tqs->stqs_count == 1) {
+ tq = tqs->stqs_taskq[0];
+ } else {
+#ifdef _KERNEL
+ tq = tqs->stqs_taskq[cpu_ticks() % tqs->stqs_count];
+#else
+ tq = tqs->stqs_taskq[gethrtime() % tqs->stqs_count];
#endif
- return (taskq_create_proc(name, value, maxclsyspri, 50, INT_MAX,
- spa->spa_proc, flags));
+ }
+
+ taskq_dispatch_ent(tq, func, arg, flags, ent);
}
static void
@@ -871,16 +1000,7 @@
{
for (int t = 0; t < ZIO_TYPES; t++) {
for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
- const zio_taskq_info_t *ztip = &zio_taskqs[t][q];
- enum zti_modes mode = ztip->zti_mode;
- uint_t value = ztip->zti_value;
- char name[32];
-
- (void) snprintf(name, sizeof (name),
- "%s_%s", zio_type_name[t], zio_taskq_types[q]);
-
- spa->spa_zio_taskq[t][q] =
- spa_taskq_create(spa, name, mode, value);
+ spa_taskqs_init(spa, t, q);
}
}
}
@@ -1018,6 +1138,8 @@
list_create(&spa->spa_config_dirty_list, sizeof (vdev_t),
offsetof(vdev_t, vdev_config_dirty_node));
+ list_create(&spa->spa_evicting_os_list, sizeof (objset_t),
+ offsetof(objset_t, os_evicting_node));
list_create(&spa->spa_state_dirty_list, sizeof (vdev_t),
offsetof(vdev_t, vdev_state_dirty_node));
@@ -1050,16 +1172,17 @@
*/
trim_thread_destroy(spa);
+ spa_evicting_os_wait(spa);
+
txg_list_destroy(&spa->spa_vdev_txg_list);
list_destroy(&spa->spa_config_dirty_list);
+ list_destroy(&spa->spa_evicting_os_list);
list_destroy(&spa->spa_state_dirty_list);
for (int t = 0; t < ZIO_TYPES; t++) {
for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
- if (spa->spa_zio_taskq[t][q] != NULL)
- taskq_destroy(spa->spa_zio_taskq[t][q]);
- spa->spa_zio_taskq[t][q] = NULL;
+ spa_taskqs_fini(spa, t, q);
}
}
@@ -1187,13 +1310,24 @@
* Wait for any outstanding async I/O to complete.
*/
if (spa->spa_async_zio_root != NULL) {
- (void) zio_wait(spa->spa_async_zio_root);
+ for (int i = 0; i < max_ncpus; i++)
+ (void) zio_wait(spa->spa_async_zio_root[i]);
+ kmem_free(spa->spa_async_zio_root, max_ncpus * sizeof (void *));
spa->spa_async_zio_root = NULL;
}
bpobj_close(&spa->spa_deferred_bpobj);
+ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+
/*
+ * Close all vdevs.
+ */
+ if (spa->spa_root_vdev)
+ vdev_free(spa->spa_root_vdev);
+ ASSERT(spa->spa_root_vdev == NULL);
+
+ /*
* Close the dsl pool.
*/
if (spa->spa_dsl_pool) {
@@ -1204,20 +1338,11 @@
ddt_unload(spa);
- spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
-
/*
* Drop and purge level 2 cache
*/
spa_l2cache_drop(spa);
- /*
- * Close all vdevs.
- */
- if (spa->spa_root_vdev)
- vdev_free(spa->spa_root_vdev);
- ASSERT(spa->spa_root_vdev == NULL);
-
for (i = 0; i < spa->spa_spares.sav_count; i++)
vdev_free(spa->spa_spares.sav_vdevs[i]);
if (spa->spa_spares.sav_vdevs) {
@@ -1511,7 +1636,10 @@
int error;
*value = NULL;
- VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
+ error = dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db);
+ if (error != 0)
+ return (error);
+
nvsize = *(uint64_t *)db->db_data;
dmu_buf_rele(db, FTAG);
@@ -1671,13 +1799,14 @@
spa_check_logs(spa_t *spa)
{
boolean_t rv = B_FALSE;
+ dsl_pool_t *dp = spa_get_dsl(spa);
switch (spa->spa_log_state) {
case SPA_LOG_MISSING:
/* need to recheck in case slog has been restored */
case SPA_LOG_UNKNOWN:
- rv = (dmu_objset_find(spa->spa_name, zil_check_log_chain,
- NULL, DS_FIND_CHILDREN) != 0);
+ rv = (dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
+ zil_check_log_chain, NULL, DS_FIND_CHILDREN) != 0);
if (rv)
spa_set_log_state(spa, SPA_LOG_MISSING);
break;
@@ -1778,35 +1907,88 @@
spa_load_error_t *sle = zio->io_private;
dmu_object_type_t type = BP_GET_TYPE(bp);
int error = zio->io_error;
+ spa_t *spa = zio->io_spa;
if (error) {
if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) &&
type != DMU_OT_INTENT_LOG)
- atomic_add_64(&sle->sle_meta_count, 1);
+ atomic_inc_64(&sle->sle_meta_count);
else
- atomic_add_64(&sle->sle_data_count, 1);
+ atomic_inc_64(&sle->sle_data_count);
}
zio_data_buf_free(zio->io_data, zio->io_size);
+
+ mutex_enter(&spa->spa_scrub_lock);
+ spa->spa_scrub_inflight--;
+ cv_broadcast(&spa->spa_scrub_io_cv);
+ mutex_exit(&spa->spa_scrub_lock);
}
+/*
+ * Maximum number of concurrent scrub i/os to create while verifying
+ * a pool while importing it.
+ */
+int spa_load_verify_maxinflight = 10000;
+boolean_t spa_load_verify_metadata = B_TRUE;
+boolean_t spa_load_verify_data = B_TRUE;
+
+SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_maxinflight, CTLFLAG_RWTUN,
+ &spa_load_verify_maxinflight, 0,
+ "Maximum number of concurrent scrub I/Os to create while verifying a "
+ "pool while importing it");
+
+SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_metadata, CTLFLAG_RWTUN,
+ &spa_load_verify_metadata, 0,
+ "Check metadata on import?");
+
+SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_data, CTLFLAG_RWTUN,
+ &spa_load_verify_data, 0,
+ "Check user data on import?");
+
/*ARGSUSED*/
static int
spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
- const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
+ const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
{
- if (bp != NULL) {
- zio_t *rio = arg;
- size_t size = BP_GET_PSIZE(bp);
- void *data = zio_data_buf_alloc(size);
+ if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
+ return (0);
+ /*
+ * Note: normally this routine will not be called if
+ * spa_load_verify_metadata is not set. However, it may be useful
+ * to manually set the flag after the traversal has begun.
+ */
+ if (!spa_load_verify_metadata)
+ return (0);
+ if (BP_GET_BUFC_TYPE(bp) == ARC_BUFC_DATA && !spa_load_verify_data)
+ return (0);
- zio_nowait(zio_read(rio, spa, bp, data, size,
- spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB,
- ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL |
- ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb));
- }
+ zio_t *rio = arg;
+ size_t size = BP_GET_PSIZE(bp);
+ void *data = zio_data_buf_alloc(size);
+
+ mutex_enter(&spa->spa_scrub_lock);
+ while (spa->spa_scrub_inflight >= spa_load_verify_maxinflight)
+ cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
+ spa->spa_scrub_inflight++;
+ mutex_exit(&spa->spa_scrub_lock);
+
+ zio_nowait(zio_read(rio, spa, bp, data, size,
+ spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB,
+ ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL |
+ ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb));
return (0);
}
+/* ARGSUSED */
+int
+verify_dataset_name_len(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
+{
+ if (dsl_dataset_namelen(ds) >= ZFS_MAX_DATASET_NAME_LEN)
+ return (SET_ERROR(ENAMETOOLONG));
+
+ return (0);
+}
+
static int
spa_load_verify(spa_t *spa)
{
@@ -1814,7 +1996,7 @@
spa_load_error_t sle = { 0 };
zpool_rewind_policy_t policy;
boolean_t verify_ok = B_FALSE;
- int error;
+ int error = 0;
zpool_get_rewind_policy(spa->spa_config, &policy);
@@ -1821,11 +2003,22 @@
if (policy.zrp_request & ZPOOL_NEVER_REWIND)
return (0);
+ dsl_pool_config_enter(spa->spa_dsl_pool, FTAG);
+ error = dmu_objset_find_dp(spa->spa_dsl_pool,
+ spa->spa_dsl_pool->dp_root_dir_obj, verify_dataset_name_len, NULL,
+ DS_FIND_CHILDREN);
+ dsl_pool_config_exit(spa->spa_dsl_pool, FTAG);
+ if (error != 0)
+ return (error);
+
rio = zio_root(spa, NULL, &sle,
ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
- error = traverse_pool(spa, spa->spa_verify_min_txg,
- TRAVERSE_PRE | TRAVERSE_PREFETCH, spa_load_verify_cb, rio);
+ if (spa_load_verify_metadata) {
+ error = traverse_pool(spa, spa->spa_verify_min_txg,
+ TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA,
+ spa_load_verify_cb, rio);
+ }
(void) zio_wait(rio);
@@ -2018,6 +2211,11 @@
mosconfig, &ereport);
}
+ /*
+ * Don't count references from objsets that are already closed
+ * and are making their way through the eviction process.
+ */
+ spa_evicting_os_wait(spa);
spa->spa_minref = refcount_count(&spa->spa_refcount);
if (error) {
if (error != EEXIST) {
@@ -2074,8 +2272,13 @@
/*
* Create "The Godfather" zio to hold all async IOs
*/
- spa->spa_async_zio_root = zio_root(spa, NULL, NULL,
- ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER);
+ spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *),
+ KM_SLEEP);
+ for (int i = 0; i < max_ncpus; i++) {
+ spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
+ ZIO_FLAG_GODFATHER);
+ }
/*
* Parse the configuration into a vdev tree. We explicitly set the
@@ -2090,6 +2293,8 @@
return (error);
ASSERT(spa->spa_root_vdev == rvd);
+ ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT);
+ ASSERT3U(spa->spa_max_ashift, <=, SPA_MAXBLOCKSHIFT);
if (type != SPA_IMPORT_ASSEMBLE) {
ASSERT(spa_guid(spa) == pool_guid);
@@ -2267,14 +2472,12 @@
enabled_feat = fnvlist_alloc();
unsup_feat = fnvlist_alloc();
- if (!feature_is_supported(spa->spa_meta_objset,
- spa->spa_feat_for_read_obj, spa->spa_feat_desc_obj,
+ if (!spa_features_check(spa, B_FALSE,
unsup_feat, enabled_feat))
missing_feat_read = B_TRUE;
if (spa_writeable(spa) || state == SPA_LOAD_TRYIMPORT) {
- if (!feature_is_supported(spa->spa_meta_objset,
- spa->spa_feat_for_write_obj, spa->spa_feat_desc_obj,
+ if (!spa_features_check(spa, B_TRUE,
unsup_feat, enabled_feat)) {
missing_feat_write = B_TRUE;
}
@@ -2320,8 +2523,34 @@
return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
ENOTSUP));
}
+
+ /*
+ * Load refcounts for ZFS features from disk into an in-memory
+ * cache during SPA initialization.
+ */
+ for (spa_feature_t i = 0; i < SPA_FEATURES; i++) {
+ uint64_t refcount;
+
+ error = feature_get_refcount_from_disk(spa,
+ &spa_feature_table[i], &refcount);
+ if (error == 0) {
+ spa->spa_feat_refcount_cache[i] = refcount;
+ } else if (error == ENOTSUP) {
+ spa->spa_feat_refcount_cache[i] =
+ SPA_FEATURE_DISABLED;
+ } else {
+ return (spa_vdev_err(rvd,
+ VDEV_AUX_CORRUPT_DATA, EIO));
+ }
+ }
}
+ if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) {
+ if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG,
+ &spa->spa_feat_enabled_txg_obj) != 0)
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+ }
+
spa->spa_is_initializing = B_TRUE;
error = dsl_pool_open(spa->spa_dsl_pool);
spa->spa_is_initializing = B_FALSE;
@@ -2377,6 +2606,19 @@
return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE));
}
+ /* Grab the secret checksum salt from the MOS. */
+ error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_CHECKSUM_SALT, 1,
+ sizeof (spa->spa_cksum_salt.zcs_bytes),
+ spa->spa_cksum_salt.zcs_bytes);
+ if (error == ENOENT) {
+ /* Generate a new salt for subsequent use */
+ (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes,
+ sizeof (spa->spa_cksum_salt.zcs_bytes));
+ } else if (error != 0) {
+ return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+ }
+
if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj) != 0)
return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj);
@@ -2552,7 +2794,7 @@
if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
return (SET_ERROR(ENXIO));
- if (spa_check_logs(spa)) {
+ if (spa_writeable(spa) && spa_check_logs(spa)) {
*ereport = FM_EREPORT_ZFS_LOG_REPLAY;
return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO));
}
@@ -2583,6 +2825,7 @@
spa->spa_load_max_txg == UINT64_MAX)) {
dmu_tx_t *tx;
int need_update = B_FALSE;
+ dsl_pool_t *dp = spa_get_dsl(spa);
ASSERT(state != SPA_LOAD_TRYIMPORT);
@@ -2595,9 +2838,8 @@
*/
spa->spa_claiming = B_TRUE;
- tx = dmu_tx_create_assigned(spa_get_dsl(spa),
- spa_first_txg(spa));
- (void) dmu_objset_find(spa_name(spa),
+ tx = dmu_tx_create_assigned(dp, spa_first_txg(spa));
+ (void) dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
zil_claim, tx, DS_FIND_CHILDREN);
dmu_tx_commit(tx);
@@ -2676,7 +2918,7 @@
spa_unload(spa);
spa_deactivate(spa);
- spa->spa_load_max_txg--;
+ spa->spa_load_max_txg = spa->spa_uberblock.ub_txg - 1;
spa_activate(spa, mode);
spa_async_suspend(spa);
@@ -2706,6 +2948,8 @@
spa_set_log_state(spa, SPA_LOG_CLEAR);
} else {
spa->spa_load_max_txg = max_request;
+ if (max_request != UINT64_MAX)
+ spa->spa_extreme_rewind = B_TRUE;
}
load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING,
@@ -3061,6 +3305,10 @@
ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
VERIFY(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ /* We may be unable to read features if pool is suspended. */
+ if (spa_suspended(spa))
+ goto out;
+
if (spa->spa_feat_for_read_obj != 0) {
for (zap_cursor_init(&zc, spa->spa_meta_objset,
spa->spa_feat_for_read_obj);
@@ -3087,6 +3335,7 @@
zap_cursor_fini(&zc);
}
+out:
VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS,
features) == 0);
nvlist_free(features);
@@ -3351,12 +3600,18 @@
uint_t nspares, nl2cache;
uint64_t version, obj;
boolean_t has_features;
+ char *poolname;
+ nvlist_t *nvl;
+ if (nvlist_lookup_string(props,
+ zpool_prop_to_name(ZPOOL_PROP_TNAME), &poolname) != 0)
+ poolname = (char *)pool;
+
/*
* If this pool already exists, return failure.
*/
mutex_enter(&spa_namespace_lock);
- if (spa_lookup(pool) != NULL) {
+ if (spa_lookup(poolname) != NULL) {
mutex_exit(&spa_namespace_lock);
return (SET_ERROR(EEXIST));
}
@@ -3364,9 +3619,12 @@
/*
* Allocate a new spa_t structure.
*/
+ nvl = fnvlist_alloc();
+ fnvlist_add_string(nvl, ZPOOL_CONFIG_POOL_NAME, pool);
(void) nvlist_lookup_string(props,
zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
- spa = spa_add(pool, NULL, altroot);
+ spa = spa_add(poolname, nvl, altroot);
+ fnvlist_free(nvl);
spa_activate(spa, spa_mode_global);
if (props && (error = spa_prop_validate(spa, props))) {
@@ -3376,6 +3634,12 @@
return (error);
}
+ /*
+ * Temporary pool names should never be written to disk.
+ */
+ if (poolname != pool)
+ spa->spa_import_flags |= ZFS_IMPORT_TEMP_NAME;
+
has_features = B_FALSE;
for (nvpair_t *elem = nvlist_next_nvpair(props, NULL);
elem != NULL; elem = nvlist_next_nvpair(props, elem)) {
@@ -3393,12 +3657,18 @@
spa->spa_uberblock.ub_txg = txg - 1;
spa->spa_uberblock.ub_version = version;
spa->spa_ubsync = spa->spa_uberblock;
+ spa->spa_load_state = SPA_LOAD_CREATE;
/*
* Create "The Godfather" zio to hold all async IOs
*/
- spa->spa_async_zio_root = zio_root(spa, NULL, NULL,
- ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER);
+ spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *),
+ KM_SLEEP);
+ for (int i = 0; i < max_ncpus; i++) {
+ spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL,
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
+ ZIO_FLAG_GODFATHER);
+ }
/*
* Create the root vdev.
@@ -3418,6 +3688,7 @@
(error = spa_validate_aux(spa, nvroot, txg,
VDEV_ALLOC_ADD)) == 0) {
for (int c = 0; c < rvd->vdev_children; c++) {
+ vdev_ashift_optimize(rvd->vdev_child[c]);
vdev_metaslab_set_size(rvd->vdev_child[c]);
vdev_expand(rvd->vdev_child[c], txg);
}
@@ -3532,6 +3803,12 @@
spa_history_create_obj(spa, tx);
/*
+ * Generate some random noise for salted checksums to operate on.
+ */
+ (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes,
+ sizeof (spa->spa_cksum_salt.zcs_bytes));
+
+ /*
* Set pool properties.
*/
spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS);
@@ -3556,10 +3833,17 @@
txg_wait_synced(spa->spa_dsl_pool, txg);
spa_config_sync(spa, B_FALSE, B_TRUE);
+ spa_event_notify(spa, NULL, ESC_ZFS_POOL_CREATE);
spa_history_log_version(spa, "create");
+ /*
+ * Don't count references from objsets that are already closed
+ * and are making their way through the eviction process.
+ */
+ spa_evicting_os_wait(spa);
spa->spa_minref = refcount_count(&spa->spa_refcount);
+ spa->spa_load_state = SPA_LOAD_NONE;
mutex_exit(&spa_namespace_lock);
@@ -3567,7 +3851,7 @@
}
#ifdef _KERNEL
-#if defined(sun)
+#ifdef illumos
/*
* Get the root pool information from the root disk, then import the root pool
* during the system boot up time.
@@ -3768,7 +4052,7 @@
return (error);
}
-#else
+#else /* !illumos */
extern int vdev_geom_read_pool_label(const char *name, nvlist_t ***configs,
uint64_t *count);
@@ -3911,6 +4195,16 @@
if ((spa = spa_lookup(pname)) != NULL) {
/*
+ * The pool could already be imported,
+ * e.g., after reboot -r.
+ */
+ if (spa->spa_state == POOL_STATE_ACTIVE) {
+ mutex_exit(&spa_namespace_lock);
+ nvlist_free(config);
+ return (0);
+ }
+
+ /*
* Remove the existing root pool from the namespace so
* that we can replace it with the correct config
* we just read in.
@@ -3927,6 +4221,8 @@
&spa->spa_ubsync.ub_version) != 0)
spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
} else if ((spa = spa_lookup(name)) == NULL) {
+ mutex_exit(&spa_namespace_lock);
+ nvlist_free(config);
cmn_err(CE_NOTE, "Cannot find the pool label for '%s'",
name);
return (EIO);
@@ -3962,8 +4258,8 @@
return (0);
}
-#endif /* sun */
-#endif
+#endif /* illumos */
+#endif /* _KERNEL */
/*
* Import a non-root pool into the system.
@@ -4012,10 +4308,9 @@
spa_configfile_set(spa, props, B_FALSE);
spa_config_sync(spa, B_FALSE, B_TRUE);
+ spa_event_notify(spa, NULL, ESC_ZFS_POOL_IMPORT);
mutex_exit(&spa_namespace_lock);
- spa_history_log_version(spa, "import");
-
return (0);
}
@@ -4144,9 +4439,12 @@
*/
spa_async_request(spa, SPA_ASYNC_AUTOEXPAND);
- mutex_exit(&spa_namespace_lock);
spa_history_log_version(spa, "import");
+ spa_event_notify(spa, NULL, ESC_ZFS_POOL_IMPORT);
+
+ mutex_exit(&spa_namespace_lock);
+
#ifdef __FreeBSD__
#ifdef _KERNEL
zvol_create_minors(pool);
@@ -4294,6 +4592,7 @@
* have to force it to sync before checking spa_refcnt.
*/
txg_wait_synced(spa->spa_dsl_pool, 0);
+ spa_evicting_os_wait(spa);
/*
* A pool cannot be exported or destroyed if there are active
@@ -4491,6 +4790,7 @@
mutex_enter(&spa_namespace_lock);
spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
+ spa_event_notify(spa, NULL, ESC_ZFS_VDEV_ADD);
mutex_exit(&spa_namespace_lock);
return (0);
@@ -4627,7 +4927,7 @@
}
/* mark the device being resilvered */
- newvd->vdev_resilvering = B_TRUE;
+ newvd->vdev_resilver_txg = txg;
/*
* If the parent is not a mirror, or if we're replacing, insert the new
@@ -4679,10 +4979,17 @@
vdev_dirty(tvd, VDD_DTL, newvd, txg);
/*
- * Restart the resilver
+ * Schedule the resilver to restart in the future. We do this to
+ * ensure that dmu_sync-ed blocks have been stitched into the
+ * respective datasets.
*/
dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg);
+ if (spa->spa_bootfs)
+ spa_event_notify(spa, newvd, ESC_ZFS_BOOTFS_VDEV_ATTACH);
+
+ spa_event_notify(spa, newvd, ESC_ZFS_VDEV_ATTACH);
+
/*
* Commit the config
*/
@@ -4697,9 +5004,6 @@
spa_strfree(oldvdpath);
spa_strfree(newvdpath);
- if (spa->spa_bootfs)
- spa_event_notify(spa, newvd, ESC_ZFS_BOOTFS_VDEV_ATTACH);
-
return (0);
}
@@ -4857,7 +5161,6 @@
if (pvd->vdev_ops == &vdev_spare_ops)
cvd->vdev_unspare = B_FALSE;
vdev_remove_parent(cvd);
- cvd->vdev_resilvering = B_FALSE;
}
@@ -5132,13 +5435,13 @@
spa_activate(newspa, spa_mode_global);
spa_async_suspend(newspa);
-#ifndef sun
+#ifndef illumos
/* mark that we are creating new spa by splitting */
newspa->spa_splitting_newspa = B_TRUE;
#endif
/* create the new pool from the disks of the original pool */
error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE);
-#ifndef sun
+#ifndef illumos
newspa->spa_splitting_newspa = B_FALSE;
#endif
if (error)
@@ -5250,7 +5553,7 @@
static void
spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count,
- nvlist_t *dev_to_remove)
+ nvlist_t *dev_to_remove)
{
nvlist_t **newdev = NULL;
@@ -5309,7 +5612,7 @@
ASSERT0(vd->vdev_stat.vs_alloc);
txg = spa_vdev_config_enter(spa);
vd->vdev_removing = B_TRUE;
- vdev_dirty(vd, 0, NULL, txg);
+ vdev_dirty_leaves(vd, VDD_DTL, txg);
vdev_config_dirty(vd);
spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
@@ -5375,6 +5678,7 @@
spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
{
vdev_t *vd;
+ sysevent_t *ev = NULL;
metaslab_group_t *mg;
nvlist_t **spares, **l2cache, *nv;
uint64_t txg = 0;
@@ -5398,6 +5702,9 @@
* in this pool.
*/
if (vd == NULL || unspare) {
+ if (vd == NULL)
+ vd = spa_lookup_by_guid(spa, guid, B_TRUE);
+ ev = spa_event_create(spa, vd, ESC_ZFS_VDEV_REMOVE_AUX);
spa_vdev_remove_aux(spa->spa_spares.sav_config,
ZPOOL_CONFIG_SPARES, spares, nspares, nv);
spa_load_spares(spa);
@@ -5412,6 +5719,8 @@
/*
* Cache devices can always be removed.
*/
+ vd = spa_lookup_by_guid(spa, guid, B_TRUE);
+ ev = spa_event_create(spa, vd, ESC_ZFS_VDEV_REMOVE_AUX);
spa_vdev_remove_aux(spa->spa_l2cache.sav_config,
ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv);
spa_load_l2cache(spa);
@@ -5420,11 +5729,6 @@
ASSERT(!locked);
ASSERT(vd == vd->vdev_top);
- /*
- * XXX - Once we have bp-rewrite this should
- * become the common case.
- */
-
mg = vd->vdev_mg;
/*
@@ -5457,6 +5761,7 @@
/*
* Clean up the vdev namespace.
*/
+ ev = spa_event_create(spa, vd, ESC_ZFS_VDEV_REMOVE_DEV);
spa_vdev_remove_from_namespace(spa, vd);
} else if (vd != NULL) {
@@ -5472,8 +5777,11 @@
}
if (!locked)
- return (spa_vdev_exit(spa, NULL, txg, error));
+ error = spa_vdev_exit(spa, NULL, txg, error);
+ if (ev)
+ spa_event_post(ev);
+
return (error);
}
@@ -5581,6 +5889,8 @@
ASSERT(pvd->vdev_ops == &vdev_replacing_ops);
sguid = ppvd->vdev_child[1]->vdev_guid;
}
+ ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd));
+
spa_config_exit(spa, SCL_ALL, FTAG);
if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0)
return;
@@ -5705,6 +6015,8 @@
vd->vdev_stat.vs_checksum_errors = 0;
vdev_state_dirty(vd->vdev_top);
+ /* Tell userspace that the vdev is gone. */
+ zfs_post_remove(spa, vd);
}
for (int c = 0; c < vd->vdev_children; c++)
@@ -5764,7 +6076,7 @@
mutex_enter(&spa->spa_async_lock);
tasks = spa->spa_async_tasks;
- spa->spa_async_tasks = 0;
+ spa->spa_async_tasks &= SPA_ASYNC_REMOVE;
mutex_exit(&spa->spa_async_lock);
/*
@@ -5790,19 +6102,6 @@
}
}
- /*
- * See if any devices need to be marked REMOVED.
- */
- if (tasks & SPA_ASYNC_REMOVE) {
- spa_vdev_state_enter(spa, SCL_NONE);
- spa_async_remove(spa, spa->spa_root_vdev);
- for (int i = 0; i < spa->spa_l2cache.sav_count; i++)
- spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]);
- for (int i = 0; i < spa->spa_spares.sav_count; i++)
- spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]);
- (void) spa_vdev_state_exit(spa, NULL, 0);
- }
-
if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) {
spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
spa_async_autoexpand(spa, spa->spa_root_vdev);
@@ -5840,12 +6139,51 @@
thread_exit();
}
+static void
+spa_async_thread_vd(void *arg)
+{
+ spa_t *spa = arg;
+ int tasks;
+
+ mutex_enter(&spa->spa_async_lock);
+ tasks = spa->spa_async_tasks;
+retry:
+ spa->spa_async_tasks &= ~SPA_ASYNC_REMOVE;
+ mutex_exit(&spa->spa_async_lock);
+
+ /*
+ * See if any devices need to be marked REMOVED.
+ */
+ if (tasks & SPA_ASYNC_REMOVE) {
+ spa_vdev_state_enter(spa, SCL_NONE);
+ spa_async_remove(spa, spa->spa_root_vdev);
+ for (int i = 0; i < spa->spa_l2cache.sav_count; i++)
+ spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]);
+ for (int i = 0; i < spa->spa_spares.sav_count; i++)
+ spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]);
+ (void) spa_vdev_state_exit(spa, NULL, 0);
+ }
+
+ /*
+ * Let the world know that we're done.
+ */
+ mutex_enter(&spa->spa_async_lock);
+ tasks = spa->spa_async_tasks;
+ if ((tasks & SPA_ASYNC_REMOVE) != 0)
+ goto retry;
+ spa->spa_async_thread_vd = NULL;
+ cv_broadcast(&spa->spa_async_cv);
+ mutex_exit(&spa->spa_async_lock);
+ thread_exit();
+}
+
void
spa_async_suspend(spa_t *spa)
{
mutex_enter(&spa->spa_async_lock);
spa->spa_async_suspended++;
- while (spa->spa_async_thread != NULL)
+ while (spa->spa_async_thread != NULL &&
+ spa->spa_async_thread_vd != NULL)
cv_wait(&spa->spa_async_cv, &spa->spa_async_lock);
mutex_exit(&spa->spa_async_lock);
}
@@ -5866,7 +6204,8 @@
uint_t config_task;
boolean_t config_task_suspended;
- non_config_tasks = spa->spa_async_tasks & ~SPA_ASYNC_CONFIG_UPDATE;
+ non_config_tasks = spa->spa_async_tasks & ~(SPA_ASYNC_CONFIG_UPDATE |
+ SPA_ASYNC_REMOVE);
config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE;
if (spa->spa_ccw_fail_time == 0) {
config_task_suspended = B_FALSE;
@@ -5892,6 +6231,19 @@
mutex_exit(&spa->spa_async_lock);
}
+static void
+spa_async_dispatch_vd(spa_t *spa)
+{
+ mutex_enter(&spa->spa_async_lock);
+ if ((spa->spa_async_tasks & SPA_ASYNC_REMOVE) != 0 &&
+ !spa->spa_async_suspended &&
+ spa->spa_async_thread_vd == NULL &&
+ rootdir != NULL)
+ spa->spa_async_thread_vd = thread_create(NULL, 0,
+ spa_async_thread_vd, spa, 0, &p0, TS_RUN, maxclsyspri);
+ mutex_exit(&spa->spa_async_lock);
+}
+
void
spa_async_request(spa_t *spa, int task)
{
@@ -5899,6 +6251,7 @@
mutex_enter(&spa->spa_async_lock);
spa->spa_async_tasks |= task;
mutex_exit(&spa->spa_async_lock);
+ spa_async_dispatch_vd(spa);
}
/*
@@ -5925,7 +6278,33 @@
return (0);
}
+/*
+ * Note: this simple function is not inlined to make it easier to dtrace the
+ * amount of time spent syncing frees.
+ */
static void
+spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx)
+{
+ zio_t *zio = zio_root(spa, NULL, NULL, 0);
+ bplist_iterate(bpl, spa_free_sync_cb, zio, tx);
+ VERIFY(zio_wait(zio) == 0);
+}
+
+/*
+ * Note: this simple function is not inlined to make it easier to dtrace the
+ * amount of time spent syncing deferred frees.
+ */
+static void
+spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx)
+{
+ zio_t *zio = zio_root(spa, NULL, NULL, 0);
+ VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj,
+ spa_free_sync_cb, zio, tx), ==, 0);
+ VERIFY0(zio_wait(zio));
+}
+
+
+static void
spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx)
{
char *packed = NULL;
@@ -5937,7 +6316,7 @@
/*
* Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration
- * information. This avoids the dbuf_will_dirty() path and
+ * information. This avoids the dmu_buf_will_dirty() path and
* saves us a pre-read to get data we don't actually care about.
*/
bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE);
@@ -6026,8 +6405,7 @@
spa_config_exit(spa, SCL_STATE, FTAG);
- if (spa->spa_config_syncing)
- nvlist_free(spa->spa_config_syncing);
+ nvlist_free(spa->spa_config_syncing);
spa->spa_config_syncing = config;
spa_sync_nvlist(spa, spa->spa_config_object, config, tx);
@@ -6072,7 +6450,7 @@
zpool_prop_t prop;
const char *propname;
zprop_type_t proptype;
- zfeature_info_t *feature;
+ spa_feature_t fid;
switch (prop = zpool_name_to_prop(nvpair_name(elem))) {
case ZPROP_INVAL:
@@ -6082,15 +6460,15 @@
ASSERT(zpool_prop_feature(nvpair_name(elem)));
fname = strchr(nvpair_name(elem), '@') + 1;
- VERIFY3U(0, ==, zfeature_lookup_name(fname, &feature));
+ VERIFY0(zfeature_lookup_name(fname, &fid));
- spa_feature_enable(spa, feature, tx);
+ spa_feature_enable(spa, fid, tx);
spa_history_log_internal(spa, "set", tx,
"%s=enabled", nvpair_name(elem));
break;
case ZPOOL_PROP_VERSION:
- VERIFY(nvpair_value_uint64(elem, &intval) == 0);
+ intval = fnvpair_value_uint64(elem);
/*
* The version is synced seperatly before other
* properties and should be correct by now.
@@ -6114,7 +6492,7 @@
*/
break;
case ZPOOL_PROP_COMMENT:
- VERIFY(nvpair_value_string(elem, &strval) == 0);
+ strval = fnvpair_value_string(elem);
if (spa->spa_comment != NULL)
spa_strfree(spa->spa_comment);
spa->spa_comment = spa_strdup(strval);
@@ -6146,23 +6524,23 @@
if (nvpair_type(elem) == DATA_TYPE_STRING) {
ASSERT(proptype == PROP_TYPE_STRING);
- VERIFY(nvpair_value_string(elem, &strval) == 0);
- VERIFY(zap_update(mos,
+ strval = fnvpair_value_string(elem);
+ VERIFY0(zap_update(mos,
spa->spa_pool_props_object, propname,
- 1, strlen(strval) + 1, strval, tx) == 0);
+ 1, strlen(strval) + 1, strval, tx));
spa_history_log_internal(spa, "set", tx,
"%s=%s", nvpair_name(elem), strval);
} else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
- VERIFY(nvpair_value_uint64(elem, &intval) == 0);
+ intval = fnvpair_value_uint64(elem);
if (proptype == PROP_TYPE_INDEX) {
const char *unused;
- VERIFY(zpool_prop_index_to_string(
- prop, intval, &unused) == 0);
+ VERIFY0(zpool_prop_index_to_string(
+ prop, intval, &unused));
}
- VERIFY(zap_update(mos,
+ VERIFY0(zap_update(mos,
spa->spa_pool_props_object, propname,
- 8, 1, &intval, tx) == 0);
+ 8, 1, &intval, tx));
spa_history_log_internal(spa, "set", tx,
"%s=%lld", nvpair_name(elem), intval);
} else {
@@ -6239,6 +6617,36 @@
spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) {
spa_feature_create_zap_objects(spa, tx);
}
+
+ /*
+ * LZ4_COMPRESS feature's behaviour was changed to activate_on_enable
+ * when possibility to use lz4 compression for metadata was added
+ * Old pools that have this feature enabled must be upgraded to have
+ * this feature active
+ */
+ if (spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) {
+ boolean_t lz4_en = spa_feature_is_enabled(spa,
+ SPA_FEATURE_LZ4_COMPRESS);
+ boolean_t lz4_ac = spa_feature_is_active(spa,
+ SPA_FEATURE_LZ4_COMPRESS);
+
+ if (lz4_en && !lz4_ac)
+ spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx);
+ }
+
+ /*
+ * If we haven't written the salt, do so now. Note that the
+ * feature may not be activated yet, but that's fine since
+ * the presence of this ZAP entry is backwards compatible.
+ */
+ if (zap_contains(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_CHECKSUM_SALT) == ENOENT) {
+ VERIFY0(zap_add(spa->spa_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT, 1,
+ sizeof (spa->spa_cksum_salt.zcs_bytes),
+ spa->spa_cksum_salt.zcs_bytes, tx));
+ }
+
rrw_exit(&dp->dp_config_rwlock, FTAG);
}
@@ -6251,12 +6659,13 @@
{
dsl_pool_t *dp = spa->spa_dsl_pool;
objset_t *mos = spa->spa_meta_objset;
- bpobj_t *defer_bpo = &spa->spa_deferred_bpobj;
bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
vdev_t *rvd = spa->spa_root_vdev;
vdev_t *vd;
dmu_tx_t *tx;
int error;
+ uint32_t max_queue_depth = zfs_vdev_async_write_max_active *
+ zfs_vdev_queue_depth_pct / 100;
VERIFY(spa_writeable(spa));
@@ -6268,6 +6677,10 @@
spa->spa_syncing_txg = txg;
spa->spa_sync_pass = 0;
+ mutex_enter(&spa->spa_alloc_lock);
+ VERIFY0(avl_numnodes(&spa->spa_alloc_tree));
+ mutex_exit(&spa->spa_alloc_lock);
+
/*
* If there are any pending vdev state changes, convert them
* into config changes that go out with this transaction group.
@@ -6298,12 +6711,12 @@
#ifdef illumos
VERIFY(cyclic_reprogram(spa->spa_deadman_cycid,
spa->spa_sync_starttime + spa->spa_deadman_synctime));
-#else /* FreeBSD */
+#else /* !illumos */
#ifdef _KERNEL
- callout_reset(&spa->spa_deadman_cycid,
- hz * spa->spa_deadman_synctime / NANOSEC, spa_deadman, spa);
+ callout_schedule(&spa->spa_deadman_cycid,
+ hz * spa->spa_deadman_synctime / NANOSEC);
#endif
-#endif
+#endif /* illumos */
/*
* If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg,
@@ -6327,23 +6740,37 @@
}
/*
- * If anything has changed in this txg, or if someone is waiting
- * for this txg to sync (eg, spa_vdev_remove()), push the
- * deferred frees from the previous txg. If not, leave them
- * alone so that we don't generate work on an otherwise idle
- * system.
+ * Set the top-level vdev's max queue depth. Evaluate each
+ * top-level's async write queue depth in case it changed.
+ * The max queue depth will not change in the middle of syncing
+ * out this txg.
*/
- if (!txg_list_empty(&dp->dp_dirty_datasets, txg) ||
- !txg_list_empty(&dp->dp_dirty_dirs, txg) ||
- !txg_list_empty(&dp->dp_sync_tasks, txg) ||
- ((dsl_scan_active(dp->dp_scan) ||
- txg_sync_waiting(dp)) && !spa_shutting_down(spa))) {
- zio_t *zio = zio_root(spa, NULL, NULL, 0);
- VERIFY3U(bpobj_iterate(defer_bpo,
- spa_free_sync_cb, zio, tx), ==, 0);
- VERIFY0(zio_wait(zio));
+ uint64_t queue_depth_total = 0;
+ for (int c = 0; c < rvd->vdev_children; c++) {
+ vdev_t *tvd = rvd->vdev_child[c];
+ metaslab_group_t *mg = tvd->vdev_mg;
+
+ if (mg == NULL || mg->mg_class != spa_normal_class(spa) ||
+ !metaslab_group_initialized(mg))
+ continue;
+
+ /*
+ * It is safe to do a lock-free check here because only async
+ * allocations look at mg_max_alloc_queue_depth, and async
+ * allocations all happen from spa_sync().
+ */
+ ASSERT0(refcount_count(&mg->mg_alloc_queue_depth));
+ mg->mg_max_alloc_queue_depth = max_queue_depth;
+ queue_depth_total += mg->mg_max_alloc_queue_depth;
}
+ metaslab_class_t *mc = spa_normal_class(spa);
+ ASSERT0(refcount_count(&mc->mc_alloc_slots));
+ mc->mc_alloc_max_slots = queue_depth_total;
+ mc->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
+ ASSERT3U(mc->mc_alloc_max_slots, <=,
+ max_queue_depth * rvd->vdev_children);
+
/*
* Iterate to convergence.
*/
@@ -6359,13 +6786,15 @@
dsl_pool_sync(dp, txg);
if (pass < zfs_sync_pass_deferred_free) {
- zio_t *zio = zio_root(spa, NULL, NULL, 0);
- bplist_iterate(free_bpl, spa_free_sync_cb,
- zio, tx);
- VERIFY(zio_wait(zio) == 0);
+ spa_sync_frees(spa, free_bpl, tx);
} else {
+ /*
+ * We can not defer frees in pass 1, because
+ * we sync the deferred frees later in pass 1.
+ */
+ ASSERT3U(pass, >, 1);
bplist_iterate(free_bpl, bpobj_enqueue_cb,
- defer_bpo, tx);
+ &spa->spa_deferred_bpobj, tx);
}
ddt_sync(spa, txg);
@@ -6374,8 +6803,37 @@
while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg))
vdev_sync(vd, txg);
- if (pass == 1)
+ if (pass == 1) {
spa_sync_upgrades(spa, tx);
+ ASSERT3U(txg, >=,
+ spa->spa_uberblock.ub_rootbp.blk_birth);
+ /*
+ * Note: We need to check if the MOS is dirty
+ * because we could have marked the MOS dirty
+ * without updating the uberblock (e.g. if we
+ * have sync tasks but no dirty user data). We
+ * need to check the uberblock's rootbp because
+ * it is updated if we have synced out dirty
+ * data (though in this case the MOS will most
+ * likely also be dirty due to second order
+ * effects, we don't want to rely on that here).
+ */
+ if (spa->spa_uberblock.ub_rootbp.blk_birth < txg &&
+ !dmu_objset_is_dirty(mos, txg)) {
+ /*
+ * Nothing changed on the first pass,
+ * therefore this TXG is a no-op. Avoid
+ * syncing deferred frees, so that we
+ * can keep this TXG as a no-op.
+ */
+ ASSERT(txg_list_empty(&dp->dp_dirty_datasets,
+ txg));
+ ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
+ ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg));
+ break;
+ }
+ spa_sync_deferred_frees(spa, tx);
+ }
} while (dmu_objset_is_dirty(mos, txg));
@@ -6409,16 +6867,10 @@
if (svdcount == SPA_DVAS_PER_BP)
break;
}
- error = vdev_config_sync(svd, svdcount, txg, B_FALSE);
- if (error != 0)
- error = vdev_config_sync(svd, svdcount, txg,
- B_TRUE);
+ error = vdev_config_sync(svd, svdcount, txg);
} else {
error = vdev_config_sync(rvd->vdev_child,
- rvd->vdev_children, txg, B_FALSE);
- if (error != 0)
- error = vdev_config_sync(rvd->vdev_child,
- rvd->vdev_children, txg, B_TRUE);
+ rvd->vdev_children, txg);
}
if (error == 0)
@@ -6435,11 +6887,11 @@
#ifdef illumos
VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY));
-#else /* FreeBSD */
+#else /* !illumos */
#ifdef _KERNEL
callout_drain(&spa->spa_deadman_cycid);
#endif
-#endif
+#endif /* illumos */
/*
* Clear the dirty config list.
@@ -6457,10 +6909,12 @@
spa->spa_config_syncing = NULL;
}
- spa->spa_ubsync = spa->spa_uberblock;
-
dsl_pool_sync_done(dp, txg);
+ mutex_enter(&spa->spa_alloc_lock);
+ VERIFY0(avl_numnodes(&spa->spa_alloc_tree));
+ mutex_exit(&spa->spa_alloc_lock);
+
/*
* Update usable space statistics.
*/
@@ -6479,6 +6933,13 @@
spa->spa_sync_pass = 0;
+ /*
+ * Update the last synced uberblock here. We want to do this at
+ * the end of spa_sync() so that consumers of spa_last_synced_txg()
+ * will be guaranteed that all the processing associated with
+ * that txg has been completed.
+ */
+ spa->spa_ubsync = spa->spa_uberblock;
spa_config_exit(spa, SCL_CONFIG, FTAG);
spa_handle_ignored_writes(spa);
@@ -6487,6 +6948,7 @@
* If any async tasks have been requested, kick them off.
*/
spa_async_dispatch(spa);
+ spa_async_dispatch_vd(spa);
}
/*
@@ -6591,7 +7053,7 @@
* possible.
*/
ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version));
- ASSERT(version >= spa->spa_uberblock.ub_version);
+ ASSERT3U(version, >=, spa->spa_uberblock.ub_version);
spa->spa_uberblock.ub_version = version;
vdev_config_dirty(spa->spa_root_vdev);
@@ -6642,24 +7104,17 @@
return (B_FALSE);
}
-/*
- * Post a sysevent corresponding to the given event. The 'name' must be one of
- * the event definitions in sys/sysevent/eventdefs.h. The payload will be
- * filled in from the spa and (optionally) the vdev. This doesn't do anything
- * in the userland libzpool, as we don't want consumers to misinterpret ztest
- * or zdb as real changes.
- */
-void
-spa_event_notify(spa_t *spa, vdev_t *vd, const char *name)
+static sysevent_t *
+spa_event_create(spa_t *spa, vdev_t *vd, const char *name)
{
+ sysevent_t *ev = NULL;
#ifdef _KERNEL
- sysevent_t *ev;
sysevent_attr_list_t *attr = NULL;
sysevent_value_t value;
- sysevent_id_t eid;
ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs",
SE_SLEEP);
+ ASSERT(ev != NULL);
value.value_type = SE_DATA_TYPE_STRING;
value.value.sv_string = spa_name(spa);
@@ -6691,11 +7146,34 @@
goto done;
attr = NULL;
- (void) log_sysevent(ev, SE_SLEEP, &eid);
-
done:
if (attr)
sysevent_free_attr(attr);
+
+#endif
+ return (ev);
+}
+
+static void
+spa_event_post(sysevent_t *ev)
+{
+#ifdef _KERNEL
+ sysevent_id_t eid;
+
+ (void) log_sysevent(ev, SE_SLEEP, &eid);
sysevent_free(ev);
#endif
}
+
+/*
+ * Post a sysevent corresponding to the given event. The 'name' must be one of
+ * the event definitions in sys/sysevent/eventdefs.h. The payload will be
+ * filled in from the spa and (optionally) the vdev. This doesn't do anything
+ * in the userland libzpool, as we don't want consumers to misinterpret ztest
+ * or zdb as real changes.
+ */
+void
+spa_event_notify(spa_t *spa, vdev_t *vd, const char *name)
+{
+ spa_event_post(spa_event_create(spa, vd, name));
+}
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -23,7 +23,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -141,6 +141,26 @@
kobj_close_file(file);
}
+static void
+spa_config_clean(nvlist_t *nvl)
+{
+ nvlist_t **child;
+ nvlist_t *nvroot = NULL;
+ uint_t c, children;
+
+ if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child,
+ &children) == 0) {
+ for (c = 0; c < children; c++)
+ spa_config_clean(child[c]);
+ }
+
+ if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0)
+ spa_config_clean(nvroot);
+
+ nvlist_remove(nvl, ZPOOL_CONFIG_VDEV_STATS, DATA_TYPE_UINT64_ARRAY);
+ nvlist_remove(nvl, ZPOOL_CONFIG_SCAN_STATS, DATA_TYPE_UINT64_ARRAY);
+}
+
static int
spa_config_write(spa_config_dirent_t *dp, nvlist_t *nvl)
{
@@ -197,7 +217,12 @@
/*
* Synchronize pool configuration to disk. This must be called with the
- * namespace lock held.
+ * namespace lock held. Synchronizing the pool cache is typically done after
+ * the configuration has been synced to the MOS. This exposes a window where
+ * the MOS config will have been updated but the cache file has not. If
+ * the system were to crash at that instant then the cached config may not
+ * contain the correct information to open the pool and an explicity import
+ * would be required.
*/
void
spa_config_sync(spa_t *target, boolean_t removing, boolean_t postsysevent)
@@ -206,6 +231,7 @@
nvlist_t *nvl;
boolean_t ccw_failure;
int error;
+ char *pool_name;
ASSERT(MUTEX_HELD(&spa_namespace_lock));
@@ -229,6 +255,7 @@
*/
nvl = NULL;
while ((spa = spa_next(spa)) != NULL) {
+ nvlist_t *nvroot = NULL;
/*
* Skip over our own pool if we're about to remove
* ourselves from the spa namespace or any pool that
@@ -237,7 +264,8 @@
* we don't allow them to be written to the cache file.
*/
if ((spa == target && removing) ||
- !spa_writeable(spa))
+ (spa_state(spa) == POOL_STATE_ACTIVE &&
+ !spa_writeable(spa)))
continue;
mutex_enter(&spa->spa_props_lock);
@@ -253,9 +281,19 @@
VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME,
KM_SLEEP) == 0);
- VERIFY(nvlist_add_nvlist(nvl, spa->spa_name,
- spa->spa_config) == 0);
+ if (spa->spa_import_flags & ZFS_IMPORT_TEMP_NAME) {
+ pool_name = fnvlist_lookup_string(spa->spa_config,
+ ZPOOL_CONFIG_POOL_NAME);
+ } else {
+ pool_name = spa_name(spa);
+ }
+
+ fnvlist_add_nvlist(nvl, pool_name,
+ spa->spa_config);
mutex_exit(&spa->spa_props_lock);
+
+ if (nvlist_lookup_nvlist(nvl, pool_name, &nvroot) == 0)
+ spa_config_clean(nvroot);
}
error = spa_config_write(dp, nvl);
@@ -338,8 +376,7 @@
spa_config_set(spa_t *spa, nvlist_t *config)
{
mutex_enter(&spa->spa_props_lock);
- if (spa->spa_config != NULL)
- nvlist_free(spa->spa_config);
+ nvlist_free(spa->spa_config);
spa->spa_config = config;
mutex_exit(&spa->spa_props_lock);
}
@@ -358,6 +395,7 @@
unsigned long hostid = 0;
boolean_t locked = B_FALSE;
uint64_t split_guid;
+ char *pool_name;
if (vd == NULL) {
vd = rvd;
@@ -374,22 +412,36 @@
if (txg == -1ULL)
txg = spa->spa_config_txg;
- VERIFY(nvlist_alloc(&config, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ /*
+ * Originally, users had to handle spa namespace collisions by either
+ * exporting the already imported pool or by specifying a new name for
+ * the pool with a conflicting name. In the case of root pools from
+ * virtual guests, neither approach to collision resolution is
+ * reasonable. This is addressed by extending the new name syntax with
+ * an option to specify that the new name is temporary. When specified,
+ * ZFS_IMPORT_TEMP_NAME will be set in spa->spa_import_flags to tell us
+ * to use the previous name, which we do below.
+ */
+ if (spa->spa_import_flags & ZFS_IMPORT_TEMP_NAME) {
+ pool_name = fnvlist_lookup_string(spa->spa_config,
+ ZPOOL_CONFIG_POOL_NAME);
+ } else {
+ pool_name = spa_name(spa);
+ }
- VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
- spa_version(spa)) == 0);
- VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME,
- spa_name(spa)) == 0);
- VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
- spa_state(spa)) == 0);
- VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG,
- txg) == 0);
- VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID,
- spa_guid(spa)) == 0);
- VERIFY(spa->spa_comment == NULL || nvlist_add_string(config,
- ZPOOL_CONFIG_COMMENT, spa->spa_comment) == 0);
+ config = fnvlist_alloc();
+ fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, spa_version(spa));
+ fnvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, pool_name);
+ fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, spa_state(spa));
+ fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, txg);
+ fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, spa_guid(spa));
+ if (spa->spa_comment != NULL) {
+ fnvlist_add_string(config, ZPOOL_CONFIG_COMMENT,
+ spa->spa_comment);
+ }
+
#ifdef _KERNEL
hostid = zone_get_hostid(NULL);
#else /* _KERNEL */
@@ -515,8 +567,10 @@
*/
for (c = 0; c < rvd->vdev_children; c++) {
vdev_t *tvd = rvd->vdev_child[c];
- if (tvd->vdev_ms_array == 0)
+ if (tvd->vdev_ms_array == 0) {
+ vdev_ashift_optimize(tvd);
vdev_metaslab_set_size(tvd);
+ }
vdev_expand(tvd, txg);
}
}
@@ -530,8 +584,7 @@
/*
* Update the global config cache to reflect the new mosconfig.
*/
- if (!spa->spa_is_root)
- spa_config_sync(spa, B_FALSE, what != SPA_CONFIG_UPDATE_POOL);
+ spa_config_sync(spa, B_FALSE, what != SPA_CONFIG_UPDATE_POOL);
if (what == SPA_CONFIG_UPDATE_POOL)
spa_config_update(spa, SPA_CONFIG_UPDATE_VDEVS);
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_errlog.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,7 +21,7 @@
*/
/*
* Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
*/
/*
@@ -36,7 +36,7 @@
* deleted from the log when the scrub completes.
*
* The log is stored using a ZAP object whose key is a string form of the
- * zbookmark tuple (objset, object, level, blkid), and whose contents is an
+ * zbookmark_phys tuple (objset, object, level, blkid), and whose contents is an
* optional 'objset:object' human-readable string describing the data. When an
* error is first logged, this string will be empty, indicating that no name is
* known. This prevents us from having to issue a potentially large amount of
@@ -60,7 +60,7 @@
* Convert a bookmark to a string.
*/
static void
-bookmark_to_name(zbookmark_t *zb, char *buf, size_t len)
+bookmark_to_name(zbookmark_phys_t *zb, char *buf, size_t len)
{
(void) snprintf(buf, len, "%llx:%llx:%llx:%llx",
(u_longlong_t)zb->zb_objset, (u_longlong_t)zb->zb_object,
@@ -72,7 +72,7 @@
*/
#ifdef _KERNEL
static void
-name_to_bookmark(char *buf, zbookmark_t *zb)
+name_to_bookmark(char *buf, zbookmark_phys_t *zb)
{
zb->zb_objset = strtonum(buf, &buf);
ASSERT(*buf == ':');
@@ -93,7 +93,7 @@
void
spa_log_error(spa_t *spa, zio_t *zio)
{
- zbookmark_t *zb = &zio->io_logical->io_bookmark;
+ zbookmark_phys_t *zb = &zio->io_logical->io_bookmark;
spa_error_entry_t search;
spa_error_entry_t *new;
avl_tree_t *tree;
@@ -166,7 +166,7 @@
{
zap_cursor_t zc;
zap_attribute_t za;
- zbookmark_t zb;
+ zbookmark_phys_t zb;
if (obj == 0)
return (0);
@@ -183,8 +183,8 @@
name_to_bookmark(za.za_name, &zb);
if (copyout(&zb, (char *)addr +
- (*count - 1) * sizeof (zbookmark_t),
- sizeof (zbookmark_t)) != 0) {
+ (*count - 1) * sizeof (zbookmark_phys_t),
+ sizeof (zbookmark_phys_t)) != 0) {
zap_cursor_fini(&zc);
return (SET_ERROR(EFAULT));
}
@@ -208,8 +208,8 @@
return (SET_ERROR(ENOMEM));
if (copyout(&se->se_bookmark, (char *)addr +
- (*count - 1) * sizeof (zbookmark_t),
- sizeof (zbookmark_t)) != 0)
+ (*count - 1) * sizeof (zbookmark_phys_t),
+ sizeof (zbookmark_phys_t)) != 0)
return (SET_ERROR(EFAULT));
*count -= 1;
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_history.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -22,7 +22,8 @@
/*
* Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
*/
#include <sys/spa.h>
@@ -91,7 +92,7 @@
ASSERT(spa->spa_history == 0);
spa->spa_history = dmu_object_alloc(mos, DMU_OT_SPA_HISTORY,
- SPA_MAXBLOCKSIZE, DMU_OT_SPA_HISTORY_OFFSETS,
+ SPA_OLD_MAXBLOCKSIZE, DMU_OT_SPA_HISTORY_OFFSETS,
sizeof (spa_history_phys_t), tx);
VERIFY(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT,
@@ -325,7 +326,7 @@
/* Kick this off asynchronously; errors are ignored. */
dsl_sync_task_nowait(spa_get_dsl(spa), spa_history_log_sync,
- nvarg, 0, tx);
+ nvarg, 0, ZFS_SPACE_CHECK_NONE, tx);
dmu_tx_commit(tx);
/* spa_history_log_sync will free nvl */
@@ -466,7 +467,7 @@
spa_history_log_sync(nvl, tx);
} else {
dsl_sync_task_nowait(spa_get_dsl(spa),
- spa_history_log_sync, nvl, 0, tx);
+ spa_history_log_sync, nvl, 0, ZFS_SPACE_CHECK_NONE, tx);
}
/* spa_history_log_sync() will free nvl */
}
@@ -501,7 +502,7 @@
dmu_tx_t *tx, const char *fmt, ...)
{
va_list adx;
- char namebuf[MAXNAMELEN];
+ char namebuf[ZFS_MAX_DATASET_NAME_LEN];
nvlist_t *nvl = fnvlist_alloc();
ASSERT(tx != NULL);
@@ -520,7 +521,7 @@
dmu_tx_t *tx, const char *fmt, ...)
{
va_list adx;
- char namebuf[MAXNAMELEN];
+ char namebuf[ZFS_MAX_DATASET_NAME_LEN];
nvlist_t *nvl = fnvlist_alloc();
ASSERT(tx != NULL);
@@ -528,7 +529,7 @@
dsl_dir_name(dd, namebuf);
fnvlist_add_string(nvl, ZPOOL_HIST_DSNAME, namebuf);
fnvlist_add_uint64(nvl, ZPOOL_HIST_DSID,
- dd->dd_phys->dd_head_dataset_obj);
+ dsl_dir_phys(dd)->dd_head_dataset_obj);
va_start(adx, fmt);
log_internal(nvl, operation, dd->dd_pool->dp_spa, tx, fmt, adx);
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,9 +21,12 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
- * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
* Copyright 2013 Martin Matuska <mm at FreeBSD.org>. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
*/
#include <sys/zfs_context.h>
@@ -37,6 +40,7 @@
#include <sys/zap.h>
#include <sys/zil.h>
#include <sys/vdev_impl.h>
+#include <sys/vdev_file.h>
#include <sys/metaslab.h>
#include <sys/uberblock_impl.h>
#include <sys/txg.h>
@@ -51,7 +55,7 @@
#include <sys/arc.h>
#include <sys/ddt.h>
#include "zfs_prop.h"
-#include "zfeature_common.h"
+#include <sys/zfeature.h>
/*
* SPA locking
@@ -244,37 +248,111 @@
#else
int zfs_flags = 0;
#endif
+SYSCTL_DECL(_debug);
+TUNABLE_INT("debug.zfs_flags", &zfs_flags);
+SYSCTL_INT(_debug, OID_AUTO, zfs_flags, CTLFLAG_RWTUN, &zfs_flags, 0,
+ "ZFS debug flags.");
/*
* zfs_recover can be set to nonzero to attempt to recover from
* otherwise-fatal errors, typically caused by on-disk corruption. When
* set, calls to zfs_panic_recover() will turn into warning messages.
+ * This should only be used as a last resort, as it typically results
+ * in leaked space, or worse.
*/
-int zfs_recover = 0;
+boolean_t zfs_recover = B_FALSE;
SYSCTL_DECL(_vfs_zfs);
TUNABLE_INT("vfs.zfs.recover", &zfs_recover);
-SYSCTL_INT(_vfs_zfs, OID_AUTO, recover, CTLFLAG_RDTUN, &zfs_recover, 0,
+SYSCTL_INT(_vfs_zfs, OID_AUTO, recover, CTLFLAG_RWTUN, &zfs_recover, 0,
"Try to recover from otherwise-fatal errors.");
-extern int zfs_txg_synctime_ms;
+static int
+sysctl_vfs_zfs_debug_flags(SYSCTL_HANDLER_ARGS)
+{
+ int err, val;
+ val = zfs_flags;
+ err = sysctl_handle_int(oidp, &val, 0, req);
+ if (err != 0 || req->newptr == NULL)
+ return (err);
+
+ /*
+ * ZFS_DEBUG_MODIFY must be enabled prior to boot so all
+ * arc buffers in the system have the necessary additional
+ * checksum data. However, it is safe to disable at any
+ * time.
+ */
+ if (!(zfs_flags & ZFS_DEBUG_MODIFY))
+ val &= ~ZFS_DEBUG_MODIFY;
+ zfs_flags = val;
+
+ return (0);
+}
+TUNABLE_INT("vfs.zfs.debugflags", &zfs_flags);
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, debugflags,
+ CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(int),
+ sysctl_vfs_zfs_debug_flags, "IU", "Debug flags for ZFS testing.");
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, debug_flags,
+ CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(int),
+ sysctl_vfs_zfs_debug_flags, "IU",
+ "Debug flags for ZFS testing (deprecated, see vfs.zfs.debugflags).");
+
/*
- * Expiration time in units of zfs_txg_synctime_ms. This value has two
- * meanings. First it is used to determine when the spa_deadman logic
- * should fire. By default the spa_deadman will fire if spa_sync has
- * not completed in 1000 * zfs_txg_synctime_ms (i.e. 1000 seconds).
- * Secondly, the value determines if an I/O is considered "hung".
- * Any I/O that has not completed in zfs_deadman_synctime is considered
- * "hung" resulting in a system panic.
- * 1000 zfs_txg_synctime_ms (i.e. 1000 seconds).
+ * If destroy encounters an EIO while reading metadata (e.g. indirect
+ * blocks), space referenced by the missing metadata can not be freed.
+ * Normally this causes the background destroy to become "stalled", as
+ * it is unable to make forward progress. While in this stalled state,
+ * all remaining space to free from the error-encountering filesystem is
+ * "temporarily leaked". Set this flag to cause it to ignore the EIO,
+ * permanently leak the space from indirect blocks that can not be read,
+ * and continue to free everything else that it can.
+ *
+ * The default, "stalling" behavior is useful if the storage partially
+ * fails (i.e. some but not all i/os fail), and then later recovers. In
+ * this case, we will be able to continue pool operations while it is
+ * partially failed, and when it recovers, we can continue to free the
+ * space, with no leaks. However, note that this case is actually
+ * fairly rare.
+ *
+ * Typically pools either (a) fail completely (but perhaps temporarily,
+ * e.g. a top-level vdev going offline), or (b) have localized,
+ * permanent errors (e.g. disk returns the wrong data due to bit flip or
+ * firmware bug). In case (a), this setting does not matter because the
+ * pool will be suspended and the sync thread will not be able to make
+ * forward progress regardless. In case (b), because the error is
+ * permanent, the best we can do is leak the minimum amount of space,
+ * which is what setting this flag will do. Therefore, it is reasonable
+ * for this flag to normally be set, but we chose the more conservative
+ * approach of not setting it, so that there is no possibility of
+ * leaking space in the "partial temporary" failure case.
*/
-uint64_t zfs_deadman_synctime = 1000ULL;
-TUNABLE_QUAD("vfs.zfs.deadman_synctime", &zfs_deadman_synctime);
-SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, deadman_synctime, CTLFLAG_RDTUN,
- &zfs_deadman_synctime, 0,
- "Stalled ZFS I/O expiration time in units of vfs.zfs.txg_synctime_ms");
+boolean_t zfs_free_leak_on_eio = B_FALSE;
/*
+ * Expiration time in milliseconds. This value has two meanings. First it is
+ * used to determine when the spa_deadman() logic should fire. By default the
+ * spa_deadman() will fire if spa_sync() has not completed in 1000 seconds.
+ * Secondly, the value determines if an I/O is considered "hung". Any I/O that
+ * has not completed in zfs_deadman_synctime_ms is considered "hung" resulting
+ * in a system panic.
+ */
+uint64_t zfs_deadman_synctime_ms = 1000000ULL;
+TUNABLE_QUAD("vfs.zfs.deadman_synctime_ms", &zfs_deadman_synctime_ms);
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, deadman_synctime_ms, CTLFLAG_RDTUN,
+ &zfs_deadman_synctime_ms, 0,
+ "Stalled ZFS I/O expiration time in milliseconds");
+
+/*
+ * Check time in milliseconds. This defines the frequency at which we check
+ * for hung I/O.
+ */
+uint64_t zfs_deadman_checktime_ms = 5000ULL;
+TUNABLE_QUAD("vfs.zfs.deadman_checktime_ms", &zfs_deadman_checktime_ms);
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, deadman_checktime_ms, CTLFLAG_RDTUN,
+ &zfs_deadman_checktime_ms, 0,
+ "Period of checks for stalled ZFS I/O in milliseconds");
+
+/*
* Default value of -1 for zfs_deadman_enabled is resolved in
* zfs_deadman_init()
*/
@@ -283,6 +361,20 @@
SYSCTL_INT(_vfs_zfs, OID_AUTO, deadman_enabled, CTLFLAG_RDTUN,
&zfs_deadman_enabled, 0, "Kernel panic on stalled ZFS I/O");
+/*
+ * The worst case is single-sector max-parity RAID-Z blocks, in which
+ * case the space requirement is exactly (VDEV_RAIDZ_MAXPARITY + 1)
+ * times the size; so just assume that. Add to this the fact that
+ * we can have up to 3 DVAs per bp, and one more factor of 2 because
+ * the block may be dittoed with up to 3 DVAs by ddt_sync(). All together,
+ * the worst case is:
+ * (VDEV_RAIDZ_MAXPARITY + 1) * SPA_DVAS_PER_BP * 2 == 24
+ */
+int spa_asize_inflation = 24;
+TUNABLE_INT("vfs.zfs.spa_asize_inflation", &spa_asize_inflation);
+SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_asize_inflation, CTLFLAG_RWTUN,
+ &spa_asize_inflation, 0, "Worst case inflation factor for single sector writes");
+
#ifndef illumos
#ifdef _KERNEL
static void
@@ -304,6 +396,43 @@
#endif /* !illumos */
/*
+ * Normally, we don't allow the last 3.2% (1/(2^spa_slop_shift)) of space in
+ * the pool to be consumed. This ensures that we don't run the pool
+ * completely out of space, due to unaccounted changes (e.g. to the MOS).
+ * It also limits the worst-case time to allocate space. If we have
+ * less than this amount of free space, most ZPL operations (e.g. write,
+ * create) will return ENOSPC.
+ *
+ * Certain operations (e.g. file removal, most administrative actions) can
+ * use half the slop space. They will only return ENOSPC if less than half
+ * the slop space is free. Typically, once the pool has less than the slop
+ * space free, the user will use these operations to free up space in the pool.
+ * These are the operations that call dsl_pool_adjustedsize() with the netfree
+ * argument set to TRUE.
+ *
+ * A very restricted set of operations are always permitted, regardless of
+ * the amount of free space. These are the operations that call
+ * dsl_sync_task(ZFS_SPACE_CHECK_NONE), e.g. "zfs destroy". If these
+ * operations result in a net increase in the amount of space used,
+ * it is possible to run the pool completely out of space, causing it to
+ * be permanently read-only.
+ *
+ * Note that on very small pools, the slop space will be larger than
+ * 3.2%, in an effort to have it be at least spa_min_slop (128MB),
+ * but we never allow it to be more than half the pool size.
+ *
+ * See also the comments in zfs_space_check_t.
+ */
+int spa_slop_shift = 5;
+SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_slop_shift, CTLFLAG_RWTUN,
+ &spa_slop_shift, 0,
+ "Shift value of reserved space (1/(2^spa_slop_shift)).");
+uint64_t spa_min_slop = 128 * 1024 * 1024;
+SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, spa_min_slop, CTLFLAG_RWTUN,
+ &spa_min_slop, 0,
+ "Minimal value of reserved space");
+
+/*
* ==========================================================================
* SPA config locking
* ==========================================================================
@@ -345,7 +474,8 @@
if (rw == RW_READER) {
if (scl->scl_writer || scl->scl_write_wanted) {
mutex_exit(&scl->scl_lock);
- spa_config_exit(spa, locks ^ (1 << i), tag);
+ spa_config_exit(spa, locks & ((1 << i) - 1),
+ tag);
return (0);
}
} else {
@@ -352,7 +482,8 @@
ASSERT(scl->scl_writer != curthread);
if (!refcount_is_zero(&scl->scl_count)) {
mutex_exit(&scl->scl_lock);
- spa_config_exit(spa, locks ^ (1 << i), tag);
+ spa_config_exit(spa, locks & ((1 << i) - 1),
+ tag);
return (0);
}
scl->scl_writer = curthread;
@@ -458,7 +589,7 @@
* If it's a full dataset name, figure out the pool name and
* just use that.
*/
- cp = strpbrk(search.spa_name, "/@");
+ cp = strpbrk(search.spa_name, "/@#");
if (cp != NULL)
*cp = '\0';
@@ -472,18 +603,46 @@
* If the zfs_deadman_enabled flag is set then it inspects all vdev queues
* looking for potentially hung I/Os.
*/
-void
-spa_deadman(void *arg)
+static void
+spa_deadman(void *arg, int pending)
{
spa_t *spa = arg;
+ /*
+ * Disable the deadman timer if the pool is suspended.
+ */
+ if (spa_suspended(spa)) {
+#ifdef illumos
+ VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY));
+#else
+ /* Nothing. just don't schedule any future callouts. */
+#endif
+ return;
+ }
+
zfs_dbgmsg("slow spa_sync: started %llu seconds ago, calls %llu",
(gethrtime() - spa->spa_sync_starttime) / NANOSEC,
++spa->spa_deadman_calls);
if (zfs_deadman_enabled)
vdev_deadman(spa->spa_root_vdev);
+#ifdef __FreeBSD__
+#ifdef _KERNEL
+ callout_schedule(&spa->spa_deadman_cycid,
+ hz * zfs_deadman_checktime_ms / MILLISEC);
+#endif
+#endif
}
+#if defined(__FreeBSD__) && defined(_KERNEL)
+static void
+spa_deadman_timeout(void *arg)
+{
+ spa_t *spa = arg;
+
+ taskqueue_enqueue(taskqueue_thread, &spa->spa_deadman_task);
+}
+#endif
+
/*
* Create an uninitialized spa_t with the given name. Requires
* spa_namespace_lock. The caller must ensure that the spa_t doesn't already
@@ -506,14 +665,18 @@
mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&spa->spa_evicting_os_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&spa->spa_proc_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&spa->spa_cksum_tmpls_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&spa->spa_alloc_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL);
+ cv_init(&spa->spa_evicting_os_cv, NULL, CV_DEFAULT, NULL);
cv_init(&spa->spa_proc_cv, NULL, CV_DEFAULT, NULL);
cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL);
cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL);
@@ -535,8 +698,7 @@
hdlr.cyh_level = CY_LOW_LEVEL;
#endif
- spa->spa_deadman_synctime = zfs_deadman_synctime *
- zfs_txg_synctime_ms * MICROSEC;
+ spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime_ms);
#ifdef illumos
/*
@@ -543,9 +705,9 @@
* This determines how often we need to check for hung I/Os after
* the cyclic has already fired. Since checking for hung I/Os is
* an expensive operation we don't want to check too frequently.
- * Instead wait for 5 synctimes before checking again.
+ * Instead wait for 5 seconds before checking again.
*/
- when.cyt_interval = 5ULL * zfs_txg_synctime_ms * MICROSEC;
+ when.cyt_interval = MSEC2NSEC(zfs_deadman_checktime_ms);
when.cyt_when = CY_INFINITY;
mutex_enter(&cpu_lock);
spa->spa_deadman_cycid = cyclic_add(&hdlr, &when);
@@ -552,7 +714,23 @@
mutex_exit(&cpu_lock);
#else /* !illumos */
#ifdef _KERNEL
- callout_init(&spa->spa_deadman_cycid, CALLOUT_MPSAFE);
+ /*
+ * callout(9) does not provide a way to initialize a callout with
+ * a function and an argument, so we use callout_reset() to schedule
+ * the callout in the very distant future. Even if that event ever
+ * fires, it should be okayas we won't have any active zio-s.
+ * But normally spa_sync() will reschedule the callout with a proper
+ * timeout.
+ * callout(9) does not allow the callback function to sleep but
+ * vdev_deadman() needs to acquire vq_lock and illumos mutexes are
+ * emulated using sx(9). For this reason spa_deadman_timeout()
+ * will schedule spa_deadman() as task on a taskqueue that allows
+ * sleeping.
+ */
+ TASK_INIT(&spa->spa_deadman_task, 0, spa_deadman, spa);
+ callout_init(&spa->spa_deadman_cycid, 1);
+ callout_reset_sbt(&spa->spa_deadman_cycid, SBT_MAX, 0,
+ spa_deadman_timeout, spa, 0);
#endif
#endif
refcount_create(&spa->spa_refcount);
@@ -568,6 +746,9 @@
spa_active_count++;
}
+ avl_create(&spa->spa_alloc_tree, zio_timestamp_compare,
+ sizeof (zio_t), offsetof(zio_t, io_alloc_node));
+
/*
* Every pool starts with the default cachefile
*/
@@ -600,6 +781,18 @@
spa->spa_debug = ((zfs_flags & ZFS_DEBUG_SPA) != 0);
+ spa->spa_min_ashift = INT_MAX;
+ spa->spa_max_ashift = 0;
+
+ /*
+ * As a pool is being created, treat all features as disabled by
+ * setting SPA_FEATURE_DISABLED for all entries in the feature
+ * refcount cache.
+ */
+ for (int i = 0; i < SPA_FEATURES; i++) {
+ spa->spa_feat_refcount_cache[i] = SPA_FEATURE_DISABLED;
+ }
+
return (spa);
}
@@ -615,6 +808,7 @@
ASSERT(MUTEX_HELD(&spa_namespace_lock));
ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
+ ASSERT3U(refcount_count(&spa->spa_refcount), ==, 0);
nvlist_free(spa->spa_config_splitting);
@@ -633,6 +827,7 @@
kmem_free(dp, sizeof (spa_config_dirent_t));
}
+ avl_destroy(&spa->spa_alloc_tree);
list_destroy(&spa->spa_config_list);
nvlist_free(spa->spa_label_features);
@@ -648,6 +843,7 @@
#else /* !illumos */
#ifdef _KERNEL
callout_drain(&spa->spa_deadman_cycid);
+ taskqueue_drain(taskqueue_thread, &spa->spa_deadman_task);
#endif
#endif
@@ -658,17 +854,23 @@
for (int t = 0; t < TXG_SIZE; t++)
bplist_destroy(&spa->spa_free_bplist[t]);
+ zio_checksum_templates_free(spa);
+
cv_destroy(&spa->spa_async_cv);
+ cv_destroy(&spa->spa_evicting_os_cv);
cv_destroy(&spa->spa_proc_cv);
cv_destroy(&spa->spa_scrub_io_cv);
cv_destroy(&spa->spa_suspend_cv);
+ mutex_destroy(&spa->spa_alloc_lock);
mutex_destroy(&spa->spa_async_lock);
mutex_destroy(&spa->spa_errlist_lock);
mutex_destroy(&spa->spa_errlog_lock);
+ mutex_destroy(&spa->spa_evicting_os_lock);
mutex_destroy(&spa->spa_history_lock);
mutex_destroy(&spa->spa_proc_lock);
mutex_destroy(&spa->spa_props_lock);
+ mutex_destroy(&spa->spa_cksum_tmpls_lock);
mutex_destroy(&spa->spa_scrub_lock);
mutex_destroy(&spa->spa_suspend_lock);
mutex_destroy(&spa->spa_vdev_top_lock);
@@ -722,6 +924,20 @@
}
/*
+ * Remove a reference to the given spa_t held by a dsl dir that is
+ * being asynchronously released. Async releases occur from a taskq
+ * performing eviction of dsl datasets and dirs. The namespace lock
+ * isn't held and the hold by the object being evicted may contribute to
+ * spa_minref (e.g. dataset or directory released during pool export),
+ * so the asserts in spa_close() do not apply.
+ */
+void
+spa_async_close(spa_t *spa, void *tag)
+{
+ (void) refcount_remove(&spa->spa_refcount, tag);
+}
+
+/*
* Check to see if the spa refcount is zero. Must be called with
* spa_namespace_lock held. We really compare against spa_minref, which is the
* number of references acquired when opening a pool
@@ -1049,7 +1265,7 @@
txg_wait_synced(spa->spa_dsl_pool, txg);
if (vd != NULL) {
- ASSERT(!vd->vdev_detached || vd->vdev_dtl_smo.smo_object == 0);
+ ASSERT(!vd->vdev_detached || vd->vdev_dtl_sm == NULL);
spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
vdev_free(vd);
spa_config_exit(spa, SCL_ALL, spa);
@@ -1157,17 +1373,27 @@
*/
void
-spa_activate_mos_feature(spa_t *spa, const char *feature)
+spa_activate_mos_feature(spa_t *spa, const char *feature, dmu_tx_t *tx)
{
- (void) nvlist_add_boolean(spa->spa_label_features, feature);
- vdev_config_dirty(spa->spa_root_vdev);
+ if (!nvlist_exists(spa->spa_label_features, feature)) {
+ fnvlist_add_boolean(spa->spa_label_features, feature);
+ /*
+ * When we are creating the pool (tx_txg==TXG_INITIAL), we can't
+ * dirty the vdev config because lock SCL_CONFIG is not held.
+ * Thankfully, in this case we don't need to dirty the config
+ * because it will be written out anyway when we finish
+ * creating the pool.
+ */
+ if (tx->tx_txg != TXG_INITIAL)
+ vdev_config_dirty(spa->spa_root_vdev);
+ }
}
void
spa_deactivate_mos_feature(spa_t *spa, const char *feature)
{
- (void) nvlist_remove_all(spa->spa_label_features, feature);
- vdev_config_dirty(spa->spa_root_vdev);
+ if (nvlist_remove_all(spa->spa_label_features, feature) == 0)
+ vdev_config_dirty(spa->spa_root_vdev);
}
/*
@@ -1318,7 +1544,7 @@
}
void
-sprintf_blkptr(char *buf, const blkptr_t *bp)
+snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp)
{
char type[256];
char *checksum = NULL;
@@ -1336,11 +1562,15 @@
(void) strlcpy(type, dmu_ot[BP_GET_TYPE(bp)].ot_name,
sizeof (type));
}
- checksum = zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name;
+ if (!BP_IS_EMBEDDED(bp)) {
+ checksum =
+ zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name;
+ }
compress = zio_compress_table[BP_GET_COMPRESS(bp)].ci_name;
}
- SPRINTF_BLKPTR(snprintf, ' ', buf, bp, type, checksum, compress);
+ SNPRINTF_BLKPTR(snprintf, ' ', buf, buflen, bp, type, checksum,
+ compress);
}
void
@@ -1535,17 +1765,24 @@
uint64_t
spa_get_asize(spa_t *spa, uint64_t lsize)
{
- /*
- * The worst case is single-sector max-parity RAID-Z blocks, in which
- * case the space requirement is exactly (VDEV_RAIDZ_MAXPARITY + 1)
- * times the size; so just assume that. Add to this the fact that
- * we can have up to 3 DVAs per bp, and one more factor of 2 because
- * the block may be dittoed with up to 3 DVAs by ddt_sync().
- */
- return (lsize * (VDEV_RAIDZ_MAXPARITY + 1) * SPA_DVAS_PER_BP * 2);
+ return (lsize * spa_asize_inflation);
}
+/*
+ * Return the amount of slop space in bytes. It is 1/32 of the pool (3.2%),
+ * or at least 128MB, unless that would cause it to be more than half the
+ * pool size.
+ *
+ * See the comment above spa_slop_shift for details.
+ */
uint64_t
+spa_get_slop_space(spa_t *spa)
+{
+ uint64_t space = spa_get_dspace(spa);
+ return (MAX(space >> spa_slop_shift, MIN(space >> 1, spa_min_slop)));
+}
+
+uint64_t
spa_get_dspace(spa_t *spa)
{
return (spa->spa_dspace);
@@ -1598,6 +1835,34 @@
return (spa->spa_log_class);
}
+void
+spa_evicting_os_register(spa_t *spa, objset_t *os)
+{
+ mutex_enter(&spa->spa_evicting_os_lock);
+ list_insert_head(&spa->spa_evicting_os_list, os);
+ mutex_exit(&spa->spa_evicting_os_lock);
+}
+
+void
+spa_evicting_os_deregister(spa_t *spa, objset_t *os)
+{
+ mutex_enter(&spa->spa_evicting_os_lock);
+ list_remove(&spa->spa_evicting_os_list, os);
+ cv_broadcast(&spa->spa_evicting_os_cv);
+ mutex_exit(&spa->spa_evicting_os_lock);
+}
+
+void
+spa_evicting_os_wait(spa_t *spa)
+{
+ mutex_enter(&spa->spa_evicting_os_lock);
+ while (!list_is_empty(&spa->spa_evicting_os_list))
+ cv_wait(&spa->spa_evicting_os_cv, &spa->spa_evicting_os_lock);
+ mutex_exit(&spa->spa_evicting_os_lock);
+
+ dmu_buf_user_evict_wait();
+}
+
int
spa_max_replication(spa_t *spa)
{
@@ -1632,7 +1897,13 @@
ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
if (asize != 0 && spa->spa_deflate) {
- vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
+ uint64_t vdev = DVA_GET_VDEV(dva);
+ vdev_t *vd = vdev_lookup_top(spa, vdev);
+ if (vd == NULL) {
+ panic(
+ "dva_get_dsize_sync(): bad DVA %llu:%llu",
+ (u_longlong_t)vdev, (u_longlong_t)asize);
+ }
dsize = (asize >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio;
}
@@ -1644,7 +1915,7 @@
{
uint64_t dsize = 0;
- for (int d = 0; d < SPA_DVAS_PER_BP; d++)
+ for (int d = 0; d < BP_GET_NDVAS(bp); d++)
dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]);
return (dsize);
@@ -1657,7 +1928,7 @@
spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
- for (int d = 0; d < SPA_DVAS_PER_BP; d++)
+ for (int d = 0; d < BP_GET_NDVAS(bp); d++)
dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]);
spa_config_exit(spa, SCL_VDEV, FTAG);
@@ -1698,6 +1969,10 @@
spa_config_load();
}
+#ifdef _KERNEL
+EVENTHANDLER_DEFINE(mountroot, spa_boot_init, NULL, 0);
+#endif
+
void
spa_init(int mode)
{
@@ -1734,11 +2009,13 @@
#endif /* illumos */
refcount_sysinit();
unique_init();
- space_map_init();
+ range_tree_init();
zio_init();
+ lz4_init();
dmu_init();
zil_init();
vdev_cache_stat_init();
+ vdev_file_init();
zfs_prop_init();
zpool_prop_init();
zpool_feature_init();
@@ -1758,11 +2035,13 @@
spa_evict_all();
+ vdev_file_fini();
vdev_cache_stat_fini();
zil_fini();
dmu_fini();
+ lz4_fini();
zio_fini();
- space_map_fini();
+ range_tree_fini();
unique_fini();
refcount_fini();
@@ -1811,6 +2090,16 @@
return (!!(spa->spa_mode & FWRITE));
}
+/*
+ * Returns true if there is a pending sync task in any of the current
+ * syncing txg, the current quiescing txg, or the current open txg.
+ */
+boolean_t
+spa_has_pending_synctask(spa_t *spa)
+{
+ return (!txg_all_lists_empty(&spa->spa_dsl_pool->dp_sync_tasks));
+}
+
int
spa_mode(spa_t *spa)
{
@@ -1888,3 +2177,12 @@
{
return (spa->spa_debug);
}
+
+int
+spa_maxblocksize(spa_t *spa)
+{
+ if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS))
+ return (SPA_MAXBLOCKSIZE);
+ else
+ return (SPA_OLD_MAXBLOCKSIZE);
+}
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -24,323 +24,66 @@
* Use is subject to license terms.
*/
/*
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
#include <sys/spa.h>
#include <sys/dmu.h>
+#include <sys/dmu_tx.h>
+#include <sys/dnode.h>
+#include <sys/dsl_pool.h>
#include <sys/zio.h>
#include <sys/space_map.h>
+#include <sys/refcount.h>
+#include <sys/zfeature.h>
-static kmem_cache_t *space_seg_cache;
+SYSCTL_DECL(_vfs_zfs);
-void
-space_map_init(void)
-{
- ASSERT(space_seg_cache == NULL);
- space_seg_cache = kmem_cache_create("space_seg_cache",
- sizeof (space_seg_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
-}
-
-void
-space_map_fini(void)
-{
- kmem_cache_destroy(space_seg_cache);
- space_seg_cache = NULL;
-}
-
/*
- * Space map routines.
- * NOTE: caller is responsible for all locking.
+ * The data for a given space map can be kept on blocks of any size.
+ * Larger blocks entail fewer i/o operations, but they also cause the
+ * DMU to keep more data in-core, and also to waste more i/o bandwidth
+ * when only a few blocks have changed since the last transaction group.
*/
-static int
-space_map_seg_compare(const void *x1, const void *x2)
-{
- const space_seg_t *s1 = x1;
- const space_seg_t *s2 = x2;
+int space_map_blksz = (1 << 12);
+SYSCTL_INT(_vfs_zfs, OID_AUTO, space_map_blksz, CTLFLAG_RDTUN, &space_map_blksz, 0,
+ "Maximum block size for space map. Must be power of 2 and greater than 4096.");
- if (s1->ss_start < s2->ss_start) {
- if (s1->ss_end > s2->ss_start)
- return (0);
- return (-1);
- }
- if (s1->ss_start > s2->ss_start) {
- if (s1->ss_start < s2->ss_end)
- return (0);
- return (1);
- }
- return (0);
-}
-
-void
-space_map_create(space_map_t *sm, uint64_t start, uint64_t size, uint8_t shift,
- kmutex_t *lp)
-{
- bzero(sm, sizeof (*sm));
-
- cv_init(&sm->sm_load_cv, NULL, CV_DEFAULT, NULL);
-
- avl_create(&sm->sm_root, space_map_seg_compare,
- sizeof (space_seg_t), offsetof(struct space_seg, ss_node));
-
- sm->sm_start = start;
- sm->sm_size = size;
- sm->sm_shift = shift;
- sm->sm_lock = lp;
-}
-
-void
-space_map_destroy(space_map_t *sm)
-{
- ASSERT(!sm->sm_loaded && !sm->sm_loading);
- VERIFY0(sm->sm_space);
- avl_destroy(&sm->sm_root);
- cv_destroy(&sm->sm_load_cv);
-}
-
-void
-space_map_add(space_map_t *sm, uint64_t start, uint64_t size)
-{
- avl_index_t where;
- space_seg_t *ss_before, *ss_after, *ss;
- uint64_t end = start + size;
- int merge_before, merge_after;
-
- ASSERT(MUTEX_HELD(sm->sm_lock));
- VERIFY(!sm->sm_condensing);
- VERIFY(size != 0);
- VERIFY3U(start, >=, sm->sm_start);
- VERIFY3U(end, <=, sm->sm_start + sm->sm_size);
- VERIFY(sm->sm_space + size <= sm->sm_size);
- VERIFY(P2PHASE(start, 1ULL << sm->sm_shift) == 0);
- VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0);
-
- ss = space_map_find(sm, start, size, &where);
- if (ss != NULL) {
- zfs_panic_recover("zfs: allocating allocated segment"
- "(offset=%llu size=%llu)\n",
- (longlong_t)start, (longlong_t)size);
- return;
- }
-
- /* Make sure we don't overlap with either of our neighbors */
- VERIFY(ss == NULL);
-
- ss_before = avl_nearest(&sm->sm_root, where, AVL_BEFORE);
- ss_after = avl_nearest(&sm->sm_root, where, AVL_AFTER);
-
- merge_before = (ss_before != NULL && ss_before->ss_end == start);
- merge_after = (ss_after != NULL && ss_after->ss_start == end);
-
- if (merge_before && merge_after) {
- avl_remove(&sm->sm_root, ss_before);
- if (sm->sm_pp_root) {
- avl_remove(sm->sm_pp_root, ss_before);
- avl_remove(sm->sm_pp_root, ss_after);
- }
- ss_after->ss_start = ss_before->ss_start;
- kmem_cache_free(space_seg_cache, ss_before);
- ss = ss_after;
- } else if (merge_before) {
- ss_before->ss_end = end;
- if (sm->sm_pp_root)
- avl_remove(sm->sm_pp_root, ss_before);
- ss = ss_before;
- } else if (merge_after) {
- ss_after->ss_start = start;
- if (sm->sm_pp_root)
- avl_remove(sm->sm_pp_root, ss_after);
- ss = ss_after;
- } else {
- ss = kmem_cache_alloc(space_seg_cache, KM_SLEEP);
- ss->ss_start = start;
- ss->ss_end = end;
- avl_insert(&sm->sm_root, ss, where);
- }
-
- if (sm->sm_pp_root)
- avl_add(sm->sm_pp_root, ss);
-
- sm->sm_space += size;
-}
-
-void
-space_map_remove(space_map_t *sm, uint64_t start, uint64_t size)
-{
-#ifdef illumos
- avl_index_t where;
-#endif
- space_seg_t *ss, *newseg;
- uint64_t end = start + size;
- int left_over, right_over;
-
- VERIFY(!sm->sm_condensing);
-#ifdef illumos
- ss = space_map_find(sm, start, size, &where);
-#else
- ss = space_map_find(sm, start, size, NULL);
-#endif
-
- /* Make sure we completely overlap with someone */
- if (ss == NULL) {
- zfs_panic_recover("zfs: freeing free segment "
- "(offset=%llu size=%llu)",
- (longlong_t)start, (longlong_t)size);
- return;
- }
- VERIFY3U(ss->ss_start, <=, start);
- VERIFY3U(ss->ss_end, >=, end);
- VERIFY(sm->sm_space - size < sm->sm_size);
-
- left_over = (ss->ss_start != start);
- right_over = (ss->ss_end != end);
-
- if (sm->sm_pp_root)
- avl_remove(sm->sm_pp_root, ss);
-
- if (left_over && right_over) {
- newseg = kmem_cache_alloc(space_seg_cache, KM_SLEEP);
- newseg->ss_start = end;
- newseg->ss_end = ss->ss_end;
- ss->ss_end = start;
- avl_insert_here(&sm->sm_root, newseg, ss, AVL_AFTER);
- if (sm->sm_pp_root)
- avl_add(sm->sm_pp_root, newseg);
- } else if (left_over) {
- ss->ss_end = start;
- } else if (right_over) {
- ss->ss_start = end;
- } else {
- avl_remove(&sm->sm_root, ss);
- kmem_cache_free(space_seg_cache, ss);
- ss = NULL;
- }
-
- if (sm->sm_pp_root && ss != NULL)
- avl_add(sm->sm_pp_root, ss);
-
- sm->sm_space -= size;
-}
-
-space_seg_t *
-space_map_find(space_map_t *sm, uint64_t start, uint64_t size,
- avl_index_t *wherep)
-{
- space_seg_t ssearch, *ss;
-
- ASSERT(MUTEX_HELD(sm->sm_lock));
- VERIFY(size != 0);
- VERIFY(P2PHASE(start, 1ULL << sm->sm_shift) == 0);
- VERIFY(P2PHASE(size, 1ULL << sm->sm_shift) == 0);
-
- ssearch.ss_start = start;
- ssearch.ss_end = start + size;
- ss = avl_find(&sm->sm_root, &ssearch, wherep);
-
- if (ss != NULL && ss->ss_start <= start && ss->ss_end >= start + size)
- return (ss);
- return (NULL);
-}
-
-boolean_t
-space_map_contains(space_map_t *sm, uint64_t start, uint64_t size)
-{
- avl_index_t where;
-
- return (space_map_find(sm, start, size, &where) != 0);
-}
-
-void
-space_map_swap(space_map_t **msrc, space_map_t **mdst)
-{
- space_map_t *sm;
-
- ASSERT(MUTEX_HELD((*msrc)->sm_lock));
- ASSERT0((*mdst)->sm_space);
- ASSERT0(avl_numnodes(&(*mdst)->sm_root));
-
- sm = *msrc;
- *msrc = *mdst;
- *mdst = sm;
-}
-
-void
-space_map_vacate(space_map_t *sm, space_map_func_t *func, space_map_t *mdest)
-{
- space_seg_t *ss;
- void *cookie = NULL;
-
- ASSERT(MUTEX_HELD(sm->sm_lock));
-
- while ((ss = avl_destroy_nodes(&sm->sm_root, &cookie)) != NULL) {
- if (func != NULL)
- func(mdest, ss->ss_start, ss->ss_end - ss->ss_start);
- kmem_cache_free(space_seg_cache, ss);
- }
- sm->sm_space = 0;
-}
-
-void
-space_map_walk(space_map_t *sm, space_map_func_t *func, space_map_t *mdest)
-{
- space_seg_t *ss;
-
- ASSERT(MUTEX_HELD(sm->sm_lock));
-
- for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss))
- func(mdest, ss->ss_start, ss->ss_end - ss->ss_start);
-}
-
/*
- * Wait for any in-progress space_map_load() to complete.
- */
-void
-space_map_load_wait(space_map_t *sm)
-{
- ASSERT(MUTEX_HELD(sm->sm_lock));
-
- while (sm->sm_loading) {
- ASSERT(!sm->sm_loaded);
- cv_wait(&sm->sm_load_cv, sm->sm_lock);
- }
-}
-
-/*
+ * Load the space map disk into the specified range tree. Segments of maptype
+ * are added to the range tree, other segment types are removed.
+ *
* Note: space_map_load() will drop sm_lock across dmu_read() calls.
* The caller must be OK with this.
*/
int
-space_map_load(space_map_t *sm, space_map_ops_t *ops, uint8_t maptype,
- space_map_obj_t *smo, objset_t *os)
+space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype)
{
uint64_t *entry, *entry_map, *entry_map_end;
uint64_t bufsize, size, offset, end, space;
- uint64_t mapstart = sm->sm_start;
int error = 0;
ASSERT(MUTEX_HELD(sm->sm_lock));
- ASSERT(!sm->sm_loaded);
- ASSERT(!sm->sm_loading);
- sm->sm_loading = B_TRUE;
- end = smo->smo_objsize;
- space = smo->smo_alloc;
+ end = space_map_length(sm);
+ space = space_map_allocated(sm);
- ASSERT(sm->sm_ops == NULL);
- VERIFY0(sm->sm_space);
+ VERIFY0(range_tree_space(rt));
if (maptype == SM_FREE) {
- space_map_add(sm, sm->sm_start, sm->sm_size);
+ range_tree_add(rt, sm->sm_start, sm->sm_size);
space = sm->sm_size - space;
}
- bufsize = 1ULL << SPACE_MAP_BLOCKSHIFT;
+ bufsize = MAX(sm->sm_blksz, SPA_MINBLOCKSIZE);
entry_map = zio_buf_alloc(bufsize);
mutex_exit(sm->sm_lock);
- if (end > bufsize)
- dmu_prefetch(os, smo->smo_object, bufsize, end - bufsize);
+ if (end > bufsize) {
+ dmu_prefetch(sm->sm_os, space_map_object(sm), 0, bufsize,
+ end - bufsize, ZIO_PRIORITY_SYNC_READ);
+ }
mutex_enter(sm->sm_lock);
for (offset = 0; offset < end; offset += bufsize) {
@@ -347,13 +90,14 @@
size = MIN(end - offset, bufsize);
VERIFY(P2PHASE(size, sizeof (uint64_t)) == 0);
VERIFY(size != 0);
+ ASSERT3U(sm->sm_blksz, !=, 0);
dprintf("object=%llu offset=%llx size=%llx\n",
- smo->smo_object, offset, size);
+ space_map_object(sm), offset, size);
mutex_exit(sm->sm_lock);
- error = dmu_read(os, smo->smo_object, offset, size, entry_map,
- DMU_READ_PREFETCH);
+ error = dmu_read(sm->sm_os, space_map_object(sm), offset, size,
+ entry_map, DMU_READ_PREFETCH);
mutex_enter(sm->sm_lock);
if (error != 0)
break;
@@ -361,115 +105,177 @@
entry_map_end = entry_map + (size / sizeof (uint64_t));
for (entry = entry_map; entry < entry_map_end; entry++) {
uint64_t e = *entry;
+ uint64_t offset, size;
if (SM_DEBUG_DECODE(e)) /* Skip debug entries */
continue;
- (SM_TYPE_DECODE(e) == maptype ?
- space_map_add : space_map_remove)(sm,
- (SM_OFFSET_DECODE(e) << sm->sm_shift) + mapstart,
- SM_RUN_DECODE(e) << sm->sm_shift);
+ offset = (SM_OFFSET_DECODE(e) << sm->sm_shift) +
+ sm->sm_start;
+ size = SM_RUN_DECODE(e) << sm->sm_shift;
+
+ VERIFY0(P2PHASE(offset, 1ULL << sm->sm_shift));
+ VERIFY0(P2PHASE(size, 1ULL << sm->sm_shift));
+ VERIFY3U(offset, >=, sm->sm_start);
+ VERIFY3U(offset + size, <=, sm->sm_start + sm->sm_size);
+ if (SM_TYPE_DECODE(e) == maptype) {
+ VERIFY3U(range_tree_space(rt) + size, <=,
+ sm->sm_size);
+ range_tree_add(rt, offset, size);
+ } else {
+ range_tree_remove(rt, offset, size);
+ }
}
}
- if (error == 0) {
- VERIFY3U(sm->sm_space, ==, space);
+ if (error == 0)
+ VERIFY3U(range_tree_space(rt), ==, space);
+ else
+ range_tree_vacate(rt, NULL, NULL);
- sm->sm_loaded = B_TRUE;
- sm->sm_ops = ops;
- if (ops != NULL)
- ops->smop_load(sm);
- } else {
- space_map_vacate(sm, NULL, NULL);
- }
-
zio_buf_free(entry_map, bufsize);
+ return (error);
+}
- sm->sm_loading = B_FALSE;
+void
+space_map_histogram_clear(space_map_t *sm)
+{
+ if (sm->sm_dbuf->db_size != sizeof (space_map_phys_t))
+ return;
- cv_broadcast(&sm->sm_load_cv);
+ bzero(sm->sm_phys->smp_histogram, sizeof (sm->sm_phys->smp_histogram));
+}
- return (error);
+boolean_t
+space_map_histogram_verify(space_map_t *sm, range_tree_t *rt)
+{
+ /*
+ * Verify that the in-core range tree does not have any
+ * ranges smaller than our sm_shift size.
+ */
+ for (int i = 0; i < sm->sm_shift; i++) {
+ if (rt->rt_histogram[i] != 0)
+ return (B_FALSE);
+ }
+ return (B_TRUE);
}
void
-space_map_unload(space_map_t *sm)
+space_map_histogram_add(space_map_t *sm, range_tree_t *rt, dmu_tx_t *tx)
{
- ASSERT(MUTEX_HELD(sm->sm_lock));
+ int idx = 0;
- if (sm->sm_loaded && sm->sm_ops != NULL)
- sm->sm_ops->smop_unload(sm);
+ ASSERT(MUTEX_HELD(rt->rt_lock));
+ ASSERT(dmu_tx_is_syncing(tx));
+ VERIFY3U(space_map_object(sm), !=, 0);
- sm->sm_loaded = B_FALSE;
- sm->sm_ops = NULL;
+ if (sm->sm_dbuf->db_size != sizeof (space_map_phys_t))
+ return;
- space_map_vacate(sm, NULL, NULL);
-}
+ dmu_buf_will_dirty(sm->sm_dbuf, tx);
-uint64_t
-space_map_maxsize(space_map_t *sm)
-{
- ASSERT(sm->sm_ops != NULL);
- return (sm->sm_ops->smop_max(sm));
+ ASSERT(space_map_histogram_verify(sm, rt));
+
+ /*
+ * Transfer the content of the range tree histogram to the space
+ * map histogram. The space map histogram contains 32 buckets ranging
+ * between 2^sm_shift to 2^(32+sm_shift-1). The range tree,
+ * however, can represent ranges from 2^0 to 2^63. Since the space
+ * map only cares about allocatable blocks (minimum of sm_shift) we
+ * can safely ignore all ranges in the range tree smaller than sm_shift.
+ */
+ for (int i = sm->sm_shift; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
+
+ /*
+ * Since the largest histogram bucket in the space map is
+ * 2^(32+sm_shift-1), we need to normalize the values in
+ * the range tree for any bucket larger than that size. For
+ * example given an sm_shift of 9, ranges larger than 2^40
+ * would get normalized as if they were 1TB ranges. Assume
+ * the range tree had a count of 5 in the 2^44 (16TB) bucket,
+ * the calculation below would normalize this to 5 * 2^4 (16).
+ */
+ ASSERT3U(i, >=, idx + sm->sm_shift);
+ sm->sm_phys->smp_histogram[idx] +=
+ rt->rt_histogram[i] << (i - idx - sm->sm_shift);
+
+ /*
+ * Increment the space map's index as long as we haven't
+ * reached the maximum bucket size. Accumulate all ranges
+ * larger than the max bucket size into the last bucket.
+ */
+ if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) {
+ ASSERT3U(idx + sm->sm_shift, ==, i);
+ idx++;
+ ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE);
+ }
+ }
}
uint64_t
-space_map_alloc(space_map_t *sm, uint64_t size)
+space_map_entries(space_map_t *sm, range_tree_t *rt)
{
- uint64_t start;
+ avl_tree_t *t = &rt->rt_root;
+ range_seg_t *rs;
+ uint64_t size, entries;
- start = sm->sm_ops->smop_alloc(sm, size);
- if (start != -1ULL)
- space_map_remove(sm, start, size);
- return (start);
-}
+ /*
+ * All space_maps always have a debug entry so account for it here.
+ */
+ entries = 1;
-void
-space_map_claim(space_map_t *sm, uint64_t start, uint64_t size)
-{
- sm->sm_ops->smop_claim(sm, start, size);
- space_map_remove(sm, start, size);
+ /*
+ * Traverse the range tree and calculate the number of space map
+ * entries that would be required to write out the range tree.
+ */
+ for (rs = avl_first(t); rs != NULL; rs = AVL_NEXT(t, rs)) {
+ size = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
+ entries += howmany(size, SM_RUN_MAX);
+ }
+ return (entries);
}
-void
-space_map_free(space_map_t *sm, uint64_t start, uint64_t size)
-{
- space_map_add(sm, start, size);
- sm->sm_ops->smop_free(sm, start, size);
-}
-
/*
- * Note: space_map_sync() will drop sm_lock across dmu_write() calls.
+ * Note: space_map_write() will drop sm_lock across dmu_write() calls.
*/
void
-space_map_sync(space_map_t *sm, uint8_t maptype,
- space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx)
+space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
+ dmu_tx_t *tx)
{
+ objset_t *os = sm->sm_os;
spa_t *spa = dmu_objset_spa(os);
- avl_tree_t *t = &sm->sm_root;
- space_seg_t *ss;
- uint64_t bufsize, start, size, run_len, total, sm_space, nodes;
+ avl_tree_t *t = &rt->rt_root;
+ range_seg_t *rs;
+ uint64_t size, total, rt_space, nodes;
uint64_t *entry, *entry_map, *entry_map_end;
+ uint64_t expected_entries, actual_entries = 1;
- ASSERT(MUTEX_HELD(sm->sm_lock));
+ ASSERT(MUTEX_HELD(rt->rt_lock));
+ ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
+ VERIFY3U(space_map_object(sm), !=, 0);
+ dmu_buf_will_dirty(sm->sm_dbuf, tx);
- if (sm->sm_space == 0)
+ /*
+ * This field is no longer necessary since the in-core space map
+ * now contains the object number but is maintained for backwards
+ * compatibility.
+ */
+ sm->sm_phys->smp_object = sm->sm_object;
+
+ if (range_tree_space(rt) == 0) {
+ VERIFY3U(sm->sm_object, ==, sm->sm_phys->smp_object);
return;
+ }
- dprintf("object %4llu, txg %llu, pass %d, %c, count %lu, space %llx\n",
- smo->smo_object, dmu_tx_get_txg(tx), spa_sync_pass(spa),
- maptype == SM_ALLOC ? 'A' : 'F', avl_numnodes(&sm->sm_root),
- sm->sm_space);
-
if (maptype == SM_ALLOC)
- smo->smo_alloc += sm->sm_space;
+ sm->sm_phys->smp_alloc += range_tree_space(rt);
else
- smo->smo_alloc -= sm->sm_space;
+ sm->sm_phys->smp_alloc -= range_tree_space(rt);
- bufsize = (8 + avl_numnodes(&sm->sm_root)) * sizeof (uint64_t);
- bufsize = MIN(bufsize, 1ULL << SPACE_MAP_BLOCKSHIFT);
- entry_map = zio_buf_alloc(bufsize);
- entry_map_end = entry_map + (bufsize / sizeof (uint64_t));
+ expected_entries = space_map_entries(sm, rt);
+
+ entry_map = zio_buf_alloc(sm->sm_blksz);
+ entry_map_end = entry_map + (sm->sm_blksz / sizeof (uint64_t));
entry = entry_map;
*entry++ = SM_DEBUG_ENCODE(1) |
@@ -478,24 +284,28 @@
SM_DEBUG_TXG_ENCODE(dmu_tx_get_txg(tx));
total = 0;
- nodes = avl_numnodes(&sm->sm_root);
- sm_space = sm->sm_space;
- for (ss = avl_first(t); ss != NULL; ss = AVL_NEXT(t, ss)) {
- size = ss->ss_end - ss->ss_start;
- start = (ss->ss_start - sm->sm_start) >> sm->sm_shift;
+ nodes = avl_numnodes(&rt->rt_root);
+ rt_space = range_tree_space(rt);
+ for (rs = avl_first(t); rs != NULL; rs = AVL_NEXT(t, rs)) {
+ uint64_t start;
- total += size;
- size >>= sm->sm_shift;
+ size = (rs->rs_end - rs->rs_start) >> sm->sm_shift;
+ start = (rs->rs_start - sm->sm_start) >> sm->sm_shift;
- while (size) {
+ total += size << sm->sm_shift;
+
+ while (size != 0) {
+ uint64_t run_len;
+
run_len = MIN(size, SM_RUN_MAX);
if (entry == entry_map_end) {
- mutex_exit(sm->sm_lock);
- dmu_write(os, smo->smo_object, smo->smo_objsize,
- bufsize, entry_map, tx);
- mutex_enter(sm->sm_lock);
- smo->smo_objsize += bufsize;
+ mutex_exit(rt->rt_lock);
+ dmu_write(os, space_map_object(sm),
+ sm->sm_phys->smp_objsize, sm->sm_blksz,
+ entry_map, tx);
+ mutex_enter(rt->rt_lock);
+ sm->sm_phys->smp_objsize += sm->sm_blksz;
entry = entry_map;
}
@@ -505,162 +315,234 @@
start += run_len;
size -= run_len;
+ actual_entries++;
}
}
if (entry != entry_map) {
size = (entry - entry_map) * sizeof (uint64_t);
- mutex_exit(sm->sm_lock);
- dmu_write(os, smo->smo_object, smo->smo_objsize,
+ mutex_exit(rt->rt_lock);
+ dmu_write(os, space_map_object(sm), sm->sm_phys->smp_objsize,
size, entry_map, tx);
- mutex_enter(sm->sm_lock);
- smo->smo_objsize += size;
+ mutex_enter(rt->rt_lock);
+ sm->sm_phys->smp_objsize += size;
}
+ ASSERT3U(expected_entries, ==, actual_entries);
/*
* Ensure that the space_map's accounting wasn't changed
* while we were in the middle of writing it out.
*/
- VERIFY3U(nodes, ==, avl_numnodes(&sm->sm_root));
- VERIFY3U(sm->sm_space, ==, sm_space);
- VERIFY3U(sm->sm_space, ==, total);
+ VERIFY3U(nodes, ==, avl_numnodes(&rt->rt_root));
+ VERIFY3U(range_tree_space(rt), ==, rt_space);
+ VERIFY3U(range_tree_space(rt), ==, total);
- zio_buf_free(entry_map, bufsize);
+ zio_buf_free(entry_map, sm->sm_blksz);
}
-void
-space_map_truncate(space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx)
+static int
+space_map_open_impl(space_map_t *sm)
{
- VERIFY(dmu_free_range(os, smo->smo_object, 0, -1ULL, tx) == 0);
+ int error;
+ u_longlong_t blocks;
- smo->smo_objsize = 0;
- smo->smo_alloc = 0;
+ error = dmu_bonus_hold(sm->sm_os, sm->sm_object, sm, &sm->sm_dbuf);
+ if (error)
+ return (error);
+
+ dmu_object_size_from_db(sm->sm_dbuf, &sm->sm_blksz, &blocks);
+ sm->sm_phys = sm->sm_dbuf->db_data;
+ return (0);
}
-/*
- * Space map reference trees.
- *
- * A space map is a collection of integers. Every integer is either
- * in the map, or it's not. A space map reference tree generalizes
- * the idea: it allows its members to have arbitrary reference counts,
- * as opposed to the implicit reference count of 0 or 1 in a space map.
- * This representation comes in handy when computing the union or
- * intersection of multiple space maps. For example, the union of
- * N space maps is the subset of the reference tree with refcnt >= 1.
- * The intersection of N space maps is the subset with refcnt >= N.
- *
- * [It's very much like a Fourier transform. Unions and intersections
- * are hard to perform in the 'space map domain', so we convert the maps
- * into the 'reference count domain', where it's trivial, then invert.]
- *
- * vdev_dtl_reassess() uses computations of this form to determine
- * DTL_MISSING and DTL_OUTAGE for interior vdevs -- e.g. a RAID-Z vdev
- * has an outage wherever refcnt >= vdev_nparity + 1, and a mirror vdev
- * has an outage wherever refcnt >= vdev_children.
- */
-static int
-space_map_ref_compare(const void *x1, const void *x2)
+int
+space_map_open(space_map_t **smp, objset_t *os, uint64_t object,
+ uint64_t start, uint64_t size, uint8_t shift, kmutex_t *lp)
{
- const space_ref_t *sr1 = x1;
- const space_ref_t *sr2 = x2;
+ space_map_t *sm;
+ int error;
- if (sr1->sr_offset < sr2->sr_offset)
- return (-1);
- if (sr1->sr_offset > sr2->sr_offset)
- return (1);
+ ASSERT(*smp == NULL);
+ ASSERT(os != NULL);
+ ASSERT(object != 0);
- if (sr1 < sr2)
- return (-1);
- if (sr1 > sr2)
- return (1);
+ sm = kmem_zalloc(sizeof (space_map_t), KM_SLEEP);
+ sm->sm_start = start;
+ sm->sm_size = size;
+ sm->sm_shift = shift;
+ sm->sm_lock = lp;
+ sm->sm_os = os;
+ sm->sm_object = object;
+
+ error = space_map_open_impl(sm);
+ if (error != 0) {
+ space_map_close(sm);
+ return (error);
+ }
+
+ *smp = sm;
+
return (0);
}
void
-space_map_ref_create(avl_tree_t *t)
+space_map_close(space_map_t *sm)
{
- avl_create(t, space_map_ref_compare,
- sizeof (space_ref_t), offsetof(space_ref_t, sr_node));
+ if (sm == NULL)
+ return;
+
+ if (sm->sm_dbuf != NULL)
+ dmu_buf_rele(sm->sm_dbuf, sm);
+ sm->sm_dbuf = NULL;
+ sm->sm_phys = NULL;
+
+ kmem_free(sm, sizeof (*sm));
}
void
-space_map_ref_destroy(avl_tree_t *t)
+space_map_truncate(space_map_t *sm, dmu_tx_t *tx)
{
- space_ref_t *sr;
- void *cookie = NULL;
+ objset_t *os = sm->sm_os;
+ spa_t *spa = dmu_objset_spa(os);
+ dmu_object_info_t doi;
- while ((sr = avl_destroy_nodes(t, &cookie)) != NULL)
- kmem_free(sr, sizeof (*sr));
+ ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
+ ASSERT(dmu_tx_is_syncing(tx));
- avl_destroy(t);
-}
+ dmu_object_info_from_db(sm->sm_dbuf, &doi);
-static void
-space_map_ref_add_node(avl_tree_t *t, uint64_t offset, int64_t refcnt)
-{
- space_ref_t *sr;
+ /*
+ * If the space map has the wrong bonus size (because
+ * SPA_FEATURE_SPACEMAP_HISTOGRAM has recently been enabled), or
+ * the wrong block size (because space_map_blksz has changed),
+ * free and re-allocate its object with the updated sizes.
+ *
+ * Otherwise, just truncate the current object.
+ */
+ if ((spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) &&
+ doi.doi_bonus_size != sizeof (space_map_phys_t)) ||
+ doi.doi_data_block_size != space_map_blksz) {
+ zfs_dbgmsg("txg %llu, spa %s, reallocating: "
+ "old bonus %u, old blocksz %u", dmu_tx_get_txg(tx),
+ spa_name(spa), doi.doi_bonus_size, doi.doi_data_block_size);
- sr = kmem_alloc(sizeof (*sr), KM_SLEEP);
- sr->sr_offset = offset;
- sr->sr_refcnt = refcnt;
+ space_map_free(sm, tx);
+ dmu_buf_rele(sm->sm_dbuf, sm);
- avl_add(t, sr);
-}
+ sm->sm_object = space_map_alloc(sm->sm_os, tx);
+ VERIFY0(space_map_open_impl(sm));
+ } else {
+ VERIFY0(dmu_free_range(os, space_map_object(sm), 0, -1ULL, tx));
-void
-space_map_ref_add_seg(avl_tree_t *t, uint64_t start, uint64_t end,
- int64_t refcnt)
-{
- space_map_ref_add_node(t, start, refcnt);
- space_map_ref_add_node(t, end, -refcnt);
+ /*
+ * If the spacemap is reallocated, its histogram
+ * will be reset. Do the same in the common case so that
+ * bugs related to the uncommon case do not go unnoticed.
+ */
+ bzero(sm->sm_phys->smp_histogram,
+ sizeof (sm->sm_phys->smp_histogram));
+ }
+
+ dmu_buf_will_dirty(sm->sm_dbuf, tx);
+ sm->sm_phys->smp_objsize = 0;
+ sm->sm_phys->smp_alloc = 0;
}
/*
- * Convert (or add) a space map into a reference tree.
+ * Update the in-core space_map allocation and length values.
*/
void
-space_map_ref_add_map(avl_tree_t *t, space_map_t *sm, int64_t refcnt)
+space_map_update(space_map_t *sm)
{
- space_seg_t *ss;
+ if (sm == NULL)
+ return;
ASSERT(MUTEX_HELD(sm->sm_lock));
- for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss))
- space_map_ref_add_seg(t, ss->ss_start, ss->ss_end, refcnt);
+ sm->sm_alloc = sm->sm_phys->smp_alloc;
+ sm->sm_length = sm->sm_phys->smp_objsize;
}
-/*
- * Convert a reference tree into a space map. The space map will contain
- * all members of the reference tree for which refcnt >= minref.
- */
+uint64_t
+space_map_alloc(objset_t *os, dmu_tx_t *tx)
+{
+ spa_t *spa = dmu_objset_spa(os);
+ uint64_t object;
+ int bonuslen;
+
+ if (spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) {
+ spa_feature_incr(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM, tx);
+ bonuslen = sizeof (space_map_phys_t);
+ ASSERT3U(bonuslen, <=, dmu_bonus_max());
+ } else {
+ bonuslen = SPACE_MAP_SIZE_V0;
+ }
+
+ object = dmu_object_alloc(os,
+ DMU_OT_SPACE_MAP, space_map_blksz,
+ DMU_OT_SPACE_MAP_HEADER, bonuslen, tx);
+
+ return (object);
+}
+
void
-space_map_ref_generate_map(avl_tree_t *t, space_map_t *sm, int64_t minref)
+space_map_free(space_map_t *sm, dmu_tx_t *tx)
{
- uint64_t start = -1ULL;
- int64_t refcnt = 0;
- space_ref_t *sr;
+ spa_t *spa;
- ASSERT(MUTEX_HELD(sm->sm_lock));
+ if (sm == NULL)
+ return;
- space_map_vacate(sm, NULL, NULL);
+ spa = dmu_objset_spa(sm->sm_os);
+ if (spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) {
+ dmu_object_info_t doi;
- for (sr = avl_first(t); sr != NULL; sr = AVL_NEXT(t, sr)) {
- refcnt += sr->sr_refcnt;
- if (refcnt >= minref) {
- if (start == -1ULL) {
- start = sr->sr_offset;
- }
- } else {
- if (start != -1ULL) {
- uint64_t end = sr->sr_offset;
- ASSERT(start <= end);
- if (end > start)
- space_map_add(sm, start, end - start);
- start = -1ULL;
- }
+ dmu_object_info_from_db(sm->sm_dbuf, &doi);
+ if (doi.doi_bonus_size != SPACE_MAP_SIZE_V0) {
+ VERIFY(spa_feature_is_active(spa,
+ SPA_FEATURE_SPACEMAP_HISTOGRAM));
+ spa_feature_decr(spa,
+ SPA_FEATURE_SPACEMAP_HISTOGRAM, tx);
}
}
- ASSERT(refcnt == 0);
- ASSERT(start == -1ULL);
+
+ VERIFY3U(dmu_object_free(sm->sm_os, space_map_object(sm), tx), ==, 0);
+ sm->sm_object = 0;
}
+
+uint64_t
+space_map_object(space_map_t *sm)
+{
+ return (sm != NULL ? sm->sm_object : 0);
+}
+
+/*
+ * Returns the already synced, on-disk allocated space.
+ */
+uint64_t
+space_map_allocated(space_map_t *sm)
+{
+ return (sm != NULL ? sm->sm_alloc : 0);
+}
+
+/*
+ * Returns the already synced, on-disk length;
+ */
+uint64_t
+space_map_length(space_map_t *sm)
+{
+ return (sm != NULL ? sm->sm_length : 0);
+}
+
+/*
+ * Returns the allocated space that is currently syncing.
+ */
+int64_t
+space_map_alloc_delta(space_map_t *sm)
+{
+ if (sm == NULL)
+ return (0);
+ ASSERT(sm->sm_dbuf != NULL);
+ return (sm->sm_phys->smp_alloc - space_map_allocated(sm));
+}
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/arc.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -20,7 +21,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
*/
@@ -37,38 +38,104 @@
#include <sys/dmu.h>
#include <sys/spa.h>
+/*
+ * Used by arc_flush() to inform arc_evict_state() that it should evict
+ * all available buffers from the arc state being passed in.
+ */
+#define ARC_EVICT_ALL -1ULL
+
+#define HDR_SET_LSIZE(hdr, x) do { \
+ ASSERT(IS_P2ALIGNED(x, 1U << SPA_MINBLOCKSHIFT)); \
+ (hdr)->b_lsize = ((x) >> SPA_MINBLOCKSHIFT); \
+_NOTE(CONSTCOND) } while (0)
+
+#define HDR_SET_PSIZE(hdr, x) do { \
+ ASSERT(IS_P2ALIGNED((x), 1U << SPA_MINBLOCKSHIFT)); \
+ (hdr)->b_psize = ((x) >> SPA_MINBLOCKSHIFT); \
+_NOTE(CONSTCOND) } while (0)
+
+#define HDR_GET_LSIZE(hdr) ((hdr)->b_lsize << SPA_MINBLOCKSHIFT)
+#define HDR_GET_PSIZE(hdr) ((hdr)->b_psize << SPA_MINBLOCKSHIFT)
+
typedef struct arc_buf_hdr arc_buf_hdr_t;
typedef struct arc_buf arc_buf_t;
typedef void arc_done_func_t(zio_t *zio, arc_buf_t *buf, void *priv);
-typedef int arc_evict_func_t(void *priv);
/* generic arc_done_func_t's which you can use */
arc_done_func_t arc_bcopy_func;
arc_done_func_t arc_getbuf_func;
+extern int zfs_arc_num_sublists_per_state;
+
+typedef enum arc_flags
+{
+ /*
+ * Public flags that can be passed into the ARC by external consumers.
+ */
+ ARC_FLAG_WAIT = 1 << 0, /* perform sync I/O */
+ ARC_FLAG_NOWAIT = 1 << 1, /* perform async I/O */
+ ARC_FLAG_PREFETCH = 1 << 2, /* I/O is a prefetch */
+ ARC_FLAG_CACHED = 1 << 3, /* I/O was in cache */
+ ARC_FLAG_L2CACHE = 1 << 4, /* cache in L2ARC */
+ ARC_FLAG_PREDICTIVE_PREFETCH = 1 << 5, /* I/O from zfetch */
+
+ /*
+ * Private ARC flags. These flags are private ARC only flags that
+ * will show up in b_flags in the arc_hdr_buf_t. These flags should
+ * only be set by ARC code.
+ */
+ ARC_FLAG_IN_HASH_TABLE = 1 << 6, /* buffer is hashed */
+ ARC_FLAG_IO_IN_PROGRESS = 1 << 7, /* I/O in progress */
+ ARC_FLAG_IO_ERROR = 1 << 8, /* I/O failed for buf */
+ ARC_FLAG_INDIRECT = 1 << 9, /* indirect block */
+ /* Indicates that block was read with ASYNC priority. */
+ ARC_FLAG_PRIO_ASYNC_READ = 1 << 10,
+ ARC_FLAG_L2_WRITING = 1 << 11, /* write in progress */
+ ARC_FLAG_L2_EVICTED = 1 << 12, /* evicted during I/O */
+ ARC_FLAG_L2_WRITE_HEAD = 1 << 13, /* head of write list */
+ /* indicates that the buffer contains metadata (otherwise, data) */
+ ARC_FLAG_BUFC_METADATA = 1 << 14,
+
+ /* Flags specifying whether optional hdr struct fields are defined */
+ ARC_FLAG_HAS_L1HDR = 1 << 15,
+ ARC_FLAG_HAS_L2HDR = 1 << 16,
+
+ /*
+ * Indicates the arc_buf_hdr_t's b_pdata matches the on-disk data.
+ * This allows the l2arc to use the blkptr's checksum to verify
+ * the data without having to store the checksum in the hdr.
+ */
+ ARC_FLAG_COMPRESSED_ARC = 1 << 17,
+ ARC_FLAG_SHARED_DATA = 1 << 18,
+
+ /*
+ * The arc buffer's compression mode is stored in the top 7 bits of the
+ * flags field, so these dummy flags are included so that MDB can
+ * interpret the enum properly.
+ */
+ ARC_FLAG_COMPRESS_0 = 1 << 24,
+ ARC_FLAG_COMPRESS_1 = 1 << 25,
+ ARC_FLAG_COMPRESS_2 = 1 << 26,
+ ARC_FLAG_COMPRESS_3 = 1 << 27,
+ ARC_FLAG_COMPRESS_4 = 1 << 28,
+ ARC_FLAG_COMPRESS_5 = 1 << 29,
+ ARC_FLAG_COMPRESS_6 = 1 << 30
+
+} arc_flags_t;
+
struct arc_buf {
arc_buf_hdr_t *b_hdr;
arc_buf_t *b_next;
kmutex_t b_evict_lock;
void *b_data;
- arc_evict_func_t *b_efunc;
- void *b_private;
};
typedef enum arc_buf_contents {
+ ARC_BUFC_INVALID, /* invalid type */
ARC_BUFC_DATA, /* buffer contains data */
ARC_BUFC_METADATA, /* buffer contains metadata */
ARC_BUFC_NUMTYPES
} arc_buf_contents_t;
-/*
- * These are the flags we pass into calls to the arc
- */
-#define ARC_WAIT (1 << 1) /* perform I/O synchronously */
-#define ARC_NOWAIT (1 << 2) /* perform I/O asynchronously */
-#define ARC_PREFETCH (1 << 3) /* I/O is a prefetch */
-#define ARC_CACHED (1 << 4) /* I/O was already in cache */
-#define ARC_L2CACHE (1 << 5) /* cache in L2ARC */
-#define ARC_L2COMPRESS (1 << 6) /* compress in L2ARC */
/*
* The following breakdows of arc_size exist for kstat only.
@@ -75,6 +142,7 @@
*/
typedef enum arc_space_type {
ARC_SPACE_DATA,
+ ARC_SPACE_META,
ARC_SPACE_HDRS,
ARC_SPACE_L2HDRS,
ARC_SPACE_OTHER,
@@ -83,42 +151,37 @@
void arc_space_consume(uint64_t space, arc_space_type_t type);
void arc_space_return(uint64_t space, arc_space_type_t type);
-void *arc_data_buf_alloc(uint64_t space);
-void arc_data_buf_free(void *buf, uint64_t space);
-arc_buf_t *arc_buf_alloc(spa_t *spa, int size, void *tag,
+arc_buf_t *arc_alloc_buf(spa_t *spa, int32_t size, void *tag,
arc_buf_contents_t type);
arc_buf_t *arc_loan_buf(spa_t *spa, int size);
void arc_return_buf(arc_buf_t *buf, void *tag);
void arc_loan_inuse_buf(arc_buf_t *buf, void *tag);
-void arc_buf_add_ref(arc_buf_t *buf, void *tag);
-boolean_t arc_buf_remove_ref(arc_buf_t *buf, void *tag);
+void arc_buf_destroy(arc_buf_t *buf, void *tag);
int arc_buf_size(arc_buf_t *buf);
void arc_release(arc_buf_t *buf, void *tag);
int arc_released(arc_buf_t *buf);
-int arc_has_callback(arc_buf_t *buf);
void arc_buf_freeze(arc_buf_t *buf);
void arc_buf_thaw(arc_buf_t *buf);
-boolean_t arc_buf_eviction_needed(arc_buf_t *buf);
#ifdef ZFS_DEBUG
int arc_referenced(arc_buf_t *buf);
#endif
int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
- arc_done_func_t *done, void *priv, int priority, int flags,
- uint32_t *arc_flags, const zbookmark_t *zb);
+ arc_done_func_t *done, void *priv, zio_priority_t priority, int flags,
+ arc_flags_t *arc_flags, const zbookmark_phys_t *zb);
zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
- blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress,
- const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *done,
- void *priv, int priority, int zio_flags, const zbookmark_t *zb);
+ blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp,
+ arc_done_func_t *ready, arc_done_func_t *child_ready,
+ arc_done_func_t *physdone, arc_done_func_t *done,
+ void *priv, zio_priority_t priority, int zio_flags,
+ const zbookmark_phys_t *zb);
void arc_freed(spa_t *spa, const blkptr_t *bp);
-void arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *priv);
-int arc_buf_evict(arc_buf_t *buf);
-
-void arc_flush(spa_t *spa);
+void arc_flush(spa_t *spa, boolean_t retry);
void arc_tempreserve_clear(uint64_t reserve);
int arc_tempreserve_space(uint64_t reserve, uint64_t txg);
+uint64_t arc_max_bytes(void);
void arc_init(void);
void arc_fini(void);
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bplist.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bplist.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bplist.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bpobj.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bpobj.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bpobj.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -20,7 +21,7 @@
*/
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
*/
#ifndef _SYS_BPOBJ_H
@@ -77,7 +78,6 @@
int bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx);
int bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *, dmu_tx_t *);
-int bpobj_iterate_dbg(bpobj_t *bpo, uint64_t *itorp, blkptr_t *bp);
void bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx);
void bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx);
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bptree.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bptree.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/bptree.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -19,7 +20,7 @@
* CDDL HEADER END
*/
/*
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
*/
#ifndef _SYS_BPTREE_H
@@ -43,7 +44,7 @@
typedef struct bptree_entry_phys {
blkptr_t be_bp;
uint64_t be_birth_txg; /* only delete blocks born after this txg */
- zbookmark_t be_zb; /* holds traversal resume point if needed */
+ zbookmark_phys_t be_zb; /* holds traversal resume point if needed */
} bptree_entry_phys_t;
typedef int bptree_itor_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
@@ -50,6 +51,7 @@
uint64_t bptree_alloc(objset_t *os, dmu_tx_t *tx);
int bptree_free(objset_t *os, uint64_t obj, dmu_tx_t *tx);
+boolean_t bptree_is_empty(objset_t *os, uint64_t obj);
void bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg,
uint64_t bytes, uint64_t comp, uint64_t uncomp, dmu_tx_t *tx);
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -20,8 +21,9 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
*/
#ifndef _SYS_DBUF_H
@@ -35,6 +37,7 @@
#include <sys/zfs_context.h>
#include <sys/refcount.h>
#include <sys/zrlock.h>
+#include <sys/multilist.h>
#ifdef __cplusplus
extern "C" {
@@ -66,8 +69,13 @@
* | |
* | |
* +--------> NOFILL -------+
+ *
+ * DB_SEARCH is an invalid state for a dbuf. It is used by dbuf_free_range
+ * to find all dbufs in a range of a dnode and must be less than any other
+ * dbuf_states_t (see comment on dn_dbufs in dnode.h).
*/
typedef enum dbuf_states {
+ DB_SEARCH = -1,
DB_UNCACHED,
DB_FILL,
DB_NOFILL,
@@ -112,6 +120,12 @@
/* pointer to parent dirty record */
struct dbuf_dirty_record *dr_parent;
+ /* How much space was changed to dsl_pool_dirty_space() for this? */
+ unsigned int dr_accounted;
+
+ /* A copy of the bp that points to us */
+ blkptr_t dr_bp_copy;
+
union dirty_types {
struct dirty_indirect {
@@ -214,18 +228,37 @@
* Our link on the owner dnodes's dn_dbufs list.
* Protected by its dn_dbufs_mtx.
*/
- list_node_t db_link;
+ avl_node_t db_link;
+ /*
+ * Link in dbuf_cache.
+ */
+ multilist_node_t db_cache_link;
+
/* Data which is unique to data (leaf) blocks: */
- /* stuff we store for the user (see dmu_buf_set_user) */
- void *db_user_ptr;
- void **db_user_data_ptr_ptr;
- dmu_buf_evict_func_t *db_evict_func;
+ /* User callback information. */
+ dmu_buf_user_t *db_user;
- uint8_t db_immediate_evict;
+ /*
+ * Evict user data as soon as the dirty and reference
+ * counts are equal.
+ */
+ uint8_t db_user_immediate_evict;
+
+ /*
+ * This block was freed while a read or write was
+ * active.
+ */
uint8_t db_freed_in_flight;
+ /*
+ * dnode_evict_dbufs() or dnode_evict_bonus() tried to
+ * evict this dbuf, but couldn't due to outstanding
+ * references. Evict once the refcount drops to 0.
+ */
+ uint8_t db_pending_evict;
+
uint8_t db_dirtycnt;
} dmu_buf_impl_t;
@@ -238,9 +271,8 @@
kmutex_t hash_mutexes[DBUF_MUTEXES];
} dbuf_hash_table_t;
+uint64_t dbuf_whichblock(struct dnode *di, int64_t level, uint64_t offset);
-uint64_t dbuf_whichblock(struct dnode *di, uint64_t offset);
-
dmu_buf_impl_t *dbuf_create_tlib(struct dnode *dn, char *data);
void dbuf_create_bonus(struct dnode *dn);
int dbuf_spill_set_blksz(dmu_buf_t *db, uint64_t blksz, dmu_tx_t *tx);
@@ -251,22 +283,25 @@
dmu_buf_impl_t *dbuf_hold(struct dnode *dn, uint64_t blkid, void *tag);
dmu_buf_impl_t *dbuf_hold_level(struct dnode *dn, int level, uint64_t blkid,
void *tag);
-int dbuf_hold_impl(struct dnode *dn, uint8_t level, uint64_t blkid, int create,
+int dbuf_hold_impl(struct dnode *dn, uint8_t level, uint64_t blkid,
+ boolean_t fail_sparse, boolean_t fail_uncached,
void *tag, dmu_buf_impl_t **dbp);
-void dbuf_prefetch(struct dnode *dn, uint64_t blkid);
+void dbuf_prefetch(struct dnode *dn, int64_t level, uint64_t blkid,
+ zio_priority_t prio, arc_flags_t aflags);
void dbuf_add_ref(dmu_buf_impl_t *db, void *tag);
+boolean_t dbuf_try_add_ref(dmu_buf_t *db, objset_t *os, uint64_t obj,
+ uint64_t blkid, void *tag);
uint64_t dbuf_refcount(dmu_buf_impl_t *db);
void dbuf_rele(dmu_buf_impl_t *db, void *tag);
void dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag);
-dmu_buf_impl_t *dbuf_find(struct dnode *dn, uint8_t level, uint64_t blkid);
+dmu_buf_impl_t *dbuf_find(struct objset *os, uint64_t object, uint8_t level,
+ uint64_t blkid);
int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags);
-void dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
-void dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx);
void dmu_buf_will_not_fill(dmu_buf_t *db, dmu_tx_t *tx);
void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx);
void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx);
@@ -273,13 +308,15 @@
void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx);
dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
arc_buf_t *dbuf_loan_arcbuf(dmu_buf_impl_t *db);
+void dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
+ bp_embedded_type_t etype, enum zio_compress comp,
+ int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx);
-void dbuf_clear(dmu_buf_impl_t *db);
-void dbuf_evict(dmu_buf_impl_t *db);
+void dbuf_destroy(dmu_buf_impl_t *db);
void dbuf_setdirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
void dbuf_unoverride(dbuf_dirty_record_t *dr);
-void dbuf_sync_list(list_t *list, dmu_tx_t *tx);
+void dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx);
void dbuf_release_bp(dmu_buf_impl_t *db);
void dbuf_free_range(struct dnode *dn, uint64_t start, uint64_t end,
@@ -292,20 +329,6 @@
#define DB_DNODE_ENTER(_db) (zrl_add(&DB_DNODE_LOCK(_db)))
#define DB_DNODE_EXIT(_db) (zrl_remove(&DB_DNODE_LOCK(_db)))
#define DB_DNODE_HELD(_db) (!zrl_is_zero(&DB_DNODE_LOCK(_db)))
-#define DB_GET_SPA(_spa_p, _db) { \
- dnode_t *__dn; \
- DB_DNODE_ENTER(_db); \
- __dn = DB_DNODE(_db); \
- *(_spa_p) = __dn->dn_objset->os_spa; \
- DB_DNODE_EXIT(_db); \
-}
-#define DB_GET_OBJSET(_os_p, _db) { \
- dnode_t *__dn; \
- DB_DNODE_ENTER(_db); \
- __dn = DB_DNODE(_db); \
- *(_os_p) = __dn->dn_objset; \
- DB_DNODE_EXIT(_db); \
-}
void dbuf_init(void);
void dbuf_fini(void);
@@ -325,8 +348,11 @@
(dbuf_is_metadata(_db) && \
((_db)->db_objset->os_secondary_cache == ZFS_CACHE_METADATA)))
-#define DBUF_IS_L2COMPRESSIBLE(_db) \
- ((_db)->db_objset->os_compress != ZIO_COMPRESS_OFF)
+#define DNODE_LEVEL_IS_L2CACHEABLE(_dn, _level) \
+ ((_dn)->dn_objset->os_secondary_cache == ZFS_CACHE_ALL || \
+ (((_level) > 0 || \
+ DMU_OT_IS_METADATA((_dn)->dn_handle->dnh_dnode->dn_type)) && \
+ ((_dn)->dn_objset->os_secondary_cache == ZFS_CACHE_METADATA)))
#ifdef ZFS_DEBUG
@@ -354,7 +380,7 @@
#define dprintf_dbuf_bp(db, bp, fmt, ...) do { \
if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP); \
- sprintf_blkptr(__blkbuf, bp); \
+ snprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, bp); \
dprintf_dbuf(db, fmt " %s\n", __VA_ARGS__, __blkbuf); \
kmem_free(__blkbuf, BP_SPRINTF_LEN); \
} \
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/ddt.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/ddt.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/ddt.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -172,7 +173,7 @@
extern int ddt_object_walk(ddt_t *ddt, enum ddt_type type,
enum ddt_class cls, uint64_t *walk, ddt_entry_t *dde);
extern int ddt_object_count(ddt_t *ddt, enum ddt_type type,
- enum ddt_class class, uint64_t *count);
+ enum ddt_class cls, uint64_t *count);
extern int ddt_object_info(ddt_t *ddt, enum ddt_type type,
enum ddt_class cls, dmu_object_info_t *);
extern boolean_t ddt_object_exists(ddt_t *ddt, enum ddt_type type,
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -21,9 +22,14 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ * Copyright 2013 DEY Storage Systems, Inc.
+ * Copyright 2014 HybridCluster. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
*/
/* Portions Copyright 2010 Robert Milkowski */
@@ -39,11 +45,10 @@
* dmu_spa.h.
*/
-#include <sys/types.h>
-#include <sys/param.h>
+#include <sys/zfs_context.h>
#include <sys/cred.h>
-#include <sys/time.h>
#include <sys/fs/zfs.h>
+#include <sys/zio_priority.h>
#ifdef __cplusplus
extern "C" {
@@ -63,7 +68,7 @@
struct dnode;
struct drr_begin;
struct drr_end;
-struct zbookmark;
+struct zbookmark_phys;
struct spa;
struct nvlist;
struct arc_buf;
@@ -74,6 +79,7 @@
typedef struct objset objset_t;
typedef struct dmu_tx dmu_tx_t;
typedef struct dsl_dir dsl_dir_t;
+typedef struct dnode dnode_t;
typedef enum dmu_object_byteswap {
DMU_BSWAP_UINT8,
@@ -118,6 +124,14 @@
((ot) & DMU_OT_METADATA) : \
dmu_ot[(ot)].ot_metadata)
+/*
+ * These object types use bp_fill != 1 for their L0 bp's. Therefore they can't
+ * have their data embedded (i.e. use a BP_IS_EMBEDDED() bp), because bp_fill
+ * is repurposed for embedded BPs.
+ */
+#define DMU_OT_HAS_FILL(ot) \
+ ((ot) == DMU_OT_DNODE || (ot) == DMU_OT_OBJSET)
+
#define DMU_OT_BYTESWAP(ot) (((ot) & DMU_OT_NEWTYPE) ? \
((ot) & DMU_OT_BYTESWAP_MASK) : \
dmu_ot[(ot)].ot_byteswap)
@@ -217,10 +231,14 @@
DMU_OTN_ZAP_METADATA = DMU_OT(DMU_BSWAP_ZAP, B_TRUE),
} dmu_object_type_t;
-typedef enum txg_how {
- TXG_WAIT = 1,
- TXG_NOWAIT,
-} txg_how_t;
+/*
+ * These flags are intended to be used to specify the "txg_how"
+ * parameter when calling the dmu_tx_assign() function. See the comment
+ * above dmu_tx_assign() for more details on the meaning of these flags.
+ */
+#define TXG_NOWAIT (0ULL)
+#define TXG_WAIT (1ULL<<0)
+#define TXG_NOTHROTTLE (1ULL<<1)
void byteswap_uint64_array(void *buf, size_t size);
void byteswap_uint32_array(void *buf, size_t size);
@@ -233,17 +251,17 @@
#define DS_FIND_SNAPSHOTS (1<<0)
#define DS_FIND_CHILDREN (1<<1)
+#define DS_FIND_SERIALIZE (1<<2)
/*
* The maximum number of bytes that can be accessed as part of one
* operation, including metadata.
*/
-#define DMU_MAX_ACCESS (10<<20) /* 10MB */
+#define DMU_MAX_ACCESS (32 * 1024 * 1024) /* 32MB */
#define DMU_MAX_DELETEBLKCNT (20480) /* ~5MB of indirect blocks */
#define DMU_USERUSED_OBJECT (-1ULL)
#define DMU_GROUPUSED_OBJECT (-2ULL)
-#define DMU_DEADLIST_OBJECT (-3ULL)
/*
* artificial blkids for bonus buffer and spill blocks
@@ -283,8 +301,6 @@
void *db_data; /* data in buffer */
} dmu_buf_t;
-typedef void dmu_buf_evict_func_t(struct dmu_buf *db, void *user_ptr);
-
/*
* The names of zap entries in the DIRECTORY_OBJECT of the MOS.
*/
@@ -293,6 +309,7 @@
#define DMU_POOL_FEATURES_FOR_WRITE "features_for_write"
#define DMU_POOL_FEATURES_FOR_READ "features_for_read"
#define DMU_POOL_FEATURE_DESCRIPTIONS "feature_descriptions"
+#define DMU_POOL_FEATURE_ENABLED_TXG "feature_enabled_txg"
#define DMU_POOL_ROOT_DATASET "root_dataset"
#define DMU_POOL_SYNC_BPOBJ "sync_bplist"
#define DMU_POOL_ERRLOG_SCRUB "errlog_scrub"
@@ -310,6 +327,7 @@
#define DMU_POOL_FREE_BPOBJ "free_bpobj"
#define DMU_POOL_BPTREE_OBJ "bptree_obj"
#define DMU_POOL_EMPTY_BPOBJ "empty_bpobj"
+#define DMU_POOL_CHECKSUM_SALT "org.illumos:checksum_salt"
/*
* Allocate an object from this objset. The range of object numbers
@@ -331,7 +349,7 @@
int dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx);
int dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
- int blocksize, dmu_object_type_t bonustype, int bonuslen);
+ int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *txp);
/*
* Free an object from this objset.
@@ -393,6 +411,11 @@
void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
dmu_tx_t *tx);
+void
+dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset,
+ void *data, uint8_t etype, uint8_t comp, int uncompressed_size,
+ int compressed_size, int byteorder, dmu_tx_t *tx);
+
/*
* Decide how to write a block: checksum, compression, number of copies, etc.
*/
@@ -400,7 +423,7 @@
#define WP_DMU_SYNC 0x2
#define WP_SPILL 0x4
-void dmu_write_policy(objset_t *os, struct dnode *dn, int level, int wp,
+void dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp,
struct zio_prop *zp);
/*
* The bonus data is accessed more or less like a regular buffer.
@@ -426,7 +449,7 @@
*/
int dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp);
-int dmu_spill_hold_by_dnode(struct dnode *dn, uint32_t flags,
+int dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags,
void *tag, dmu_buf_t **dbp);
int dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp);
@@ -446,7 +469,25 @@
*/
int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
void *tag, dmu_buf_t **, int flags);
+int dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset,
+ void *tag, dmu_buf_t **dbp, int flags);
+
+/*
+ * Add a reference to a dmu buffer that has already been held via
+ * dmu_buf_hold() in the current context.
+ */
void dmu_buf_add_ref(dmu_buf_t *db, void* tag);
+
+/*
+ * Attempt to add a reference to a dmu buffer that is in an unknown state,
+ * using a pointer that may have been invalidated by eviction processing.
+ * The request will succeed if the passed in dbuf still represents the
+ * same os/object/blkid, is ineligible for eviction, and has at least
+ * one hold by a user other than the syncer.
+ */
+boolean_t dmu_buf_try_add_ref(dmu_buf_t *, objset_t *os, uint64_t object,
+ uint64_t blkid, void *tag);
+
void dmu_buf_rele(dmu_buf_t *db, void *tag);
uint64_t dmu_buf_refcount(dmu_buf_t *db);
@@ -461,46 +502,134 @@
* individually with dmu_buf_rele.
*/
int dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset,
- uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp);
+ uint64_t length, boolean_t read, void *tag,
+ int *numbufsp, dmu_buf_t ***dbpp);
void dmu_buf_rele_array(dmu_buf_t **, int numbufs, void *tag);
+typedef void dmu_buf_evict_func_t(void *user_ptr);
+
/*
- * Returns NULL on success, or the existing user ptr if it's already
- * been set.
+ * A DMU buffer user object may be associated with a dbuf for the
+ * duration of its lifetime. This allows the user of a dbuf (client)
+ * to attach private data to a dbuf (e.g. in-core only data such as a
+ * dnode_children_t, zap_t, or zap_leaf_t) and be optionally notified
+ * when that dbuf has been evicted. Clients typically respond to the
+ * eviction notification by freeing their private data, thus ensuring
+ * the same lifetime for both dbuf and private data.
*
- * user_ptr is for use by the user and can be obtained via dmu_buf_get_user().
+ * The mapping from a dmu_buf_user_t to any client private data is the
+ * client's responsibility. All current consumers of the API with private
+ * data embed a dmu_buf_user_t as the first member of the structure for
+ * their private data. This allows conversions between the two types
+ * with a simple cast. Since the DMU buf user API never needs access
+ * to the private data, other strategies can be employed if necessary
+ * or convenient for the client (e.g. using container_of() to do the
+ * conversion for private data that cannot have the dmu_buf_user_t as
+ * its first member).
*
- * user_data_ptr_ptr should be NULL, or a pointer to a pointer which
- * will be set to db->db_data when you are allowed to access it. Note
- * that db->db_data (the pointer) can change when you do dmu_buf_read(),
- * dmu_buf_tryupgrade(), dmu_buf_will_dirty(), or dmu_buf_will_fill().
- * *user_data_ptr_ptr will be set to the new value when it changes.
+ * Eviction callbacks are executed without the dbuf mutex held or any
+ * other type of mechanism to guarantee that the dbuf is still available.
+ * For this reason, users must assume the dbuf has already been freed
+ * and not reference the dbuf from the callback context.
*
- * If non-NULL, pageout func will be called when this buffer is being
- * excised from the cache, so that you can clean up the data structure
- * pointed to by user_ptr.
+ * Users requesting "immediate eviction" are notified as soon as the dbuf
+ * is only referenced by dirty records (dirties == holds). Otherwise the
+ * notification occurs after eviction processing for the dbuf begins.
+ */
+typedef struct dmu_buf_user {
+ /*
+ * Asynchronous user eviction callback state.
+ */
+ taskq_ent_t dbu_tqent;
+
+ /* This instance's eviction function pointer. */
+ dmu_buf_evict_func_t *dbu_evict_func;
+#ifdef ZFS_DEBUG
+ /*
+ * Pointer to user's dbuf pointer. NULL for clients that do
+ * not associate a dbuf with their user data.
+ *
+ * The dbuf pointer is cleared upon eviction so as to catch
+ * use-after-evict bugs in clients.
+ */
+ dmu_buf_t **dbu_clear_on_evict_dbufp;
+#endif
+} dmu_buf_user_t;
+
+/*
+ * Initialize the given dmu_buf_user_t instance with the eviction function
+ * evict_func, to be called when the user is evicted.
*
- * dmu_evict_user() will call the pageout func for all buffers in a
- * objset with a given pageout func.
+ * NOTE: This function should only be called once on a given dmu_buf_user_t.
+ * To allow enforcement of this, dbu must already be zeroed on entry.
*/
-void *dmu_buf_set_user(dmu_buf_t *db, void *user_ptr, void *user_data_ptr_ptr,
- dmu_buf_evict_func_t *pageout_func);
+#ifdef __lint
+/* Very ugly, but it beats issuing suppression directives in many Makefiles. */
+extern void
+dmu_buf_init_user(dmu_buf_user_t *dbu, dmu_buf_evict_func_t *evict_func,
+ dmu_buf_t **clear_on_evict_dbufp);
+#else /* __lint */
+static inline void
+dmu_buf_init_user(dmu_buf_user_t *dbu, dmu_buf_evict_func_t *evict_func,
+ dmu_buf_t **clear_on_evict_dbufp)
+{
+ ASSERT(dbu->dbu_evict_func == NULL);
+ ASSERT(evict_func != NULL);
+ dbu->dbu_evict_func = evict_func;
+#ifdef ZFS_DEBUG
+ dbu->dbu_clear_on_evict_dbufp = clear_on_evict_dbufp;
+#endif
+}
+#endif /* __lint */
+
/*
- * set_user_ie is the same as set_user, but request immediate eviction
- * when hold count goes to zero.
+ * Attach user data to a dbuf and mark it for normal (when the dbuf's
+ * data is cleared or its reference count goes to zero) eviction processing.
+ *
+ * Returns NULL on success, or the existing user if another user currently
+ * owns the buffer.
*/
-void *dmu_buf_set_user_ie(dmu_buf_t *db, void *user_ptr,
- void *user_data_ptr_ptr, dmu_buf_evict_func_t *pageout_func);
-void *dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr,
- void *user_ptr, void *user_data_ptr_ptr,
- dmu_buf_evict_func_t *pageout_func);
-void dmu_evict_user(objset_t *os, dmu_buf_evict_func_t *func);
+void *dmu_buf_set_user(dmu_buf_t *db, dmu_buf_user_t *user);
/*
- * Returns the user_ptr set with dmu_buf_set_user(), or NULL if not set.
+ * Attach user data to a dbuf and mark it for immediate (its dirty and
+ * reference counts are equal) eviction processing.
+ *
+ * Returns NULL on success, or the existing user if another user currently
+ * owns the buffer.
*/
+void *dmu_buf_set_user_ie(dmu_buf_t *db, dmu_buf_user_t *user);
+
+/*
+ * Replace the current user of a dbuf.
+ *
+ * If given the current user of a dbuf, replaces the dbuf's user with
+ * "new_user" and returns the user data pointer that was replaced.
+ * Otherwise returns the current, and unmodified, dbuf user pointer.
+ */
+void *dmu_buf_replace_user(dmu_buf_t *db,
+ dmu_buf_user_t *old_user, dmu_buf_user_t *new_user);
+
+/*
+ * Remove the specified user data for a DMU buffer.
+ *
+ * Returns the user that was removed on success, or the current user if
+ * another user currently owns the buffer.
+ */
+void *dmu_buf_remove_user(dmu_buf_t *db, dmu_buf_user_t *user);
+
+/*
+ * Returns the user data (dmu_buf_user_t *) associated with this dbuf.
+ */
void *dmu_buf_get_user(dmu_buf_t *db);
+objset_t *dmu_buf_get_objset(dmu_buf_t *db);
+dnode_t *dmu_buf_dnode_enter(dmu_buf_t *db);
+void dmu_buf_dnode_exit(dmu_buf_t *db);
+
+/* Block until any in-progress dmu buf user evictions complete. */
+void dmu_buf_user_evict_wait(void);
+
/*
* Returns the blkptr associated with this dbuf, or NULL if not set.
*/
@@ -551,9 +680,10 @@
void dmu_tx_hold_sa(dmu_tx_t *tx, struct sa_handle *hdl, boolean_t may_grow);
void dmu_tx_hold_sa_create(dmu_tx_t *tx, int total_size);
void dmu_tx_abort(dmu_tx_t *tx);
-int dmu_tx_assign(dmu_tx_t *tx, enum txg_how txg_how);
+int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how);
void dmu_tx_wait(dmu_tx_t *tx);
void dmu_tx_commit(dmu_tx_t *tx);
+void dmu_tx_mark_netfree(dmu_tx_t *tx);
/*
* To register a commit callback, dmu_tx_callback_register() must be called.
@@ -583,7 +713,7 @@
uint64_t size, dmu_tx_t *tx);
int dmu_free_long_range(objset_t *os, uint64_t object, uint64_t offset,
uint64_t size);
-int dmu_free_object(objset_t *os, uint64_t object);
+int dmu_free_long_object(objset_t *os, uint64_t object);
/*
* Convenience functions.
@@ -600,12 +730,20 @@
void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
dmu_tx_t *tx);
int dmu_read_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size);
+int dmu_read_uio_dbuf(dmu_buf_t *zdb, struct uio *uio, uint64_t size);
int dmu_write_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size,
dmu_tx_t *tx);
int dmu_write_uio_dbuf(dmu_buf_t *zdb, struct uio *uio, uint64_t size,
dmu_tx_t *tx);
+#ifdef _KERNEL
+#ifdef illumos
int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset,
uint64_t size, struct page *pp, dmu_tx_t *tx);
+#else
+int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset,
+ uint64_t size, struct vm_page **ppa, dmu_tx_t *tx);
+#endif
+#endif
struct arc_buf *dmu_request_arcbuf(dmu_buf_t *handle, int size);
void dmu_return_arcbuf(struct arc_buf *buf);
void dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, struct arc_buf *buf,
@@ -620,13 +758,14 @@
void xuio_stat_wbuf_copied();
void xuio_stat_wbuf_nocopy();
-extern int zfs_prefetch_disable;
+extern boolean_t zfs_prefetch_disable;
+extern int zfs_max_recordsize;
/*
* Asynchronously try to read in the data.
*/
-void dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset,
- uint64_t len);
+void dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
+ uint64_t len, enum zio_priority pri);
typedef struct dmu_object_info {
/* All sizes are in bytes unless otherwise indicated. */
@@ -638,7 +777,8 @@
uint8_t doi_indirection; /* 2 = dnode->indirect->data */
uint8_t doi_checksum;
uint8_t doi_compress;
- uint8_t doi_pad[5];
+ uint8_t doi_nblkptr;
+ uint8_t doi_pad[4];
uint64_t doi_physical_blocks_512; /* data + metadata, 512b blks */
uint64_t doi_max_offset;
uint64_t doi_fill_count; /* number of non-empty blocks */
@@ -669,7 +809,7 @@
*/
int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi);
/* Like dmu_object_info, but faster if you have a held dnode in hand. */
-void dmu_object_info_from_dnode(struct dnode *dn, dmu_object_info_t *doi);
+void dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi);
/* Like dmu_object_info, but faster if you have a held dbuf in hand. */
void dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi);
/*
@@ -686,7 +826,7 @@
dmu_objset_type_t dds_type;
uint8_t dds_is_snapshot;
uint8_t dds_inconsistent;
- char dds_origin[MAXNAMELEN];
+ char dds_origin[ZFS_MAX_DATASET_NAME_LEN];
} dmu_objset_stats_t;
/*
@@ -736,8 +876,8 @@
extern void dmu_objset_name(objset_t *os, char *buf);
extern dmu_objset_type_t dmu_objset_type(objset_t *os);
extern uint64_t dmu_objset_id(objset_t *os);
-extern uint64_t dmu_objset_syncprop(objset_t *os);
-extern uint64_t dmu_objset_logbias(objset_t *os);
+extern zfs_sync_type_t dmu_objset_syncprop(objset_t *os);
+extern zfs_logbias_op_t dmu_objset_logbias(objset_t *os);
extern int dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
uint64_t *id, uint64_t *offp, boolean_t *case_conflict);
extern int dmu_snapshot_realname(objset_t *os, char *name, char *real,
@@ -790,6 +930,15 @@
uint64_t *off);
/*
+ * Check if a DMU object has any dirty blocks. If so, sync out
+ * all pending transaction groups. Otherwise, this function
+ * does not alter DMU state. This could be improved to only sync
+ * out the necessary transaction groups for this particular
+ * object.
+ */
+int dmu_object_wait_synced(objset_t *os, uint64_t object);
+
+/*
* Initial setup and final teardown.
*/
extern void dmu_init(void);
@@ -806,6 +955,8 @@
#define ZFS_CRC64_POLY 0xC96C5795D7870F42ULL /* ECMA-182, reflected form */
extern uint64_t zfs_crc64_table[256];
+extern int zfs_mdcomp_disable;
+
#ifdef __cplusplus
}
#endif
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_impl.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -21,8 +22,11 @@
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ */
+/*
* Copyright (c) 2012, Joyent, Inc. All rights reserved.
* Copyright (c) 2012, Martin Matuska <mm at FreeBSD.org>. All rights reserved.
+ * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
*/
#ifndef _SYS_DMU_IMPL_H
@@ -293,8 +297,19 @@
uint64_t dsa_toguid;
int dsa_err;
dmu_pendop_t dsa_pending_op;
+ uint64_t dsa_featureflags;
+ uint64_t dsa_last_data_object;
+ uint64_t dsa_last_data_offset;
+ uint64_t dsa_resume_object;
+ uint64_t dsa_resume_offset;
+ boolean_t dsa_sent_begin;
+ boolean_t dsa_sent_end;
} dmu_sendarg_t;
+void dmu_object_zapify(objset_t *, uint64_t, dmu_object_type_t, dmu_tx_t *);
+void dmu_object_free_zapified(objset_t *, uint64_t, dmu_tx_t *);
+int dmu_buf_hold_noread(objset_t *, uint64_t, uint64_t,
+ void *, dmu_buf_t **);
#ifdef __cplusplus
}
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -20,8 +21,10 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
*/
/* Portions Copyright 2010 Robert Milkowski */
@@ -74,10 +77,10 @@
arc_buf_t *os_phys_buf;
objset_phys_t *os_phys;
/*
- * The following "special" dnodes have no parent and are exempt from
- * dnode_move(), but they root their descendents in this objset using
- * handles anyway, so that all access to dnodes from dbufs consistently
- * uses handles.
+ * The following "special" dnodes have no parent, are exempt
+ * from dnode_move(), and are not recorded in os_dnodes, but they
+ * root their descendents in this objset using handles anyway, so
+ * that all access to dnodes from dbufs consistently uses handles.
*/
dnode_handle_t os_meta_dnode;
dnode_handle_t os_userused_dnode;
@@ -84,20 +87,29 @@
dnode_handle_t os_groupused_dnode;
zilog_t *os_zil;
+ list_node_t os_evicting_node;
+
/* can change, under dsl_dir's locks: */
- uint8_t os_checksum;
- uint8_t os_compress;
+ enum zio_checksum os_checksum;
+ enum zio_compress os_compress;
uint8_t os_copies;
- uint8_t os_dedup_checksum;
- uint8_t os_dedup_verify;
- uint8_t os_logbias;
- uint8_t os_primary_cache;
- uint8_t os_secondary_cache;
- uint8_t os_sync;
+ enum zio_checksum os_dedup_checksum;
+ boolean_t os_dedup_verify;
+ zfs_logbias_op_t os_logbias;
+ zfs_cache_type_t os_primary_cache;
+ zfs_cache_type_t os_secondary_cache;
+ zfs_sync_type_t os_sync;
+ zfs_redundant_metadata_type_t os_redundant_metadata;
+ int os_recordsize;
+ /*
+ * Pointer is constant; the blkptr it points to is protected by
+ * os_dsl_dataset->ds_bp_rwlock
+ */
+ blkptr_t *os_rootbp;
+
/* no lock needed: */
struct dmu_tx *os_synctx; /* XXX sketchy */
- blkptr_t *os_rootbp;
zil_header_t os_zil_header;
list_t os_synced_dnodes;
uint64_t os_flags;
@@ -130,12 +142,16 @@
((os)->os_secondary_cache == ZFS_CACHE_ALL || \
(os)->os_secondary_cache == ZFS_CACHE_METADATA)
-#define DMU_OS_IS_L2COMPRESSIBLE(os) ((os)->os_compress != ZIO_COMPRESS_OFF)
+#define DMU_OS_IS_L2COMPRESSIBLE(os) (zfs_mdcomp_disable == B_FALSE)
/* called from zpl */
int dmu_objset_hold(const char *name, void *tag, objset_t **osp);
int dmu_objset_own(const char *name, dmu_objset_type_t type,
boolean_t readonly, void *tag, objset_t **osp);
+int dmu_objset_own_obj(struct dsl_pool *dp, uint64_t obj,
+ dmu_objset_type_t type, boolean_t readonly, void *tag, objset_t **osp);
+void dmu_objset_refresh_ownership(struct dsl_dataset *ds,
+ struct dsl_dataset **newds, void *tag);
void dmu_objset_rele(objset_t *os, void *tag);
void dmu_objset_disown(objset_t *os, void *tag);
int dmu_objset_from_ds(struct dsl_dataset *ds, objset_t **osp);
@@ -167,6 +183,8 @@
boolean_t dmu_objset_userspace_present(objset_t *os);
int dmu_fsname(const char *snapname, char *buf);
+void dmu_objset_evict_done(objset_t *os);
+
void dmu_objset_init(void);
void dmu_objset_fini(void);
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_send.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -21,9 +22,10 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
*/
#ifndef _DMU_SEND_H
@@ -35,8 +37,12 @@
struct dsl_dataset;
struct drr_begin;
struct avl_tree;
+struct dmu_replay_record;
-int dmu_send(const char *tosnap, const char *fromsnap, int outfd,
+extern const char *recv_clone_name;
+
+int dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
+ boolean_t large_block_ok, int outfd, uint64_t resumeobj, uint64_t resumeoff,
#ifdef illumos
struct vnode *vp, offset_t *off);
#else
@@ -44,7 +50,10 @@
#endif
int dmu_send_estimate(struct dsl_dataset *ds, struct dsl_dataset *fromds,
uint64_t *sizep);
+int dmu_send_estimate_from_txg(struct dsl_dataset *ds, uint64_t fromtxg,
+ uint64_t *sizep);
int dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
+ boolean_t embedok, boolean_t large_block_ok,
#ifdef illumos
int outfd, struct vnode *vp, offset_t *off);
#else
@@ -53,6 +62,7 @@
typedef struct dmu_recv_cookie {
struct dsl_dataset *drc_ds;
+ struct dmu_replay_record *drc_drr_begin;
struct drr_begin *drc_drrb;
const char *drc_tofs;
const char *drc_tosnap;
@@ -59,13 +69,17 @@
boolean_t drc_newfs;
boolean_t drc_byteswap;
boolean_t drc_force;
+ boolean_t drc_resumable;
struct avl_tree *drc_guid_to_ds_map;
zio_cksum_t drc_cksum;
uint64_t drc_newsnapobj;
+ void *drc_owner;
+ cred_t *drc_cred;
} dmu_recv_cookie_t;
-int dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb,
- boolean_t force, char *origin, dmu_recv_cookie_t *drc);
+int dmu_recv_begin(char *tofs, char *tosnap,
+ struct dmu_replay_record *drr_begin,
+ boolean_t force, boolean_t resumable, char *origin, dmu_recv_cookie_t *drc);
#ifdef illumos
int dmu_recv_stream(dmu_recv_cookie_t *drc, struct vnode *vp, offset_t *voffp,
#else
@@ -72,6 +86,7 @@
int dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp,
#endif
int cleanup_fd, uint64_t *action_handlep);
-int dmu_recv_end(dmu_recv_cookie_t *drc);
+int dmu_recv_end(dmu_recv_cookie_t *drc, void *owner);
+boolean_t dmu_objset_is_receiving(objset_t *os);
#endif /* _DMU_SEND_H */
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_traverse.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -20,7 +21,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
*/
#ifndef _SYS_DMU_TRAVERSE_H
@@ -40,7 +41,7 @@
struct arc_buf;
typedef int (blkptr_cb_t)(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
- const zbookmark_t *zb, const struct dnode_phys *dnp, void *arg);
+ const zbookmark_phys_t *zb, const struct dnode_phys *dnp, void *arg);
#define TRAVERSE_PRE (1<<0)
#define TRAVERSE_POST (1<<1)
@@ -54,8 +55,10 @@
int traverse_dataset(struct dsl_dataset *ds,
uint64_t txg_start, int flags, blkptr_cb_t func, void *arg);
+int traverse_dataset_resume(struct dsl_dataset *ds, uint64_t txg_start,
+ zbookmark_phys_t *resume, int flags, blkptr_cb_t func, void *arg);
int traverse_dataset_destroyed(spa_t *spa, blkptr_t *blkptr,
- uint64_t txg_start, zbookmark_t *resume, int flags,
+ uint64_t txg_start, zbookmark_phys_t *resume, int flags,
blkptr_cb_t func, void *arg);
int traverse_pool(spa_t *spa,
uint64_t txg_start, int flags, blkptr_cb_t func, void *arg);
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -23,7 +24,7 @@
* Use is subject to license terms.
*/
/*
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
*/
#ifndef _SYS_DMU_TX_H
@@ -59,8 +60,22 @@
txg_handle_t tx_txgh;
void *tx_tempreserve_cookie;
struct dmu_tx_hold *tx_needassign_txh;
- list_t tx_callbacks; /* list of dmu_tx_callback_t on this dmu_tx */
- uint8_t tx_anyobj;
+
+ /* list of dmu_tx_callback_t on this dmu_tx */
+ list_t tx_callbacks;
+
+ /* placeholder for syncing context, doesn't need specific holds */
+ boolean_t tx_anyobj;
+
+ /* time this transaction was created */
+ hrtime_t tx_start;
+
+ /* need to wait for sufficient dirty space */
+ boolean_t tx_wait_dirty;
+
+ /* has this transaction already been delayed? */
+ boolean_t tx_dirty_delayed;
+
int tx_err;
#ifdef ZFS_DEBUG
uint64_t tx_space_towrite;
@@ -87,12 +102,12 @@
dmu_tx_t *txh_tx;
list_node_t txh_node;
struct dnode *txh_dnode;
- uint64_t txh_space_towrite;
- uint64_t txh_space_tofree;
- uint64_t txh_space_tooverwrite;
- uint64_t txh_space_tounref;
- uint64_t txh_memory_tohold;
- uint64_t txh_fudge;
+ refcount_t txh_space_towrite;
+ refcount_t txh_space_tofree;
+ refcount_t txh_space_tooverwrite;
+ refcount_t txh_space_tounref;
+ refcount_t txh_memory_tohold;
+ refcount_t txh_fudge;
#ifdef ZFS_DEBUG
enum dmu_tx_hold_type txh_type;
uint64_t txh_arg1;
@@ -110,7 +125,7 @@
* These routines are defined in dmu.h, and are called by the user.
*/
dmu_tx_t *dmu_tx_create(objset_t *dd);
-int dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how);
+int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how);
void dmu_tx_commit(dmu_tx_t *tx);
void dmu_tx_abort(dmu_tx_t *tx);
uint64_t dmu_tx_get_txg(dmu_tx_t *tx);
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_zfetch.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_zfetch.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_zfetch.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -23,9 +24,13 @@
* Use is subject to license terms.
*/
-#ifndef _DFETCH_H
-#define _DFETCH_H
+/*
+ * Copyright (c) 2014 by Delphix. All rights reserved.
+ */
+#ifndef _DMU_ZFETCH_H
+#define _DMU_ZFETCH_H
+
#include <sys/zfs_context.h>
#ifdef __cplusplus
@@ -36,29 +41,25 @@
struct dnode; /* so we can reference dnode */
-typedef enum zfetch_dirn {
- ZFETCH_FORWARD = 1, /* prefetch increasing block numbers */
- ZFETCH_BACKWARD = -1 /* prefetch decreasing block numbers */
-} zfetch_dirn_t;
+typedef struct zstream {
+ uint64_t zs_blkid; /* expect next access at this blkid */
+ uint64_t zs_pf_blkid; /* next block to prefetch */
-typedef struct zstream {
- uint64_t zst_offset; /* offset of starting block in range */
- uint64_t zst_len; /* length of range, in blocks */
- zfetch_dirn_t zst_direction; /* direction of prefetch */
- uint64_t zst_stride; /* length of stride, in blocks */
- uint64_t zst_ph_offset; /* prefetch offset, in blocks */
- uint64_t zst_cap; /* prefetch limit (cap), in blocks */
- kmutex_t zst_lock; /* protects stream */
- clock_t zst_last; /* lbolt of last prefetch */
- avl_node_t zst_node; /* embed avl node here */
+ /*
+ * We will next prefetch the L1 indirect block of this level-0
+ * block id.
+ */
+ uint64_t zs_ipf_blkid;
+
+ kmutex_t zs_lock; /* protects stream */
+ hrtime_t zs_atime; /* time last prefetch issued */
+ list_node_t zs_node; /* link for zf_stream */
} zstream_t;
typedef struct zfetch {
krwlock_t zf_rwlock; /* protects zfetch structure */
- list_t zf_stream; /* AVL tree of zstream_t's */
+ list_t zf_stream; /* list of zstream_t's */
struct dnode *zf_dnode; /* dnode that owns this zfetch */
- uint32_t zf_stream_cnt; /* # of active streams */
- uint64_t zf_alloc_fail; /* # of failed attempts to alloc strm */
} zfetch_t;
void zfetch_init(void);
@@ -65,8 +66,8 @@
void zfetch_fini(void);
void dmu_zfetch_init(zfetch_t *, struct dnode *);
-void dmu_zfetch_rele(zfetch_t *);
-void dmu_zfetch(zfetch_t *, uint64_t, uint64_t, int);
+void dmu_zfetch_fini(zfetch_t *);
+void dmu_zfetch(zfetch_t *, uint64_t, uint64_t, boolean_t);
#ifdef __cplusplus
@@ -73,4 +74,4 @@
}
#endif
-#endif /* _DFETCH_H */
+#endif /* _DMU_ZFETCH_H */
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -20,7 +21,8 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
*/
#ifndef _SYS_DNODE_H
@@ -56,8 +58,8 @@
* Fixed constants.
*/
#define DNODE_SHIFT 9 /* 512 bytes */
-#define DN_MIN_INDBLKSHIFT 10 /* 1k */
-#define DN_MAX_INDBLKSHIFT 14 /* 16k */
+#define DN_MIN_INDBLKSHIFT 12 /* 4k */
+#define DN_MAX_INDBLKSHIFT 17 /* 128k */
#define DNODE_BLOCK_SHIFT 14 /* 16k */
#define DNODE_CORE_SIZE 64 /* 64 bytes for dnode sans blkptrs */
#define DN_MAX_OBJECT_SHIFT 48 /* 256 trillion (zfs_fid_t limit) */
@@ -87,6 +89,11 @@
#define DNODES_PER_BLOCK_SHIFT (DNODE_BLOCK_SHIFT - DNODE_SHIFT)
#define DNODES_PER_BLOCK (1ULL << DNODES_PER_BLOCK_SHIFT)
+
+/*
+ * This is inaccurate if the indblkshift of the particular object is not the
+ * max. But it's only used by userland to calculate the zvol reservation.
+ */
#define DNODES_PER_LEVEL_SHIFT (DN_MAX_INDBLKSHIFT - SPA_BLKPTRSHIFT)
#define DNODES_PER_LEVEL (1ULL << DNODES_PER_LEVEL_SHIFT)
@@ -143,7 +150,7 @@
blkptr_t dn_spill;
} dnode_phys_t;
-typedef struct dnode {
+struct dnode {
/*
* Protects the structure of the dnode, including the number of levels
* of indirection (dn_nlevels), dn_maxblkid, and dn_next_*
@@ -178,6 +185,7 @@
uint16_t dn_datablkszsec; /* in 512b sectors */
uint32_t dn_datablksz; /* in bytes */
uint64_t dn_maxblkid;
+ uint8_t dn_next_type[TXG_SIZE];
uint8_t dn_next_nblkptr[TXG_SIZE];
uint8_t dn_next_nlevels[TXG_SIZE];
uint8_t dn_next_indblkshift[TXG_SIZE];
@@ -188,6 +196,8 @@
/* protected by dn_dbufs_mtx; declared here to fill 32-bit hole */
uint32_t dn_dbufs_count; /* count of dn_dbufs */
+ /* There are no level-0 blocks of this blkid or higher in dn_dbufs */
+ uint64_t dn_unlisted_l0_blkid;
/* protected by os_lock: */
list_node_t dn_dirty_link[TXG_SIZE]; /* next on dataset's dirty */
@@ -195,7 +205,7 @@
/* protected by dn_mtx: */
kmutex_t dn_mtx;
list_t dn_dirty_records[TXG_SIZE];
- avl_tree_t dn_ranges[TXG_SIZE];
+ struct range_tree *dn_free_ranges[TXG_SIZE];
uint64_t dn_allocated_txg;
uint64_t dn_free_txg;
uint64_t dn_assigned_txg;
@@ -208,7 +218,18 @@
refcount_t dn_holds;
kmutex_t dn_dbufs_mtx;
- list_t dn_dbufs; /* descendent dbufs */
+ /*
+ * Descendent dbufs, ordered by dbuf_compare. Note that dn_dbufs
+ * can contain multiple dbufs of the same (level, blkid) when a
+ * dbuf is marked DB_EVICTING without being removed from
+ * dn_dbufs. To maintain the avl invariant that there cannot be
+ * duplicate entries, we order the dbufs by an arbitrary value -
+ * their address in memory. This means that dn_dbufs cannot be used to
+ * directly look up a dbuf. Instead, callers must use avl_walk, have
+ * a reference to the dbuf, or look up a non-existant node with
+ * db_state = DB_SEARCH (see dbuf_free_range for an example).
+ */
+ avl_tree_t dn_dbufs;
/* protected by dn_struct_rwlock */
struct dmu_buf_impl *dn_bonus; /* bonus buffer dbuf */
@@ -227,7 +248,7 @@
/* holds prefetch structure */
struct zfetch dn_zfetch;
-} dnode_t;
+};
/*
* Adds a level of indirection between the dbuf and the dnode to avoid
@@ -241,8 +262,9 @@
} dnode_handle_t;
typedef struct dnode_children {
+ dmu_buf_user_t dnc_dbu; /* User evict data */
size_t dnc_count; /* number of children */
- dnode_handle_t dnc_children[1]; /* sized dynamically */
+ dnode_handle_t dnc_children[]; /* sized dynamically */
} dnode_children_t;
typedef struct free_range {
@@ -251,7 +273,7 @@
uint64_t fr_nblks;
} free_range_t;
-dnode_t *dnode_special_open(struct objset *dd, dnode_phys_t *dnp,
+void dnode_special_open(struct objset *dd, dnode_phys_t *dnp,
uint64_t object, dnode_handle_t *dnh);
void dnode_special_close(dnode_handle_t *dnh);
@@ -265,6 +287,7 @@
void *ref, dnode_t **dnp);
boolean_t dnode_add_ref(dnode_t *dn, void *ref);
void dnode_rele(dnode_t *dn, void *ref);
+void dnode_rele_and_unlock(dnode_t *dn, void *tag);
void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx);
void dnode_sync(dnode_t *dn, dmu_tx_t *tx);
void dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
@@ -277,8 +300,6 @@
void dnode_verify(dnode_t *dn);
int dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx);
void dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx);
-void dnode_clear_range(dnode_t *dn, uint64_t blkid,
- uint64_t nblks, dmu_tx_t *tx);
void dnode_diduse_space(dnode_t *dn, int64_t space);
void dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx);
void dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t);
@@ -288,7 +309,17 @@
int dnode_next_offset(dnode_t *dn, int flags, uint64_t *off,
int minlvl, uint64_t blkfill, uint64_t txg);
void dnode_evict_dbufs(dnode_t *dn);
+void dnode_evict_bonus(dnode_t *dn);
+#define DNODE_IS_CACHEABLE(_dn) \
+ ((_dn)->dn_objset->os_primary_cache == ZFS_CACHE_ALL || \
+ (DMU_OT_IS_METADATA((_dn)->dn_type) && \
+ (_dn)->dn_objset->os_primary_cache == ZFS_CACHE_METADATA))
+
+#define DNODE_META_IS_CACHEABLE(_dn) \
+ ((_dn)->dn_objset->os_primary_cache == ZFS_CACHE_ALL || \
+ (_dn)->dn_objset->os_primary_cache == ZFS_CACHE_METADATA)
+
#ifdef ZFS_DEBUG
/*
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -20,9 +21,11 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
- * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
* Copyright (c) 2013 Steven Hartland. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
*/
#ifndef _SYS_DSL_DATASET_H
@@ -37,6 +40,8 @@
#include <sys/zfs_context.h>
#include <sys/dsl_deadlist.h>
#include <sys/refcount.h>
+#include <sys/rrwlock.h>
+#include <zfeature_common.h>
#ifdef __cplusplus
extern "C" {
@@ -48,10 +53,10 @@
#define DS_FLAG_INCONSISTENT (1ULL<<0)
#define DS_IS_INCONSISTENT(ds) \
- ((ds)->ds_phys->ds_flags & DS_FLAG_INCONSISTENT)
+ (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_INCONSISTENT)
+
/*
- * Note: nopromote can not yet be set, but we want support for it in this
- * on-disk version, so that we don't need to upgrade for it later.
+ * Do not allow this dataset to be promoted.
*/
#define DS_FLAG_NOPROMOTE (1ULL<<1)
@@ -68,9 +73,33 @@
*/
#define DS_FLAG_DEFER_DESTROY (1ULL<<3)
#define DS_IS_DEFER_DESTROY(ds) \
- ((ds)->ds_phys->ds_flags & DS_FLAG_DEFER_DESTROY)
+ (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_DEFER_DESTROY)
/*
+ * DS_FIELD_* are strings that are used in the "extensified" dataset zap object.
+ * They should be of the format <reverse-dns>:<field>.
+ */
+
+/*
+ * This field's value is the object ID of a zap object which contains the
+ * bookmarks of this dataset. If it is present, then this dataset is counted
+ * in the refcount of the SPA_FEATURES_BOOKMARKS feature.
+ */
+#define DS_FIELD_BOOKMARK_NAMES "com.delphix:bookmarks"
+
+/*
+ * These fields are set on datasets that are in the middle of a resumable
+ * receive, and allow the sender to resume the send if it is interrupted.
+ */
+#define DS_FIELD_RESUME_FROMGUID "com.delphix:resume_fromguid"
+#define DS_FIELD_RESUME_TONAME "com.delphix:resume_toname"
+#define DS_FIELD_RESUME_TOGUID "com.delphix:resume_toguid"
+#define DS_FIELD_RESUME_OBJECT "com.delphix:resume_object"
+#define DS_FIELD_RESUME_OFFSET "com.delphix:resume_offset"
+#define DS_FIELD_RESUME_BYTES "com.delphix:resume_bytes"
+#define DS_FIELD_RESUME_EMBEDOK "com.delphix:resume_embedok"
+
+/*
* DS_FLAG_CI_DATASET is set if the dataset contains a file system whose
* name lookups should be performed case-insensitively.
*/
@@ -113,15 +142,19 @@
} dsl_dataset_phys_t;
typedef struct dsl_dataset {
+ dmu_buf_user_t ds_dbu;
+ rrwlock_t ds_bp_rwlock; /* Protects ds_phys->ds_bp */
+
/* Immutable: */
struct dsl_dir *ds_dir;
- dsl_dataset_phys_t *ds_phys;
dmu_buf_t *ds_dbuf;
uint64_t ds_object;
uint64_t ds_fsid_guid;
+ boolean_t ds_is_snapshot;
/* only used in syncing context, only valid for non-snapshots: */
struct dsl_dataset *ds_prev;
+ uint64_t ds_bookmarks; /* DMU_OTN_ZAP_METADATA */
/* has internal locking: */
dsl_deadlist_t ds_deadlist;
@@ -160,10 +193,39 @@
kmutex_t ds_sendstream_lock;
list_t ds_sendstreams;
+ /*
+ * When in the middle of a resumable receive, tracks how much
+ * progress we have made.
+ */
+ uint64_t ds_resume_object[TXG_SIZE];
+ uint64_t ds_resume_offset[TXG_SIZE];
+ uint64_t ds_resume_bytes[TXG_SIZE];
+
+ /* Protected by our dsl_dir's dd_lock */
+ list_t ds_prop_cbs;
+
+ /*
+ * For ZFEATURE_FLAG_PER_DATASET features, set if this dataset
+ * uses this feature.
+ */
+ uint8_t ds_feature_inuse[SPA_FEATURES];
+
+ /*
+ * Set if we need to activate the feature on this dataset this txg
+ * (used only in syncing context).
+ */
+ uint8_t ds_feature_activation_needed[SPA_FEATURES];
+
/* Protected by ds_lock; keep at end of struct for better locality */
- char ds_snapname[MAXNAMELEN];
+ char ds_snapname[ZFS_MAX_DATASET_NAME_LEN];
} dsl_dataset_t;
+inline dsl_dataset_phys_t *
+dsl_dataset_phys(dsl_dataset_t *ds)
+{
+ return (ds->ds_dbuf->db_data);
+}
+
/*
* The max length of a temporary tag prefix is the number of hex digits
* required to express UINT64_MAX plus one for the hyphen.
@@ -171,13 +233,15 @@
#define MAX_TAG_PREFIX_LEN 17
#define dsl_dataset_is_snapshot(ds) \
- ((ds)->ds_phys->ds_num_children != 0)
+ (dsl_dataset_phys(ds)->ds_num_children != 0)
#define DS_UNIQUE_IS_ACCURATE(ds) \
- (((ds)->ds_phys->ds_flags & DS_FLAG_UNIQUE_ACCURATE) != 0)
+ ((dsl_dataset_phys(ds)->ds_flags & DS_FLAG_UNIQUE_ACCURATE) != 0)
int dsl_dataset_hold(struct dsl_pool *dp, const char *name, void *tag,
dsl_dataset_t **dsp);
+boolean_t dsl_dataset_try_add_ref(struct dsl_pool *dp, dsl_dataset_t *ds,
+ void *tag);
int dsl_dataset_hold_obj(struct dsl_pool *dp, uint64_t dsobj, void *tag,
dsl_dataset_t **);
void dsl_dataset_rele(dsl_dataset_t *ds, void *tag);
@@ -188,6 +252,8 @@
void dsl_dataset_disown(dsl_dataset_t *ds, void *tag);
void dsl_dataset_name(dsl_dataset_t *ds, char *name);
boolean_t dsl_dataset_tryown(dsl_dataset_t *ds, void *tag);
+int dsl_dataset_namelen(dsl_dataset_t *ds);
+boolean_t dsl_dataset_has_owner(dsl_dataset_t *ds);
uint64_t dsl_dataset_create_sync(dsl_dir_t *pds, const char *lastname,
dsl_dataset_t *origin, uint64_t flags, cred_t *, dmu_tx_t *);
uint64_t dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
@@ -202,13 +268,14 @@
minor_t cleanup_minor, const char *htag);
blkptr_t *dsl_dataset_get_blkptr(dsl_dataset_t *ds);
-void dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx);
spa_t *dsl_dataset_get_spa(dsl_dataset_t *ds);
-boolean_t dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds);
+boolean_t dsl_dataset_modified_since_snap(dsl_dataset_t *ds,
+ dsl_dataset_t *snap);
void dsl_dataset_sync(dsl_dataset_t *os, zio_t *zio, dmu_tx_t *tx);
+void dsl_dataset_sync_done(dsl_dataset_t *os, dmu_tx_t *tx);
void dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp,
dmu_tx_t *tx);
@@ -241,17 +308,18 @@
int dsl_dataset_set_refreservation(const char *dsname, zprop_source_t source,
uint64_t reservation);
-boolean_t dsl_dataset_is_before(dsl_dataset_t *later, dsl_dataset_t *earlier);
+boolean_t dsl_dataset_is_before(dsl_dataset_t *later, dsl_dataset_t *earlier,
+ uint64_t earlier_txg);
void dsl_dataset_long_hold(dsl_dataset_t *ds, void *tag);
void dsl_dataset_long_rele(dsl_dataset_t *ds, void *tag);
boolean_t dsl_dataset_long_held(dsl_dataset_t *ds);
int dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone,
- dsl_dataset_t *origin_head, boolean_t force);
+ dsl_dataset_t *origin_head, boolean_t force, void *owner, dmu_tx_t *tx);
void dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone,
dsl_dataset_t *origin_head, dmu_tx_t *tx);
int dsl_dataset_snapshot_check_impl(dsl_dataset_t *ds, const char *snapname,
- dmu_tx_t *tx);
+ dmu_tx_t *tx, boolean_t recv, uint64_t cnt, cred_t *cr);
void dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname,
dmu_tx_t *tx);
@@ -261,18 +329,25 @@
int dsl_dataset_get_snapname(dsl_dataset_t *ds);
int dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name,
uint64_t *value);
-int dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx);
+int dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx,
+ boolean_t adj_cnt);
void dsl_dataset_set_refreservation_sync_impl(dsl_dataset_t *ds,
zprop_source_t source, uint64_t value, dmu_tx_t *tx);
-int dsl_dataset_rollback(const char *fsname);
+void dsl_dataset_zapify(dsl_dataset_t *ds, dmu_tx_t *tx);
+boolean_t dsl_dataset_is_zapified(dsl_dataset_t *ds);
+boolean_t dsl_dataset_has_resume_receive_state(dsl_dataset_t *ds);
+int dsl_dataset_rollback(const char *fsname, void *owner, nvlist_t *result);
+void dsl_dataset_deactivate_feature(uint64_t dsobj,
+ spa_feature_t f, dmu_tx_t *tx);
+
#ifdef ZFS_DEBUG
#define dprintf_ds(ds, fmt, ...) do { \
if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
- char *__ds_name = kmem_alloc(MAXNAMELEN, KM_SLEEP); \
+ char *__ds_name = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); \
dsl_dataset_name(ds, __ds_name); \
dprintf("ds=%s " fmt, __ds_name, __VA_ARGS__); \
- kmem_free(__ds_name, MAXNAMELEN); \
+ kmem_free(__ds_name, ZFS_MAX_DATASET_NAME_LEN); \
} \
_NOTE(CONSTCOND) } while (0)
#else
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deadlist.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deadlist.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deadlist.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deleg.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deleg.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_deleg.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -20,7 +21,7 @@
*/
/*
* Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
*/
#ifndef _SYS_DSL_DELEG_H
@@ -56,6 +57,7 @@
#define ZFS_DELEG_PERM_HOLD "hold"
#define ZFS_DELEG_PERM_RELEASE "release"
#define ZFS_DELEG_PERM_DIFF "diff"
+#define ZFS_DELEG_PERM_BOOKMARK "bookmark"
/*
* Note: the names of properties that are marked delegatable are also
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_destroy.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_destroy.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_destroy.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -20,7 +21,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
* Copyright (c) 2012, Joyent, Inc. All rights reserved.
*/
@@ -35,15 +36,16 @@
struct dsl_dataset;
struct dmu_tx;
-int dsl_destroy_snapshots_nvl(struct nvlist *snaps, boolean_t defer,
- struct nvlist *errlist);
-int dsl_destroy_snapshot(const char *name, boolean_t defer);
-int dsl_destroy_head(const char *name);
-int dsl_destroy_head_check_impl(struct dsl_dataset *ds, int expected_holds);
-void dsl_destroy_head_sync_impl(struct dsl_dataset *ds, struct dmu_tx *tx);
-int dsl_destroy_inconsistent(const char *dsname, void *arg);
-void dsl_destroy_snapshot_sync_impl(struct dsl_dataset *ds,
- boolean_t defer, struct dmu_tx *tx);
+int dsl_destroy_snapshots_nvl(struct nvlist *, boolean_t,
+ struct nvlist *);
+int dsl_destroy_snapshot(const char *, boolean_t);
+int dsl_destroy_head(const char *);
+int dsl_destroy_head_check_impl(struct dsl_dataset *, int);
+void dsl_destroy_head_sync_impl(struct dsl_dataset *, struct dmu_tx *);
+int dsl_destroy_inconsistent(const char *, void *);
+int dsl_destroy_snapshot_check_impl(struct dsl_dataset *, boolean_t);
+void dsl_destroy_snapshot_sync_impl(struct dsl_dataset *,
+ boolean_t, struct dmu_tx *);
#ifdef __cplusplus
}
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -20,7 +21,9 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
*/
#ifndef _SYS_DSL_DIR_H
@@ -38,6 +41,14 @@
struct dsl_dataset;
+/*
+ * DD_FIELD_* are strings that are used in the "extensified" dsl_dir zap object.
+ * They should be of the format <reverse-dns>:<field>.
+ */
+
+#define DD_FIELD_FILESYSTEM_COUNT "com.joyent:filesystem_count"
+#define DD_FIELD_SNAPSHOT_COUNT "com.joyent:snapshot_count"
+
typedef enum dd_used {
DD_USED_HEAD,
DD_USED_SNAP,
@@ -75,12 +86,15 @@
} dsl_dir_phys_t;
struct dsl_dir {
+ dmu_buf_user_t dd_dbu;
+
/* These are immutable; no lock needed: */
uint64_t dd_object;
- dsl_dir_phys_t *dd_phys;
- dmu_buf_t *dd_dbuf;
dsl_pool_t *dd_pool;
+ /* Stable until user eviction; no lock needed: */
+ dmu_buf_t *dd_dbuf;
+
/* protected by lock on pool's dp_dirty_dirs list */
txg_node_t dd_dirty_link;
@@ -89,7 +103,7 @@
/* Protected by dd_lock */
kmutex_t dd_lock;
- list_t dd_prop_cbs; /* list of dsl_prop_cb_record_t's */
+ list_t dd_props; /* list of dsl_prop_record_t's */
timestruc_t dd_snap_cmtime; /* last time snapshot namespace changed */
uint64_t dd_origin_txg;
@@ -99,10 +113,17 @@
int64_t dd_space_towrite[TXG_SIZE];
/* protected by dd_lock; keep at end of struct for better locality */
- char dd_myname[MAXNAMELEN];
+ char dd_myname[ZFS_MAX_DATASET_NAME_LEN];
};
+inline dsl_dir_phys_t *
+dsl_dir_phys(dsl_dir_t *dd)
+{
+ return (dd->dd_dbuf->db_data);
+}
+
void dsl_dir_rele(dsl_dir_t *dd, void *tag);
+void dsl_dir_async_rele(dsl_dir_t *dd, void *tag);
int dsl_dir_hold(dsl_pool_t *dp, const char *name, void *tag,
dsl_dir_t **, const char **tail);
int dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj,
@@ -129,8 +150,13 @@
uint64_t quota);
int dsl_dir_set_reservation(const char *ddname, zprop_source_t source,
uint64_t reservation);
+int dsl_dir_activate_fs_ss_limit(const char *);
+int dsl_fs_ss_limit_check(dsl_dir_t *, uint64_t, zfs_prop_t, dsl_dir_t *,
+ cred_t *);
+void dsl_fs_ss_count_adjust(dsl_dir_t *, int64_t, const char *, dmu_tx_t *);
int dsl_dir_rename(const char *oldname, const char *newname);
-int dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t space);
+int dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd,
+ uint64_t fs_cnt, uint64_t ss_cnt, uint64_t space, cred_t *);
boolean_t dsl_dir_is_clone(dsl_dir_t *dd);
void dsl_dir_new_refreservation(dsl_dir_t *dd, struct dsl_dataset *ds,
uint64_t reservation, cred_t *cr, dmu_tx_t *tx);
@@ -138,6 +164,8 @@
timestruc_t dsl_dir_snap_cmtime(dsl_dir_t *dd);
void dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value,
dmu_tx_t *tx);
+void dsl_dir_zapify(dsl_dir_t *dd, dmu_tx_t *tx);
+boolean_t dsl_dir_is_zapified(dsl_dir_t *dd);
/* internal reserved dir name */
#define MOS_DIR_NAME "$MOS"
@@ -144,15 +172,15 @@
#define ORIGIN_DIR_NAME "$ORIGIN"
#define XLATION_DIR_NAME "$XLATION"
#define FREE_DIR_NAME "$FREE"
+#define LEAK_DIR_NAME "$LEAK"
#ifdef ZFS_DEBUG
#define dprintf_dd(dd, fmt, ...) do { \
if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
- char *__ds_name = kmem_alloc(MAXNAMELEN + strlen(MOS_DIR_NAME) + 1, \
- KM_SLEEP); \
+ char *__ds_name = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); \
dsl_dir_name(dd, __ds_name); \
dprintf("dd=%s " fmt, __ds_name, __VA_ARGS__); \
- kmem_free(__ds_name, MAXNAMELEN + strlen(MOS_DIR_NAME) + 1); \
+ kmem_free(__ds_name, ZFS_MAX_DATASET_NAME_LEN); \
} \
_NOTE(CONSTCOND) } while (0)
#else
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_pool.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -20,7 +21,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
*/
#ifndef _SYS_DSL_POOL_H
@@ -49,6 +50,13 @@
struct dmu_tx;
struct dsl_scan;
+extern uint64_t zfs_dirty_data_max;
+extern uint64_t zfs_dirty_data_max_max;
+extern uint64_t zfs_dirty_data_sync;
+extern int zfs_dirty_data_max_percent;
+extern int zfs_delay_min_dirty_percent;
+extern uint64_t zfs_delay_scale;
+
/* These macros are for indexing into the zfs_all_blkstats_t. */
#define DMU_OT_DEFERRED DMU_OT_NONE
#define DMU_OT_OTHER DMU_OT_NUMTYPES /* place holder for DMU_OT() types */
@@ -77,6 +85,7 @@
struct dsl_dir *dp_root_dir;
struct dsl_dir *dp_mos_dir;
struct dsl_dir *dp_free_dir;
+ struct dsl_dir *dp_leak_dir;
struct dsl_dataset *dp_origin_snap;
uint64_t dp_root_dir_obj;
struct taskq *dp_vnrele_taskq;
@@ -83,9 +92,6 @@
/* No lock needed - sync context only */
blkptr_t dp_meta_rootbp;
- hrtime_t dp_read_overhead;
- uint64_t dp_throughput; /* bytes per millisec */
- uint64_t dp_write_limit;
uint64_t dp_tmp_userrefs_obj;
bpobj_t dp_free_bpobj;
uint64_t dp_bptree_obj;
@@ -95,12 +101,19 @@
/* Uses dp_lock */
kmutex_t dp_lock;
- uint64_t dp_space_towrite[TXG_SIZE];
- uint64_t dp_tempreserved[TXG_SIZE];
+ kcondvar_t dp_spaceavail_cv;
+ uint64_t dp_dirty_pertxg[TXG_SIZE];
+ uint64_t dp_dirty_total;
uint64_t dp_mos_used_delta;
uint64_t dp_mos_compressed_delta;
uint64_t dp_mos_uncompressed_delta;
+ /*
+ * Time of most recently scheduled (furthest in the future)
+ * wakeup for delayed transactions.
+ */
+ hrtime_t dp_last_wakeup;
+
/* Has its own locking */
tx_state_t dp_tx;
txg_list_t dp_dirty_datasets;
@@ -129,10 +142,8 @@
int dsl_pool_sync_context(dsl_pool_t *dp);
uint64_t dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree);
uint64_t dsl_pool_adjustedfree(dsl_pool_t *dp, boolean_t netfree);
-int dsl_pool_tempreserve_space(dsl_pool_t *dp, uint64_t space, dmu_tx_t *tx);
-void dsl_pool_tempreserve_clear(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx);
-void dsl_pool_memory_pressure(dsl_pool_t *dp);
-void dsl_pool_willuse_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx);
+void dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx);
+void dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg);
void dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp);
void dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg,
const blkptr_t *bpp);
@@ -142,8 +153,11 @@
void dsl_pool_mos_diduse_space(dsl_pool_t *dp,
int64_t used, int64_t comp, int64_t uncomp);
void dsl_pool_config_enter(dsl_pool_t *dp, void *tag);
+void dsl_pool_config_enter_prio(dsl_pool_t *dp, void *tag);
void dsl_pool_config_exit(dsl_pool_t *dp, void *tag);
boolean_t dsl_pool_config_held(dsl_pool_t *dp);
+boolean_t dsl_pool_config_held_writer(dsl_pool_t *dp);
+boolean_t dsl_pool_need_dirty_delay(dsl_pool_t *dp);
taskq_t *dsl_pool_vnrele_taskq(dsl_pool_t *dp);
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_prop.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_prop.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_prop.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -41,10 +42,17 @@
/* The callback func may not call into the DMU or DSL! */
typedef void (dsl_prop_changed_cb_t)(void *arg, uint64_t newval);
+typedef struct dsl_prop_record {
+ list_node_t pr_node; /* link on dd_props */
+ const char *pr_propname;
+ list_t pr_cbs;
+} dsl_prop_record_t;
+
typedef struct dsl_prop_cb_record {
- list_node_t cbr_node; /* link on dd_prop_cbs */
+ list_node_t cbr_pr_node; /* link on pr_cbs */
+ list_node_t cbr_ds_node; /* link on ds_prop_cbs */
+ dsl_prop_record_t *cbr_pr;
struct dsl_dataset *cbr_ds;
- const char *cbr_propname;
dsl_prop_changed_cb_t *cbr_func;
void *cbr_arg;
} dsl_prop_cb_record_t;
@@ -54,10 +62,11 @@
zprop_source_t pa_source;
} dsl_props_arg_t;
+void dsl_prop_init(dsl_dir_t *dd);
+void dsl_prop_fini(dsl_dir_t *dd);
int dsl_prop_register(struct dsl_dataset *ds, const char *propname,
dsl_prop_changed_cb_t *callback, void *cbarg);
-int dsl_prop_unregister(struct dsl_dataset *ds, const char *propname,
- dsl_prop_changed_cb_t *callback, void *cbarg);
+void dsl_prop_unregister_all(struct dsl_dataset *ds, void *cbarg);
void dsl_prop_notify_all(struct dsl_dir *dd);
boolean_t dsl_prop_hascb(struct dsl_dataset *ds);
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_scan.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_scan.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_scan.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -20,7 +21,7 @@
*/
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
*/
#ifndef _SYS_DSL_SCAN_H
@@ -62,7 +63,7 @@
uint64_t scn_errors; /* scan I/O error count */
uint64_t scn_ddt_class_max;
ddt_bookmark_t scn_ddt_bookmark;
- zbookmark_t scn_bookmark;
+ zbookmark_phys_t scn_bookmark;
uint64_t scn_flags; /* dsl_scan_flags_t */
} dsl_scan_phys_t;
@@ -72,11 +73,42 @@
DSF_VISIT_DS_AGAIN = 1<<0,
} dsl_scan_flags_t;
+/*
+ * Every pool will have one dsl_scan_t and this structure will contain
+ * in-memory information about the scan and a pointer to the on-disk
+ * representation (i.e. dsl_scan_phys_t). Most of the state of the scan
+ * is contained on-disk to allow the scan to resume in the event of a reboot
+ * or panic. This structure maintains information about the behavior of a
+ * running scan, some caching information, and how it should traverse the pool.
+ *
+ * The following members of this structure direct the behavior of the scan:
+ *
+ * scn_pausing - a scan that cannot be completed in a single txg or
+ * has exceeded its allotted time will need to pause.
+ * When this flag is set the scanner will stop traversing
+ * the pool and write out the current state to disk.
+ *
+ * scn_restart_txg - directs the scanner to either restart or start a
+ * a scan at the specified txg value.
+ *
+ * scn_done_txg - when a scan completes its traversal it will set
+ * the completion txg to the next txg. This is necessary
+ * to ensure that any blocks that were freed during
+ * the scan but have not yet been processed (i.e deferred
+ * frees) are accounted for.
+ *
+ * This structure also maintains information about deferred frees which are
+ * a special kind of traversal. Deferred free can exist in either a bptree or
+ * a bpobj structure. The scn_is_bptree flag will indicate the type of
+ * deferred free that is in progress. If the deferred free is part of an
+ * asynchronous destroy then the scn_async_destroying flag will be set.
+ */
typedef struct dsl_scan {
struct dsl_pool *scn_dp;
boolean_t scn_pausing;
uint64_t scn_restart_txg;
+ uint64_t scn_done_txg;
uint64_t scn_sync_start_time;
zio_t *scn_zio_root;
@@ -83,6 +115,7 @@
/* for freeing blocks */
boolean_t scn_is_bptree;
boolean_t scn_async_destroying;
+ boolean_t scn_async_stalled;
/* for debugging / information */
uint64_t scn_visited_this_txg;
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_synctask.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_synctask.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_synctask.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -20,7 +21,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
*/
#ifndef _SYS_DSL_SYNCTASK_H
@@ -38,11 +39,41 @@
typedef int (dsl_checkfunc_t)(void *, dmu_tx_t *);
typedef void (dsl_syncfunc_t)(void *, dmu_tx_t *);
+typedef enum zfs_space_check {
+ /*
+ * Normal space check: if there is less than 3.2% free space,
+ * the operation will fail. Operations which are logically
+ * creating things should use this (e.g. "zfs create", "zfs snapshot").
+ * User writes (via the ZPL / ZVOL) also fail at this point.
+ */
+ ZFS_SPACE_CHECK_NORMAL,
+
+ /*
+ * Space check allows use of half the slop space. If there
+ * is less than 1.6% free space, the operation will fail. Most
+ * operations should use this (e.g. "zfs set", "zfs rename"),
+ * because we want them to succeed even after user writes are failing,
+ * so that they can be used as part of the space recovery process.
+ */
+ ZFS_SPACE_CHECK_RESERVED,
+
+ /*
+ * No space check is performed. Only operations which we expect to
+ * result in a net reduction in space should use this
+ * (e.g. "zfs destroy". Setting quotas & reservations also uses
+ * this because it needs to circumvent the quota/reservation checks).
+ *
+ * See also the comments above spa_slop_shift.
+ */
+ ZFS_SPACE_CHECK_NONE,
+} zfs_space_check_t;
+
typedef struct dsl_sync_task {
txg_node_t dst_node;
struct dsl_pool *dst_pool;
uint64_t dst_txg;
int dst_space;
+ zfs_space_check_t dst_space_check;
dsl_checkfunc_t *dst_checkfunc;
dsl_syncfunc_t *dst_syncfunc;
void *dst_arg;
@@ -50,11 +81,11 @@
boolean_t dst_nowaiter;
} dsl_sync_task_t;
-void dsl_sync_task_sync(dsl_sync_task_t *dst, dmu_tx_t *tx);
-int dsl_sync_task(const char *pool, dsl_checkfunc_t *checkfunc,
- dsl_syncfunc_t *syncfunc, void *arg, int blocks_modified);
-void dsl_sync_task_nowait(struct dsl_pool *dp, dsl_syncfunc_t *syncfunc,
- void *arg, int blocks_modified, dmu_tx_t *tx);
+void dsl_sync_task_sync(dsl_sync_task_t *, dmu_tx_t *);
+int dsl_sync_task(const char *, dsl_checkfunc_t *,
+ dsl_syncfunc_t *, void *, int, zfs_space_check_t);
+void dsl_sync_task_nowait(struct dsl_pool *, dsl_syncfunc_t *,
+ void *, int, zfs_space_check_t, dmu_tx_t *);
#ifdef __cplusplus
}
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_userhold.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_userhold.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_userhold.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -20,7 +21,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
*/
#ifndef _SYS_METASLAB_H
@@ -36,47 +37,68 @@
extern "C" {
#endif
-extern space_map_ops_t *zfs_metaslab_ops;
+typedef struct metaslab_ops {
+ uint64_t (*msop_alloc)(metaslab_t *msp, uint64_t size);
+} metaslab_ops_t;
-extern metaslab_t *metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo,
- uint64_t start, uint64_t size, uint64_t txg);
-extern void metaslab_fini(metaslab_t *msp);
-extern void metaslab_sync(metaslab_t *msp, uint64_t txg);
-extern void metaslab_sync_done(metaslab_t *msp, uint64_t txg);
-extern void metaslab_sync_reassess(metaslab_group_t *mg);
+extern metaslab_ops_t *zfs_metaslab_ops;
-#define METASLAB_HINTBP_FAVOR 0x0
-#define METASLAB_HINTBP_AVOID 0x1
-#define METASLAB_GANG_HEADER 0x2
-#define METASLAB_GANG_CHILD 0x4
-#define METASLAB_GANG_AVOID 0x8
+int metaslab_init(metaslab_group_t *, uint64_t, uint64_t, uint64_t,
+ metaslab_t **);
+void metaslab_fini(metaslab_t *);
-extern int metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
- blkptr_t *bp, int ncopies, uint64_t txg, blkptr_t *hintbp, int flags);
-extern void metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg,
- boolean_t now);
-extern int metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg);
-extern void metaslab_check_free(spa_t *spa, const blkptr_t *bp);
+void metaslab_load_wait(metaslab_t *);
+int metaslab_load(metaslab_t *);
+void metaslab_unload(metaslab_t *);
-extern metaslab_class_t *metaslab_class_create(spa_t *spa,
- space_map_ops_t *ops);
-extern void metaslab_class_destroy(metaslab_class_t *mc);
-extern int metaslab_class_validate(metaslab_class_t *mc);
+void metaslab_sync(metaslab_t *, uint64_t);
+void metaslab_sync_done(metaslab_t *, uint64_t);
+void metaslab_sync_reassess(metaslab_group_t *);
+uint64_t metaslab_block_maxsize(metaslab_t *);
-extern void metaslab_class_space_update(metaslab_class_t *mc,
- int64_t alloc_delta, int64_t defer_delta,
- int64_t space_delta, int64_t dspace_delta);
-extern uint64_t metaslab_class_get_alloc(metaslab_class_t *mc);
-extern uint64_t metaslab_class_get_space(metaslab_class_t *mc);
-extern uint64_t metaslab_class_get_dspace(metaslab_class_t *mc);
-extern uint64_t metaslab_class_get_deferred(metaslab_class_t *mc);
+#define METASLAB_HINTBP_FAVOR 0x0
+#define METASLAB_HINTBP_AVOID 0x1
+#define METASLAB_GANG_HEADER 0x2
+#define METASLAB_GANG_CHILD 0x4
+#define METASLAB_ASYNC_ALLOC 0x8
+#define METASLAB_DONT_THROTTLE 0x10
-extern metaslab_group_t *metaslab_group_create(metaslab_class_t *mc,
- vdev_t *vd);
-extern void metaslab_group_destroy(metaslab_group_t *mg);
-extern void metaslab_group_activate(metaslab_group_t *mg);
-extern void metaslab_group_passivate(metaslab_group_t *mg);
+int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t,
+ blkptr_t *, int, uint64_t, blkptr_t *, int, zio_t *);
+void metaslab_free(spa_t *, const blkptr_t *, uint64_t, boolean_t);
+int metaslab_claim(spa_t *, const blkptr_t *, uint64_t);
+void metaslab_check_free(spa_t *, const blkptr_t *);
+metaslab_class_t *metaslab_class_create(spa_t *, metaslab_ops_t *);
+void metaslab_class_destroy(metaslab_class_t *);
+int metaslab_class_validate(metaslab_class_t *);
+void metaslab_class_histogram_verify(metaslab_class_t *);
+uint64_t metaslab_class_fragmentation(metaslab_class_t *);
+uint64_t metaslab_class_expandable_space(metaslab_class_t *);
+boolean_t metaslab_class_throttle_reserve(metaslab_class_t *, int,
+ zio_t *, int);
+void metaslab_class_throttle_unreserve(metaslab_class_t *, int, zio_t *);
+
+void metaslab_class_space_update(metaslab_class_t *, int64_t, int64_t,
+ int64_t, int64_t);
+uint64_t metaslab_class_get_alloc(metaslab_class_t *);
+uint64_t metaslab_class_get_space(metaslab_class_t *);
+uint64_t metaslab_class_get_dspace(metaslab_class_t *);
+uint64_t metaslab_class_get_deferred(metaslab_class_t *);
+uint64_t metaslab_class_get_minblocksize(metaslab_class_t *mc);
+
+metaslab_group_t *metaslab_group_create(metaslab_class_t *, vdev_t *);
+void metaslab_group_destroy(metaslab_group_t *);
+void metaslab_group_activate(metaslab_group_t *);
+void metaslab_group_passivate(metaslab_group_t *);
+boolean_t metaslab_group_initialized(metaslab_group_t *);
+uint64_t metaslab_group_get_space(metaslab_group_t *);
+void metaslab_group_histogram_verify(metaslab_group_t *);
+uint64_t metaslab_group_fragmentation(metaslab_group_t *);
+void metaslab_group_histogram_remove(metaslab_group_t *, metaslab_t *);
+void metaslab_group_alloc_decrement(spa_t *, uint64_t, void *, int);
+void metaslab_group_alloc_verify(spa_t *, const blkptr_t *, void *);
+
#ifdef __cplusplus
}
#endif
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -24,7 +25,7 @@
*/
/*
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
*/
#ifndef _SYS_METASLAB_IMPL_H
@@ -32,6 +33,7 @@
#include <sys/metaslab.h>
#include <sys/space_map.h>
+#include <sys/range_tree.h>
#include <sys/vdev.h>
#include <sys/txg.h>
#include <sys/avl.h>
@@ -40,66 +42,216 @@
extern "C" {
#endif
+/*
+ * A metaslab class encompasses a category of allocatable top-level vdevs.
+ * Each top-level vdev is associated with a metaslab group which defines
+ * the allocatable region for that vdev. Examples of these categories include
+ * "normal" for data block allocations (i.e. main pool allocations) or "log"
+ * for allocations designated for intent log devices (i.e. slog devices).
+ * When a block allocation is requested from the SPA it is associated with a
+ * metaslab_class_t, and only top-level vdevs (i.e. metaslab groups) belonging
+ * to the class can be used to satisfy that request. Allocations are done
+ * by traversing the metaslab groups that are linked off of the mc_rotor field.
+ * This rotor points to the next metaslab group where allocations will be
+ * attempted. Allocating a block is a 3 step process -- select the metaslab
+ * group, select the metaslab, and then allocate the block. The metaslab
+ * class defines the low-level block allocator that will be used as the
+ * final step in allocation. These allocators are pluggable allowing each class
+ * to use a block allocator that best suits that class.
+ */
struct metaslab_class {
+ kmutex_t mc_lock;
spa_t *mc_spa;
metaslab_group_t *mc_rotor;
- space_map_ops_t *mc_ops;
+ metaslab_ops_t *mc_ops;
uint64_t mc_aliquot;
+
+ /*
+ * Track the number of metaslab groups that have been initialized
+ * and can accept allocations. An initialized metaslab group is
+ * one has been completely added to the config (i.e. we have
+ * updated the MOS config and the space has been added to the pool).
+ */
+ uint64_t mc_groups;
+
+ /*
+ * Toggle to enable/disable the allocation throttle.
+ */
+ boolean_t mc_alloc_throttle_enabled;
+
+ /*
+ * The allocation throttle works on a reservation system. Whenever
+ * an asynchronous zio wants to perform an allocation it must
+ * first reserve the number of blocks that it wants to allocate.
+ * If there aren't sufficient slots available for the pending zio
+ * then that I/O is throttled until more slots free up. The current
+ * number of reserved allocations is maintained by the mc_alloc_slots
+ * refcount. The mc_alloc_max_slots value determines the maximum
+ * number of allocations that the system allows. Gang blocks are
+ * allowed to reserve slots even if we've reached the maximum
+ * number of allocations allowed.
+ */
+ uint64_t mc_alloc_max_slots;
+ refcount_t mc_alloc_slots;
+
+ uint64_t mc_alloc_groups; /* # of allocatable groups */
+
uint64_t mc_alloc; /* total allocated space */
uint64_t mc_deferred; /* total deferred frees */
uint64_t mc_space; /* total space (alloc + free) */
uint64_t mc_dspace; /* total deflated space */
+ uint64_t mc_minblocksize;
+ uint64_t mc_histogram[RANGE_TREE_HISTOGRAM_SIZE];
};
+/*
+ * Metaslab groups encapsulate all the allocatable regions (i.e. metaslabs)
+ * of a top-level vdev. They are linked togther to form a circular linked
+ * list and can belong to only one metaslab class. Metaslab groups may become
+ * ineligible for allocations for a number of reasons such as limited free
+ * space, fragmentation, or going offline. When this happens the allocator will
+ * simply find the next metaslab group in the linked list and attempt
+ * to allocate from that group instead.
+ */
struct metaslab_group {
kmutex_t mg_lock;
avl_tree_t mg_metaslab_tree;
uint64_t mg_aliquot;
- uint64_t mg_bonus_area;
- uint64_t mg_alloc_failures;
+ boolean_t mg_allocatable; /* can we allocate? */
+
+ /*
+ * A metaslab group is considered to be initialized only after
+ * we have updated the MOS config and added the space to the pool.
+ * We only allow allocation attempts to a metaslab group if it
+ * has been initialized.
+ */
+ boolean_t mg_initialized;
+
+ uint64_t mg_free_capacity; /* percentage free */
int64_t mg_bias;
int64_t mg_activation_count;
metaslab_class_t *mg_class;
vdev_t *mg_vd;
+ taskq_t *mg_taskq;
metaslab_group_t *mg_prev;
metaslab_group_t *mg_next;
+
+ /*
+ * Each metaslab group can handle mg_max_alloc_queue_depth allocations
+ * which are tracked by mg_alloc_queue_depth. It's possible for a
+ * metaslab group to handle more allocations than its max. This
+ * can occur when gang blocks are required or when other groups
+ * are unable to handle their share of allocations.
+ */
+ uint64_t mg_max_alloc_queue_depth;
+ refcount_t mg_alloc_queue_depth;
+
+ /*
+ * A metalab group that can no longer allocate the minimum block
+ * size will set mg_no_free_space. Once a metaslab group is out
+ * of space then its share of work must be distributed to other
+ * groups.
+ */
+ boolean_t mg_no_free_space;
+
+ uint64_t mg_allocations;
+ uint64_t mg_failed_allocations;
+ uint64_t mg_fragmentation;
+ uint64_t mg_histogram[RANGE_TREE_HISTOGRAM_SIZE];
};
/*
- * Each metaslab maintains an in-core free map (ms_map) that contains the
- * current list of free segments. As blocks are allocated, the allocated
- * segment is removed from the ms_map and added to a per txg allocation map.
- * As blocks are freed, they are added to the per txg free map. These per
- * txg maps allow us to process all allocations and frees in syncing context
- * where it is safe to update the on-disk space maps.
+ * This value defines the number of elements in the ms_lbas array. The value
+ * of 64 was chosen as it covers all power of 2 buckets up to UINT64_MAX.
+ * This is the equivalent of highbit(UINT64_MAX).
+ */
+#define MAX_LBAS 64
+
+/*
+ * Each metaslab maintains a set of in-core trees to track metaslab operations.
+ * The in-core free tree (ms_tree) contains the current list of free segments.
+ * As blocks are allocated, the allocated segment are removed from the ms_tree
+ * and added to a per txg allocation tree (ms_alloctree). As blocks are freed,
+ * they are added to the per txg free tree (ms_freetree). These per txg
+ * trees allow us to process all allocations and frees in syncing context
+ * where it is safe to update the on-disk space maps. One additional in-core
+ * tree is maintained to track deferred frees (ms_defertree). Once a block
+ * is freed it will move from the ms_freetree to the ms_defertree. A deferred
+ * free means that a block has been freed but cannot be used by the pool
+ * until TXG_DEFER_SIZE transactions groups later. For example, a block
+ * that is freed in txg 50 will not be available for reallocation until
+ * txg 52 (50 + TXG_DEFER_SIZE). This provides a safety net for uberblock
+ * rollback. A pool could be safely rolled back TXG_DEFERS_SIZE
+ * transactions groups and ensure that no block has been reallocated.
*
- * Each metaslab's free space is tracked in a space map object in the MOS,
+ * The simplified transition diagram looks like this:
+ *
+ *
+ * ALLOCATE
+ * |
+ * V
+ * free segment (ms_tree) --------> ms_alloctree ----> (write to space map)
+ * ^
+ * |
+ * | ms_freetree <--- FREE
+ * | |
+ * | |
+ * | |
+ * +----------- ms_defertree <-------+---------> (write to space map)
+ *
+ *
+ * Each metaslab's space is tracked in a single space map in the MOS,
* which is only updated in syncing context. Each time we sync a txg,
- * we append the allocs and frees from that txg to the space map object.
- * When the txg is done syncing, metaslab_sync_done() updates ms_smo
- * to ms_smo_syncing. Everything in ms_smo is always safe to allocate.
+ * we append the allocs and frees from that txg to the space map.
+ * The pool space is only updated once all metaslabs have finished syncing.
*
- * To load the in-core free map we read the space map object from disk.
+ * To load the in-core free tree we read the space map from disk.
* This object contains a series of alloc and free records that are
* combined to make up the list of all free segments in this metaslab. These
- * segments are represented in-core by the ms_map and are stored in an
+ * segments are represented in-core by the ms_tree and are stored in an
* AVL tree.
*
- * As the space map objects grows (as a result of the appends) it will
- * eventually become space-inefficient. When the space map object is
- * zfs_condense_pct/100 times the size of the minimal on-disk representation,
- * we rewrite it in its minimized form.
+ * As the space map grows (as a result of the appends) it will
+ * eventually become space-inefficient. When the metaslab's in-core free tree
+ * is zfs_condense_pct/100 times the size of the minimal on-disk
+ * representation, we rewrite it in its minimized form. If a metaslab
+ * needs to condense then we must set the ms_condensing flag to ensure
+ * that allocations are not performed on the metaslab that is being written.
*/
struct metaslab {
- kmutex_t ms_lock; /* metaslab lock */
- space_map_obj_t ms_smo; /* synced space map object */
- space_map_obj_t ms_smo_syncing; /* syncing space map object */
- space_map_t *ms_allocmap[TXG_SIZE]; /* allocated this txg */
- space_map_t *ms_freemap[TXG_SIZE]; /* freed this txg */
- space_map_t *ms_defermap[TXG_DEFER_SIZE]; /* deferred frees */
- space_map_t *ms_map; /* in-core free space map */
+ kmutex_t ms_lock;
+ kcondvar_t ms_load_cv;
+ space_map_t *ms_sm;
+ metaslab_ops_t *ms_ops;
+ uint64_t ms_id;
+ uint64_t ms_start;
+ uint64_t ms_size;
+ uint64_t ms_fragmentation;
+
+ range_tree_t *ms_alloctree[TXG_SIZE];
+ range_tree_t *ms_freetree[TXG_SIZE];
+ range_tree_t *ms_defertree[TXG_DEFER_SIZE];
+ range_tree_t *ms_tree;
+
+ boolean_t ms_condensing; /* condensing? */
+ boolean_t ms_condense_wanted;
+ boolean_t ms_loaded;
+ boolean_t ms_loading;
+
int64_t ms_deferspace; /* sum of ms_defermap[] space */
uint64_t ms_weight; /* weight vs. others in group */
+ uint64_t ms_access_txg;
+
+ /*
+ * The metaslab block allocators can optionally use a size-ordered
+ * range tree and/or an array of LBAs. Not all allocators use
+ * this functionality. The ms_size_tree should always contain the
+ * same number of segments as the ms_tree. The only difference
+ * is that the ms_size_tree is ordered by segment sizes.
+ */
+ avl_tree_t ms_size_tree;
+ uint64_t ms_lbas[MAX_LBAS];
+
metaslab_group_t *ms_group; /* metaslab group */
avl_node_t ms_group_node; /* node in metaslab group tree */
txg_node_t ms_txg_node; /* per-txg dirty metaslab links */
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -20,7 +21,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
*/
#ifndef _SYS_REFCOUNT_H
@@ -64,6 +65,7 @@
void refcount_create(refcount_t *rc);
void refcount_create_untracked(refcount_t *rc);
+void refcount_create_tracked(refcount_t *rc);
void refcount_destroy(refcount_t *rc);
void refcount_destroy_many(refcount_t *rc, uint64_t number);
int refcount_is_zero(refcount_t *rc);
@@ -73,6 +75,9 @@
int64_t refcount_add_many(refcount_t *rc, uint64_t number, void *holder_tag);
int64_t refcount_remove_many(refcount_t *rc, uint64_t number, void *holder_tag);
void refcount_transfer(refcount_t *dst, refcount_t *src);
+void refcount_transfer_ownership(refcount_t *, void *, void *);
+boolean_t refcount_held(refcount_t *, void *);
+boolean_t refcount_not_held(refcount_t *, void *);
void refcount_sysinit(void);
void refcount_fini(void);
@@ -85,12 +90,13 @@
#define refcount_create(rc) ((rc)->rc_count = 0)
#define refcount_create_untracked(rc) ((rc)->rc_count = 0)
+#define refcount_create_tracked(rc) ((rc)->rc_count = 0)
#define refcount_destroy(rc) ((rc)->rc_count = 0)
#define refcount_destroy_many(rc, number) ((rc)->rc_count = 0)
#define refcount_is_zero(rc) ((rc)->rc_count == 0)
#define refcount_count(rc) ((rc)->rc_count)
-#define refcount_add(rc, holder) atomic_add_64_nv(&(rc)->rc_count, 1)
-#define refcount_remove(rc, holder) atomic_add_64_nv(&(rc)->rc_count, -1)
+#define refcount_add(rc, holder) atomic_inc_64_nv(&(rc)->rc_count)
+#define refcount_remove(rc, holder) atomic_dec_64_nv(&(rc)->rc_count)
#define refcount_add_many(rc, number, holder) \
atomic_add_64_nv(&(rc)->rc_count, number)
#define refcount_remove_many(rc, number, holder) \
@@ -100,6 +106,9 @@
atomic_add_64(&(src)->rc_count, -__tmp); \
atomic_add_64(&(dst)->rc_count, __tmp); \
}
+#define refcount_transfer_ownership(rc, current_holder, new_holder)
+#define refcount_held(rc, holder) ((rc)->rc_count > 0)
+#define refcount_not_held(rc, holder) (B_TRUE)
#define refcount_sysinit()
#define refcount_fini()
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/rrwlock.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/rrwlock.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/rrwlock.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -69,6 +70,7 @@
void rrw_destroy(rrwlock_t *rrl);
void rrw_enter(rrwlock_t *rrl, krw_t rw, void *tag);
void rrw_enter_read(rrwlock_t *rrl, void *tag);
+void rrw_enter_read_prio(rrwlock_t *rrl, void *tag);
void rrw_enter_write(rrwlock_t *rrl);
void rrw_exit(rrwlock_t *rrl, void *tag);
boolean_t rrw_held(rrwlock_t *rrl, krw_t rw);
@@ -79,6 +81,31 @@
#define RRW_LOCK_HELD(x) \
(rrw_held(x, RW_WRITER) || rrw_held(x, RW_READER))
+/*
+ * A reader-mostly lock implementation, tuning above reader-writer locks
+ * for hightly parallel read acquisitions, pessimizing write acquisitions.
+ *
+ * This should be a prime number. See comment in rrwlock.c near
+ * RRM_TD_LOCK() for details.
+ */
+#define RRM_NUM_LOCKS 17
+typedef struct rrmlock {
+ rrwlock_t locks[RRM_NUM_LOCKS];
+} rrmlock_t;
+
+void rrm_init(rrmlock_t *rrl, boolean_t track_all);
+void rrm_destroy(rrmlock_t *rrl);
+void rrm_enter(rrmlock_t *rrl, krw_t rw, void *tag);
+void rrm_enter_read(rrmlock_t *rrl, void *tag);
+void rrm_enter_write(rrmlock_t *rrl);
+void rrm_exit(rrmlock_t *rrl, void *tag);
+boolean_t rrm_held(rrmlock_t *rrl, krw_t rw);
+
+#define RRM_READ_HELD(x) rrm_held(x, RW_READER)
+#define RRM_WRITE_HELD(x) rrm_held(x, RW_WRITER)
+#define RRM_LOCK_HELD(x) \
+ (rrm_held(x, RW_WRITER) || rrm_held(x, RW_READER))
+
#ifdef __cplusplus
}
#endif
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -134,7 +135,6 @@
uint32_t buflen, sa_data_locator_t *, void *userdata, dmu_tx_t *);
void sa_object_info(sa_handle_t *, dmu_object_info_t *);
void sa_object_size(sa_handle_t *, uint32_t *, u_longlong_t *);
-void sa_update_user(sa_handle_t *, sa_handle_t *);
void *sa_get_userdata(sa_handle_t *);
void sa_set_userp(sa_handle_t *, void *);
dmu_buf_t *sa_get_db(sa_handle_t *);
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa_impl.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa_impl.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa_impl.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -20,7 +21,8 @@
*/
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
*/
#ifndef _SYS_SA_IMPL_H
@@ -153,12 +155,13 @@
*
* The header has a fixed portion with a variable number
* of "lengths" depending on the number of variable sized
- * attribues which are determined by the "layout number"
+ * attributes which are determined by the "layout number"
*/
#define SA_MAGIC 0x2F505A /* ZFS SA */
typedef struct sa_hdr_phys {
uint32_t sa_magic;
+ /* BEGIN CSTYLED */
/*
* Encoded with hdrsize and layout number as follows:
* 16 10 0
@@ -175,6 +178,7 @@
* 2 ==> 16 byte header
*
*/
+ /* END CSTYLED */
uint16_t sa_layout_info;
uint16_t sa_lengths[1]; /* optional sizes for variable length attrs */
/* ... Data follows the lengths. */
@@ -208,11 +212,12 @@
*/
struct sa_handle {
+ dmu_buf_user_t sa_dbu;
kmutex_t sa_lock;
dmu_buf_t *sa_bonus;
dmu_buf_t *sa_spill;
objset_t *sa_os;
- void *sa_userp;
+ void *sa_userp;
sa_idx_tab_t *sa_bonus_tab; /* idx of bonus */
sa_idx_tab_t *sa_spill_tab; /* only present if spill activated */
};
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -20,8 +21,11 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
*/
#ifndef _SYS_SPA_H
@@ -65,34 +69,72 @@
#define BF32_GET(x, low, len) BF32_DECODE(x, low, len)
#define BF64_GET(x, low, len) BF64_DECODE(x, low, len)
-#define BF32_SET(x, low, len, val) \
- ((x) ^= BF32_ENCODE((x >> low) ^ (val), low, len))
-#define BF64_SET(x, low, len, val) \
- ((x) ^= BF64_ENCODE((x >> low) ^ (val), low, len))
+#define BF32_SET(x, low, len, val) do { \
+ ASSERT3U(val, <, 1U << (len)); \
+ ASSERT3U(low + len, <=, 32); \
+ (x) ^= BF32_ENCODE((x >> low) ^ (val), low, len); \
+_NOTE(CONSTCOND) } while (0)
+#define BF64_SET(x, low, len, val) do { \
+ ASSERT3U(val, <, 1ULL << (len)); \
+ ASSERT3U(low + len, <=, 64); \
+ ((x) ^= BF64_ENCODE((x >> low) ^ (val), low, len)); \
+_NOTE(CONSTCOND) } while (0)
+
#define BF32_GET_SB(x, low, len, shift, bias) \
((BF32_GET(x, low, len) + (bias)) << (shift))
#define BF64_GET_SB(x, low, len, shift, bias) \
((BF64_GET(x, low, len) + (bias)) << (shift))
-#define BF32_SET_SB(x, low, len, shift, bias, val) \
- BF32_SET(x, low, len, ((val) >> (shift)) - (bias))
-#define BF64_SET_SB(x, low, len, shift, bias, val) \
- BF64_SET(x, low, len, ((val) >> (shift)) - (bias))
+#define BF32_SET_SB(x, low, len, shift, bias, val) do { \
+ ASSERT(IS_P2ALIGNED(val, 1U << shift)); \
+ ASSERT3S((val) >> (shift), >=, bias); \
+ BF32_SET(x, low, len, ((val) >> (shift)) - (bias)); \
+_NOTE(CONSTCOND) } while (0)
+#define BF64_SET_SB(x, low, len, shift, bias, val) do { \
+ ASSERT(IS_P2ALIGNED(val, 1ULL << shift)); \
+ ASSERT3S((val) >> (shift), >=, bias); \
+ BF64_SET(x, low, len, ((val) >> (shift)) - (bias)); \
+_NOTE(CONSTCOND) } while (0)
/*
- * We currently support nine block sizes, from 512 bytes to 128K.
- * We could go higher, but the benefits are near-zero and the cost
- * of COWing a giant block to modify one byte would become excessive.
+ * We currently support block sizes from 512 bytes to 16MB.
+ * The benefits of larger blocks, and thus larger IO, need to be weighed
+ * against the cost of COWing a giant block to modify one byte, and the
+ * large latency of reading or writing a large block.
+ *
+ * Note that although blocks up to 16MB are supported, the recordsize
+ * property can not be set larger than zfs_max_recordsize (default 1MB).
+ * See the comment near zfs_max_recordsize in dsl_dataset.c for details.
+ *
+ * Note that although the LSIZE field of the blkptr_t can store sizes up
+ * to 32MB, the dnode's dn_datablkszsec can only store sizes up to
+ * 32MB - 512 bytes. Therefore, we limit SPA_MAXBLOCKSIZE to 16MB.
*/
#define SPA_MINBLOCKSHIFT 9
-#define SPA_MAXBLOCKSHIFT 17
+#define SPA_OLD_MAXBLOCKSHIFT 17
+#define SPA_MAXBLOCKSHIFT 24
#define SPA_MINBLOCKSIZE (1ULL << SPA_MINBLOCKSHIFT)
+#define SPA_OLD_MAXBLOCKSIZE (1ULL << SPA_OLD_MAXBLOCKSHIFT)
#define SPA_MAXBLOCKSIZE (1ULL << SPA_MAXBLOCKSHIFT)
-#define SPA_BLOCKSIZES (SPA_MAXBLOCKSHIFT - SPA_MINBLOCKSHIFT + 1)
+/*
+ * Default maximum supported logical ashift.
+ *
+ * The current 8k allocation block size limit is due to the 8k
+ * aligned/sized operations performed by vdev_probe() on
+ * vdev_label->vl_pad2. Using another "safe region" for these tests
+ * would allow the limit to be raised to 16k, at the expense of
+ * only having 8 available uberblocks in the label area.
+ */
+#define SPA_MAXASHIFT 13
/*
+ * Default minimum supported logical ashift.
+ */
+#define SPA_MINASHIFT SPA_MINBLOCKSHIFT
+
+/*
* Size of block to hold the configuration data (a packed nvlist)
*/
#define SPA_CONFIG_BLOCKSIZE (1ULL << 14)
@@ -108,6 +150,8 @@
#define SPA_PSIZEBITS 16 /* PSIZE up to 32M (2^16 * 512) */
#define SPA_ASIZEBITS 24 /* ASIZE up to 64 times larger */
+#define SPA_COMPRESSBITS 7
+
/*
* All SPA data is represented by 128-bit data virtual addresses (DVAs).
* The members of the dva_t should be considered opaque outside the SPA.
@@ -124,6 +168,14 @@
} zio_cksum_t;
/*
+ * Some checksums/hashes need a 256-bit initialization salt. This salt is kept
+ * secret and is suitable for use in MAC algorithms as the key.
+ */
+typedef struct zio_cksum_salt {
+ uint8_t zcs_bytes[32];
+} zio_cksum_salt_t;
+
+/*
* Each block is described by its DVAs, time of birth, checksum, etc.
* The word-by-word, bit-by-bit layout of the blkptr is as follows:
*
@@ -141,7 +193,7 @@
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 5 |G| offset3 |
* +-------+-------+-------+-------+-------+-------+-------+-------+
- * 6 |BDX|lvl| type | cksum | comp | PSIZE | LSIZE |
+ * 6 |BDX|lvl| type | cksum |E| comp| PSIZE | LSIZE |
* +-------+-------+-------+-------+-------+-------+-------+-------+
* 7 | padding |
* +-------+-------+-------+-------+-------+-------+-------+-------+
@@ -175,7 +227,8 @@
* G gang block indicator
* B byteorder (endianness)
* D dedup
- * X unused
+ * X encryption (on version 30, which is not supported)
+ * E blkptr_t contains embedded data (see below)
* lvl level of indirection
* type DMU object type
* phys birth txg of block allocation; zero if same as logical birth txg
@@ -183,9 +236,112 @@
* fill count number of non-zero blocks under this bp
* checksum[4] 256-bit checksum of the data this bp describes
*/
+
+/*
+ * "Embedded" blkptr_t's don't actually point to a block, instead they
+ * have a data payload embedded in the blkptr_t itself. See the comment
+ * in blkptr.c for more details.
+ *
+ * The blkptr_t is laid out as follows:
+ *
+ * 64 56 48 40 32 24 16 8 0
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 0 | payload |
+ * 1 | payload |
+ * 2 | payload |
+ * 3 | payload |
+ * 4 | payload |
+ * 5 | payload |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 6 |BDX|lvl| type | etype |E| comp| PSIZE| LSIZE |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * 7 | payload |
+ * 8 | payload |
+ * 9 | payload |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * a | logical birth txg |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * b | payload |
+ * c | payload |
+ * d | payload |
+ * e | payload |
+ * f | payload |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ *
+ * Legend:
+ *
+ * payload contains the embedded data
+ * B (byteorder) byteorder (endianness)
+ * D (dedup) padding (set to zero)
+ * X encryption (set to zero; see above)
+ * E (embedded) set to one
+ * lvl indirection level
+ * type DMU object type
+ * etype how to interpret embedded data (BP_EMBEDDED_TYPE_*)
+ * comp compression function of payload
+ * PSIZE size of payload after compression, in bytes
+ * LSIZE logical size of payload, in bytes
+ * note that 25 bits is enough to store the largest
+ * "normal" BP's LSIZE (2^16 * 2^9) in bytes
+ * log. birth transaction group in which the block was logically born
+ *
+ * Note that LSIZE and PSIZE are stored in bytes, whereas for non-embedded
+ * bp's they are stored in units of SPA_MINBLOCKSHIFT.
+ * Generally, the generic BP_GET_*() macros can be used on embedded BP's.
+ * The B, D, X, lvl, type, and comp fields are stored the same as with normal
+ * BP's so the BP_SET_* macros can be used with them. etype, PSIZE, LSIZE must
+ * be set with the BPE_SET_* macros. BP_SET_EMBEDDED() should be called before
+ * other macros, as they assert that they are only used on BP's of the correct
+ * "embedded-ness".
+ */
+
+#define BPE_GET_ETYPE(bp) \
+ (ASSERT(BP_IS_EMBEDDED(bp)), \
+ BF64_GET((bp)->blk_prop, 40, 8))
+#define BPE_SET_ETYPE(bp, t) do { \
+ ASSERT(BP_IS_EMBEDDED(bp)); \
+ BF64_SET((bp)->blk_prop, 40, 8, t); \
+_NOTE(CONSTCOND) } while (0)
+
+#define BPE_GET_LSIZE(bp) \
+ (ASSERT(BP_IS_EMBEDDED(bp)), \
+ BF64_GET_SB((bp)->blk_prop, 0, 25, 0, 1))
+#define BPE_SET_LSIZE(bp, x) do { \
+ ASSERT(BP_IS_EMBEDDED(bp)); \
+ BF64_SET_SB((bp)->blk_prop, 0, 25, 0, 1, x); \
+_NOTE(CONSTCOND) } while (0)
+
+#define BPE_GET_PSIZE(bp) \
+ (ASSERT(BP_IS_EMBEDDED(bp)), \
+ BF64_GET_SB((bp)->blk_prop, 25, 7, 0, 1))
+#define BPE_SET_PSIZE(bp, x) do { \
+ ASSERT(BP_IS_EMBEDDED(bp)); \
+ BF64_SET_SB((bp)->blk_prop, 25, 7, 0, 1, x); \
+_NOTE(CONSTCOND) } while (0)
+
+typedef enum bp_embedded_type {
+ BP_EMBEDDED_TYPE_DATA,
+ BP_EMBEDDED_TYPE_RESERVED, /* Reserved for an unintegrated feature. */
+ NUM_BP_EMBEDDED_TYPES = BP_EMBEDDED_TYPE_RESERVED
+} bp_embedded_type_t;
+
+#define BPE_NUM_WORDS 14
+#define BPE_PAYLOAD_SIZE (BPE_NUM_WORDS * sizeof (uint64_t))
+#define BPE_IS_PAYLOADWORD(bp, wp) \
+ ((wp) != &(bp)->blk_prop && (wp) != &(bp)->blk_birth)
+
#define SPA_BLKPTRSHIFT 7 /* blkptr_t is 128 bytes */
#define SPA_DVAS_PER_BP 3 /* Number of DVAs in a bp */
+/*
+ * A block is a hole when it has either 1) never been written to, or
+ * 2) is zero-filled. In both cases, ZFS can return all zeroes for all reads
+ * without physically allocating disk space. Holes are represented in the
+ * blkptr_t structure by zeroed blk_dva. Correct checking for holes is
+ * done through the BP_IS_HOLE macro. For holes, the logical size, level,
+ * DMU object type, and birth times are all also stored for holes that
+ * were written to at some point (i.e. were punched after having been filled).
+ */
typedef struct blkptr {
dva_t blk_dva[SPA_DVAS_PER_BP]; /* Data Virtual Addresses */
uint64_t blk_prop; /* size, compression, type, etc */
@@ -200,9 +356,10 @@
* Macros to get and set fields in a bp or DVA.
*/
#define DVA_GET_ASIZE(dva) \
- BF64_GET_SB((dva)->dva_word[0], 0, 24, SPA_MINBLOCKSHIFT, 0)
+ BF64_GET_SB((dva)->dva_word[0], 0, SPA_ASIZEBITS, SPA_MINBLOCKSHIFT, 0)
#define DVA_SET_ASIZE(dva, x) \
- BF64_SET_SB((dva)->dva_word[0], 0, 24, SPA_MINBLOCKSHIFT, 0, x)
+ BF64_SET_SB((dva)->dva_word[0], 0, SPA_ASIZEBITS, \
+ SPA_MINBLOCKSHIFT, 0, x)
#define DVA_GET_GRID(dva) BF64_GET((dva)->dva_word[0], 24, 8)
#define DVA_SET_GRID(dva, x) BF64_SET((dva)->dva_word[0], 24, 8, x)
@@ -219,21 +376,40 @@
#define DVA_SET_GANG(dva, x) BF64_SET((dva)->dva_word[1], 63, 1, x)
#define BP_GET_LSIZE(bp) \
- BF64_GET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1)
-#define BP_SET_LSIZE(bp, x) \
- BF64_SET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1, x)
+ (BP_IS_EMBEDDED(bp) ? \
+ (BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA ? BPE_GET_LSIZE(bp) : 0): \
+ BF64_GET_SB((bp)->blk_prop, 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1))
+#define BP_SET_LSIZE(bp, x) do { \
+ ASSERT(!BP_IS_EMBEDDED(bp)); \
+ BF64_SET_SB((bp)->blk_prop, \
+ 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \
+_NOTE(CONSTCOND) } while (0)
#define BP_GET_PSIZE(bp) \
- BF64_GET_SB((bp)->blk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1)
-#define BP_SET_PSIZE(bp, x) \
- BF64_SET_SB((bp)->blk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1, x)
+ (BP_IS_EMBEDDED(bp) ? 0 : \
+ BF64_GET_SB((bp)->blk_prop, 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1))
+#define BP_SET_PSIZE(bp, x) do { \
+ ASSERT(!BP_IS_EMBEDDED(bp)); \
+ BF64_SET_SB((bp)->blk_prop, \
+ 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \
+_NOTE(CONSTCOND) } while (0)
-#define BP_GET_COMPRESS(bp) BF64_GET((bp)->blk_prop, 32, 8)
-#define BP_SET_COMPRESS(bp, x) BF64_SET((bp)->blk_prop, 32, 8, x)
+#define BP_GET_COMPRESS(bp) \
+ BF64_GET((bp)->blk_prop, 32, SPA_COMPRESSBITS)
+#define BP_SET_COMPRESS(bp, x) \
+ BF64_SET((bp)->blk_prop, 32, SPA_COMPRESSBITS, x)
-#define BP_GET_CHECKSUM(bp) BF64_GET((bp)->blk_prop, 40, 8)
-#define BP_SET_CHECKSUM(bp, x) BF64_SET((bp)->blk_prop, 40, 8, x)
+#define BP_IS_EMBEDDED(bp) BF64_GET((bp)->blk_prop, 39, 1)
+#define BP_SET_EMBEDDED(bp, x) BF64_SET((bp)->blk_prop, 39, 1, x)
+#define BP_GET_CHECKSUM(bp) \
+ (BP_IS_EMBEDDED(bp) ? ZIO_CHECKSUM_OFF : \
+ BF64_GET((bp)->blk_prop, 40, 8))
+#define BP_SET_CHECKSUM(bp, x) do { \
+ ASSERT(!BP_IS_EMBEDDED(bp)); \
+ BF64_SET((bp)->blk_prop, 40, 8, x); \
+_NOTE(CONSTCOND) } while (0)
+
#define BP_GET_TYPE(bp) BF64_GET((bp)->blk_prop, 48, 8)
#define BP_SET_TYPE(bp, x) BF64_SET((bp)->blk_prop, 48, 8, x)
@@ -240,27 +416,30 @@
#define BP_GET_LEVEL(bp) BF64_GET((bp)->blk_prop, 56, 5)
#define BP_SET_LEVEL(bp, x) BF64_SET((bp)->blk_prop, 56, 5, x)
-#define BP_GET_PROP_BIT_61(bp) BF64_GET((bp)->blk_prop, 61, 1)
-#define BP_SET_PROP_BIT_61(bp, x) BF64_SET((bp)->blk_prop, 61, 1, x)
-
#define BP_GET_DEDUP(bp) BF64_GET((bp)->blk_prop, 62, 1)
#define BP_SET_DEDUP(bp, x) BF64_SET((bp)->blk_prop, 62, 1, x)
-#define BP_GET_BYTEORDER(bp) (0 - BF64_GET((bp)->blk_prop, 63, 1))
+#define BP_GET_BYTEORDER(bp) BF64_GET((bp)->blk_prop, 63, 1)
#define BP_SET_BYTEORDER(bp, x) BF64_SET((bp)->blk_prop, 63, 1, x)
#define BP_PHYSICAL_BIRTH(bp) \
- ((bp)->blk_phys_birth ? (bp)->blk_phys_birth : (bp)->blk_birth)
+ (BP_IS_EMBEDDED(bp) ? 0 : \
+ (bp)->blk_phys_birth ? (bp)->blk_phys_birth : (bp)->blk_birth)
#define BP_SET_BIRTH(bp, logical, physical) \
{ \
+ ASSERT(!BP_IS_EMBEDDED(bp)); \
(bp)->blk_birth = (logical); \
(bp)->blk_phys_birth = ((logical) == (physical) ? 0 : (physical)); \
}
+#define BP_GET_FILL(bp) (BP_IS_EMBEDDED(bp) ? 1 : (bp)->blk_fill)
+
#define BP_GET_ASIZE(bp) \
- (DVA_GET_ASIZE(&(bp)->blk_dva[0]) + DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
- DVA_GET_ASIZE(&(bp)->blk_dva[2]))
+ (BP_IS_EMBEDDED(bp) ? 0 : \
+ DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \
+ DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
+ DVA_GET_ASIZE(&(bp)->blk_dva[2]))
#define BP_GET_UCSIZE(bp) \
((BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp))) ? \
@@ -267,14 +446,16 @@
BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp))
#define BP_GET_NDVAS(bp) \
- (!!DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \
+ (BP_IS_EMBEDDED(bp) ? 0 : \
+ !!DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \
!!DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
!!DVA_GET_ASIZE(&(bp)->blk_dva[2]))
#define BP_COUNT_GANG(bp) \
+ (BP_IS_EMBEDDED(bp) ? 0 : \
(DVA_GET_GANG(&(bp)->blk_dva[0]) + \
DVA_GET_GANG(&(bp)->blk_dva[1]) + \
- DVA_GET_GANG(&(bp)->blk_dva[2]))
+ DVA_GET_GANG(&(bp)->blk_dva[2])))
#define DVA_EQUAL(dva1, dva2) \
((dva1)->dva_word[1] == (dva2)->dva_word[1] && \
@@ -282,6 +463,7 @@
#define BP_EQUAL(bp1, bp2) \
(BP_PHYSICAL_BIRTH(bp1) == BP_PHYSICAL_BIRTH(bp2) && \
+ (bp1)->blk_birth == (bp2)->blk_birth && \
DVA_EQUAL(&(bp1)->blk_dva[0], &(bp2)->blk_dva[0]) && \
DVA_EQUAL(&(bp1)->blk_dva[1], &(bp2)->blk_dva[1]) && \
DVA_EQUAL(&(bp1)->blk_dva[2], &(bp2)->blk_dva[2]))
@@ -292,6 +474,19 @@
((zc1).zc_word[2] - (zc2).zc_word[2]) | \
((zc1).zc_word[3] - (zc2).zc_word[3])))
+#define ZIO_CHECKSUM_IS_ZERO(zc) \
+ (0 == ((zc)->zc_word[0] | (zc)->zc_word[1] | \
+ (zc)->zc_word[2] | (zc)->zc_word[3]))
+
+#define ZIO_CHECKSUM_BSWAP(zcp) \
+{ \
+ (zcp)->zc_word[0] = BSWAP_64((zcp)->zc_word[0]); \
+ (zcp)->zc_word[1] = BSWAP_64((zcp)->zc_word[1]); \
+ (zcp)->zc_word[2] = BSWAP_64((zcp)->zc_word[2]); \
+ (zcp)->zc_word[3] = BSWAP_64((zcp)->zc_word[3]); \
+}
+
+
#define DVA_IS_VALID(dva) (DVA_GET_ASIZE(dva) != 0)
#define ZIO_SET_CHECKSUM(zcp, w0, w1, w2, w3) \
@@ -302,9 +497,13 @@
(zcp)->zc_word[3] = w3; \
}
-#define BP_IDENTITY(bp) (&(bp)->blk_dva[0])
-#define BP_IS_GANG(bp) DVA_GET_GANG(BP_IDENTITY(bp))
-#define BP_IS_HOLE(bp) ((bp)->blk_birth == 0)
+#define BP_IDENTITY(bp) (ASSERT(!BP_IS_EMBEDDED(bp)), &(bp)->blk_dva[0])
+#define BP_IS_GANG(bp) \
+ (BP_IS_EMBEDDED(bp) ? B_FALSE : DVA_GET_GANG(BP_IDENTITY(bp)))
+#define DVA_IS_EMPTY(dva) ((dva)->dva_word[0] == 0ULL && \
+ (dva)->dva_word[1] == 0ULL)
+#define BP_IS_HOLE(bp) \
+ (!BP_IS_EMBEDDED(bp) && DVA_IS_EMPTY(BP_IDENTITY(bp)))
/* BP_IS_RAIDZ(bp) assumes no block compression */
#define BP_IS_RAIDZ(bp) (DVA_GET_ASIZE(&(bp)->blk_dva[0]) > \
@@ -327,14 +526,10 @@
ZIO_SET_CHECKSUM(&(bp)->blk_cksum, 0, 0, 0, 0); \
}
-/*
- * Note: the byteorder is either 0 or -1, both of which are palindromes.
- * This simplifies the endianness handling a bit.
- */
#if BYTE_ORDER == _BIG_ENDIAN
#define ZFS_HOST_BYTEORDER (0ULL)
#else
-#define ZFS_HOST_BYTEORDER (-1ULL)
+#define ZFS_HOST_BYTEORDER (1ULL)
#endif
#define BP_SHOULD_BYTESWAP(bp) (BP_GET_BYTEORDER(bp) != ZFS_HOST_BYTEORDER)
@@ -346,18 +541,34 @@
* 'func' is either snprintf() or mdb_snprintf().
* 'ws' (whitespace) can be ' ' for single-line format, '\n' for multi-line.
*/
-#define SPRINTF_BLKPTR(func, ws, buf, bp, type, checksum, compress) \
+#define SNPRINTF_BLKPTR(func, ws, buf, size, bp, type, checksum, compress) \
{ \
static const char *copyname[] = \
{ "zero", "single", "double", "triple" }; \
- int size = BP_SPRINTF_LEN; \
int len = 0; \
int copies = 0; \
\
if (bp == NULL) { \
- len = func(buf + len, size - len, "<NULL>"); \
+ len += func(buf + len, size - len, "<NULL>"); \
} else if (BP_IS_HOLE(bp)) { \
- len = func(buf + len, size - len, "<hole>"); \
+ len += func(buf + len, size - len, \
+ "HOLE [L%llu %s] " \
+ "size=%llxL birth=%lluL", \
+ (u_longlong_t)BP_GET_LEVEL(bp), \
+ type, \
+ (u_longlong_t)BP_GET_LSIZE(bp), \
+ (u_longlong_t)bp->blk_birth); \
+ } else if (BP_IS_EMBEDDED(bp)) { \
+ len = func(buf + len, size - len, \
+ "EMBEDDED [L%llu %s] et=%u %s " \
+ "size=%llxL/%llxP birth=%lluL", \
+ (u_longlong_t)BP_GET_LEVEL(bp), \
+ type, \
+ (int)BPE_GET_ETYPE(bp), \
+ compress, \
+ (u_longlong_t)BPE_GET_LSIZE(bp), \
+ (u_longlong_t)BPE_GET_PSIZE(bp), \
+ (u_longlong_t)bp->blk_birth); \
} else { \
for (int d = 0; d < BP_GET_NDVAS(bp); d++) { \
const dva_t *dva = &bp->blk_dva[d]; \
@@ -391,7 +602,7 @@
(u_longlong_t)BP_GET_PSIZE(bp), \
(u_longlong_t)bp->blk_birth, \
(u_longlong_t)BP_PHYSICAL_BIRTH(bp), \
- (u_longlong_t)bp->blk_fill, \
+ (u_longlong_t)BP_GET_FILL(bp), \
ws, \
(u_longlong_t)bp->blk_cksum.zc_word[0], \
(u_longlong_t)bp->blk_cksum.zc_word[1], \
@@ -420,7 +631,7 @@
size_t buflen);
extern int spa_create(const char *pool, nvlist_t *config, nvlist_t *props,
nvlist_t *zplprops);
-#if defined(sun)
+#ifdef illumos
extern int spa_import_rootpool(char *devpath, char *devid);
#else
extern int spa_import_rootpool(const char *name);
@@ -521,6 +732,7 @@
/* Refcount functions */
extern void spa_open_ref(spa_t *spa, void *tag);
extern void spa_close(spa_t *spa, void *tag);
+extern void spa_async_close(spa_t *spa, void *tag);
extern boolean_t spa_refcount_zero(spa_t *spa);
#define SCL_NONE 0x00
@@ -587,11 +799,15 @@
extern uint64_t spa_freeze_txg(spa_t *spa);
extern uint64_t spa_get_asize(spa_t *spa, uint64_t lsize);
extern uint64_t spa_get_dspace(spa_t *spa);
+extern uint64_t spa_get_slop_space(spa_t *spa);
extern void spa_update_dspace(spa_t *spa);
extern uint64_t spa_version(spa_t *spa);
extern boolean_t spa_deflate(spa_t *spa);
extern metaslab_class_t *spa_normal_class(spa_t *spa);
extern metaslab_class_t *spa_log_class(spa_t *spa);
+extern void spa_evicting_os_register(spa_t *, objset_t *os);
+extern void spa_evicting_os_deregister(spa_t *, objset_t *os);
+extern void spa_evicting_os_wait(spa_t *spa);
extern int spa_max_replication(spa_t *spa);
extern int spa_prev_software_version(spa_t *spa);
extern int spa_busy(void);
@@ -603,7 +819,8 @@
extern uint64_t spa_deadman_synctime(spa_t *spa);
/* Miscellaneous support routines */
-extern void spa_activate_mos_feature(spa_t *spa, const char *feature);
+extern void spa_activate_mos_feature(spa_t *spa, const char *feature,
+ dmu_tx_t *tx);
extern void spa_deactivate_mos_feature(spa_t *spa, const char *feature);
extern int spa_rename(const char *oldname, const char *newname);
extern spa_t *spa_by_guid(uint64_t pool_guid, uint64_t device_guid);
@@ -612,7 +829,7 @@
extern void spa_strfree(char *);
extern uint64_t spa_get_random(uint64_t range);
extern uint64_t spa_generate_guid(spa_t *spa);
-extern void sprintf_blkptr(char *buf, const blkptr_t *bp);
+extern void snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp);
extern void spa_freeze(spa_t *spa);
extern int spa_change_guid(spa_t *spa);
extern void spa_upgrade(spa_t *spa, uint64_t version);
@@ -626,6 +843,9 @@
extern boolean_t spa_has_slogs(spa_t *spa);
extern boolean_t spa_is_root(spa_t *spa);
extern boolean_t spa_writeable(spa_t *spa);
+extern boolean_t spa_has_pending_synctask(spa_t *spa);
+extern int spa_maxblocksize(spa_t *spa);
+extern void zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp);
extern int spa_mode(spa_t *spa);
extern uint64_t zfs_strtonum(const char *str, char **nptr);
@@ -647,7 +867,7 @@
dmu_tx_t *tx, const char *fmt, ...);
/* error handling */
-struct zbookmark;
+struct zbookmark_phys;
extern void spa_log_error(spa_t *spa, zio_t *zio);
extern void zfs_ereport_post(const char *cls, spa_t *spa, vdev_t *vd,
zio_t *zio, uint64_t stateoroffset, uint64_t length);
@@ -681,9 +901,9 @@
#ifdef ZFS_DEBUG
#define dprintf_bp(bp, fmt, ...) do { \
- if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
+ if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP); \
- sprintf_blkptr(__blkbuf, (bp)); \
+ snprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, (bp)); \
dprintf(fmt " %s\n", __VA_ARGS__, __blkbuf); \
kmem_free(__blkbuf, BP_SPRINTF_LEN); \
} \
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_boot.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_boot.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_boot.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -20,9 +21,11 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright 2013 Martin Matuska <mm at FreeBSD.org>. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
*/
#ifndef _SYS_SPA_IMPL_H
@@ -39,6 +42,8 @@
#include <sys/refcount.h>
#include <sys/bplist.h>
#include <sys/bpobj.h>
+#include <sys/zfeature.h>
+#include <zfeature_common.h>
#ifdef __cplusplus
extern "C" {
@@ -45,9 +50,9 @@
#endif
typedef struct spa_error_entry {
- zbookmark_t se_bookmark;
- char *se_name;
- avl_node_t se_avl;
+ zbookmark_phys_t se_bookmark;
+ char *se_name;
+ avl_node_t se_avl;
} spa_error_entry_t;
typedef struct spa_history_phys {
@@ -81,16 +86,16 @@
char *scd_path;
} spa_config_dirent_t;
-enum zio_taskq_type {
+typedef enum zio_taskq_type {
ZIO_TASKQ_ISSUE = 0,
ZIO_TASKQ_ISSUE_HIGH,
ZIO_TASKQ_INTERRUPT,
ZIO_TASKQ_INTERRUPT_HIGH,
ZIO_TASKQ_TYPES
-};
+} zio_taskq_type_t;
/*
- * State machine for the zpool-pooname process. The states transitions
+ * State machine for the zpool-poolname process. The states transitions
* are done as follows:
*
* From To Routine
@@ -108,11 +113,16 @@
SPA_PROC_GONE /* spa_thread() is exiting, spa_proc = &p0 */
} spa_proc_state_t;
+typedef struct spa_taskqs {
+ uint_t stqs_count;
+ taskq_t **stqs_taskq;
+} spa_taskqs_t;
+
struct spa {
/*
* Fields protected by spa_namespace_lock.
*/
- char spa_name[MAXNAMELEN]; /* pool name */
+ char spa_name[ZFS_MAX_DATASET_NAME_LEN]; /* pool name */
char *spa_comment; /* comment */
avl_node_t spa_avl; /* node in spa_namespace_avl */
nvlist_t *spa_config; /* last synced config */
@@ -126,7 +136,7 @@
uint8_t spa_sync_on; /* sync threads are running */
spa_load_state_t spa_load_state; /* current load operation */
uint64_t spa_import_flags; /* import specific flags */
- taskq_t *spa_zio_taskq[ZIO_TYPES][ZIO_TASKQ_TYPES];
+ spa_taskqs_t spa_zio_taskq[ZIO_TYPES][ZIO_TASKQ_TYPES];
dsl_pool_t *spa_dsl_pool;
boolean_t spa_is_initializing; /* true while opening pool */
metaslab_class_t *spa_normal_class; /* normal data class */
@@ -138,13 +148,20 @@
uint64_t spa_claim_max_txg; /* highest claimed birth txg */
timespec_t spa_loaded_ts; /* 1st successful open time */
objset_t *spa_meta_objset; /* copy of dp->dp_meta_objset */
+ kmutex_t spa_evicting_os_lock; /* Evicting objset list lock */
+ list_t spa_evicting_os_list; /* Objsets being evicted. */
+ kcondvar_t spa_evicting_os_cv; /* Objset Eviction Completion */
txg_list_t spa_vdev_txg_list; /* per-txg dirty vdev list */
vdev_t *spa_root_vdev; /* top-level vdev container */
+ int spa_min_ashift; /* of vdevs in normal class */
+ int spa_max_ashift; /* of vdevs in normal class */
uint64_t spa_config_guid; /* config pool guid */
uint64_t spa_load_guid; /* spa_load initialized guid */
uint64_t spa_last_synced_guid; /* last synced guid */
list_t spa_config_dirty_list; /* vdevs with dirty config */
list_t spa_state_dirty_list; /* vdevs with dirty state */
+ kmutex_t spa_alloc_lock;
+ avl_tree_t spa_alloc_tree;
spa_aux_vdev_t spa_spares; /* hot spares */
spa_aux_vdev_t spa_l2cache; /* L2ARC cache devices */
nvlist_t *spa_label_features; /* Features for reading MOS */
@@ -153,6 +170,10 @@
uint64_t spa_syncing_txg; /* txg currently syncing */
bpobj_t spa_deferred_bpobj; /* deferred-free bplist */
bplist_t spa_free_bplist[TXG_SIZE]; /* bplist of stuff to free */
+ zio_cksum_salt_t spa_cksum_salt; /* secret salt for cksum */
+ /* checksum context templates */
+ kmutex_t spa_cksum_tmpls_lock;
+ void *spa_cksum_tmpls[ZIO_CHECKSUM_FUNCTIONS];
uberblock_t spa_ubsync; /* last synced uberblock */
uberblock_t spa_uberblock; /* current uberblock */
boolean_t spa_extreme_rewind; /* rewind past deferred frees */
@@ -169,6 +190,7 @@
uint64_t spa_scan_pass_exam; /* examined bytes per pass */
kmutex_t spa_async_lock; /* protect async state */
kthread_t *spa_async_thread; /* thread doing async task */
+ kthread_t *spa_async_thread_vd; /* thread doing vd async task */
int spa_async_suspended; /* async tasks suspended */
kcondvar_t spa_async_cv; /* wait for thread_exit() */
uint16_t spa_async_tasks; /* async task mask */
@@ -198,7 +220,8 @@
uint64_t spa_failmode; /* failure mode for the pool */
uint64_t spa_delegation; /* delegation on/off */
list_t spa_config_list; /* previous cache file(s) */
- zio_t *spa_async_zio_root; /* root of all async I/O */
+ /* per-CPU array of root of async I/O: */
+ zio_t **spa_async_zio_root;
zio_t *spa_suspend_zio_root; /* root of all suspended I/O */
kmutex_t spa_suspend_lock; /* protects suspend_zio_root */
kcondvar_t spa_suspend_cv; /* notification of resume */
@@ -231,17 +254,34 @@
uint64_t spa_feat_for_write_obj; /* required to write to pool */
uint64_t spa_feat_for_read_obj; /* required to read from pool */
uint64_t spa_feat_desc_obj; /* Feature descriptions */
+ uint64_t spa_feat_enabled_txg_obj; /* Feature enabled txg */
+ /* cache feature refcounts */
+ uint64_t spa_feat_refcount_cache[SPA_FEATURES];
#ifdef illumos
cyclic_id_t spa_deadman_cycid; /* cyclic id */
-#else /* FreeBSD */
+#else /* !illumos */
#ifdef _KERNEL
struct callout spa_deadman_cycid; /* callout id */
+ struct task spa_deadman_task;
#endif
#endif /* illumos */
uint64_t spa_deadman_calls; /* number of deadman calls */
- uint64_t spa_sync_starttime; /* starting time fo spa_sync */
+ hrtime_t spa_sync_starttime; /* starting time fo spa_sync */
uint64_t spa_deadman_synctime; /* deadman expiration timer */
+#ifdef illumos
+ /*
+ * spa_iokstat_lock protects spa_iokstat and
+ * spa_queue_stats[].
+ */
+ kmutex_t spa_iokstat_lock;
+ struct kstat *spa_iokstat; /* kstat of io to this pool */
+ struct {
+ int spa_active;
+ int spa_queued;
+ } spa_queue_stats[ZIO_PRIORITY_NUM_QUEUEABLE];
+#endif
hrtime_t spa_ccw_fail_time; /* Conf cache write fail time */
+
/*
* spa_refcount & spa_config_lock must be the last elements
* because refcount_t changes size based on compilation options.
@@ -250,7 +290,7 @@
*/
spa_config_lock_t spa_config_lock[SCL_LOCKS]; /* config changes */
refcount_t spa_refcount; /* number of opens */
-#ifndef sun
+#ifndef illumos
boolean_t spa_splitting_newspa; /* creating new spa in split */
#endif
};
@@ -257,6 +297,9 @@
extern const char *spa_config_path;
+extern void spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
+ task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent);
+
#ifdef __cplusplus
}
#endif
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -24,7 +25,7 @@
*/
/*
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
*/
#ifndef _SYS_SPACE_MAP_H
@@ -31,6 +32,7 @@
#define _SYS_SPACE_MAP_H
#include <sys/avl.h>
+#include <sys/range_tree.h>
#include <sys/dmu.h>
#ifdef __cplusplus
@@ -37,53 +39,56 @@
extern "C" {
#endif
-typedef struct space_map_ops space_map_ops_t;
+/*
+ * The size of the space map object has increased to include a histogram.
+ * The SPACE_MAP_SIZE_V0 designates the original size and is used to
+ * maintain backward compatibility.
+ */
+#define SPACE_MAP_SIZE_V0 (3 * sizeof (uint64_t))
+#define SPACE_MAP_HISTOGRAM_SIZE 32
+/*
+ * The space_map_phys is the on-disk representation of the space map.
+ * Consumers of space maps should never reference any of the members of this
+ * structure directly. These members may only be updated in syncing context.
+ *
+ * Note the smp_object is no longer used but remains in the structure
+ * for backward compatibility.
+ */
+typedef struct space_map_phys {
+ uint64_t smp_object; /* on-disk space map object */
+ uint64_t smp_objsize; /* size of the object */
+ uint64_t smp_alloc; /* space allocated from the map */
+ uint64_t smp_pad[5]; /* reserved */
+
+ /*
+ * The smp_histogram maintains a histogram of free regions. Each
+ * bucket, smp_histogram[i], contains the number of free regions
+ * whose size is:
+ * 2^(i+sm_shift) <= size of free region in bytes < 2^(i+sm_shift+1)
+ */
+ uint64_t smp_histogram[SPACE_MAP_HISTOGRAM_SIZE];
+} space_map_phys_t;
+
+/*
+ * The space map object defines a region of space, its size, how much is
+ * allocated, and the on-disk object that stores this information.
+ * Consumers of space maps may only access the members of this structure.
+ */
typedef struct space_map {
- avl_tree_t sm_root; /* offset-ordered segment AVL tree */
- uint64_t sm_space; /* sum of all segments in the map */
uint64_t sm_start; /* start of map */
uint64_t sm_size; /* size of map */
uint8_t sm_shift; /* unit shift */
- uint8_t sm_loaded; /* map loaded? */
- uint8_t sm_loading; /* map loading? */
- uint8_t sm_condensing; /* map condensing? */
- kcondvar_t sm_load_cv; /* map load completion */
- space_map_ops_t *sm_ops; /* space map block picker ops vector */
- avl_tree_t *sm_pp_root; /* size-ordered, picker-private tree */
- void *sm_ppd; /* picker-private data */
+ uint64_t sm_length; /* synced length */
+ uint64_t sm_alloc; /* synced space allocated */
+ objset_t *sm_os; /* objset for this map */
+ uint64_t sm_object; /* object id for this map */
+ uint32_t sm_blksz; /* block size for space map */
+ dmu_buf_t *sm_dbuf; /* space_map_phys_t dbuf */
+ space_map_phys_t *sm_phys; /* on-disk space map */
kmutex_t *sm_lock; /* pointer to lock that protects map */
} space_map_t;
-typedef struct space_seg {
- avl_node_t ss_node; /* AVL node */
- avl_node_t ss_pp_node; /* AVL picker-private node */
- uint64_t ss_start; /* starting offset of this segment */
- uint64_t ss_end; /* ending offset (non-inclusive) */
-} space_seg_t;
-
-typedef struct space_ref {
- avl_node_t sr_node; /* AVL node */
- uint64_t sr_offset; /* offset (start or end) */
- int64_t sr_refcnt; /* associated reference count */
-} space_ref_t;
-
-typedef struct space_map_obj {
- uint64_t smo_object; /* on-disk space map object */
- uint64_t smo_objsize; /* size of the object */
- uint64_t smo_alloc; /* space allocated from the map */
-} space_map_obj_t;
-
-struct space_map_ops {
- void (*smop_load)(space_map_t *sm);
- void (*smop_unload)(space_map_t *sm);
- uint64_t (*smop_alloc)(space_map_t *sm, uint64_t size);
- void (*smop_claim)(space_map_t *sm, uint64_t start, uint64_t size);
- void (*smop_free)(space_map_t *sm, uint64_t start, uint64_t size);
- uint64_t (*smop_max)(space_map_t *sm);
- boolean_t (*smop_fragmented)(space_map_t *sm);
-};
-
/*
* debug entry
*
@@ -124,61 +129,34 @@
#define SM_RUN_MAX SM_RUN_DECODE(~0ULL)
-#define SM_ALLOC 0x0
-#define SM_FREE 0x1
+typedef enum {
+ SM_ALLOC,
+ SM_FREE
+} maptype_t;
-/*
- * The data for a given space map can be kept on blocks of any size.
- * Larger blocks entail fewer i/o operations, but they also cause the
- * DMU to keep more data in-core, and also to waste more i/o bandwidth
- * when only a few blocks have changed since the last transaction group.
- * This could use a lot more research, but for now, set the freelist
- * block size to 4k (2^12).
- */
-#define SPACE_MAP_BLOCKSHIFT 12
+int space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype);
-typedef void space_map_func_t(space_map_t *sm, uint64_t start, uint64_t size);
+void space_map_histogram_clear(space_map_t *sm);
+void space_map_histogram_add(space_map_t *sm, range_tree_t *rt,
+ dmu_tx_t *tx);
-extern void space_map_init(void);
-extern void space_map_fini(void);
-extern void space_map_create(space_map_t *sm, uint64_t start, uint64_t size,
- uint8_t shift, kmutex_t *lp);
-extern void space_map_destroy(space_map_t *sm);
-extern void space_map_add(space_map_t *sm, uint64_t start, uint64_t size);
-extern void space_map_remove(space_map_t *sm, uint64_t start, uint64_t size);
-extern boolean_t space_map_contains(space_map_t *sm,
- uint64_t start, uint64_t size);
-extern space_seg_t *space_map_find(space_map_t *sm, uint64_t start,
- uint64_t size, avl_index_t *wherep);
-extern void space_map_swap(space_map_t **msrc, space_map_t **mdest);
-extern void space_map_vacate(space_map_t *sm,
- space_map_func_t *func, space_map_t *mdest);
-extern void space_map_walk(space_map_t *sm,
- space_map_func_t *func, space_map_t *mdest);
+void space_map_update(space_map_t *sm);
-extern void space_map_load_wait(space_map_t *sm);
-extern int space_map_load(space_map_t *sm, space_map_ops_t *ops,
- uint8_t maptype, space_map_obj_t *smo, objset_t *os);
-extern void space_map_unload(space_map_t *sm);
+uint64_t space_map_object(space_map_t *sm);
+uint64_t space_map_allocated(space_map_t *sm);
+uint64_t space_map_length(space_map_t *sm);
-extern uint64_t space_map_alloc(space_map_t *sm, uint64_t size);
-extern void space_map_claim(space_map_t *sm, uint64_t start, uint64_t size);
-extern void space_map_free(space_map_t *sm, uint64_t start, uint64_t size);
-extern uint64_t space_map_maxsize(space_map_t *sm);
+void space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype,
+ dmu_tx_t *tx);
+void space_map_truncate(space_map_t *sm, dmu_tx_t *tx);
+uint64_t space_map_alloc(objset_t *os, dmu_tx_t *tx);
+void space_map_free(space_map_t *sm, dmu_tx_t *tx);
-extern void space_map_sync(space_map_t *sm, uint8_t maptype,
- space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx);
-extern void space_map_truncate(space_map_obj_t *smo,
- objset_t *os, dmu_tx_t *tx);
+int space_map_open(space_map_t **smp, objset_t *os, uint64_t object,
+ uint64_t start, uint64_t size, uint8_t shift, kmutex_t *lp);
+void space_map_close(space_map_t *sm);
-extern void space_map_ref_create(avl_tree_t *t);
-extern void space_map_ref_destroy(avl_tree_t *t);
-extern void space_map_ref_add_seg(avl_tree_t *t,
- uint64_t start, uint64_t end, int64_t refcnt);
-extern void space_map_ref_add_map(avl_tree_t *t,
- space_map_t *sm, int64_t refcnt);
-extern void space_map_ref_generate_map(avl_tree_t *t,
- space_map_t *sm, int64_t minref);
+int64_t space_map_alloc_delta(space_map_t *sm);
#ifdef __cplusplus
}
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/trim_map.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/trim_map.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/trim_map.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -23,7 +24,7 @@
* Use is subject to license terms.
*/
/*
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
*/
#ifndef _SYS_TXG_H
@@ -74,13 +75,9 @@
extern void txg_rele_to_sync(txg_handle_t *txghp);
extern void txg_register_callbacks(txg_handle_t *txghp, list_t *tx_callbacks);
-/*
- * Delay the caller by the specified number of ticks or until
- * the txg closes (whichever comes first). This is intended
- * to be used to throttle writers when the system nears its
- * capacity.
- */
-extern void txg_delay(struct dsl_pool *dp, uint64_t txg, int ticks);
+extern void txg_delay(struct dsl_pool *dp, uint64_t txg, hrtime_t delta,
+ hrtime_t resolution);
+extern void txg_kick(struct dsl_pool *dp);
/*
* Wait until the given transaction group has finished syncing.
@@ -116,6 +113,7 @@
extern void txg_list_create(txg_list_t *tl, size_t offset);
extern void txg_list_destroy(txg_list_t *tl);
extern boolean_t txg_list_empty(txg_list_t *tl, uint64_t txg);
+extern boolean_t txg_all_lists_empty(txg_list_t *tl);
extern boolean_t txg_list_add(txg_list_t *tl, void *p, uint64_t txg);
extern boolean_t txg_list_add_tail(txg_list_t *tl, void *p, uint64_t txg);
extern void *txg_list_remove(txg_list_t *tl, uint64_t txg);
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -18,6 +19,7 @@
*
* CDDL HEADER END
*/
+
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
@@ -70,7 +72,7 @@
kmutex_t tc_open_lock; /* protects tx_open_txg */
kmutex_t tc_lock; /* protects the rest of this struct */
kcondvar_t tc_cv[TXG_SIZE];
- uint64_t tc_count[TXG_SIZE];
+ uint64_t tc_count[TXG_SIZE]; /* tx hold count on each txg */
list_t tc_callbacks[TXG_SIZE]; /* commit cb list */
char tc_pad[8]; /* pad to fill 3 cache lines */
};
@@ -87,13 +89,16 @@
* every cpu (see txg_quiesce()).
*/
typedef struct tx_state {
- tx_cpu_t *tx_cpu; /* protects right to enter txg */
- kmutex_t tx_sync_lock; /* protects tx_state_t */
+ tx_cpu_t *tx_cpu; /* protects access to tx_open_txg */
+ kmutex_t tx_sync_lock; /* protects the rest of this struct */
+
uint64_t tx_open_txg; /* currently open txg id */
uint64_t tx_quiesced_txg; /* quiesced txg waiting for sync */
uint64_t tx_syncing_txg; /* currently syncing txg id */
uint64_t tx_synced_txg; /* last synced txg id */
+ hrtime_t tx_open_time; /* start time of tx_open_txg */
+
uint64_t tx_sync_txg_waiting; /* txg we're waiting to sync */
uint64_t tx_quiesce_txg_waiting; /* txg we're waiting to open */
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -22,6 +23,9 @@
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
+/*
+ * Copyright (c) 2014 by Delphix. All rights reserved.
+ */
#ifndef _SYS_UBERBLOCK_H
#define _SYS_UBERBLOCK_H
@@ -36,8 +40,8 @@
typedef struct uberblock uberblock_t;
-extern int uberblock_verify(uberblock_t *ub);
-extern int uberblock_update(uberblock_t *ub, vdev_t *rvd, uint64_t txg);
+extern int uberblock_verify(uberblock_t *);
+extern boolean_t uberblock_update(uberblock_t *, vdev_t *, uint64_t);
#ifdef __cplusplus
}
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock_impl.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock_impl.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/uberblock_impl.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -20,6 +21,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2017 by Delphix. All rights reserved.
*/
#ifndef _SYS_UBERBLOCK_IMPL_H
@@ -54,6 +56,12 @@
/* highest SPA_VERSION supported by software that wrote this txg */
uint64_t ub_software_version;
+
+ /* These fields are reserved for features that are under development: */
+ uint64_t ub_mmp_magic;
+ uint64_t ub_mmp_delay;
+ uint64_t ub_mmp_seq;
+ uint64_t ub_checkpoint_txg;
};
#ifdef __cplusplus
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/unique.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/unique.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/unique.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -21,7 +22,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
*/
#ifndef _SYS_VDEV_H
@@ -61,6 +62,7 @@
extern boolean_t vdev_is_bootable(vdev_t *vd);
extern vdev_t *vdev_lookup_top(spa_t *spa, uint64_t vdev);
extern vdev_t *vdev_lookup_by_guid(vdev_t *vd, uint64_t guid);
+extern int vdev_count_leaves(spa_t *spa);
extern void vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t d,
uint64_t txg, uint64_t size);
extern boolean_t vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t d,
@@ -78,6 +80,7 @@
extern int vdev_metaslab_init(vdev_t *vd, uint64_t txg);
extern void vdev_metaslab_fini(vdev_t *vd);
extern void vdev_metaslab_set_size(vdev_t *);
+extern void vdev_ashift_optimize(vdev_t *);
extern void vdev_expand(vdev_t *vd, uint64_t txg);
extern void vdev_split(vdev_t *vd);
extern void vdev_deadman(vdev_t *vd);
@@ -111,7 +114,7 @@
extern void vdev_cache_init(vdev_t *vd);
extern void vdev_cache_fini(vdev_t *vd);
-extern int vdev_cache_read(zio_t *zio);
+extern boolean_t vdev_cache_read(zio_t *zio);
extern void vdev_cache_write(zio_t *zio);
extern void vdev_cache_purge(vdev_t *vd);
@@ -119,11 +122,13 @@
extern void vdev_queue_fini(vdev_t *vd);
extern zio_t *vdev_queue_io(zio_t *zio);
extern void vdev_queue_io_done(zio_t *zio);
+extern int vdev_queue_length(vdev_t *vd);
+extern uint64_t vdev_queue_lastoffset(vdev_t *vd);
+extern void vdev_queue_register_lastoffset(vdev_t *vd, zio_t *zio);
extern void vdev_config_dirty(vdev_t *vd);
extern void vdev_config_clean(vdev_t *vd);
-extern int vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg,
- boolean_t);
+extern int vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg);
extern void vdev_state_dirty(vdev_t *vd);
extern void vdev_state_clean(vdev_t *vd);
@@ -158,6 +163,8 @@
extern int vdev_label_init(vdev_t *vd, uint64_t txg, vdev_labeltype_t reason);
+extern int vdev_label_write_pad2(vdev_t *vd, const char *buf, size_t size);
+
#ifdef __cplusplus
}
#endif
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_disk.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_disk.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_disk.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -21,13 +22,13 @@
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright (c) 2013 Joyent, Inc. All rights reserved.
+ * Copyright 2012 Nexenta Systems, Inc. All rights reserved.
*/
#ifndef _SYS_VDEV_DISK_H
#define _SYS_VDEV_DISK_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/vdev.h>
#ifdef _KERNEL
#include <sys/buf.h>
@@ -40,14 +41,25 @@
extern "C" {
#endif
+#ifdef _KERNEL
typedef struct vdev_disk {
ddi_devid_t vd_devid;
char *vd_minor;
ldi_handle_t vd_lh;
+ list_t vd_ldi_cbs;
+ boolean_t vd_ldi_offline;
} vdev_disk_t;
+#endif
+extern int vdev_disk_physio(vdev_t *,
+ caddr_t, size_t, uint64_t, int, boolean_t);
+
+/*
+ * Since vdev_disk.c is not compiled into libzpool, this function should only be
+ * defined in the zfs kernel module.
+ */
#ifdef _KERNEL
-extern int vdev_disk_physio(ldi_handle_t, caddr_t, size_t, uint64_t, int);
+extern int vdev_disk_ldi_physio(ldi_handle_t, caddr_t, size_t, uint64_t, int);
#endif
#ifdef __cplusplus
}
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_file.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_file.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_file.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -39,6 +40,9 @@
vnode_t *vf_vnode;
} vdev_file_t;
+extern void vdev_file_init(void);
+extern void vdev_file_fini(void);
+
#ifdef __cplusplus
}
#endif
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -20,7 +21,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
*/
#ifndef _SYS_VDEV_IMPL_H
@@ -53,14 +54,17 @@
typedef struct vdev_cache vdev_cache_t;
typedef struct vdev_cache_entry vdev_cache_entry_t;
+extern int zfs_vdev_queue_depth_pct;
+extern uint32_t zfs_vdev_async_write_max_active;
+
/*
* Virtual device operations
*/
typedef int vdev_open_func_t(vdev_t *vd, uint64_t *size, uint64_t *max_size,
- uint64_t *ashift);
+ uint64_t *logical_ashift, uint64_t *physical_ashift);
typedef void vdev_close_func_t(vdev_t *vd);
typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize);
-typedef int vdev_io_start_func_t(zio_t *zio);
+typedef void vdev_io_start_func_t(zio_t *zio);
typedef void vdev_io_done_func_t(zio_t *zio);
typedef void vdev_state_change_func_t(vdev_t *vd, int, int);
typedef void vdev_hold_func_t(vdev_t *vd);
@@ -99,13 +103,26 @@
kmutex_t vc_lock;
};
+typedef struct vdev_queue_class {
+ uint32_t vqc_active;
+
+ /*
+ * Sorted by offset or timestamp, depending on if the queue is
+ * LBA-ordered vs FIFO.
+ */
+ avl_tree_t vqc_queued_tree;
+} vdev_queue_class_t;
+
struct vdev_queue {
- avl_tree_t vq_deadline_tree;
- avl_tree_t vq_read_tree;
- avl_tree_t vq_write_tree;
- avl_tree_t vq_pending_tree;
- hrtime_t vq_io_complete_ts;
+ vdev_t *vq_vdev;
+ vdev_queue_class_t vq_class[ZIO_PRIORITY_NUM_QUEUEABLE];
+ avl_tree_t vq_active_tree;
+ avl_tree_t vq_read_offset_tree;
+ avl_tree_t vq_write_offset_tree;
+ uint64_t vq_last_offset;
+ hrtime_t vq_io_complete_ts; /* time last i/o completed */
kmutex_t vq_lock;
+ uint64_t vq_lastoffset;
};
/*
@@ -123,6 +140,24 @@
uint64_t vdev_min_asize; /* min acceptable asize */
uint64_t vdev_max_asize; /* max acceptable asize */
uint64_t vdev_ashift; /* block alignment shift */
+ /*
+ * Logical block alignment shift
+ *
+ * The smallest sized/aligned I/O supported by the device.
+ */
+ uint64_t vdev_logical_ashift;
+ /*
+ * Physical block alignment shift
+ *
+ * The device supports logical I/Os with vdev_logical_ashift
+ * size/alignment, but optimum performance will be achieved by
+ * aligning/sizing requests to vdev_physical_ashift. Smaller
+ * requests may be inflated or incur device level read-modify-write
+ * operations.
+ *
+ * May be 0 to indicate no preference (i.e. use vdev_logical_ashift).
+ */
+ uint64_t vdev_physical_ashift;
uint64_t vdev_state; /* see VDEV_STATE_* #defines */
uint64_t vdev_prevstate; /* used when reopening a vdev */
vdev_ops_t *vdev_ops; /* vdev operations */
@@ -134,7 +169,6 @@
vdev_t *vdev_parent; /* parent vdev */
vdev_t **vdev_child; /* array of children */
uint64_t vdev_children; /* number of children */
- space_map_t vdev_dtl[DTL_TYPES]; /* in-core dirty time logs */
vdev_stat_t vdev_stat; /* virtual device statistics */
boolean_t vdev_expanding; /* expand the vdev? */
boolean_t vdev_reopening; /* reopen in progress? */
@@ -155,25 +189,38 @@
txg_node_t vdev_txg_node; /* per-txg dirty vdev linkage */
boolean_t vdev_remove_wanted; /* async remove wanted? */
boolean_t vdev_probe_wanted; /* async probe wanted? */
- uint64_t vdev_removing; /* device is being removed? */
list_node_t vdev_config_dirty_node; /* config dirty list */
list_node_t vdev_state_dirty_node; /* state dirty list */
uint64_t vdev_deflate_ratio; /* deflation ratio (x512) */
uint64_t vdev_islog; /* is an intent log device */
- uint64_t vdev_ishole; /* is a hole in the namespace */
+ uint64_t vdev_removing; /* device is being removed? */
+ boolean_t vdev_ishole; /* is a hole in the namespace */
+ kmutex_t vdev_queue_lock; /* protects vdev_queue_depth */
/*
+ * The queue depth parameters determine how many async writes are
+ * still pending (i.e. allocated by net yet issued to disk) per
+ * top-level (vdev_async_write_queue_depth) and the maximum allowed
+ * (vdev_max_async_write_queue_depth). These values only apply to
+ * top-level vdevs.
+ */
+ uint64_t vdev_async_write_queue_depth;
+ uint64_t vdev_max_async_write_queue_depth;
+
+ /*
* Leaf vdev state.
*/
+ range_tree_t *vdev_dtl[DTL_TYPES]; /* dirty time logs */
+ space_map_t *vdev_dtl_sm; /* dirty time log space map */
+ txg_node_t vdev_dtl_node; /* per-txg dirty DTL linkage */
+ uint64_t vdev_dtl_object; /* DTL object */
uint64_t vdev_psize; /* physical device capacity */
- space_map_obj_t vdev_dtl_smo; /* dirty time log space map obj */
- txg_node_t vdev_dtl_node; /* per-txg dirty DTL linkage */
uint64_t vdev_wholedisk; /* true if this is a whole disk */
uint64_t vdev_offline; /* persistent offline state */
uint64_t vdev_faulted; /* persistent faulted state */
uint64_t vdev_degraded; /* persistent degraded state */
uint64_t vdev_removed; /* persistent removed state */
- uint64_t vdev_resilvering; /* persistent resilvering state */
+ uint64_t vdev_resilver_txg; /* persistent resilvering state */
uint64_t vdev_nparity; /* number of parity devices for raidz */
char *vdev_path; /* vdev path (if any) */
char *vdev_devid; /* vdev devid (if any) */
@@ -181,7 +228,6 @@
char *vdev_fru; /* physical FRU location */
uint64_t vdev_not_present; /* not present during import */
uint64_t vdev_unspare; /* unspare when resilvering done */
- hrtime_t vdev_last_try; /* last reopen time */
boolean_t vdev_nowritecache; /* true if flushwritecache failed */
boolean_t vdev_notrim; /* true if trim failed */
boolean_t vdev_checkremove; /* temporary online test */
@@ -188,18 +234,21 @@
boolean_t vdev_forcefault; /* force online fault */
boolean_t vdev_splitting; /* split or repair in progress */
boolean_t vdev_delayed_close; /* delayed device close? */
- uint8_t vdev_tmpoffline; /* device taken offline temporarily? */
- uint8_t vdev_detached; /* device detached? */
- uint8_t vdev_cant_read; /* vdev is failing all reads */
- uint8_t vdev_cant_write; /* vdev is failing all writes */
- uint64_t vdev_isspare; /* was a hot spare */
- uint64_t vdev_isl2cache; /* was a l2cache device */
+ boolean_t vdev_tmpoffline; /* device taken offline temporarily? */
+ boolean_t vdev_detached; /* device detached? */
+ boolean_t vdev_cant_read; /* vdev is failing all reads */
+ boolean_t vdev_cant_write; /* vdev is failing all writes */
+ boolean_t vdev_isspare; /* was a hot spare */
+ boolean_t vdev_isl2cache; /* was a l2cache device */
vdev_queue_t vdev_queue; /* I/O deadline schedule queue */
vdev_cache_t vdev_cache; /* physical block cache */
- spa_aux_vdev_t *vdev_aux; /* for l2cache vdevs */
+ spa_aux_vdev_t *vdev_aux; /* for l2cache and spares vdevs */
zio_t *vdev_probe_zio; /* root of current probe */
vdev_aux_t vdev_label_aux; /* on-disk aux state */
- struct trim_map *vdev_trimmap;
+ struct trim_map *vdev_trimmap; /* map on outstanding trims */
+ uint16_t vdev_rotation_rate; /* rotational rate of the media */
+#define VDEV_RATE_UNKNOWN 0
+#define VDEV_RATE_NON_ROTATING 1
/*
* For DTrace to work in userland (libzpool) context, these fields must
@@ -221,8 +270,11 @@
#define VDEV_PHYS_SIZE (112 << 10)
#define VDEV_UBERBLOCK_RING (128 << 10)
+/* The largest uberblock we support is 8k. */
+#define MAX_UBERBLOCK_SHIFT (13)
#define VDEV_UBERBLOCK_SHIFT(vd) \
- MAX((vd)->vdev_top->vdev_ashift, UBERBLOCK_SHIFT)
+ MIN(MAX((vd)->vdev_top->vdev_ashift, UBERBLOCK_SHIFT), \
+ MAX_UBERBLOCK_SHIFT)
#define VDEV_UBERBLOCK_COUNT(vd) \
(VDEV_UBERBLOCK_RING >> VDEV_UBERBLOCK_SHIFT(vd))
#define VDEV_UBERBLOCK_OFFSET(vd, n) \
@@ -295,9 +347,11 @@
extern void vdev_load_log_state(vdev_t *nvd, vdev_t *ovd);
extern boolean_t vdev_log_state_valid(vdev_t *vd);
extern void vdev_load(vdev_t *vd);
+extern int vdev_dtl_load(vdev_t *vd);
extern void vdev_sync(vdev_t *vd, uint64_t txg);
extern void vdev_sync_done(vdev_t *vd, uint64_t txg);
extern void vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg);
+extern void vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg);
/*
* Available vdev types.
@@ -328,7 +382,18 @@
*/
/* zdb uses this tunable, so it must be declared here to make lint happy. */
extern int zfs_vdev_cache_size;
+extern uint_t zfs_geom_probe_vdev_key;
+#ifdef illumos
+/*
+ * The vdev_buf_t is used to translate between zio_t and buf_t, and back again.
+ */
+typedef struct vdev_buf {
+ buf_t vb_buf; /* buffer that describes the io */
+ zio_t *vb_io; /* pointer back to the original zio_t */
+} vdev_buf_t;
+#endif
+
#ifdef __cplusplus
}
#endif
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -20,7 +21,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
*/
#ifndef _SYS_ZAP_H
@@ -80,6 +81,7 @@
*/
#include <sys/dmu.h>
+#include <sys/refcount.h>
#ifdef __cplusplus
extern "C" {
@@ -141,6 +143,12 @@
uint64_t parent_obj, const char *name, dmu_tx_t *tx);
/*
+ * Initialize an already-allocated object.
+ */
+void mzap_create_impl(objset_t *os, uint64_t obj, int normflags,
+ zap_flags_t flags, dmu_tx_t *tx);
+
+/*
* Create a new zapobj with no attributes from the given (unallocated)
* object number.
*/
@@ -209,9 +217,15 @@
int zap_contains(objset_t *ds, uint64_t zapobj, const char *name);
int zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
int key_numints);
+int zap_lookup_by_dnode(dnode_t *dn, const char *name,
+ uint64_t integer_size, uint64_t num_integers, void *buf);
+int zap_lookup_norm_by_dnode(dnode_t *dn, const char *name,
+ uint64_t integer_size, uint64_t num_integers, void *buf,
+ matchtype_t mt, char *realname, int rn_len,
+ boolean_t *ncp);
-int zap_count_write(objset_t *os, uint64_t zapobj, const char *name,
- int add, uint64_t *towrite, uint64_t *tooverwrite);
+int zap_count_write_by_dnode(dnode_t *dn, const char *name,
+ int add, refcount_t *towrite, refcount_t *tooverwrite);
/*
* Create an attribute with the given name and value.
@@ -337,7 +351,7 @@
boolean_t za_normalization_conflict;
uint64_t za_num_integers;
uint64_t za_first_integer; /* no sign extension for <8byte ints */
- char za_name[MAXNAMELEN];
+ char za_name[ZAP_MAXNAMELEN];
} zap_attribute_t;
/*
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -20,6 +21,9 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
*/
#ifndef _SYS_ZAP_IMPL_H
@@ -41,8 +45,7 @@
#define MZAP_ENT_LEN 64
#define MZAP_NAME_LEN (MZAP_ENT_LEN - 8 - 4 - 2)
-#define MZAP_MAX_BLKSHIFT SPA_MAXBLOCKSHIFT
-#define MZAP_MAX_BLKSZ (1 << MZAP_MAX_BLKSHIFT)
+#define MZAP_MAX_BLKSZ SPA_OLD_MAXBLOCKSIZE
#define ZAP_NEED_CD (-1U)
@@ -70,7 +73,7 @@
} mzap_ent_t;
#define MZE_PHYS(zap, mze) \
- (&(zap)->zap_m.zap_phys->mz_chunk[(mze)->mze_chunkid])
+ (&zap_m_phys(zap)->mz_chunk[(mze)->mze_chunkid])
/*
* The (fat) zap is stored in one object. It is an array of
@@ -104,7 +107,7 @@
* word number (1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap)).
*/
#define ZAP_EMBEDDED_PTRTBL_ENT(zap, idx) \
- ((uint64_t *)(zap)->zap_f.zap_phys) \
+ ((uint64_t *)zap_f_phys(zap)) \
[(idx) + (1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap))]
/*
@@ -140,6 +143,7 @@
typedef struct zap_table_phys zap_table_phys_t;
typedef struct zap {
+ dmu_buf_user_t zap_dbu;
objset_t *zap_objset;
uint64_t zap_object;
struct dmu_buf *zap_dbuf;
@@ -149,8 +153,6 @@
uint64_t zap_salt;
union {
struct {
- zap_phys_t *zap_phys;
-
/*
* zap_num_entries_mtx protects
* zap_num_entries
@@ -159,7 +161,6 @@
int zap_block_shift;
} zap_fat;
struct {
- mzap_phys_t *zap_phys;
int16_t zap_num_entries;
int16_t zap_num_chunks;
int16_t zap_alloc_next;
@@ -168,6 +169,18 @@
} zap_u;
} zap_t;
+inline zap_phys_t *
+zap_f_phys(zap_t *zap)
+{
+ return (zap->zap_dbuf->db_data);
+}
+
+inline mzap_phys_t *
+zap_m_phys(zap_t *zap)
+{
+ return (zap->zap_dbuf->db_data);
+}
+
typedef struct zap_name {
zap_t *zn_zap;
int zn_key_intlen;
@@ -185,9 +198,9 @@
boolean_t zap_match(zap_name_t *zn, const char *matchname);
int zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
- krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp);
-void zap_unlockdir(zap_t *zap);
-void zap_evict(dmu_buf_t *db, void *vmzap);
+ krw_t lti, boolean_t fatreader, boolean_t adding, void *tag, zap_t **zapp);
+void zap_unlockdir(zap_t *zap, void *tag);
+void zap_evict(void *dbu);
zap_name_t *zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt);
void zap_name_free(zap_name_t *zn);
int zap_hashbits(zap_t *zap);
@@ -202,12 +215,13 @@
uint64_t integer_size, uint64_t num_integers, void *buf,
char *realname, int rn_len, boolean_t *normalization_conflictp);
void fzap_prefetch(zap_name_t *zn);
-int fzap_count_write(zap_name_t *zn, int add, uint64_t *towrite,
- uint64_t *tooverwrite);
+int fzap_count_write(zap_name_t *zn, int add, refcount_t *towrite,
+ refcount_t *tooverwrite);
int fzap_add(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers,
- const void *val, dmu_tx_t *tx);
+ const void *val, void *tag, dmu_tx_t *tx);
int fzap_update(zap_name_t *zn,
- int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx);
+ int integer_size, uint64_t num_integers, const void *val,
+ void *tag, dmu_tx_t *tx);
int fzap_length(zap_name_t *zn,
uint64_t *integer_size, uint64_t *num_integers);
int fzap_remove(zap_name_t *zn, dmu_tx_t *tx);
@@ -217,7 +231,7 @@
int fzap_add_cd(zap_name_t *zn,
uint64_t integer_size, uint64_t num_integers,
- const void *val, uint32_t cd, dmu_tx_t *tx);
+ const void *val, uint32_t cd, void *tag, dmu_tx_t *tx);
void fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags);
int fzap_cursor_move_to_key(zap_cursor_t *zc, zap_name_t *zn);
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -20,6 +21,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
*/
#ifndef _SYS_ZAP_LEAF_H
@@ -83,7 +85,7 @@
*/
#define ZAP_LEAF_CHUNK(l, idx) \
((zap_leaf_chunk_t *) \
- ((l)->l_phys->l_hash + ZAP_LEAF_HASH_NUMENTRIES(l)))[idx]
+ (zap_leaf_phys(l)->l_hash + ZAP_LEAF_HASH_NUMENTRIES(l)))[idx]
#define ZAP_LEAF_ENTRY(l, idx) (&ZAP_LEAF_CHUNK(l, idx).l_entry)
typedef enum zap_chunk_type {
@@ -152,13 +154,18 @@
} zap_leaf_chunk_t;
typedef struct zap_leaf {
+ dmu_buf_user_t l_dbu;
krwlock_t l_rwlock;
uint64_t l_blkid; /* 1<<ZAP_BLOCK_SHIFT byte block off */
int l_bs; /* block size shift */
dmu_buf_t *l_dbuf;
- zap_leaf_phys_t *l_phys;
} zap_leaf_t;
+inline zap_leaf_phys_t *
+zap_leaf_phys(zap_leaf_t *l)
+{
+ return (l->l_dbuf->db_data);
+}
typedef struct zap_entry_handle {
/* Set by zap_leaf and public to ZAP */
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfeature.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfeature.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfeature.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -20,7 +21,7 @@
*/
/*
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
*/
#ifndef _SYS_ZFEATURE_H
@@ -27,6 +28,7 @@
#define _SYS_ZFEATURE_H
#include <sys/nvpair.h>
+#include <sys/txg.h>
#include "zfeature_common.h"
#ifdef __cplusplus
@@ -33,21 +35,38 @@
extern "C" {
#endif
+#define VALID_FEATURE_FID(fid) ((fid) >= 0 && (fid) < SPA_FEATURES)
+#define VALID_FEATURE_OR_NONE(fid) ((fid) == SPA_FEATURE_NONE || \
+ VALID_FEATURE_FID(fid))
+
struct spa;
struct dmu_tx;
struct objset;
-extern boolean_t feature_is_supported(struct objset *os, uint64_t obj,
- uint64_t desc_obj, nvlist_t *unsup_feat, nvlist_t *enabled_feat);
-
extern void spa_feature_create_zap_objects(struct spa *, struct dmu_tx *);
-extern void spa_feature_enable(struct spa *, zfeature_info_t *,
+extern void spa_feature_enable(struct spa *, spa_feature_t,
struct dmu_tx *);
-extern void spa_feature_incr(struct spa *, zfeature_info_t *, struct dmu_tx *);
-extern void spa_feature_decr(struct spa *, zfeature_info_t *, struct dmu_tx *);
-extern boolean_t spa_feature_is_enabled(struct spa *, zfeature_info_t *);
-extern boolean_t spa_feature_is_active(struct spa *, zfeature_info_t *);
+extern void spa_feature_incr(struct spa *, spa_feature_t, struct dmu_tx *);
+extern void spa_feature_decr(struct spa *, spa_feature_t, struct dmu_tx *);
+extern boolean_t spa_feature_is_enabled(struct spa *, spa_feature_t);
+extern boolean_t spa_feature_is_active(struct spa *, spa_feature_t);
+extern boolean_t spa_feature_enabled_txg(spa_t *spa, spa_feature_t fid,
+ uint64_t *txg);
+extern uint64_t spa_feature_refcount(spa_t *, spa_feature_t, uint64_t);
+extern boolean_t spa_features_check(spa_t *, boolean_t, nvlist_t *, nvlist_t *);
+/*
+ * These functions are only exported for zhack and zdb; normal callers should
+ * use the above interfaces.
+ */
+extern int feature_get_refcount(struct spa *, zfeature_info_t *, uint64_t *);
+extern int feature_get_refcount_from_disk(spa_t *spa, zfeature_info_t *feature,
+ uint64_t *res);
+extern void feature_enable_sync(struct spa *, zfeature_info_t *,
+ struct dmu_tx *);
+extern void feature_sync(struct spa *, zfeature_info_t *, uint64_t,
+ struct dmu_tx *);
+
#ifdef __cplusplus
}
#endif
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_acl.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_acl.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_acl.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_context.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -24,7 +25,7 @@
*/
/*
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
*/
#ifndef _SYS_ZFS_CONTEXT_H
@@ -94,16 +95,14 @@
#include <sys/sunddi.h>
#ifdef illumos
#include <sys/cyclic.h>
-#else /* FreeBSD */
-#include <sys/callout.h>
#endif
-
+#include <sys/callo.h>
+#include <sys/disp.h>
#include <machine/stdarg.h>
#include <vm/vm.h>
#include <vm/vm_page.h>
#include <vm/vm_object.h>
-#include <vm/vm_pager.h>
#include <vm/vm_kern.h>
#include <vm/vm_map.h>
/* There is clash. vm_map.h defines the two below and vdev_cache.c use them. */
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ctldir.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ctldir.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ctldir.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -44,7 +45,7 @@
void zfsctl_create(zfsvfs_t *);
void zfsctl_destroy(zfsvfs_t *);
-vnode_t *zfsctl_root(znode_t *);
+int zfsctl_root(zfsvfs_t *, int, vnode_t **);
void zfsctl_init(void);
void zfsctl_fini(void);
boolean_t zfsctl_is_node(vnode_t *);
@@ -53,15 +54,10 @@
int zfsctl_destroy_snapshot(const char *snapname, int force);
int zfsctl_umount_snapshots(vfs_t *, int, cred_t *);
-int zfsctl_root_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
- int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
- int *direntflags, pathname_t *realpnp);
-
int zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp);
#define ZFSCTL_INO_ROOT 0x1
#define ZFSCTL_INO_SNAPDIR 0x2
-#define ZFSCTL_INO_SHARES 0x3
#ifdef __cplusplus
}
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -20,7 +21,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
*/
#ifndef _SYS_ZFS_DEBUG_H
@@ -49,14 +50,17 @@
#endif
extern int zfs_flags;
+extern boolean_t zfs_recover;
+extern boolean_t zfs_free_leak_on_eio;
-#define ZFS_DEBUG_DPRINTF (1<<0)
-#define ZFS_DEBUG_DBUF_VERIFY (1<<1)
-#define ZFS_DEBUG_DNODE_VERIFY (1<<2)
-#define ZFS_DEBUG_SNAPNAMES (1<<3)
-#define ZFS_DEBUG_MODIFY (1<<4)
-#define ZFS_DEBUG_SPA (1<<5)
-#define ZFS_DEBUG_ZIO_FREE (1<<6)
+#define ZFS_DEBUG_DPRINTF (1<<0)
+#define ZFS_DEBUG_DBUF_VERIFY (1<<1)
+#define ZFS_DEBUG_DNODE_VERIFY (1<<2)
+#define ZFS_DEBUG_SNAPNAMES (1<<3)
+#define ZFS_DEBUG_MODIFY (1<<4)
+#define ZFS_DEBUG_SPA (1<<5)
+#define ZFS_DEBUG_ZIO_FREE (1<<6)
+#define ZFS_DEBUG_HISTOGRAM_VERIFY (1<<7)
#ifdef ZFS_DEBUG
extern void __dprintf(const char *file, const char *func,
@@ -79,6 +83,7 @@
extern void zfs_dbgmsg_init(void);
extern void zfs_dbgmsg_fini(void);
extern void zfs_dbgmsg(const char *fmt, ...);
+extern void zfs_dbgmsg_print(const char *tag);
#ifdef illumos
#ifndef _KERNEL
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_dir.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -48,18 +49,18 @@
#define IS_ROOT_NODE 0x01 /* create a root node */
#define IS_XATTR 0x02 /* create an extended attribute node */
-extern int zfs_dirent_lock(zfs_dirlock_t **, znode_t *, char *, znode_t **,
- int, int *, pathname_t *);
-extern void zfs_dirent_unlock(zfs_dirlock_t *);
-extern int zfs_link_create(zfs_dirlock_t *, znode_t *, dmu_tx_t *, int);
-extern int zfs_link_destroy(zfs_dirlock_t *, znode_t *, dmu_tx_t *, int,
+extern int zfs_dirent_lookup(znode_t *, const char *, znode_t **, int);
+extern int zfs_link_create(znode_t *, const char *, znode_t *, dmu_tx_t *, int);
+extern int zfs_link_destroy(znode_t *, const char *, znode_t *, dmu_tx_t *, int,
boolean_t *);
-extern int zfs_dirlook(znode_t *, char *, vnode_t **, int, int *,
- pathname_t *);
+#if 0
+extern int zfs_dirlook(vnode_t *, const char *, vnode_t **, int);
+#else
+extern int zfs_dirlook(znode_t *, const char *name, znode_t **);
+#endif
extern void zfs_mknode(znode_t *, vattr_t *, dmu_tx_t *, cred_t *,
uint_t, znode_t **, zfs_acl_ids_t *);
extern void zfs_rmnode(znode_t *);
-extern void zfs_dl_name_switch(zfs_dirlock_t *dl, char *new, char **old);
extern boolean_t zfs_dirempty(znode_t *);
extern void zfs_unlinked_add(znode_t *, dmu_tx_t *);
extern void zfs_unlinked_drain(zfsvfs_t *zfsvfs);
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_fuid.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_fuid.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_fuid.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_ioctl.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -20,7 +21,10 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2011-2012 Pawel Jakub Dawidek. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright 2016 RackTop Systems.
+ * Copyright (c) 2014 Integros [integros.com]
*/
#ifndef _SYS_ZFS_IOCTL_H
@@ -79,19 +83,32 @@
* Feature flags for zfs send streams (flags in drr_versioninfo)
*/
-#define DMU_BACKUP_FEATURE_DEDUP (0x1)
-#define DMU_BACKUP_FEATURE_DEDUPPROPS (0x2)
-#define DMU_BACKUP_FEATURE_SA_SPILL (0x4)
+#define DMU_BACKUP_FEATURE_DEDUP (1 << 0)
+#define DMU_BACKUP_FEATURE_DEDUPPROPS (1 << 1)
+#define DMU_BACKUP_FEATURE_SA_SPILL (1 << 2)
+/* flags #3 - #15 are reserved for incompatible closed-source implementations */
+#define DMU_BACKUP_FEATURE_EMBED_DATA (1 << 16)
+#define DMU_BACKUP_FEATURE_EMBED_DATA_LZ4 (1 << 17)
+/* flag #18 is reserved for a Delphix feature */
+#define DMU_BACKUP_FEATURE_LARGE_BLOCKS (1 << 19)
+#define DMU_BACKUP_FEATURE_RESUMING (1 << 20)
/*
* Mask of all supported backup features
*/
#define DMU_BACKUP_FEATURE_MASK (DMU_BACKUP_FEATURE_DEDUP | \
- DMU_BACKUP_FEATURE_DEDUPPROPS | DMU_BACKUP_FEATURE_SA_SPILL)
+ DMU_BACKUP_FEATURE_DEDUPPROPS | DMU_BACKUP_FEATURE_SA_SPILL | \
+ DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_EMBED_DATA_LZ4 | \
+ DMU_BACKUP_FEATURE_RESUMING | \
+ DMU_BACKUP_FEATURE_LARGE_BLOCKS)
/* Are all features in the given flag word currently supported? */
#define DMU_STREAM_SUPPORTED(x) (!((x) & ~DMU_BACKUP_FEATURE_MASK))
+typedef enum dmu_send_resume_token_version {
+ ZFS_SEND_RESUME_TOKEN_VERSION = 1
+} dmu_send_resume_token_version_t;
+
/*
* The drr_versioninfo field of the dmu_replay_record has the
* following layout:
@@ -111,8 +128,22 @@
#define DMU_BACKUP_MAGIC 0x2F5bacbacULL
+/*
+ * Send stream flags. Bits 24-31 are reserved for vendor-specific
+ * implementations and should not be used.
+ */
#define DRR_FLAG_CLONE (1<<0)
#define DRR_FLAG_CI_DATA (1<<1)
+/*
+ * This send stream, if it is a full send, includes the FREE and FREEOBJECT
+ * records that are created by the sending process. This means that the send
+ * stream can be received as a clone, even though it is not an incremental.
+ * This is not implemented as a feature flag, because the receiving side does
+ * not need to have implemented it to receive this stream; it is fully backwards
+ * compatible. We need a flag, though, because full send streams without it
+ * cannot necessarily be received as a clone correctly.
+ */
+#define DRR_FLAG_FREERECORDS (1<<2)
/*
* flags in the drr_checksumflags field in the DRR_WRITE and
@@ -210,7 +241,7 @@
enum {
DRR_BEGIN, DRR_OBJECT, DRR_FREEOBJECTS,
DRR_WRITE, DRR_FREE, DRR_END, DRR_WRITE_BYREF,
- DRR_SPILL, DRR_NUMTYPES
+ DRR_SPILL, DRR_WRITE_EMBEDDED, DRR_NUMTYPES
} drr_type;
uint32_t drr_payloadlen;
union {
@@ -222,6 +253,35 @@
struct drr_free drr_free;
struct drr_write_byref drr_write_byref;
struct drr_spill drr_spill;
+ struct drr_write_embedded {
+ uint64_t drr_object;
+ uint64_t drr_offset;
+ /* logical length, should equal blocksize */
+ uint64_t drr_length;
+ uint64_t drr_toguid;
+ uint8_t drr_compression;
+ uint8_t drr_etype;
+ uint8_t drr_pad[6];
+ uint32_t drr_lsize; /* uncompressed size of payload */
+ uint32_t drr_psize; /* compr. (real) size of payload */
+ /* (possibly compressed) content follows */
+ } drr_write_embedded;
+
+ /*
+ * Nore: drr_checksum is overlaid with all record types
+ * except DRR_BEGIN. Therefore its (non-pad) members
+ * must not overlap with members from the other structs.
+ * We accomplish this by putting its members at the very
+ * end of the struct.
+ */
+ struct drr_checksum {
+ uint64_t drr_pad[34];
+ /*
+ * fletcher-4 checksum of everything preceding the
+ * checksum.
+ */
+ zio_cksum_t drr_checksum;
+ } drr_checksum;
} drr_u;
} dmu_replay_record_t;
@@ -256,6 +316,7 @@
uint32_t zi_iotype;
int32_t zi_duration;
uint64_t zi_timer;
+ uint64_t zi_nlanes;
uint32_t zi_cmd;
uint32_t zi_pad;
} zinject_record_t;
@@ -293,6 +354,12 @@
ZFS_CASE_MIXED
} zfs_case_t;
+/*
+ * Note: this struct must have the same layout in 32-bit and 64-bit, so
+ * that 32-bit processes (like /sbin/zfs) can pass it to the 64-bit
+ * kernel. Therefore, we add padding to it so that no "hidden" padding
+ * is automatically added on 64-bit (but not on 32-bit).
+ */
typedef struct zfs_cmd {
char zc_name[MAXPATHLEN]; /* name of pool or dataset */
uint64_t zc_nvlist_src; /* really (char *) */
@@ -322,14 +389,16 @@
zfs_share_t zc_share;
uint64_t zc_jailid;
dmu_objset_stats_t zc_objset_stats;
- struct drr_begin zc_begin_record;
+ dmu_replay_record_t zc_begin_record;
zinject_record_t zc_inject_record;
- boolean_t zc_defer_destroy;
- boolean_t zc_temphold;
+ uint32_t zc_defer_destroy;
+ uint32_t zc_flags;
uint64_t zc_action_handle;
int zc_cleanup_fd;
uint8_t zc_simple;
- uint8_t zc_pad[3]; /* alignment */
+ uint8_t zc_pad3[3];
+ boolean_t zc_resumable;
+ uint32_t zc_pad4;
uint64_t zc_sendobj;
uint64_t zc_fromobj;
uint64_t zc_createtxg;
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_onexit.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_onexit.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_onexit.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_rlock.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_rlock.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_rlock.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_sa.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_sa.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_sa.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_stat.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_stat.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_stat.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_vfsops.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -64,11 +65,11 @@
int z_norm; /* normalization flags */
boolean_t z_atime; /* enable atimes mount option */
boolean_t z_unmounted; /* unmounted */
- rrwlock_t z_teardown_lock;
+ rrmlock_t z_teardown_lock;
krwlock_t z_teardown_inactive_lock;
list_t z_all_znodes; /* all vnodes in the fs */
kmutex_t z_znodes_lock; /* lock for z_all_znodes */
- vnode_t *z_ctldir; /* .zfs directory pointer */
+ struct zfsctl_root *z_ctldir; /* .zfs directory pointer */
boolean_t z_show_ctldir; /* expose .zfs in the root dir */
boolean_t z_issnap; /* true if this is a snapshot */
boolean_t z_vscan; /* virus scan on/off */
@@ -75,6 +76,7 @@
boolean_t z_use_fuids; /* version allows fuids */
boolean_t z_replay; /* set during ZIL replay */
boolean_t z_use_sa; /* version allow system attributes */
+ boolean_t z_use_namecache;/* make use of FreeBSD name cache */
uint64_t z_version; /* ZPL version */
uint64_t z_shares_dir; /* hidden shares dir */
kmutex_t z_lock;
@@ -110,7 +112,7 @@
} zfid_short_t;
/*
- * Filesystems under .zfs/snapshot have a total file ID size of 22 bytes
+ * Filesystems under .zfs/snapshot have a total file ID size of 22[*] bytes
* (including the length field). This makes files under .zfs/snapshot
* accessible by NFSv3 and NFSv4, but not NFSv2.
*
@@ -120,10 +122,13 @@
* 6 bytes object number (48 bits)
* 4 bytes generation number (32 bits)
* 6 bytes objset id (48 bits)
- * 4 bytes currently just zero (32 bits)
+ * 4 bytes[**] currently just zero (32 bits)
*
* We reserve only 48 bits for the object number and objset id, as these are
* the limits currently defined and imposed by the DMU.
+ *
+ * [*] 20 bytes on FreeBSD to fit into the size of struct fid.
+ * [**] 2 bytes on FreeBSD for the above reason.
*/
typedef struct zfid_long {
zfid_short_t z_fid;
@@ -138,7 +143,7 @@
extern int zfs_super_owner;
extern int zfs_suspend_fs(zfsvfs_t *zfsvfs);
-extern int zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname);
+extern int zfs_resume_fs(zfsvfs_t *zfsvfs, struct dsl_dataset *ds);
extern int zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
const char *domain, uint64_t rid, uint64_t *valuep);
extern int zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
@@ -153,7 +158,6 @@
extern int zfsvfs_create(const char *name, zfsvfs_t **zfvp);
extern void zfsvfs_free(zfsvfs_t *zfsvfs);
extern int zfs_check_global_label(const char *dsname, const char *hexsl);
-extern int zfs_vnode_lock(vnode_t *vp, int flags);
#ifdef _KERNEL
extern void zfsvfs_update_fromname(const char *oldname, const char *newname);
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_znode.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -20,7 +21,8 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
*/
#ifndef _SYS_FS_ZFS_ZNODE_H
@@ -133,20 +135,7 @@
#define ZFS_SHARES_DIR "SHARES"
#define ZFS_SA_ATTRS "SA_ATTRS"
-#define ZFS_MAX_BLOCKSIZE (SPA_MAXBLOCKSIZE)
-
/*
- * Path component length
- *
- * The generic fs code uses MAXNAMELEN to represent
- * what the largest component length is. Unfortunately,
- * this length includes the terminating NULL. ZFS needs
- * to tell the users via pathconf() and statvfs() what the
- * true maximum length of a component is, excluding the NULL.
- */
-#define ZFS_MAXNAMELEN (MAXNAMELEN - 1)
-
-/*
* Convert mode bits (zp_mode) to BSD-style DT_* values for storing in
* the directory entries.
*/
@@ -182,10 +171,12 @@
struct zfsvfs *z_zfsvfs;
vnode_t *z_vnode;
uint64_t z_id; /* object ID for this znode */
+#ifdef illumos
kmutex_t z_lock; /* znode modification lock */
krwlock_t z_parent_lock; /* parent lock for directories */
krwlock_t z_name_lock; /* "master" lock for dirent locks */
zfs_dirlock_t *z_dirlocks; /* directory entry lock list */
+#endif
kmutex_t z_range_lock; /* protects changes to z_range_avl */
avl_tree_t z_range_avl; /* avl tree of file range locks */
uint8_t z_unlinked; /* file has been unlinked */
@@ -237,7 +228,7 @@
{
vnode_t *vp = zp->z_vnode;
- ASSERT(vp == NULL || vp->v_data == NULL || vp->v_data == zp);
+ ASSERT(vp != NULL && vp->v_data == zp);
return (vp);
}
static __inline znode_t *
@@ -245,7 +236,7 @@
{
znode_t *zp = (znode_t *)vp->v_data;
- ASSERT(zp == NULL || zp->z_vnode == NULL || zp->z_vnode == vp);
+ ASSERT(zp != NULL && zp->z_vnode == vp);
return (zp);
}
#else
@@ -256,7 +247,7 @@
/* Called on entry to each ZFS vnode and vfs operation */
#define ZFS_ENTER(zfsvfs) \
{ \
- rrw_enter_read(&(zfsvfs)->z_teardown_lock, FTAG); \
+ rrm_enter_read(&(zfsvfs)->z_teardown_lock, FTAG); \
if ((zfsvfs)->z_unmounted) { \
ZFS_EXIT(zfsvfs); \
return (EIO); \
@@ -263,12 +254,8 @@
} \
}
-/* Called on entry to each ZFS vnode and vfs operation that can not return EIO */
-#define ZFS_ENTER_NOERROR(zfsvfs) \
- rrw_enter(&(zfsvfs)->z_teardown_lock, RW_READER, FTAG)
-
/* Must be called before exiting the vop */
-#define ZFS_EXIT(zfsvfs) rrw_exit(&(zfsvfs)->z_teardown_lock, FTAG)
+#define ZFS_EXIT(zfsvfs) rrm_exit(&(zfsvfs)->z_teardown_lock, FTAG)
/* Verifies the znode is valid */
#define ZFS_VERIFY_ZP(zp) \
@@ -369,6 +356,8 @@
extern zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE];
extern int zfsfstype;
+extern int zfs_znode_parent_and_name(znode_t *zp, znode_t **dzpp, char *buf);
+
#endif /* _KERNEL */
extern int zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len);
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -21,6 +22,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
*/
/* Portions Copyright 2010 Robert Milkowski */
@@ -37,6 +39,9 @@
extern "C" {
#endif
+struct dsl_pool;
+struct dsl_dataset;
+
/*
* Intent log format:
*
@@ -90,7 +95,6 @@
} zil_chain_t;
#define ZIL_MIN_BLKSZ 4096ULL
-#define ZIL_MAX_BLKSZ SPA_MAXBLOCKSIZE
/*
* The words of a log block checksum.
@@ -366,7 +370,6 @@
void *itx_private; /* type-specific opaque data */
itx_wr_state_t itx_wr_state; /* write state */
uint8_t itx_sync; /* synchronous transaction */
- uint64_t itx_sod; /* record size on disk */
uint64_t itx_oid; /* object id */
lr_t itx_lr; /* common part of log record */
/* followed by type-specific part of lr_xx_t and its immediate data */
@@ -402,11 +405,14 @@
extern void zil_itx_destroy(itx_t *itx);
extern void zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx);
+extern void zil_async_to_sync(zilog_t *zilog, uint64_t oid);
extern void zil_commit(zilog_t *zilog, uint64_t oid);
extern int zil_vdev_offline(const char *osname, void *txarg);
-extern int zil_claim(const char *osname, void *txarg);
-extern int zil_check_log_chain(const char *osname, void *txarg);
+extern int zil_claim(struct dsl_pool *dp,
+ struct dsl_dataset *ds, void *txarg);
+extern int zil_check_log_chain(struct dsl_pool *dp,
+ struct dsl_dataset *ds, void *tx);
extern void zil_sync(zilog_t *zilog, dmu_tx_t *tx);
extern void zil_clean(zilog_t *zilog, uint64_t synced_txg);
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zil_impl.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -21,6 +22,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
*/
/* Portions Copyright 2010 Robert Milkowski */
@@ -41,6 +43,7 @@
typedef struct lwb {
zilog_t *lwb_zilog; /* back pointer to log struct */
blkptr_t lwb_blk; /* on disk address of this log blk */
+ boolean_t lwb_slog; /* lwb_blk is on SLOG device */
int lwb_nused; /* # used bytes in buffer */
int lwb_sz; /* size of block and buffer */
char *lwb_buf; /* log write buffer */
@@ -61,7 +64,6 @@
typedef struct itxg {
kmutex_t itxg_lock; /* lock for this structure */
uint64_t itxg_txg; /* txg for this chain */
- uint64_t itxg_sod; /* total size on disk for this txg */
itxs_t *itxg_itxs; /* sync and async itxs */
} itxg_t;
@@ -119,7 +121,6 @@
kcondvar_t zl_cv_batch[2]; /* batch condition variables */
itxg_t zl_itxg[TXG_SIZE]; /* intent log txg chains */
list_t zl_itx_commit_list; /* itx list to be committed */
- uint64_t zl_itx_list_sz; /* total size of records on list */
uint64_t zl_cur_used; /* current commit log size used */
list_t zl_lwb_list; /* in-flight log write list */
kmutex_t zl_vdev_lock; /* protects zl_vdev_tree */
@@ -139,8 +140,10 @@
avl_node_t zn_node;
} zil_bp_node_t;
-#define ZIL_MAX_LOG_DATA (SPA_MAXBLOCKSIZE - sizeof (zil_chain_t) - \
+#define ZIL_MAX_LOG_DATA (SPA_OLD_MAXBLOCKSIZE - sizeof (zil_chain_t) - \
sizeof (lr_write_t))
+#define ZIL_MAX_COPIED_DATA \
+ ((SPA_OLD_MAXBLOCKSIZE - sizeof (zil_chain_t)) / 2 - sizeof (lr_write_t))
#ifdef __cplusplus
}
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -21,7 +22,8 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
*/
@@ -28,6 +30,7 @@
#ifndef _ZIO_H
#define _ZIO_H
+#include <sys/zio_priority.h>
#include <sys/zfs_context.h>
#include <sys/spa.h>
#include <sys/txg.h>
@@ -79,9 +82,21 @@
ZIO_CHECKSUM_FLETCHER_4,
ZIO_CHECKSUM_SHA256,
ZIO_CHECKSUM_ZILOG2,
+ ZIO_CHECKSUM_NOPARITY,
+#ifdef illumos
+ ZIO_CHECKSUM_SHA512,
+ ZIO_CHECKSUM_SKEIN,
+ ZIO_CHECKSUM_EDONR,
+#endif
ZIO_CHECKSUM_FUNCTIONS
};
+/*
+ * The number of "legacy" compression functions which can be set on individual
+ * objects.
+ */
+#define ZIO_CHECKSUM_LEGACY_FUNCTIONS ZIO_CHECKSUM_ZILOG2
+
#define ZIO_CHECKSUM_ON_VALUE ZIO_CHECKSUM_FLETCHER_4
#define ZIO_CHECKSUM_DEFAULT ZIO_CHECKSUM_ON
@@ -111,15 +126,25 @@
ZIO_COMPRESS_FUNCTIONS
};
-/* N.B. when altering this value, also change BOOTFS_COMPRESS_VALID below */
-#define ZIO_COMPRESS_ON_VALUE ZIO_COMPRESS_LZJB
-#define ZIO_COMPRESS_DEFAULT ZIO_COMPRESS_OFF
+/*
+ * The number of "legacy" compression functions which can be set on individual
+ * objects.
+ */
+#define ZIO_COMPRESS_LEGACY_FUNCTIONS ZIO_COMPRESS_LZ4
+/*
+ * The meaning of "compress = on" selected by the compression features enabled
+ * on a given pool.
+ */
+#define ZIO_COMPRESS_LEGACY_ON_VALUE ZIO_COMPRESS_LZJB
+#define ZIO_COMPRESS_LZ4_ON_VALUE ZIO_COMPRESS_LZ4
+
+#define ZIO_COMPRESS_DEFAULT ZIO_COMPRESS_OFF
+
#define BOOTFS_COMPRESS_VALID(compress) \
((compress) == ZIO_COMPRESS_LZJB || \
(compress) == ZIO_COMPRESS_LZ4 || \
- ((compress) == ZIO_COMPRESS_ON && \
- ZIO_COMPRESS_ON_VALUE == ZIO_COMPRESS_LZJB) || \
+ (compress) == ZIO_COMPRESS_ON || \
(compress) == ZIO_COMPRESS_OFF)
#define ZIO_FAILURE_MODE_WAIT 0
@@ -126,24 +151,6 @@
#define ZIO_FAILURE_MODE_CONTINUE 1
#define ZIO_FAILURE_MODE_PANIC 2
-#define ZIO_PRIORITY_NOW (zio_priority_table[0])
-#define ZIO_PRIORITY_SYNC_READ (zio_priority_table[1])
-#define ZIO_PRIORITY_SYNC_WRITE (zio_priority_table[2])
-#define ZIO_PRIORITY_LOG_WRITE (zio_priority_table[3])
-#define ZIO_PRIORITY_CACHE_FILL (zio_priority_table[4])
-#define ZIO_PRIORITY_AGG (zio_priority_table[5])
-#define ZIO_PRIORITY_FREE (zio_priority_table[6])
-#define ZIO_PRIORITY_ASYNC_WRITE (zio_priority_table[7])
-#define ZIO_PRIORITY_ASYNC_READ (zio_priority_table[8])
-#define ZIO_PRIORITY_RESILVER (zio_priority_table[9])
-#define ZIO_PRIORITY_SCRUB (zio_priority_table[10])
-#define ZIO_PRIORITY_DDT_PREFETCH (zio_priority_table[11])
-#define ZIO_PRIORITY_TRIM (zio_priority_table[12])
-#define ZIO_PRIORITY_TABLE_SIZE 13
-
-#define ZIO_PIPELINE_CONTINUE 0x100
-#define ZIO_PIPELINE_STOP 0x101
-
enum zio_flag {
/*
* Flags inherited by gang, ddt, and vdev children,
@@ -155,6 +162,7 @@
ZIO_FLAG_RESILVER = 1 << 3,
ZIO_FLAG_SCRUB = 1 << 4,
ZIO_FLAG_SCAN_THREAD = 1 << 5,
+ ZIO_FLAG_PHYSICAL = 1 << 6,
#define ZIO_FLAG_AGG_INHERIT (ZIO_FLAG_CANFAIL - 1)
@@ -161,13 +169,14 @@
/*
* Flags inherited by ddt, gang, and vdev children.
*/
- ZIO_FLAG_CANFAIL = 1 << 6, /* must be first for INHERIT */
- ZIO_FLAG_SPECULATIVE = 1 << 7,
- ZIO_FLAG_CONFIG_WRITER = 1 << 8,
- ZIO_FLAG_DONT_RETRY = 1 << 9,
- ZIO_FLAG_DONT_CACHE = 1 << 10,
- ZIO_FLAG_NODATA = 1 << 11,
- ZIO_FLAG_INDUCE_DAMAGE = 1 << 12,
+ ZIO_FLAG_CANFAIL = 1 << 7, /* must be first for INHERIT */
+ ZIO_FLAG_SPECULATIVE = 1 << 8,
+ ZIO_FLAG_CONFIG_WRITER = 1 << 9,
+ ZIO_FLAG_DONT_RETRY = 1 << 10,
+ ZIO_FLAG_DONT_CACHE = 1 << 11,
+ ZIO_FLAG_NODATA = 1 << 12,
+ ZIO_FLAG_INDUCE_DAMAGE = 1 << 13,
+ ZIO_FLAG_IO_ALLOCATING = 1 << 14,
#define ZIO_FLAG_DDT_INHERIT (ZIO_FLAG_IO_RETRY - 1)
#define ZIO_FLAG_GANG_INHERIT (ZIO_FLAG_IO_RETRY - 1)
@@ -175,10 +184,10 @@
/*
* Flags inherited by vdev children.
*/
- ZIO_FLAG_IO_RETRY = 1 << 13, /* must be first for INHERIT */
- ZIO_FLAG_PROBE = 1 << 14,
- ZIO_FLAG_TRYHARD = 1 << 15,
- ZIO_FLAG_OPTIONAL = 1 << 16,
+ ZIO_FLAG_IO_RETRY = 1 << 15, /* must be first for INHERIT */
+ ZIO_FLAG_PROBE = 1 << 16,
+ ZIO_FLAG_TRYHARD = 1 << 17,
+ ZIO_FLAG_OPTIONAL = 1 << 18,
#define ZIO_FLAG_VDEV_INHERIT (ZIO_FLAG_DONT_QUEUE - 1)
@@ -185,16 +194,17 @@
/*
* Flags not inherited by any children.
*/
- ZIO_FLAG_DONT_QUEUE = 1 << 17, /* must be first for INHERIT */
- ZIO_FLAG_DONT_PROPAGATE = 1 << 18,
- ZIO_FLAG_IO_BYPASS = 1 << 19,
- ZIO_FLAG_IO_REWRITE = 1 << 20,
- ZIO_FLAG_RAW = 1 << 21,
- ZIO_FLAG_GANG_CHILD = 1 << 22,
- ZIO_FLAG_DDT_CHILD = 1 << 23,
- ZIO_FLAG_GODFATHER = 1 << 24,
- ZIO_FLAG_NOPWRITE = 1 << 25,
- ZIO_FLAG_REEXECUTED = 1 << 26,
+ ZIO_FLAG_DONT_QUEUE = 1 << 19, /* must be first for INHERIT */
+ ZIO_FLAG_DONT_PROPAGATE = 1 << 20,
+ ZIO_FLAG_IO_BYPASS = 1 << 21,
+ ZIO_FLAG_IO_REWRITE = 1 << 22,
+ ZIO_FLAG_RAW = 1 << 23,
+ ZIO_FLAG_GANG_CHILD = 1 << 24,
+ ZIO_FLAG_DDT_CHILD = 1 << 25,
+ ZIO_FLAG_GODFATHER = 1 << 26,
+ ZIO_FLAG_NOPWRITE = 1 << 27,
+ ZIO_FLAG_REEXECUTED = 1 << 28,
+ ZIO_FLAG_DELEGATED = 1 << 29,
};
#define ZIO_FLAG_MUSTSUCCEED 0
@@ -211,6 +221,9 @@
(((zio)->io_flags & ZIO_FLAG_VDEV_INHERIT) | \
ZIO_FLAG_CANFAIL)
+#define ZIO_CHILD_BIT(x) (1 << (x))
+#define ZIO_CHILD_BIT_IS_SET(val, x) ((val) & (1 << (x)))
+
enum zio_child {
ZIO_CHILD_VDEV = 0,
ZIO_CHILD_GANG,
@@ -219,6 +232,14 @@
ZIO_CHILD_TYPES
};
+#define ZIO_CHILD_VDEV_BIT ZIO_CHILD_BIT(ZIO_CHILD_VDEV)
+#define ZIO_CHILD_GANG_BIT ZIO_CHILD_BIT(ZIO_CHILD_GANG)
+#define ZIO_CHILD_DDT_BIT ZIO_CHILD_BIT(ZIO_CHILD_DDT)
+#define ZIO_CHILD_LOGICAL_BIT ZIO_CHILD_BIT(ZIO_CHILD_LOGICAL)
+#define ZIO_CHILD_ALL_BITS \
+ (ZIO_CHILD_VDEV_BIT | ZIO_CHILD_GANG_BIT | \
+ ZIO_CHILD_DDT_BIT | ZIO_CHILD_LOGICAL_BIT)
+
enum zio_wait_type {
ZIO_WAIT_READY = 0,
ZIO_WAIT_DONE,
@@ -235,8 +256,8 @@
typedef void zio_done_func_t(zio_t *zio);
-extern uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE];
-extern char *zio_type_name[ZIO_TYPES];
+extern boolean_t zio_dva_throttle_enabled;
+extern const char *zio_type_name[ZIO_TYPES];
/*
* A bookmark is a four-tuple <objset, object, level, blkid> that uniquely
@@ -247,20 +268,21 @@
* Root blocks (objset_phys_t) are object 0, level -1: <objset, 0, -1, 0>.
* ZIL blocks are bookmarked <objset, 0, -2, blkid == ZIL sequence number>.
* dmu_sync()ed ZIL data blocks are bookmarked <objset, object, -2, blkid>.
+ * dnode visit bookmarks are <objset, object id of dnode, -3, 0>.
*
* Note: this structure is called a bookmark because its original purpose
* was to remember where to resume a pool-wide traverse.
*
- * Note: this structure is passed between userland and the kernel.
- * Therefore it must not change size or alignment between 32/64 bit
- * compilation options.
+ * Note: this structure is passed between userland and the kernel, and is
+ * stored on disk (by virtue of being incorporated into other on-disk
+ * structures, e.g. dsl_scan_phys_t).
*/
-typedef struct zbookmark {
+typedef struct zbookmark_phys {
uint64_t zb_objset;
uint64_t zb_object;
int64_t zb_level;
uint64_t zb_blkid;
-} zbookmark_t;
+} zbookmark_phys_t;
#define SET_BOOKMARK(zb, objset, object, level, blkid) \
{ \
@@ -279,6 +301,9 @@
#define ZB_ZIL_OBJECT (0ULL)
#define ZB_ZIL_LEVEL (-2LL)
+#define ZB_DNODE_LEVEL (-3LL)
+#define ZB_DNODE_BLKID (0ULL)
+
#define ZB_IS_ZERO(zb) \
((zb)->zb_objset == 0 && (zb)->zb_object == 0 && \
(zb)->zb_level == 0 && (zb)->zb_blkid == 0)
@@ -404,12 +429,12 @@
struct zio {
/* Core information about this I/O */
- zbookmark_t io_bookmark;
+ zbookmark_phys_t io_bookmark;
zio_prop_t io_prop;
zio_type_t io_type;
enum zio_child io_child_type;
int io_cmd;
- uint8_t io_priority;
+ zio_priority_t io_priority;
uint8_t io_reexecute;
uint8_t io_state[ZIO_WAIT_TYPES];
uint64_t io_txg;
@@ -419,12 +444,13 @@
blkptr_t io_bp_copy;
list_t io_parent_list;
list_t io_child_list;
- zio_link_t *io_walk_link;
zio_t *io_logical;
zio_transform_t *io_transform_stack;
/* Callback info */
zio_done_func_t *io_ready;
+ zio_done_func_t *io_children_ready;
+ zio_done_func_t *io_physdone;
zio_done_func_t *io_done;
void *io_private;
int64_t io_prev_space_delta; /* DMU private */
@@ -442,11 +468,12 @@
const zio_vsd_ops_t *io_vsd_ops;
uint64_t io_offset;
- uint64_t io_deadline;
hrtime_t io_timestamp;
+ hrtime_t io_queued_timestamp;
+ hrtime_t io_target_timestamp;
+ avl_node_t io_queue_node;
avl_node_t io_offset_node;
- avl_node_t io_deadline_node;
- avl_tree_t *io_vdev_tree;
+ avl_node_t io_alloc_node;
/* Internal pipeline state */
enum zio_flag io_flags;
@@ -455,10 +482,12 @@
enum zio_flag io_orig_flags;
enum zio_stage io_orig_stage;
enum zio_stage io_orig_pipeline;
+ enum zio_stage io_pipeline_trace;
int io_error;
int io_child_error[ZIO_CHILD_TYPES];
uint64_t io_children[ZIO_CHILD_TYPES][ZIO_WAIT_TYPES];
uint64_t io_child_count;
+ uint64_t io_phys_children;
uint64_t io_parent_count;
uint64_t *io_stall;
zio_t *io_gang_leader;
@@ -472,14 +501,15 @@
zio_cksum_report_t *io_cksum_report;
uint64_t io_ena;
-#ifdef _KERNEL
- /* FreeBSD only. */
- struct ostask io_task;
-#endif
+ /* Taskq dispatching state */
+ taskq_ent_t io_tqent;
+
avl_node_t io_trim_node;
list_node_t io_trim_link;
};
+extern int zio_timestamp_compare(const void *, const void *);
+
extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd,
zio_done_func_t *done, void *priv, enum zio_flag flags);
@@ -488,16 +518,18 @@
extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, void *data,
uint64_t size, zio_done_func_t *done, void *priv,
- int priority, enum zio_flag flags, const zbookmark_t *zb);
+ zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb);
extern zio_t *zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
void *data, uint64_t size, const zio_prop_t *zp,
- zio_done_func_t *ready, zio_done_func_t *done, void *priv,
- int priority, enum zio_flag flags, const zbookmark_t *zb);
+ zio_done_func_t *ready, zio_done_func_t *children_ready,
+ zio_done_func_t *physdone, zio_done_func_t *done,
+ void *priv, zio_priority_t priority, enum zio_flag flags,
+ const zbookmark_phys_t *zb);
extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
void *data, uint64_t size, zio_done_func_t *done, void *priv,
- int priority, enum zio_flag flags, zbookmark_t *zb);
+ zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb);
extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies,
boolean_t nopwrite);
@@ -510,23 +542,23 @@
extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
uint64_t offset, uint64_t size, zio_done_func_t *done, void *priv,
- int priority, enum zio_flag flags);
+ zio_priority_t priority, enum zio_flag flags);
extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
uint64_t size, void *data, int checksum,
- zio_done_func_t *done, void *priv, int priority, enum zio_flag flags,
- boolean_t labels);
+ zio_done_func_t *done, void *priv, zio_priority_t priority,
+ enum zio_flag flags, boolean_t labels);
extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
uint64_t size, void *data, int checksum,
- zio_done_func_t *done, void *priv, int priority, enum zio_flag flags,
- boolean_t labels);
+ zio_done_func_t *done, void *priv, zio_priority_t priority,
+ enum zio_flag flags, boolean_t labels);
extern zio_t *zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg,
const blkptr_t *bp, uint64_t size, enum zio_flag flags);
extern int zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp,
- blkptr_t *old_bp, uint64_t size, boolean_t use_slog);
+ blkptr_t *old_bp, uint64_t size, boolean_t *slog);
extern void zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp);
extern void zio_flush(zio_t *zio, vdev_t *vd);
extern zio_t *zio_trim(zio_t *zio, spa_t *spa, vdev_t *vd, uint64_t offset,
@@ -537,25 +569,33 @@
extern void zio_nowait(zio_t *zio);
extern void zio_execute(zio_t *zio);
extern void zio_interrupt(zio_t *zio);
+extern void zio_delay_init(zio_t *zio);
+extern void zio_delay_interrupt(zio_t *zio);
-extern zio_t *zio_walk_parents(zio_t *cio);
-extern zio_t *zio_walk_children(zio_t *pio);
+extern zio_t *zio_walk_parents(zio_t *cio, zio_link_t **);
+extern zio_t *zio_walk_children(zio_t *pio, zio_link_t **);
extern zio_t *zio_unique_parent(zio_t *cio);
extern void zio_add_child(zio_t *pio, zio_t *cio);
extern void *zio_buf_alloc(size_t size);
+extern void *zio_buf_alloc_nowait(size_t size);
extern void zio_buf_free(void *buf, size_t size);
extern void *zio_data_buf_alloc(size_t size);
extern void zio_data_buf_free(void *buf, size_t size);
+extern void zio_push_transform(zio_t *zio, void *data, uint64_t size,
+ uint64_t bufsize, zio_transform_func_t *transform);
+extern void zio_pop_transforms(zio_t *zio);
+
extern void zio_resubmit_stage_async(void *);
extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd,
- uint64_t offset, void *data, uint64_t size, int type, int priority,
- enum zio_flag flags, zio_done_func_t *done, void *priv);
+ uint64_t offset, void *data, uint64_t size, int type,
+ zio_priority_t priority, enum zio_flag flags,
+ zio_done_func_t *done, void *priv);
extern zio_t *zio_vdev_delegated_io(vdev_t *vd, uint64_t offset,
- void *data, uint64_t size, int type, int priority,
+ void *data, uint64_t size, int type, zio_priority_t priority,
enum zio_flag flags, zio_done_func_t *done, void *priv);
extern void zio_vdev_io_bypass(zio_t *zio);
@@ -569,8 +609,8 @@
enum zio_checksum parent);
extern enum zio_checksum zio_checksum_dedup_select(spa_t *spa,
enum zio_checksum child, enum zio_checksum parent);
-extern enum zio_compress zio_compress_select(enum zio_compress child,
- enum zio_compress parent);
+extern enum zio_compress zio_compress_select(spa_t *spa,
+ enum zio_compress child, enum zio_compress parent);
extern void zio_suspend(spa_t *spa, zio_t *zio);
extern int zio_resume(spa_t *spa);
@@ -597,7 +637,7 @@
extern int zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error);
extern int zio_handle_label_injection(zio_t *zio, int error);
extern void zio_handle_ignored_writes(zio_t *zio);
-extern uint64_t zio_handle_io_delay(zio_t *zio);
+extern hrtime_t zio_handle_io_delay(zio_t *zio);
/*
* Checksum ereport functions
@@ -618,9 +658,11 @@
/* Called from spa_sync(), but primarily an injection handler */
extern void spa_handle_ignored_writes(spa_t *spa);
-/* zbookmark functions */
-boolean_t zbookmark_is_before(const struct dnode_phys *dnp,
- const zbookmark_t *zb1, const zbookmark_t *zb2);
+/* zbookmark_phys functions */
+boolean_t zbookmark_subtree_completed(const struct dnode_phys *dnp,
+ const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block);
+int zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2,
+ uint8_t ibs2, const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2);
#ifdef __cplusplus
}
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_checksum.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -20,6 +21,8 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, 2015 by Delphix. All rights reserved.
+ * Copyright Saso Kiselkov 2013, All rights reserved.
*/
#ifndef _SYS_ZIO_CHECKSUM_H
@@ -26,6 +29,7 @@
#define _SYS_ZIO_CHECKSUM_H
#include <sys/zio.h>
+#include <zfeature_common.h>
#ifdef __cplusplus
extern "C" {
@@ -34,17 +38,34 @@
/*
* Signature for checksum functions.
*/
-typedef void zio_checksum_t(const void *data, uint64_t size, zio_cksum_t *zcp);
+typedef void zio_checksum_t(const void *data, uint64_t size,
+ const void *ctx_template, zio_cksum_t *zcp);
+typedef void *zio_checksum_tmpl_init_t(const zio_cksum_salt_t *salt);
+typedef void zio_checksum_tmpl_free_t(void *ctx_template);
+typedef enum zio_checksum_flags {
+ /* Strong enough for metadata? */
+ ZCHECKSUM_FLAG_METADATA = (1 << 1),
+ /* ZIO embedded checksum */
+ ZCHECKSUM_FLAG_EMBEDDED = (1 << 2),
+ /* Strong enough for dedup (without verification)? */
+ ZCHECKSUM_FLAG_DEDUP = (1 << 3),
+ /* Uses salt value */
+ ZCHECKSUM_FLAG_SALTED = (1 << 4),
+ /* Strong enough for nopwrite? */
+ ZCHECKSUM_FLAG_NOPWRITE = (1 << 5)
+} zio_checksum_flags_t;
+
/*
* Information about each checksum function.
*/
typedef struct zio_checksum_info {
- zio_checksum_t *ci_func[2]; /* checksum function for each byteorder */
- int ci_correctable; /* number of correctable bits */
- int ci_eck; /* uses zio embedded checksum? */
- int ci_dedup; /* strong enough for dedup? */
- char *ci_name; /* descriptive name */
+ /* checksum function for each byteorder */
+ zio_checksum_t *ci_func[2];
+ zio_checksum_tmpl_init_t *ci_tmpl_init;
+ zio_checksum_tmpl_free_t *ci_tmpl_free;
+ zio_checksum_flags_t ci_flags;
+ char *ci_name; /* descriptive name */
} zio_checksum_info_t;
typedef struct zio_bad_cksum {
@@ -62,11 +83,33 @@
* Checksum routines.
*/
extern zio_checksum_t zio_checksum_SHA256;
+#ifdef illumos
+extern zio_checksum_t zio_checksum_SHA512_native;
+extern zio_checksum_t zio_checksum_SHA512_byteswap;
+/* Skein */
+extern zio_checksum_t zio_checksum_skein_native;
+extern zio_checksum_t zio_checksum_skein_byteswap;
+extern zio_checksum_tmpl_init_t zio_checksum_skein_tmpl_init;
+extern zio_checksum_tmpl_free_t zio_checksum_skein_tmpl_free;
+
+/* Edon-R */
+extern zio_checksum_t zio_checksum_edonr_native;
+extern zio_checksum_t zio_checksum_edonr_byteswap;
+extern zio_checksum_tmpl_init_t zio_checksum_edonr_tmpl_init;
+extern zio_checksum_tmpl_free_t zio_checksum_edonr_tmpl_free;
+#endif
+
+extern int zio_checksum_equal(spa_t *, blkptr_t *, enum zio_checksum,
+ void *, uint64_t, uint64_t, zio_bad_cksum_t *);
extern void zio_checksum_compute(zio_t *zio, enum zio_checksum checksum,
void *data, uint64_t size);
+extern int zio_checksum_error_impl(spa_t *, blkptr_t *, enum zio_checksum,
+ void *, uint64_t, uint64_t, zio_bad_cksum_t *);
extern int zio_checksum_error(zio_t *zio, zio_bad_cksum_t *out);
extern enum zio_checksum spa_dedup_checksum(spa_t *spa);
+extern void zio_checksum_templates_free(spa_t *spa);
+extern spa_feature_t zio_checksum_to_feature(enum zio_checksum cksum);
#ifdef __cplusplus
}
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_compress.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_compress.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_compress.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -70,6 +71,8 @@
int level);
extern int zle_decompress(void *src, void *dst, size_t s_len, size_t d_len,
int level);
+extern void lz4_init(void);
+extern void lz4_fini(void);
extern size_t lz4_compress(void *src, void *dst, size_t s_len, size_t d_len,
int level);
extern int lz4_decompress(void *src, void *dst, size_t s_len, size_t d_len,
@@ -83,6 +86,12 @@
extern int zio_decompress_data(enum zio_compress c, void *src, void *dst,
size_t s_len, size_t d_len);
+/*
+ * Module lifetime management.
+ */
+extern void zio_compress_init(void);
+extern void zio_compress_fini(void);
+
#ifdef __cplusplus
}
#endif
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_impl.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -24,7 +25,7 @@
*/
/*
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
*/
#ifndef _ZIO_IMPL_H
@@ -38,7 +39,7 @@
#endif
/*
- * XXX -- Describe ZFS I/O pipleine here. Fill in as needed.
+ * XXX -- Describe ZFS I/O pipeline here. Fill in as needed.
*
* The ZFS I/O pipeline is comprised of various stages which are defined
* in the zio_stage enum below. The individual stages are used to construct
@@ -108,35 +109,37 @@
ZIO_STAGE_OPEN = 1 << 0, /* RWFCI */
ZIO_STAGE_READ_BP_INIT = 1 << 1, /* R---- */
- ZIO_STAGE_FREE_BP_INIT = 1 << 2, /* --F-- */
- ZIO_STAGE_ISSUE_ASYNC = 1 << 3, /* RWF-- */
- ZIO_STAGE_WRITE_BP_INIT = 1 << 4, /* -W--- */
+ ZIO_STAGE_WRITE_BP_INIT = 1 << 2, /* -W--- */
+ ZIO_STAGE_FREE_BP_INIT = 1 << 3, /* --F-- */
+ ZIO_STAGE_ISSUE_ASYNC = 1 << 4, /* RWF-- */
+ ZIO_STAGE_WRITE_COMPRESS = 1 << 5, /* -W--- */
- ZIO_STAGE_CHECKSUM_GENERATE = 1 << 5, /* -W--- */
+ ZIO_STAGE_CHECKSUM_GENERATE = 1 << 6, /* -W--- */
- ZIO_STAGE_NOP_WRITE = 1 << 6, /* -W--- */
+ ZIO_STAGE_NOP_WRITE = 1 << 7, /* -W--- */
- ZIO_STAGE_DDT_READ_START = 1 << 7, /* R---- */
- ZIO_STAGE_DDT_READ_DONE = 1 << 8, /* R---- */
- ZIO_STAGE_DDT_WRITE = 1 << 9, /* -W--- */
- ZIO_STAGE_DDT_FREE = 1 << 10, /* --F-- */
+ ZIO_STAGE_DDT_READ_START = 1 << 8, /* R---- */
+ ZIO_STAGE_DDT_READ_DONE = 1 << 9, /* R---- */
+ ZIO_STAGE_DDT_WRITE = 1 << 10, /* -W--- */
+ ZIO_STAGE_DDT_FREE = 1 << 11, /* --F-- */
- ZIO_STAGE_GANG_ASSEMBLE = 1 << 11, /* RWFC- */
- ZIO_STAGE_GANG_ISSUE = 1 << 12, /* RWFC- */
+ ZIO_STAGE_GANG_ASSEMBLE = 1 << 12, /* RWFC- */
+ ZIO_STAGE_GANG_ISSUE = 1 << 13, /* RWFC- */
- ZIO_STAGE_DVA_ALLOCATE = 1 << 13, /* -W--- */
- ZIO_STAGE_DVA_FREE = 1 << 14, /* --F-- */
- ZIO_STAGE_DVA_CLAIM = 1 << 15, /* ---C- */
+ ZIO_STAGE_DVA_THROTTLE = 1 << 14, /* -W--- */
+ ZIO_STAGE_DVA_ALLOCATE = 1 << 15, /* -W--- */
+ ZIO_STAGE_DVA_FREE = 1 << 16, /* --F-- */
+ ZIO_STAGE_DVA_CLAIM = 1 << 17, /* ---C- */
- ZIO_STAGE_READY = 1 << 16, /* RWFCI */
+ ZIO_STAGE_READY = 1 << 18, /* RWFCI */
- ZIO_STAGE_VDEV_IO_START = 1 << 17, /* RWF-I */
- ZIO_STAGE_VDEV_IO_DONE = 1 << 18, /* RWF-- */
- ZIO_STAGE_VDEV_IO_ASSESS = 1 << 19, /* RWF-I */
+ ZIO_STAGE_VDEV_IO_START = 1 << 19, /* RWF-I */
+ ZIO_STAGE_VDEV_IO_DONE = 1 << 20, /* RWF-I */
+ ZIO_STAGE_VDEV_IO_ASSESS = 1 << 21, /* RWF-I */
- ZIO_STAGE_CHECKSUM_VERIFY = 1 << 20, /* R---- */
+ ZIO_STAGE_CHECKSUM_VERIFY = 1 << 22, /* R---- */
- ZIO_STAGE_DONE = 1 << 21 /* RWFCI */
+ ZIO_STAGE_DONE = 1 << 23 /* RWFCI */
};
#define ZIO_INTERLOCK_STAGES \
@@ -187,22 +190,27 @@
#define ZIO_REWRITE_PIPELINE \
(ZIO_WRITE_COMMON_STAGES | \
+ ZIO_STAGE_WRITE_COMPRESS | \
ZIO_STAGE_WRITE_BP_INIT)
#define ZIO_WRITE_PIPELINE \
(ZIO_WRITE_COMMON_STAGES | \
ZIO_STAGE_WRITE_BP_INIT | \
+ ZIO_STAGE_WRITE_COMPRESS | \
+ ZIO_STAGE_DVA_THROTTLE | \
ZIO_STAGE_DVA_ALLOCATE)
#define ZIO_DDT_CHILD_WRITE_PIPELINE \
(ZIO_INTERLOCK_STAGES | \
ZIO_VDEV_IO_STAGES | \
+ ZIO_STAGE_DVA_THROTTLE | \
ZIO_STAGE_DVA_ALLOCATE)
#define ZIO_DDT_WRITE_PIPELINE \
(ZIO_INTERLOCK_STAGES | \
+ ZIO_STAGE_WRITE_BP_INIT | \
ZIO_STAGE_ISSUE_ASYNC | \
- ZIO_STAGE_WRITE_BP_INIT | \
+ ZIO_STAGE_WRITE_COMPRESS | \
ZIO_STAGE_CHECKSUM_GENERATE | \
ZIO_STAGE_DDT_WRITE)
@@ -213,11 +221,12 @@
#define ZIO_FREE_PIPELINE \
(ZIO_INTERLOCK_STAGES | \
ZIO_STAGE_FREE_BP_INIT | \
- ZIO_STAGE_ISSUE_ASYNC | \
- ZIO_STAGE_DVA_FREE | \
- ZIO_STAGE_VDEV_IO_START | \
- ZIO_STAGE_VDEV_IO_ASSESS)
+ ZIO_STAGE_DVA_FREE)
+#define ZIO_FREE_PHYS_PIPELINE \
+ (ZIO_INTERLOCK_STAGES | \
+ ZIO_VDEV_IO_STAGES)
+
#define ZIO_DDT_FREE_PIPELINE \
(ZIO_INTERLOCK_STAGES | \
ZIO_STAGE_FREE_BP_INIT | \
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zrlock.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zrlock.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zrlock.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -20,6 +21,7 @@
*/
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015 by Delphix. All rights reserved.
*/
#ifndef _SYS_ZRLOCK_H
@@ -44,12 +46,8 @@
extern void zrl_init(zrlock_t *);
extern void zrl_destroy(zrlock_t *);
-#ifdef ZFS_DEBUG
-#define zrl_add(_z) zrl_add_debug((_z), __func__)
-extern void zrl_add_debug(zrlock_t *, const char *);
-#else
-extern void zrl_add(zrlock_t *);
-#endif
+#define zrl_add(_z) zrl_add_impl((_z), __func__)
+extern void zrl_add_impl(zrlock_t *, const char *);
extern void zrl_remove(zrlock_t *);
extern int zrl_tryenter(zrlock_t *);
extern void zrl_exit(zrlock_t *);
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zvol.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zvol.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zvol.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -43,9 +44,9 @@
extern int zvol_create_minor(const char *);
extern int zvol_remove_minor(const char *);
extern void zvol_remove_minors(const char *);
-extern int zvol_set_volsize(const char *, major_t, uint64_t);
+extern int zvol_set_volsize(const char *, uint64_t);
-#ifdef sun
+#ifdef illumos
extern int zvol_open(dev_t *devp, int flag, int otyp, cred_t *cr);
extern int zvol_dump(dev_t dev, caddr_t addr, daddr_t offset, int nblocks);
extern int zvol_close(dev_t dev, int flag, int otyp, cred_t *cr);
@@ -54,7 +55,7 @@
extern int zvol_write(dev_t dev, uio_t *uiop, cred_t *cr);
extern int zvol_aread(dev_t dev, struct aio_req *aio, cred_t *cr);
extern int zvol_awrite(dev_t dev, struct aio_req *aio, cred_t *cr);
-#endif /* sun */
+#endif /* illumos */
extern int zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr,
int *rvalp);
extern int zvol_busy(void);
@@ -61,7 +62,7 @@
extern void zvol_init(void);
extern void zvol_fini(void);
-#ifdef sun
+#ifdef illumos
extern int zvol_get_volume_params(minor_t minor, uint64_t *blksize,
uint64_t *max_xfer_len, void **minor_hdl, void **objset_hdl, void **zil_hdl,
void **rl_hdl, void **bonus_hdl);
@@ -69,9 +70,9 @@
extern int zvol_get_volume_wce(void *minor_hdl);
extern void zvol_log_write_minor(void *minor_hdl, dmu_tx_t *tx, offset_t off,
ssize_t resid, boolean_t sync);
-#endif /* sun */
+#endif /* illumos */
-#ifdef __FreeBSD__
+#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
extern int zvol_create_minors(const char *name);
extern void zvol_rename_minors(const char *oldname, const char *newname);
#endif
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/trim_map.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/trim_map.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/trim_map.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -41,17 +41,20 @@
#define TRIM_ZIO_END(vd, offset, size) (offset + \
P2ROUNDUP(size, 1ULL << vd->vdev_top->vdev_ashift))
-#define TRIM_MAP_SINC(tm, size) \
- atomic_add_64(&(tm)->tm_bytes, (size))
+/* Maximal segment size for ATA TRIM. */
+#define TRIM_MAP_SIZE_FACTOR (512 << 16)
-#define TRIM_MAP_SDEC(tm, size) \
- atomic_add_64(&(tm)->tm_bytes, -(size))
+#define TRIM_MAP_SEGS(size) (1 + (size) / TRIM_MAP_SIZE_FACTOR)
-#define TRIM_MAP_QINC(tm) \
- atomic_inc_64(&(tm)->tm_pending); \
+#define TRIM_MAP_ADD(tm, ts) do { \
+ list_insert_tail(&(tm)->tm_head, (ts)); \
+ (tm)->tm_pending += TRIM_MAP_SEGS((ts)->ts_end - (ts)->ts_start); \
+} while (0)
-#define TRIM_MAP_QDEC(tm) \
- atomic_dec_64(&(tm)->tm_pending);
+#define TRIM_MAP_REM(tm, ts) do { \
+ list_remove(&(tm)->tm_head, (ts)); \
+ (tm)->tm_pending -= TRIM_MAP_SEGS((ts)->ts_end - (ts)->ts_start); \
+} while (0)
typedef struct trim_map {
list_t tm_head; /* List of segments sorted by txg. */
@@ -61,7 +64,6 @@
list_t tm_pending_writes; /* Writes blocked on in-flight frees. */
kmutex_t tm_lock;
uint64_t tm_pending; /* Count of pending TRIMs. */
- uint64_t tm_bytes; /* Total size in bytes of queued TRIMs. */
} trim_map_t;
typedef struct trim_seg {
@@ -75,13 +77,10 @@
extern boolean_t zfs_trim_enabled;
-static u_int trim_txg_delay = 32;
-static u_int trim_timeout = 30;
-static u_int trim_max_interval = 1;
-/* Limit outstanding TRIMs to 2G (max size for a single TRIM request) */
-static uint64_t trim_vdev_max_bytes = 2147483648;
-/* Limit outstanding TRIMs to 64 (max ranges for a single TRIM request) */
-static u_int trim_vdev_max_pending = 64;
+static u_int trim_txg_delay = 32; /* Keep deleted data up to 32 TXG */
+static u_int trim_timeout = 30; /* Keep deleted data up to 30s */
+static u_int trim_max_interval = 1; /* 1s delays between TRIMs */
+static u_int trim_vdev_max_pending = 10000; /* Keep up to 10K segments */
SYSCTL_DECL(_vfs_zfs);
SYSCTL_NODE(_vfs_zfs, OID_AUTO, trim, CTLFLAG_RD, 0, "ZFS TRIM");
@@ -100,11 +99,6 @@
"Maximum interval between TRIM queue processing (seconds)");
SYSCTL_DECL(_vfs_zfs_vdev);
-TUNABLE_QUAD("vfs.zfs.vdev.trim_max_bytes", &trim_vdev_max_bytes);
-SYSCTL_QUAD(_vfs_zfs_vdev, OID_AUTO, trim_max_bytes, CTLFLAG_RWTUN,
- &trim_vdev_max_bytes, 0,
- "Maximum pending TRIM bytes for a vdev");
-
TUNABLE_INT("vfs.zfs.vdev.trim_max_pending", &trim_vdev_max_pending);
SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, trim_max_pending, CTLFLAG_RWTUN,
&trim_vdev_max_pending, 0,
@@ -156,11 +150,9 @@
{
trim_map_t *tm;
- ASSERT(vd->vdev_ops->vdev_op_leaf);
+ ASSERT(zfs_trim_enabled && !vd->vdev_notrim &&
+ vd->vdev_ops->vdev_op_leaf);
- if (!zfs_trim_enabled)
- return;
-
tm = kmem_zalloc(sizeof (*tm), KM_SLEEP);
mutex_init(&tm->tm_lock, NULL, MUTEX_DEFAULT, NULL);
list_create(&tm->tm_head, sizeof (trim_seg_t),
@@ -201,10 +193,8 @@
mutex_enter(&tm->tm_lock);
while ((ts = list_head(&tm->tm_head)) != NULL) {
avl_remove(&tm->tm_queued_frees, ts);
- list_remove(&tm->tm_head, ts);
+ TRIM_MAP_REM(tm, ts);
kmem_free(ts, sizeof (*ts));
- TRIM_MAP_SDEC(tm, ts->ts_end - ts->ts_start);
- TRIM_MAP_QDEC(tm);
}
mutex_exit(&tm->tm_lock);
@@ -249,27 +239,27 @@
merge_after = (ts_after != NULL && ts_after->ts_start == end);
if (merge_before && merge_after) {
- TRIM_MAP_SINC(tm, ts_after->ts_start - ts_before->ts_end);
- TRIM_MAP_QDEC(tm);
avl_remove(&tm->tm_queued_frees, ts_before);
- list_remove(&tm->tm_head, ts_before);
+ TRIM_MAP_REM(tm, ts_before);
+ TRIM_MAP_REM(tm, ts_after);
ts_after->ts_start = ts_before->ts_start;
ts_after->ts_txg = txg;
ts_after->ts_time = time;
+ TRIM_MAP_ADD(tm, ts_after);
kmem_free(ts_before, sizeof (*ts_before));
} else if (merge_before) {
- TRIM_MAP_SINC(tm, end - ts_before->ts_end);
+ TRIM_MAP_REM(tm, ts_before);
ts_before->ts_end = end;
ts_before->ts_txg = txg;
ts_before->ts_time = time;
+ TRIM_MAP_ADD(tm, ts_before);
} else if (merge_after) {
- TRIM_MAP_SINC(tm, ts_after->ts_start - start);
+ TRIM_MAP_REM(tm, ts_after);
ts_after->ts_start = start;
ts_after->ts_txg = txg;
ts_after->ts_time = time;
+ TRIM_MAP_ADD(tm, ts_after);
} else {
- TRIM_MAP_SINC(tm, end - start);
- TRIM_MAP_QINC(tm);
ts = kmem_alloc(sizeof (*ts), KM_SLEEP);
ts->ts_start = start;
ts->ts_end = end;
@@ -276,7 +266,7 @@
ts->ts_txg = txg;
ts->ts_time = time;
avl_insert(&tm->tm_queued_frees, ts, where);
- list_insert_tail(&tm->tm_head, ts);
+ TRIM_MAP_ADD(tm, ts);
}
}
@@ -292,7 +282,7 @@
left_over = (ts->ts_start < start);
right_over = (ts->ts_end > end);
- TRIM_MAP_SDEC(tm, end - start);
+ TRIM_MAP_REM(tm, ts);
if (left_over && right_over) {
nts = kmem_alloc(sizeof (*nts), KM_SLEEP);
nts->ts_start = end;
@@ -301,16 +291,16 @@
nts->ts_time = ts->ts_time;
ts->ts_end = start;
avl_insert_here(&tm->tm_queued_frees, nts, ts, AVL_AFTER);
- list_insert_after(&tm->tm_head, ts, nts);
- TRIM_MAP_QINC(tm);
+ TRIM_MAP_ADD(tm, ts);
+ TRIM_MAP_ADD(tm, nts);
} else if (left_over) {
ts->ts_end = start;
+ TRIM_MAP_ADD(tm, ts);
} else if (right_over) {
ts->ts_start = end;
+ TRIM_MAP_ADD(tm, ts);
} else {
avl_remove(&tm->tm_queued_frees, ts);
- list_remove(&tm->tm_head, ts);
- TRIM_MAP_QDEC(tm);
kmem_free(ts, sizeof (*ts));
}
}
@@ -429,7 +419,8 @@
* the first element's time is not greater than time argument
*/
static trim_seg_t *
-trim_map_first(trim_map_t *tm, uint64_t txg, uint64_t txgsafe, hrtime_t time)
+trim_map_first(trim_map_t *tm, uint64_t txg, uint64_t txgsafe, hrtime_t time,
+ boolean_t force)
{
trim_seg_t *ts;
@@ -438,9 +429,7 @@
ts = list_head(&tm->tm_head);
if (ts != NULL && ts->ts_txg <= txgsafe &&
- (ts->ts_txg <= txg || ts->ts_time <= time ||
- tm->tm_bytes > trim_vdev_max_bytes ||
- tm->tm_pending > trim_vdev_max_pending))
+ (ts->ts_txg <= txg || ts->ts_time <= time || force))
return (ts);
return (NULL);
}
@@ -450,7 +439,8 @@
{
trim_map_t *tm = vd->vdev_trimmap;
trim_seg_t *ts;
- uint64_t size, txgtarget, txgsafe;
+ uint64_t size, offset, txgtarget, txgsafe;
+ int64_t hard, soft;
hrtime_t timelimit;
ASSERT(vd->vdev_ops->vdev_op_leaf);
@@ -458,7 +448,7 @@
if (tm == NULL)
return;
- timelimit = gethrtime() - trim_timeout * NANOSEC;
+ timelimit = gethrtime() - (hrtime_t)trim_timeout * NANOSEC;
if (vd->vdev_isl2cache) {
txgsafe = UINT64_MAX;
txgtarget = UINT64_MAX;
@@ -471,16 +461,31 @@
}
mutex_enter(&tm->tm_lock);
+ hard = 0;
+ if (tm->tm_pending > trim_vdev_max_pending)
+ hard = (tm->tm_pending - trim_vdev_max_pending) / 4;
+ soft = P2ROUNDUP(hard + tm->tm_pending / trim_timeout + 1, 64);
/* Loop until we have sent all outstanding free's */
- while ((ts = trim_map_first(tm, txgtarget, txgsafe, timelimit))
+ while (soft > 0 &&
+ (ts = trim_map_first(tm, txgtarget, txgsafe, timelimit, hard > 0))
!= NULL) {
- list_remove(&tm->tm_head, ts);
+ TRIM_MAP_REM(tm, ts);
avl_remove(&tm->tm_queued_frees, ts);
avl_add(&tm->tm_inflight_frees, ts);
size = ts->ts_end - ts->ts_start;
- zio_nowait(zio_trim(zio, spa, vd, ts->ts_start, size));
- TRIM_MAP_SDEC(tm, size);
- TRIM_MAP_QDEC(tm);
+ offset = ts->ts_start;
+ /*
+ * We drop the lock while we call zio_nowait as the IO
+ * scheduler can result in a different IO being run e.g.
+ * a write which would result in a recursive lock.
+ */
+ mutex_exit(&tm->tm_lock);
+
+ zio_nowait(zio_trim(zio, spa, vd, offset, size));
+
+ soft -= TRIM_MAP_SEGS(size);
+ hard -= TRIM_MAP_SEGS(size);
+ mutex_enter(&tm->tm_lock);
}
mutex_exit(&tm->tm_lock);
}
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -22,7 +22,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Portions Copyright 2011 Martin Matuska <mm at FreeBSD.org>
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -46,7 +46,7 @@
* either be processing, or blocked waiting to enter the next state. There may
* be up to three active txgs, and there is always a txg in the open state
* (though it may be blocked waiting to enter the quiescing state). In broad
- * strokes, transactions — operations that change in-memory structures — are
+ * strokes, transactions -- operations that change in-memory structures -- are
* accepted into the txg in the open state, and are completed while the txg is
* in the open or quiescing states. The accumulated changes are written to
* disk in the syncing state.
@@ -54,7 +54,7 @@
* Open
*
* When a new txg becomes active, it first enters the open state. New
- * transactions — updates to in-memory structures — are assigned to the
+ * transactions -- updates to in-memory structures -- are assigned to the
* currently open txg. There is always a txg in the open state so that ZFS can
* accept new changes (though the txg may refuse new changes if it has hit
* some limit). ZFS advances the open txg to the next state for a variety of
@@ -242,7 +242,7 @@
}
static void
-txg_thread_wait(tx_state_t *tx, callb_cpr_t *cpr, kcondvar_t *cv, uint64_t time)
+txg_thread_wait(tx_state_t *tx, callb_cpr_t *cpr, kcondvar_t *cv, clock_t time)
{
CALLB_CPR_SAFE_BEGIN(cpr);
@@ -355,7 +355,7 @@
* On return, the transaction group has reached a stable state in which it can
* then be passed off to the syncing context.
*/
-static void
+static __noinline void
txg_quiesce(dsl_pool_t *dp, uint64_t txg)
{
tx_state_t *tx = &dp->dp_tx;
@@ -370,7 +370,11 @@
ASSERT(txg == tx->tx_open_txg);
tx->tx_open_txg++;
+ tx->tx_open_time = gethrtime();
+ DTRACE_PROBE2(txg__quiescing, dsl_pool_t *, dp, uint64_t, txg);
+ DTRACE_PROBE2(txg__opened, dsl_pool_t *, dp, uint64_t, tx->tx_open_txg);
+
/*
* Now that we've incremented tx_open_txg, we can let threads
* enter the next transaction group.
@@ -460,7 +464,8 @@
start = delta = 0;
for (;;) {
- uint64_t timer, timeout = zfs_txg_timeout * hz;
+ uint64_t timeout = zfs_txg_timeout * hz;
+ uint64_t timer;
uint64_t txg;
/*
@@ -472,7 +477,8 @@
while (!dsl_scan_active(dp->dp_scan) &&
!tx->tx_exiting && timer > 0 &&
tx->tx_synced_txg >= tx->tx_sync_txg_waiting &&
- tx->tx_quiesced_txg == 0) {
+ tx->tx_quiesced_txg == 0 &&
+ dp->dp_dirty_total < zfs_dirty_data_sync) {
dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n",
tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp);
txg_thread_wait(tx, &cpr, &tx->tx_sync_more_cv, timer);
@@ -502,6 +508,7 @@
txg = tx->tx_quiesced_txg;
tx->tx_quiesced_txg = 0;
tx->tx_syncing_txg = txg;
+ DTRACE_PROBE2(txg__syncing, dsl_pool_t *, dp, uint64_t, txg);
cv_broadcast(&tx->tx_quiesce_more_cv);
dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
@@ -515,6 +522,7 @@
mutex_enter(&tx->tx_sync_lock);
tx->tx_synced_txg = txg;
tx->tx_syncing_txg = 0;
+ DTRACE_PROBE2(txg__synced, dsl_pool_t *, dp, uint64_t, txg);
cv_broadcast(&tx->tx_sync_done_cv);
/*
@@ -564,6 +572,7 @@
*/
dprintf("quiesce done, handing off txg %llu\n", txg);
tx->tx_quiesced_txg = txg;
+ DTRACE_PROBE2(txg__quiesced, dsl_pool_t *, dp, uint64_t, txg);
cv_broadcast(&tx->tx_sync_more_cv);
cv_broadcast(&tx->tx_quiesce_done_cv);
}
@@ -570,15 +579,15 @@
}
/*
- * Delay this thread by 'ticks' if we are still in the open transaction
- * group and there is already a waiting txg quiescing or quiesced.
- * Abort the delay if this txg stalls or enters the quiescing state.
+ * Delay this thread by delay nanoseconds if we are still in the open
+ * transaction group and there is already a waiting txg quiesing or quiesced.
+ * Abort the delay if this txg stalls or enters the quiesing state.
*/
void
-txg_delay(dsl_pool_t *dp, uint64_t txg, int ticks)
+txg_delay(dsl_pool_t *dp, uint64_t txg, hrtime_t delay, hrtime_t resolution)
{
tx_state_t *tx = &dp->dp_tx;
- clock_t timeout = ddi_get_lbolt() + ticks;
+ hrtime_t start = gethrtime();
/* don't delay if this txg could transition to quiescing immediately */
if (tx->tx_open_txg > txg ||
@@ -591,10 +600,11 @@
return;
}
- while (ddi_get_lbolt() < timeout &&
- tx->tx_syncing_txg < txg-1 && !txg_stalled(dp))
- (void) cv_timedwait(&tx->tx_quiesce_more_cv, &tx->tx_sync_lock,
- timeout - ddi_get_lbolt());
+ while (gethrtime() - start < delay &&
+ tx->tx_syncing_txg < txg-1 && !txg_stalled(dp)) {
+ (void) cv_timedwait_hires(&tx->tx_quiesce_more_cv,
+ &tx->tx_sync_lock, delay, resolution, 0);
+ }
mutex_exit(&tx->tx_sync_lock);
}
@@ -646,6 +656,28 @@
mutex_exit(&tx->tx_sync_lock);
}
+/*
+ * If there isn't a txg syncing or in the pipeline, push another txg through
+ * the pipeline by queiscing the open txg.
+ */
+void
+txg_kick(dsl_pool_t *dp)
+{
+ tx_state_t *tx = &dp->dp_tx;
+
+ ASSERT(!dsl_pool_config_held(dp));
+
+ mutex_enter(&tx->tx_sync_lock);
+ if (tx->tx_syncing_txg == 0 &&
+ tx->tx_quiesce_txg_waiting <= tx->tx_open_txg &&
+ tx->tx_sync_txg_waiting <= tx->tx_synced_txg &&
+ tx->tx_quiesced_txg <= tx->tx_synced_txg) {
+ tx->tx_quiesce_txg_waiting = tx->tx_open_txg + 1;
+ cv_broadcast(&tx->tx_quiesce_more_cv);
+ }
+ mutex_exit(&tx->tx_sync_lock);
+}
+
boolean_t
txg_stalled(dsl_pool_t *dp)
{
@@ -696,6 +728,24 @@
}
/*
+ * Returns true if all txg lists are empty.
+ *
+ * Warning: this is inherently racy (an item could be added immediately after this
+ * function returns). We don't bother with the lock because it wouldn't change the
+ * semantics.
+ */
+boolean_t
+txg_all_lists_empty(txg_list_t *tl)
+{
+ for (int i = 0; i < TXG_SIZE; i++) {
+ if (!txg_list_empty(tl, i)) {
+ return (B_FALSE);
+ }
+ }
+ return (B_TRUE);
+}
+
+/*
* Add an entry to the list (unless it's already on the list).
* Returns B_TRUE if it was actually added.
*/
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/uberblock.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/uberblock.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/uberblock.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,7 +21,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -41,10 +41,10 @@
}
/*
- * Update the uberblock and return a boolean value indicating whether
- * anything changed in this transaction group.
+ * Update the uberblock and return TRUE if anything changed in this
+ * transaction group.
*/
-int
+boolean_t
uberblock_update(uberblock_t *ub, vdev_t *rvd, uint64_t txg)
{
ASSERT(ub->ub_txg < txg);
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -22,9 +22,10 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
+ * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
* Copyright 2013 Martin Matuska <mm at FreeBSD.org>. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
*/
#include <sys/zfs_context.h>
@@ -38,6 +39,7 @@
#include <sys/metaslab.h>
#include <sys/metaslab_impl.h>
#include <sys/space_map.h>
+#include <sys/space_reftree.h>
#include <sys/zio.h>
#include <sys/zap.h>
#include <sys/fs/zfs.h>
@@ -53,6 +55,90 @@
* Virtual device management.
*/
+/*
+ * The limit for ZFS to automatically increase a top-level vdev's ashift
+ * from logical ashift to physical ashift.
+ *
+ * Example: one or more 512B emulation child vdevs
+ * child->vdev_ashift = 9 (512 bytes)
+ * child->vdev_physical_ashift = 12 (4096 bytes)
+ * zfs_max_auto_ashift = 11 (2048 bytes)
+ * zfs_min_auto_ashift = 9 (512 bytes)
+ *
+ * On pool creation or the addition of a new top-level vdev, ZFS will
+ * increase the ashift of the top-level vdev to 2048 as limited by
+ * zfs_max_auto_ashift.
+ *
+ * Example: one or more 512B emulation child vdevs
+ * child->vdev_ashift = 9 (512 bytes)
+ * child->vdev_physical_ashift = 12 (4096 bytes)
+ * zfs_max_auto_ashift = 13 (8192 bytes)
+ * zfs_min_auto_ashift = 9 (512 bytes)
+ *
+ * On pool creation or the addition of a new top-level vdev, ZFS will
+ * increase the ashift of the top-level vdev to 4096 to match the
+ * max vdev_physical_ashift.
+ *
+ * Example: one or more 512B emulation child vdevs
+ * child->vdev_ashift = 9 (512 bytes)
+ * child->vdev_physical_ashift = 9 (512 bytes)
+ * zfs_max_auto_ashift = 13 (8192 bytes)
+ * zfs_min_auto_ashift = 12 (4096 bytes)
+ *
+ * On pool creation or the addition of a new top-level vdev, ZFS will
+ * increase the ashift of the top-level vdev to 4096 to match the
+ * zfs_min_auto_ashift.
+ */
+static uint64_t zfs_max_auto_ashift = SPA_MAXASHIFT;
+static uint64_t zfs_min_auto_ashift = SPA_MINASHIFT;
+
+static int
+sysctl_vfs_zfs_max_auto_ashift(SYSCTL_HANDLER_ARGS)
+{
+ uint64_t val;
+ int err;
+
+ val = zfs_max_auto_ashift;
+ err = sysctl_handle_64(oidp, &val, 0, req);
+ if (err != 0 || req->newptr == NULL)
+ return (err);
+
+ if (val > SPA_MAXASHIFT || val < zfs_min_auto_ashift)
+ return (EINVAL);
+
+ zfs_max_auto_ashift = val;
+
+ return (0);
+}
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, max_auto_ashift,
+ CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t),
+ sysctl_vfs_zfs_max_auto_ashift, "QU",
+ "Max ashift used when optimising for logical -> physical sectors size on "
+ "new top-level vdevs.");
+
+static int
+sysctl_vfs_zfs_min_auto_ashift(SYSCTL_HANDLER_ARGS)
+{
+ uint64_t val;
+ int err;
+
+ val = zfs_min_auto_ashift;
+ err = sysctl_handle_64(oidp, &val, 0, req);
+ if (err != 0 || req->newptr == NULL)
+ return (err);
+
+ if (val < SPA_MINASHIFT || val > zfs_max_auto_ashift)
+ return (EINVAL);
+
+ zfs_min_auto_ashift = val;
+
+ return (0);
+}
+SYSCTL_PROC(_vfs_zfs, OID_AUTO, min_auto_ashift,
+ CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t),
+ sysctl_vfs_zfs_min_auto_ashift, "QU",
+ "Min ashift used when creating new top-level vdevs.");
+
static vdev_ops_t *vdev_ops_table[] = {
&vdev_root_ops,
&vdev_raidz_ops,
@@ -72,6 +158,15 @@
/*
+ * When a vdev is added, it will be divided into approximately (but no
+ * more than) this number of metaslabs.
+ */
+int metaslabs_per_vdev = 200;
+SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, metaslabs_per_vdev, CTLFLAG_RDTUN,
+ &metaslabs_per_vdev, 0,
+ "When a vdev is added, how many metaslabs the vdev should be divided into");
+
+/*
* Given a vdev type, return the appropriate ops vector.
*/
static vdev_ops_t *
@@ -179,6 +274,26 @@
return (NULL);
}
+static int
+vdev_count_leaves_impl(vdev_t *vd)
+{
+ int n = 0;
+
+ if (vd->vdev_ops->vdev_op_leaf)
+ return (1);
+
+ for (int c = 0; c < vd->vdev_children; c++)
+ n += vdev_count_leaves_impl(vd->vdev_child[c]);
+
+ return (n);
+}
+
+int
+vdev_count_leaves(spa_t *spa)
+{
+ return (vdev_count_leaves_impl(spa->spa_root_vdev));
+}
+
void
vdev_add_child(vdev_t *pvd, vdev_t *cvd)
{
@@ -185,8 +300,9 @@
size_t oldsize, newsize;
uint64_t id = cvd->vdev_id;
vdev_t **newchild;
+ spa_t *spa = cvd->vdev_spa;
- ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+ ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
ASSERT(cvd->vdev_parent == NULL);
cvd->vdev_parent = pvd;
@@ -326,8 +442,9 @@
mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&vd->vdev_queue_lock, NULL, MUTEX_DEFAULT, NULL);
for (int t = 0; t < DTL_TYPES; t++) {
- space_map_create(&vd->vdev_dtl[t], 0, -1ULL, 0,
+ vd->vdev_dtl[t] = range_tree_create(NULL, NULL,
&vd->vdev_dtl_lock);
}
txg_list_create(&vd->vdev_ms_list,
@@ -513,7 +630,7 @@
alloctype == VDEV_ALLOC_ROOTPOOL)) {
if (alloctype == VDEV_ALLOC_LOAD) {
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL,
- &vd->vdev_dtl_smo.smo_object);
+ &vd->vdev_dtl_object);
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE,
&vd->vdev_unspare);
}
@@ -529,8 +646,8 @@
(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE,
&vd->vdev_offline);
- (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVERING,
- &vd->vdev_resilvering);
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG,
+ &vd->vdev_resilver_txg);
/*
* When importing a pool, we want to ignore the persistent fault
@@ -635,12 +752,14 @@
txg_list_destroy(&vd->vdev_dtl_list);
mutex_enter(&vd->vdev_dtl_lock);
+ space_map_close(vd->vdev_dtl_sm);
for (int t = 0; t < DTL_TYPES; t++) {
- space_map_unload(&vd->vdev_dtl[t]);
- space_map_destroy(&vd->vdev_dtl[t]);
+ range_tree_vacate(vd->vdev_dtl[t], NULL, NULL);
+ range_tree_destroy(vd->vdev_dtl[t]);
}
mutex_exit(&vd->vdev_dtl_lock);
+ mutex_destroy(&vd->vdev_queue_lock);
mutex_destroy(&vd->vdev_dtl_lock);
mutex_destroy(&vd->vdev_stat_lock);
mutex_destroy(&vd->vdev_probe_lock);
@@ -747,6 +866,8 @@
mvd->vdev_min_asize = cvd->vdev_min_asize;
mvd->vdev_max_asize = cvd->vdev_max_asize;
mvd->vdev_ashift = cvd->vdev_ashift;
+ mvd->vdev_logical_ashift = cvd->vdev_logical_ashift;
+ mvd->vdev_physical_ashift = cvd->vdev_physical_ashift;
mvd->vdev_state = cvd->vdev_state;
mvd->vdev_crtxg = cvd->vdev_crtxg;
@@ -778,6 +899,8 @@
mvd->vdev_ops == &vdev_replacing_ops ||
mvd->vdev_ops == &vdev_spare_ops);
cvd->vdev_ashift = mvd->vdev_ashift;
+ cvd->vdev_logical_ashift = mvd->vdev_logical_ashift;
+ cvd->vdev_physical_ashift = mvd->vdev_physical_ashift;
vdev_remove_child(mvd, cvd);
vdev_remove_child(pvd, mvd);
@@ -828,9 +951,9 @@
/*
* Compute the raidz-deflation ratio. Note, we hard-code
- * in 128k (1 << 17) because it is the current "typical" blocksize.
- * Even if SPA_MAXBLOCKSIZE changes, this algorithm must never change,
- * or we will inconsistently account for existing bp's.
+ * in 128k (1 << 17) because it is the "typical" blocksize.
+ * Even though SPA_MAXBLOCKSIZE changed, this algorithm can not change,
+ * otherwise it would inconsistently account for existing bp's.
*/
vd->vdev_deflate_ratio = (1 << 17) /
(vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT);
@@ -848,27 +971,20 @@
vd->vdev_ms_count = newc;
for (m = oldc; m < newc; m++) {
- space_map_obj_t smo = { 0, 0, 0 };
+ uint64_t object = 0;
+
if (txg == 0) {
- uint64_t object = 0;
error = dmu_read(mos, vd->vdev_ms_array,
m * sizeof (uint64_t), sizeof (uint64_t), &object,
DMU_READ_PREFETCH);
if (error)
return (error);
- if (object != 0) {
- dmu_buf_t *db;
- error = dmu_bonus_hold(mos, object, FTAG, &db);
- if (error)
- return (error);
- ASSERT3U(db->db_size, >=, sizeof (smo));
- bcopy(db->db_data, &smo, sizeof (smo));
- ASSERT3U(smo.smo_object, ==, object);
- dmu_buf_rele(db, FTAG);
- }
}
- vd->vdev_ms[m] = metaslab_init(vd->vdev_mg, &smo,
- m << vd->vdev_ms_shift, 1ULL << vd->vdev_ms_shift, txg);
+
+ error = metaslab_init(vd->vdev_mg, m, object, txg,
+ &(vd->vdev_ms[m]));
+ if (error)
+ return (error);
}
if (txg == 0)
@@ -896,9 +1012,12 @@
if (vd->vdev_ms != NULL) {
metaslab_group_passivate(vd->vdev_mg);
- for (m = 0; m < count; m++)
- if (vd->vdev_ms[m] != NULL)
- metaslab_fini(vd->vdev_ms[m]);
+ for (m = 0; m < count; m++) {
+ metaslab_t *msp = vd->vdev_ms[m];
+
+ if (msp != NULL)
+ metaslab_fini(msp);
+ }
kmem_free(vd->vdev_ms, count * sizeof (metaslab_t *));
vd->vdev_ms = NULL;
}
@@ -955,7 +1074,8 @@
vd->vdev_probe_zio = NULL;
mutex_exit(&vd->vdev_probe_lock);
- while ((pio = zio_walk_parents(zio)) != NULL)
+ zio_link_t *zl = NULL;
+ while ((pio = zio_walk_parents(zio, &zl)) != NULL)
if (!vdev_accessible(vd, pio))
pio->io_error = SET_ERROR(ENXIO);
@@ -1121,7 +1241,8 @@
uint64_t osize = 0;
uint64_t max_osize = 0;
uint64_t asize, max_asize, psize;
- uint64_t ashift = 0;
+ uint64_t logical_ashift = 0;
+ uint64_t physical_ashift = 0;
ASSERT(vd->vdev_open_thread == curthread ||
spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
@@ -1132,6 +1253,7 @@
vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
vd->vdev_cant_read = B_FALSE;
vd->vdev_cant_write = B_FALSE;
+ vd->vdev_notrim = B_FALSE;
vd->vdev_min_asize = vdev_get_min_asize(vd);
/*
@@ -1151,7 +1273,8 @@
return (SET_ERROR(ENXIO));
}
- error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize, &ashift);
+ error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize,
+ &logical_ashift, &physical_ashift);
/*
* Reset the vdev_reopening flag so that we actually close
@@ -1200,10 +1323,8 @@
if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops)
return (0);
- if (vd->vdev_ops->vdev_op_leaf) {
- vd->vdev_notrim = B_FALSE;
+ if (zfs_trim_enabled && !vd->vdev_notrim && vd->vdev_ops->vdev_op_leaf)
trim_map_create(vd);
- }
for (int c = 0; c < vd->vdev_children; c++) {
if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) {
@@ -1249,6 +1370,17 @@
return (SET_ERROR(EINVAL));
}
+ vd->vdev_physical_ashift =
+ MAX(physical_ashift, vd->vdev_physical_ashift);
+ vd->vdev_logical_ashift = MAX(logical_ashift, vd->vdev_logical_ashift);
+ vd->vdev_ashift = MAX(vd->vdev_logical_ashift, vd->vdev_ashift);
+
+ if (vd->vdev_logical_ashift > SPA_MAXASHIFT) {
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_ASHIFT_TOO_BIG);
+ return (EINVAL);
+ }
+
if (vd->vdev_asize == 0) {
/*
* This is the first-ever open, so use the computed values.
@@ -1256,12 +1388,12 @@
*/
vd->vdev_asize = asize;
vd->vdev_max_asize = max_asize;
- vd->vdev_ashift = MAX(ashift, vd->vdev_ashift);
} else {
/*
* Make sure the alignment requirement hasn't increased.
*/
- if (ashift > vd->vdev_top->vdev_ashift) {
+ if (vd->vdev_ashift > vd->vdev_top->vdev_ashift &&
+ vd->vdev_ops->vdev_op_leaf) {
vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_BAD_LABEL);
return (EINVAL);
@@ -1292,6 +1424,17 @@
}
/*
+ * Track the min and max ashift values for normal data devices.
+ */
+ if (vd->vdev_top == vd && vd->vdev_ashift != 0 &&
+ !vd->vdev_islog && vd->vdev_aux == NULL) {
+ if (vd->vdev_ashift > spa->spa_max_ashift)
+ spa->spa_max_ashift = vd->vdev_ashift;
+ if (vd->vdev_ashift < spa->spa_min_ashift)
+ spa->spa_min_ashift = vd->vdev_ashift;
+ }
+
+ /*
* If a leaf vdev has a DTL, and seems healthy, then kick off a
* resilver. But don't do this if we are doing a reopen for a scrub,
* since this would just restart the scrub we are already doing.
@@ -1553,9 +1696,10 @@
}
/*
- * Recursively initialize all labels.
+ * Recursively load DTLs and initialize all labels.
*/
- if ((error = vdev_label_init(vd, txg, isreplacing ?
+ if ((error = vdev_dtl_load(vd)) != 0 ||
+ (error = vdev_label_init(vd, txg, isreplacing ?
VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) {
vdev_close(vd);
return (error);
@@ -1568,13 +1712,41 @@
vdev_metaslab_set_size(vdev_t *vd)
{
/*
- * Aim for roughly 200 metaslabs per vdev.
+ * Aim for roughly metaslabs_per_vdev (default 200) metaslabs per vdev.
*/
- vd->vdev_ms_shift = highbit(vd->vdev_asize / 200);
+ vd->vdev_ms_shift = highbit64(vd->vdev_asize / metaslabs_per_vdev);
vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT);
}
+/*
+ * Maximize performance by inflating the configured ashift for top level
+ * vdevs to be as close to the physical ashift as possible while maintaining
+ * administrator defined limits and ensuring it doesn't go below the
+ * logical ashift.
+ */
void
+vdev_ashift_optimize(vdev_t *vd)
+{
+ if (vd == vd->vdev_top) {
+ if (vd->vdev_ashift < vd->vdev_physical_ashift) {
+ vd->vdev_ashift = MIN(
+ MAX(zfs_max_auto_ashift, vd->vdev_ashift),
+ MAX(zfs_min_auto_ashift, vd->vdev_physical_ashift));
+ } else {
+ /*
+ * Unusual case where logical ashift > physical ashift
+ * so we can't cap the calculated ashift based on max
+ * ashift as that would cause failures.
+ * We still check if we need to increase it to match
+ * the min ashift.
+ */
+ vd->vdev_ashift = MAX(zfs_min_auto_ashift,
+ vd->vdev_ashift);
+ }
+ }
+}
+
+void
vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg)
{
ASSERT(vd == vd->vdev_top);
@@ -1591,6 +1763,16 @@
(void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg);
}
+void
+vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg)
+{
+ for (int c = 0; c < vd->vdev_children; c++)
+ vdev_dirty_leaves(vd->vdev_child[c], flags, txg);
+
+ if (vd->vdev_ops->vdev_op_leaf)
+ vdev_dirty(vd->vdev_top, flags, vd, txg);
+}
+
/*
* DTLs.
*
@@ -1632,31 +1814,31 @@
void
vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
{
- space_map_t *sm = &vd->vdev_dtl[t];
+ range_tree_t *rt = vd->vdev_dtl[t];
ASSERT(t < DTL_TYPES);
ASSERT(vd != vd->vdev_spa->spa_root_vdev);
ASSERT(spa_writeable(vd->vdev_spa));
- mutex_enter(sm->sm_lock);
- if (!space_map_contains(sm, txg, size))
- space_map_add(sm, txg, size);
- mutex_exit(sm->sm_lock);
+ mutex_enter(rt->rt_lock);
+ if (!range_tree_contains(rt, txg, size))
+ range_tree_add(rt, txg, size);
+ mutex_exit(rt->rt_lock);
}
boolean_t
vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
{
- space_map_t *sm = &vd->vdev_dtl[t];
+ range_tree_t *rt = vd->vdev_dtl[t];
boolean_t dirty = B_FALSE;
ASSERT(t < DTL_TYPES);
ASSERT(vd != vd->vdev_spa->spa_root_vdev);
- mutex_enter(sm->sm_lock);
- if (sm->sm_space != 0)
- dirty = space_map_contains(sm, txg, size);
- mutex_exit(sm->sm_lock);
+ mutex_enter(rt->rt_lock);
+ if (range_tree_space(rt) != 0)
+ dirty = range_tree_contains(rt, txg, size);
+ mutex_exit(rt->rt_lock);
return (dirty);
}
@@ -1664,17 +1846,89 @@
boolean_t
vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t)
{
- space_map_t *sm = &vd->vdev_dtl[t];
+ range_tree_t *rt = vd->vdev_dtl[t];
boolean_t empty;
- mutex_enter(sm->sm_lock);
- empty = (sm->sm_space == 0);
- mutex_exit(sm->sm_lock);
+ mutex_enter(rt->rt_lock);
+ empty = (range_tree_space(rt) == 0);
+ mutex_exit(rt->rt_lock);
return (empty);
}
/*
+ * Returns the lowest txg in the DTL range.
+ */
+static uint64_t
+vdev_dtl_min(vdev_t *vd)
+{
+ range_seg_t *rs;
+
+ ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock));
+ ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0);
+ ASSERT0(vd->vdev_children);
+
+ rs = avl_first(&vd->vdev_dtl[DTL_MISSING]->rt_root);
+ return (rs->rs_start - 1);
+}
+
+/*
+ * Returns the highest txg in the DTL.
+ */
+static uint64_t
+vdev_dtl_max(vdev_t *vd)
+{
+ range_seg_t *rs;
+
+ ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock));
+ ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0);
+ ASSERT0(vd->vdev_children);
+
+ rs = avl_last(&vd->vdev_dtl[DTL_MISSING]->rt_root);
+ return (rs->rs_end);
+}
+
+/*
+ * Determine if a resilvering vdev should remove any DTL entries from
+ * its range. If the vdev was resilvering for the entire duration of the
+ * scan then it should excise that range from its DTLs. Otherwise, this
+ * vdev is considered partially resilvered and should leave its DTL
+ * entries intact. The comment in vdev_dtl_reassess() describes how we
+ * excise the DTLs.
+ */
+static boolean_t
+vdev_dtl_should_excise(vdev_t *vd)
+{
+ spa_t *spa = vd->vdev_spa;
+ dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
+
+ ASSERT0(scn->scn_phys.scn_errors);
+ ASSERT0(vd->vdev_children);
+
+ if (vd->vdev_state < VDEV_STATE_DEGRADED)
+ return (B_FALSE);
+
+ if (vd->vdev_resilver_txg == 0 ||
+ range_tree_space(vd->vdev_dtl[DTL_MISSING]) == 0)
+ return (B_TRUE);
+
+ /*
+ * When a resilver is initiated the scan will assign the scn_max_txg
+ * value to the highest txg value that exists in all DTLs. If this
+ * device's max DTL is not part of this scan (i.e. it is not in
+ * the range (scn_min_txg, scn_max_txg] then it is not eligible
+ * for excision.
+ */
+ if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) {
+ ASSERT3U(scn->scn_phys.scn_min_txg, <=, vdev_dtl_min(vd));
+ ASSERT3U(scn->scn_phys.scn_min_txg, <, vd->vdev_resilver_txg);
+ ASSERT3U(vd->vdev_resilver_txg, <=, scn->scn_phys.scn_max_txg);
+ return (B_TRUE);
+ }
+ return (B_FALSE);
+}
+
+/*
* Reassess DTLs after a config change or scrub completion.
*/
void
@@ -1697,9 +1951,17 @@
dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
mutex_enter(&vd->vdev_dtl_lock);
+
+ /*
+ * If we've completed a scan cleanly then determine
+ * if this vdev should remove any DTLs. We only want to
+ * excise regions on vdevs that were available during
+ * the entire duration of this scan.
+ */
if (scrub_txg != 0 &&
(spa->spa_scrub_started ||
- (scn && scn->scn_phys.scn_errors == 0))) {
+ (scn != NULL && scn->scn_phys.scn_errors == 0)) &&
+ vdev_dtl_should_excise(vd)) {
/*
* We completed a scrub up to scrub_txg. If we
* did it without rebooting, then the scrub dtl
@@ -1717,27 +1979,40 @@
* positive refcnt -- either 1 or 2. We then convert
* the reference tree into the new DTL_MISSING map.
*/
- space_map_ref_create(&reftree);
- space_map_ref_add_map(&reftree,
- &vd->vdev_dtl[DTL_MISSING], 1);
- space_map_ref_add_seg(&reftree, 0, scrub_txg, -1);
- space_map_ref_add_map(&reftree,
- &vd->vdev_dtl[DTL_SCRUB], 2);
- space_map_ref_generate_map(&reftree,
- &vd->vdev_dtl[DTL_MISSING], 1);
- space_map_ref_destroy(&reftree);
+ space_reftree_create(&reftree);
+ space_reftree_add_map(&reftree,
+ vd->vdev_dtl[DTL_MISSING], 1);
+ space_reftree_add_seg(&reftree, 0, scrub_txg, -1);
+ space_reftree_add_map(&reftree,
+ vd->vdev_dtl[DTL_SCRUB], 2);
+ space_reftree_generate_map(&reftree,
+ vd->vdev_dtl[DTL_MISSING], 1);
+ space_reftree_destroy(&reftree);
}
- space_map_vacate(&vd->vdev_dtl[DTL_PARTIAL], NULL, NULL);
- space_map_walk(&vd->vdev_dtl[DTL_MISSING],
- space_map_add, &vd->vdev_dtl[DTL_PARTIAL]);
+ range_tree_vacate(vd->vdev_dtl[DTL_PARTIAL], NULL, NULL);
+ range_tree_walk(vd->vdev_dtl[DTL_MISSING],
+ range_tree_add, vd->vdev_dtl[DTL_PARTIAL]);
if (scrub_done)
- space_map_vacate(&vd->vdev_dtl[DTL_SCRUB], NULL, NULL);
- space_map_vacate(&vd->vdev_dtl[DTL_OUTAGE], NULL, NULL);
+ range_tree_vacate(vd->vdev_dtl[DTL_SCRUB], NULL, NULL);
+ range_tree_vacate(vd->vdev_dtl[DTL_OUTAGE], NULL, NULL);
if (!vdev_readable(vd))
- space_map_add(&vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL);
+ range_tree_add(vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL);
else
- space_map_walk(&vd->vdev_dtl[DTL_MISSING],
- space_map_add, &vd->vdev_dtl[DTL_OUTAGE]);
+ range_tree_walk(vd->vdev_dtl[DTL_MISSING],
+ range_tree_add, vd->vdev_dtl[DTL_OUTAGE]);
+
+ /*
+ * If the vdev was resilvering and no longer has any
+ * DTLs then reset its resilvering flag and dirty
+ * the top level so that we persist the change.
+ */
+ if (vd->vdev_resilver_txg != 0 &&
+ range_tree_space(vd->vdev_dtl[DTL_MISSING]) == 0 &&
+ range_tree_space(vd->vdev_dtl[DTL_OUTAGE]) == 0) {
+ vd->vdev_resilver_txg = 0;
+ vdev_config_dirty(vd->vdev_top);
+ }
+
mutex_exit(&vd->vdev_dtl_lock);
if (txg != 0)
@@ -1757,47 +2032,56 @@
minref = vd->vdev_nparity + 1; /* RAID-Z */
else
minref = vd->vdev_children; /* any kind of mirror */
- space_map_ref_create(&reftree);
+ space_reftree_create(&reftree);
for (int c = 0; c < vd->vdev_children; c++) {
vdev_t *cvd = vd->vdev_child[c];
mutex_enter(&cvd->vdev_dtl_lock);
- space_map_ref_add_map(&reftree, &cvd->vdev_dtl[s], 1);
+ space_reftree_add_map(&reftree, cvd->vdev_dtl[s], 1);
mutex_exit(&cvd->vdev_dtl_lock);
}
- space_map_ref_generate_map(&reftree, &vd->vdev_dtl[t], minref);
- space_map_ref_destroy(&reftree);
+ space_reftree_generate_map(&reftree, vd->vdev_dtl[t], minref);
+ space_reftree_destroy(&reftree);
}
mutex_exit(&vd->vdev_dtl_lock);
}
-static int
+int
vdev_dtl_load(vdev_t *vd)
{
spa_t *spa = vd->vdev_spa;
- space_map_obj_t *smo = &vd->vdev_dtl_smo;
objset_t *mos = spa->spa_meta_objset;
- dmu_buf_t *db;
- int error;
+ int error = 0;
- ASSERT(vd->vdev_children == 0);
+ if (vd->vdev_ops->vdev_op_leaf && vd->vdev_dtl_object != 0) {
+ ASSERT(!vd->vdev_ishole);
- if (smo->smo_object == 0)
- return (0);
+ error = space_map_open(&vd->vdev_dtl_sm, mos,
+ vd->vdev_dtl_object, 0, -1ULL, 0, &vd->vdev_dtl_lock);
+ if (error)
+ return (error);
+ ASSERT(vd->vdev_dtl_sm != NULL);
- ASSERT(!vd->vdev_ishole);
+ mutex_enter(&vd->vdev_dtl_lock);
- if ((error = dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)) != 0)
+ /*
+ * Now that we've opened the space_map we need to update
+ * the in-core DTL.
+ */
+ space_map_update(vd->vdev_dtl_sm);
+
+ error = space_map_load(vd->vdev_dtl_sm,
+ vd->vdev_dtl[DTL_MISSING], SM_ALLOC);
+ mutex_exit(&vd->vdev_dtl_lock);
+
return (error);
+ }
- ASSERT3U(db->db_size, >=, sizeof (*smo));
- bcopy(db->db_data, smo, sizeof (*smo));
- dmu_buf_rele(db, FTAG);
+ for (int c = 0; c < vd->vdev_children; c++) {
+ error = vdev_dtl_load(vd->vdev_child[c]);
+ if (error != 0)
+ break;
+ }
- mutex_enter(&vd->vdev_dtl_lock);
- error = space_map_load(&vd->vdev_dtl[DTL_MISSING],
- NULL, SM_ALLOC, smo, mos);
- mutex_exit(&vd->vdev_dtl_lock);
-
return (error);
}
@@ -1805,66 +2089,75 @@
vdev_dtl_sync(vdev_t *vd, uint64_t txg)
{
spa_t *spa = vd->vdev_spa;
- space_map_obj_t *smo = &vd->vdev_dtl_smo;
- space_map_t *sm = &vd->vdev_dtl[DTL_MISSING];
+ range_tree_t *rt = vd->vdev_dtl[DTL_MISSING];
objset_t *mos = spa->spa_meta_objset;
- space_map_t smsync;
- kmutex_t smlock;
- dmu_buf_t *db;
+ range_tree_t *rtsync;
+ kmutex_t rtlock;
dmu_tx_t *tx;
+ uint64_t object = space_map_object(vd->vdev_dtl_sm);
ASSERT(!vd->vdev_ishole);
+ ASSERT(vd->vdev_ops->vdev_op_leaf);
tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
- if (vd->vdev_detached) {
- if (smo->smo_object != 0) {
- int err = dmu_object_free(mos, smo->smo_object, tx);
- ASSERT0(err);
- smo->smo_object = 0;
- }
+ if (vd->vdev_detached || vd->vdev_top->vdev_removing) {
+ mutex_enter(&vd->vdev_dtl_lock);
+ space_map_free(vd->vdev_dtl_sm, tx);
+ space_map_close(vd->vdev_dtl_sm);
+ vd->vdev_dtl_sm = NULL;
+ mutex_exit(&vd->vdev_dtl_lock);
dmu_tx_commit(tx);
return;
}
- if (smo->smo_object == 0) {
- ASSERT(smo->smo_objsize == 0);
- ASSERT(smo->smo_alloc == 0);
- smo->smo_object = dmu_object_alloc(mos,
- DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT,
- DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx);
- ASSERT(smo->smo_object != 0);
- vdev_config_dirty(vd->vdev_top);
+ if (vd->vdev_dtl_sm == NULL) {
+ uint64_t new_object;
+
+ new_object = space_map_alloc(mos, tx);
+ VERIFY3U(new_object, !=, 0);
+
+ VERIFY0(space_map_open(&vd->vdev_dtl_sm, mos, new_object,
+ 0, -1ULL, 0, &vd->vdev_dtl_lock));
+ ASSERT(vd->vdev_dtl_sm != NULL);
}
- bzero(&smlock, sizeof (smlock));
- mutex_init(&smlock, NULL, MUTEX_DEFAULT, NULL);
+ bzero(&rtlock, sizeof(rtlock));
+ mutex_init(&rtlock, NULL, MUTEX_DEFAULT, NULL);
- space_map_create(&smsync, sm->sm_start, sm->sm_size, sm->sm_shift,
- &smlock);
+ rtsync = range_tree_create(NULL, NULL, &rtlock);
- mutex_enter(&smlock);
+ mutex_enter(&rtlock);
mutex_enter(&vd->vdev_dtl_lock);
- space_map_walk(sm, space_map_add, &smsync);
+ range_tree_walk(rt, range_tree_add, rtsync);
mutex_exit(&vd->vdev_dtl_lock);
- space_map_truncate(smo, mos, tx);
- space_map_sync(&smsync, SM_ALLOC, smo, mos, tx);
- space_map_vacate(&smsync, NULL, NULL);
+ space_map_truncate(vd->vdev_dtl_sm, tx);
+ space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, tx);
+ range_tree_vacate(rtsync, NULL, NULL);
- space_map_destroy(&smsync);
+ range_tree_destroy(rtsync);
- mutex_exit(&smlock);
- mutex_destroy(&smlock);
+ mutex_exit(&rtlock);
+ mutex_destroy(&rtlock);
- VERIFY(0 == dmu_bonus_hold(mos, smo->smo_object, FTAG, &db));
- dmu_buf_will_dirty(db, tx);
- ASSERT3U(db->db_size, >=, sizeof (*smo));
- bcopy(smo, db->db_data, sizeof (*smo));
- dmu_buf_rele(db, FTAG);
+ /*
+ * If the object for the space map has changed then dirty
+ * the top level so that we update the config.
+ */
+ if (object != space_map_object(vd->vdev_dtl_sm)) {
+ zfs_dbgmsg("txg %llu, spa %s, DTL old object %llu, "
+ "new object %llu", txg, spa_name(spa), object,
+ space_map_object(vd->vdev_dtl_sm));
+ vdev_config_dirty(vd->vdev_top);
+ }
dmu_tx_commit(tx);
+
+ mutex_enter(&vd->vdev_dtl_lock);
+ space_map_update(vd->vdev_dtl_sm);
+ mutex_exit(&vd->vdev_dtl_lock);
}
/*
@@ -1913,14 +2206,11 @@
if (vd->vdev_children == 0) {
mutex_enter(&vd->vdev_dtl_lock);
- if (vd->vdev_dtl[DTL_MISSING].sm_space != 0 &&
+ if (range_tree_space(vd->vdev_dtl[DTL_MISSING]) != 0 &&
vdev_writeable(vd)) {
- space_seg_t *ss;
- ss = avl_first(&vd->vdev_dtl[DTL_MISSING].sm_root);
- thismin = ss->ss_start - 1;
- ss = avl_last(&vd->vdev_dtl[DTL_MISSING].sm_root);
- thismax = ss->ss_end;
+ thismin = vdev_dtl_min(vd);
+ thismax = vdev_dtl_max(vd);
needed = B_TRUE;
}
mutex_exit(&vd->vdev_dtl_lock);
@@ -2021,29 +2311,45 @@
tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
- if (vd->vdev_dtl_smo.smo_object) {
- ASSERT0(vd->vdev_dtl_smo.smo_alloc);
- (void) dmu_object_free(mos, vd->vdev_dtl_smo.smo_object, tx);
- vd->vdev_dtl_smo.smo_object = 0;
- }
+ if (vd->vdev_ms != NULL) {
+ metaslab_group_t *mg = vd->vdev_mg;
- if (vd->vdev_ms != NULL) {
+ metaslab_group_histogram_verify(mg);
+ metaslab_class_histogram_verify(mg->mg_class);
+
for (int m = 0; m < vd->vdev_ms_count; m++) {
metaslab_t *msp = vd->vdev_ms[m];
- if (msp == NULL || msp->ms_smo.smo_object == 0)
+ if (msp == NULL || msp->ms_sm == NULL)
continue;
- ASSERT0(msp->ms_smo.smo_alloc);
- (void) dmu_object_free(mos, msp->ms_smo.smo_object, tx);
- msp->ms_smo.smo_object = 0;
+ mutex_enter(&msp->ms_lock);
+ /*
+ * If the metaslab was not loaded when the vdev
+ * was removed then the histogram accounting may
+ * not be accurate. Update the histogram information
+ * here so that we ensure that the metaslab group
+ * and metaslab class are up-to-date.
+ */
+ metaslab_group_histogram_remove(mg, msp);
+
+ VERIFY0(space_map_allocated(msp->ms_sm));
+ space_map_free(msp->ms_sm, tx);
+ space_map_close(msp->ms_sm);
+ msp->ms_sm = NULL;
+ mutex_exit(&msp->ms_lock);
}
+
+ metaslab_group_histogram_verify(mg);
+ metaslab_class_histogram_verify(mg->mg_class);
+ for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++)
+ ASSERT0(mg->mg_histogram[i]);
+
}
if (vd->vdev_ms_array) {
(void) dmu_object_free(mos, vd->vdev_ms_array, tx);
vd->vdev_ms_array = 0;
- vd->vdev_ms_shift = 0;
}
dmu_tx_commit(tx);
}
@@ -2205,6 +2511,7 @@
vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
{
vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev;
+ boolean_t postevent = B_FALSE;
spa_vdev_state_enter(spa, SCL_NONE);
@@ -2214,6 +2521,10 @@
if (!vd->vdev_ops->vdev_op_leaf)
return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
+ postevent =
+ (vd->vdev_offline == B_TRUE || vd->vdev_tmpoffline == B_TRUE) ?
+ B_TRUE : B_FALSE;
+
tvd = vd->vdev_top;
vd->vdev_offline = B_FALSE;
vd->vdev_tmpoffline = B_FALSE;
@@ -2249,6 +2560,10 @@
return (spa_vdev_state_exit(spa, vd, ENOTSUP));
spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
}
+
+ if (postevent)
+ spa_event_notify(spa, vd, ESC_ZFS_VDEV_ONLINE);
+
return (spa_vdev_state_exit(spa, vd, 0));
}
@@ -2380,6 +2695,14 @@
for (int c = 0; c < vd->vdev_children; c++)
vdev_clear(spa, vd->vdev_child[c]);
+ if (vd == rvd) {
+ for (int c = 0; c < spa->spa_l2cache.sav_count; c++)
+ vdev_clear(spa, spa->spa_l2cache.sav_vdevs[c]);
+
+ for (int c = 0; c < spa->spa_spares.sav_count; c++)
+ vdev_clear(spa, spa->spa_spares.sav_vdevs[c]);
+ }
+
/*
* If we're in the FAULTED state or have experienced failed I/O, then
* clear the persistent state and attempt to reopen the device. We
@@ -2464,7 +2787,8 @@
* we're asking two separate questions about it.
*/
return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) &&
- !vd->vdev_cant_write && !vd->vdev_ishole);
+ !vd->vdev_cant_write && !vd->vdev_ishole &&
+ vd->vdev_mg->mg_initialized);
}
boolean_t
@@ -2490,8 +2814,12 @@
void
vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
{
- vdev_t *rvd = vd->vdev_spa->spa_root_vdev;
+ spa_t *spa = vd->vdev_spa;
+ vdev_t *rvd = spa->spa_root_vdev;
+ vdev_t *tvd = vd->vdev_top;
+ ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
+
mutex_enter(&vd->vdev_stat_lock);
bcopy(&vd->vdev_stat, vs, sizeof (*vs));
vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
@@ -2499,8 +2827,22 @@
vs->vs_rsize = vdev_get_min_asize(vd);
if (vd->vdev_ops->vdev_op_leaf)
vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
- vs->vs_esize = vd->vdev_max_asize - vd->vdev_asize;
- mutex_exit(&vd->vdev_stat_lock);
+ /*
+ * Report expandable space on top-level, non-auxillary devices only.
+ * The expandable space is reported in terms of metaslab sized units
+ * since that determines how much space the pool can expand.
+ */
+ if (vd->vdev_aux == NULL && tvd != NULL && vd->vdev_max_asize != 0) {
+ vs->vs_esize = P2ALIGN(vd->vdev_max_asize - vd->vdev_asize,
+ 1ULL << tvd->vdev_ms_shift);
+ }
+ vs->vs_configured_ashift = vd->vdev_top != NULL
+ ? vd->vdev_top->vdev_ashift : vd->vdev_ashift;
+ vs->vs_logical_ashift = vd->vdev_logical_ashift;
+ vs->vs_physical_ashift = vd->vdev_physical_ashift;
+ if (vd->vdev_aux == NULL && vd == vd->vdev_top && !vd->vdev_ishole) {
+ vs->vs_fragmentation = vd->vdev_mg->mg_fragmentation;
+ }
/*
* If we're getting stats on the root vdev, aggregate the I/O counts
@@ -2511,15 +2853,14 @@
vdev_t *cvd = rvd->vdev_child[c];
vdev_stat_t *cvs = &cvd->vdev_stat;
- mutex_enter(&vd->vdev_stat_lock);
for (int t = 0; t < ZIO_TYPES; t++) {
vs->vs_ops[t] += cvs->vs_ops[t];
vs->vs_bytes[t] += cvs->vs_bytes[t];
}
cvs->vs_scan_removing = cvd->vdev_removing;
- mutex_exit(&vd->vdev_stat_lock);
}
}
+ mutex_exit(&vd->vdev_stat_lock);
}
void
@@ -3077,7 +3418,7 @@
boolean_t
vdev_is_bootable(vdev_t *vd)
{
-#ifdef sun
+#ifdef illumos
if (!vd->vdev_ops->vdev_op_leaf) {
char *vdev_type = vd->vdev_ops->vdev_op_type;
@@ -3088,8 +3429,6 @@
strcmp(vdev_type, VDEV_TYPE_MISSING) == 0) {
return (B_FALSE);
}
- } else if (vd->vdev_wholedisk == 1) {
- return (B_FALSE);
}
for (int c = 0; c < vd->vdev_children; c++) {
@@ -3096,7 +3435,7 @@
if (!vdev_is_bootable(vd->vdev_child[c]))
return (B_FALSE);
}
-#endif /* sun */
+#endif /* illumos */
return (B_TRUE);
}
@@ -3195,7 +3534,7 @@
vdev_queue_t *vq = &vd->vdev_queue;
mutex_enter(&vq->vq_lock);
- if (avl_numnodes(&vq->vq_pending_tree) > 0) {
+ if (avl_numnodes(&vq->vq_active_tree) > 0) {
spa_t *spa = vd->vdev_spa;
zio_t *fio;
uint64_t delta;
@@ -3205,7 +3544,7 @@
* if any I/O has been outstanding for longer than
* the spa_deadman_synctime we panic the system.
*/
- fio = avl_first(&vq->vq_pending_tree);
+ fio = avl_first(&vq->vq_active_tree);
delta = gethrtime() - fio->io_timestamp;
if (delta > spa_deadman_synctime(spa)) {
zfs_dbgmsg("SLOW IO: zio timestamp %lluns, "
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -24,7 +24,7 @@
* Use is subject to license terms.
*/
/*
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -115,7 +115,7 @@
{ "misses", KSTAT_DATA_UINT64 }
};
-#define VDCSTAT_BUMP(stat) atomic_add_64(&vdc_stats.stat.value.ui64, 1);
+#define VDCSTAT_BUMP(stat) atomic_inc_64(&vdc_stats.stat.value.ui64);
static int
vdev_cache_offset_compare(const void *a1, const void *a2)
@@ -251,7 +251,8 @@
* any reads that were queued up before the missed update are still
* valid, so we can satisfy them from this line before we evict it.
*/
- while ((pio = zio_walk_parents(fio)) != NULL)
+ zio_link_t *zl = NULL;
+ while ((pio = zio_walk_parents(fio, &zl)) != NULL)
vdev_cache_hit(vc, ve, pio);
if (fio->io_error || ve->ve_missed_update)
@@ -261,9 +262,9 @@
}
/*
- * Read data from the cache. Returns 0 on cache hit, errno on a miss.
+ * Read data from the cache. Returns B_TRUE cache hit, B_FALSE on miss.
*/
-int
+boolean_t
vdev_cache_read(zio_t *zio)
{
vdev_cache_t *vc = &zio->io_vd->vdev_cache;
@@ -275,16 +276,16 @@
ASSERT(zio->io_type == ZIO_TYPE_READ);
if (zio->io_flags & ZIO_FLAG_DONT_CACHE)
- return (SET_ERROR(EINVAL));
+ return (B_FALSE);
if (zio->io_size > zfs_vdev_cache_max)
- return (SET_ERROR(EOVERFLOW));
+ return (B_FALSE);
/*
* If the I/O straddles two or more cache blocks, don't cache it.
*/
if (P2BOUNDARY(zio->io_offset, zio->io_size, VCBS))
- return (SET_ERROR(EXDEV));
+ return (B_FALSE);
ASSERT(cache_phase + zio->io_size <= VCBS);
@@ -296,7 +297,7 @@
if (ve != NULL) {
if (ve->ve_missed_update) {
mutex_exit(&vc->vc_lock);
- return (SET_ERROR(ESTALE));
+ return (B_FALSE);
}
if ((fio = ve->ve_fill_io) != NULL) {
@@ -304,7 +305,7 @@
zio_add_child(zio, fio);
mutex_exit(&vc->vc_lock);
VDCSTAT_BUMP(vdc_stat_delegations);
- return (0);
+ return (B_TRUE);
}
vdev_cache_hit(vc, ve, zio);
@@ -312,7 +313,7 @@
mutex_exit(&vc->vc_lock);
VDCSTAT_BUMP(vdc_stat_hits);
- return (0);
+ return (B_TRUE);
}
ve = vdev_cache_allocate(zio);
@@ -319,11 +320,11 @@
if (ve == NULL) {
mutex_exit(&vc->vc_lock);
- return (SET_ERROR(ENOMEM));
+ return (B_FALSE);
}
fio = zio_vdev_delegated_io(zio->io_vd, cache_offset,
- ve->ve_data, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_CACHE_FILL,
+ ve->ve_data, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_NOW,
ZIO_FLAG_DONT_CACHE, vdev_cache_fill, ve);
ve->ve_fill_io = fio;
@@ -334,7 +335,7 @@
zio_nowait(fio);
VDCSTAT_BUMP(vdc_stat_misses);
- return (0);
+ return (B_TRUE);
}
/*
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,7 +21,9 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2013 Joyent, Inc. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -41,12 +43,147 @@
extern ldi_ident_t zfs_li;
-typedef struct vdev_disk_buf {
- buf_t vdb_buf;
- zio_t *vdb_io;
-} vdev_disk_buf_t;
+static void vdev_disk_close(vdev_t *);
+typedef struct vdev_disk_ldi_cb {
+ list_node_t lcb_next;
+ ldi_callback_id_t lcb_id;
+} vdev_disk_ldi_cb_t;
+
static void
+vdev_disk_alloc(vdev_t *vd)
+{
+ vdev_disk_t *dvd;
+
+ dvd = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
+ /*
+ * Create the LDI event callback list.
+ */
+ list_create(&dvd->vd_ldi_cbs, sizeof (vdev_disk_ldi_cb_t),
+ offsetof(vdev_disk_ldi_cb_t, lcb_next));
+}
+
+static void
+vdev_disk_free(vdev_t *vd)
+{
+ vdev_disk_t *dvd = vd->vdev_tsd;
+ vdev_disk_ldi_cb_t *lcb;
+
+ if (dvd == NULL)
+ return;
+
+ /*
+ * We have already closed the LDI handle. Clean up the LDI event
+ * callbacks and free vd->vdev_tsd.
+ */
+ while ((lcb = list_head(&dvd->vd_ldi_cbs)) != NULL) {
+ list_remove(&dvd->vd_ldi_cbs, lcb);
+ (void) ldi_ev_remove_callbacks(lcb->lcb_id);
+ kmem_free(lcb, sizeof (vdev_disk_ldi_cb_t));
+ }
+ list_destroy(&dvd->vd_ldi_cbs);
+ kmem_free(dvd, sizeof (vdev_disk_t));
+ vd->vdev_tsd = NULL;
+}
+
+/* ARGSUSED */
+static int
+vdev_disk_off_notify(ldi_handle_t lh, ldi_ev_cookie_t ecookie, void *arg,
+ void *ev_data)
+{
+ vdev_t *vd = (vdev_t *)arg;
+ vdev_disk_t *dvd = vd->vdev_tsd;
+
+ /*
+ * Ignore events other than offline.
+ */
+ if (strcmp(ldi_ev_get_type(ecookie), LDI_EV_OFFLINE) != 0)
+ return (LDI_EV_SUCCESS);
+
+ /*
+ * All LDI handles must be closed for the state change to succeed, so
+ * call on vdev_disk_close() to do this.
+ *
+ * We inform vdev_disk_close that it is being called from offline
+ * notify context so it will defer cleanup of LDI event callbacks and
+ * freeing of vd->vdev_tsd to the offline finalize or a reopen.
+ */
+ dvd->vd_ldi_offline = B_TRUE;
+ vdev_disk_close(vd);
+
+ /*
+ * Now that the device is closed, request that the spa_async_thread
+ * mark the device as REMOVED and notify FMA of the removal.
+ */
+ zfs_post_remove(vd->vdev_spa, vd);
+ vd->vdev_remove_wanted = B_TRUE;
+ spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE);
+
+ return (LDI_EV_SUCCESS);
+}
+
+/* ARGSUSED */
+static void
+vdev_disk_off_finalize(ldi_handle_t lh, ldi_ev_cookie_t ecookie,
+ int ldi_result, void *arg, void *ev_data)
+{
+ vdev_t *vd = (vdev_t *)arg;
+
+ /*
+ * Ignore events other than offline.
+ */
+ if (strcmp(ldi_ev_get_type(ecookie), LDI_EV_OFFLINE) != 0)
+ return;
+
+ /*
+ * We have already closed the LDI handle in notify.
+ * Clean up the LDI event callbacks and free vd->vdev_tsd.
+ */
+ vdev_disk_free(vd);
+
+ /*
+ * Request that the vdev be reopened if the offline state change was
+ * unsuccessful.
+ */
+ if (ldi_result != LDI_EV_SUCCESS) {
+ vd->vdev_probe_wanted = B_TRUE;
+ spa_async_request(vd->vdev_spa, SPA_ASYNC_PROBE);
+ }
+}
+
+static ldi_ev_callback_t vdev_disk_off_callb = {
+ .cb_vers = LDI_EV_CB_VERS,
+ .cb_notify = vdev_disk_off_notify,
+ .cb_finalize = vdev_disk_off_finalize
+};
+
+/* ARGSUSED */
+static void
+vdev_disk_dgrd_finalize(ldi_handle_t lh, ldi_ev_cookie_t ecookie,
+ int ldi_result, void *arg, void *ev_data)
+{
+ vdev_t *vd = (vdev_t *)arg;
+
+ /*
+ * Ignore events other than degrade.
+ */
+ if (strcmp(ldi_ev_get_type(ecookie), LDI_EV_DEGRADE) != 0)
+ return;
+
+ /*
+ * Degrade events always succeed. Mark the vdev as degraded.
+ * This status is purely informative for the user.
+ */
+ (void) vdev_degrade(vd->vdev_spa, vd->vdev_guid, 0);
+}
+
+static ldi_ev_callback_t vdev_disk_dgrd_callb = {
+ .cb_vers = LDI_EV_CB_VERS,
+ .cb_notify = NULL,
+ .cb_finalize = vdev_disk_dgrd_finalize
+};
+
+static void
vdev_disk_hold(vdev_t *vd)
{
ddi_devid_t devid;
@@ -105,48 +242,36 @@
}
}
-static uint64_t
-vdev_disk_get_space(vdev_t *vd, uint64_t capacity, uint_t blksz)
-{
- ASSERT(vd->vdev_wholedisk);
+/*
+ * We want to be loud in DEBUG kernels when DKIOCGMEDIAINFOEXT fails, or when
+ * even a fallback to DKIOCGMEDIAINFO fails.
+ */
+#ifdef DEBUG
+#define VDEV_DEBUG(...) cmn_err(CE_NOTE, __VA_ARGS__)
+#else
+#define VDEV_DEBUG(...) /* Nothing... */
+#endif
- vdev_disk_t *dvd = vd->vdev_tsd;
- dk_efi_t dk_ioc;
- efi_gpt_t *efi;
- uint64_t avail_space = 0;
- int efisize = EFI_LABEL_SIZE * 2;
-
- dk_ioc.dki_data = kmem_alloc(efisize, KM_SLEEP);
- dk_ioc.dki_lba = 1;
- dk_ioc.dki_length = efisize;
- dk_ioc.dki_data_64 = (uint64_t)(uintptr_t)dk_ioc.dki_data;
- efi = dk_ioc.dki_data;
-
- if (ldi_ioctl(dvd->vd_lh, DKIOCGETEFI, (intptr_t)&dk_ioc,
- FKIOCTL, kcred, NULL) == 0) {
- uint64_t efi_altern_lba = LE_64(efi->efi_gpt_AlternateLBA);
-
- zfs_dbgmsg("vdev %s, capacity %llu, altern lba %llu",
- vd->vdev_path, capacity, efi_altern_lba);
- if (capacity > efi_altern_lba)
- avail_space = (capacity - efi_altern_lba) * blksz;
- }
- kmem_free(dk_ioc.dki_data, efisize);
- return (avail_space);
-}
-
static int
vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
uint64_t *ashift)
{
spa_t *spa = vd->vdev_spa;
- vdev_disk_t *dvd;
- struct dk_minfo_ext dkmext;
+ vdev_disk_t *dvd = vd->vdev_tsd;
+ ldi_ev_cookie_t ecookie;
+ vdev_disk_ldi_cb_t *lcb;
+ union {
+ struct dk_minfo_ext ude;
+ struct dk_minfo ud;
+ } dks;
+ struct dk_minfo_ext *dkmext = &dks.ude;
+ struct dk_minfo *dkm = &dks.ud;
int error;
dev_t dev;
int otyp;
boolean_t validate_devid = B_FALSE;
ddi_devid_t devid;
+ uint64_t capacity = 0, blksz = 0, pbsize;
/*
* We must have a pathname, and it must be absolute.
@@ -160,13 +285,25 @@
* Reopen the device if it's not currently open. Otherwise,
* just update the physical size of the device.
*/
- if (vd->vdev_tsd != NULL) {
- ASSERT(vd->vdev_reopening);
- dvd = vd->vdev_tsd;
- goto skip_open;
+ if (dvd != NULL) {
+ if (dvd->vd_ldi_offline && dvd->vd_lh == NULL) {
+ /*
+ * If we are opening a device in its offline notify
+ * context, the LDI handle was just closed. Clean
+ * up the LDI event callbacks and free vd->vdev_tsd.
+ */
+ vdev_disk_free(vd);
+ } else {
+ ASSERT(vd->vdev_reopening);
+ goto skip_open;
+ }
}
- dvd = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
+ /*
+ * Create vd->vdev_tsd.
+ */
+ vdev_disk_alloc(vd);
+ dvd = vd->vdev_tsd;
/*
* When opening a disk device, we want to preserve the user's original
@@ -199,23 +336,28 @@
if (vd->vdev_wholedisk == -1ULL) {
size_t len = strlen(vd->vdev_path) + 3;
char *buf = kmem_alloc(len, KM_SLEEP);
- ldi_handle_t lh;
(void) snprintf(buf, len, "%ss0", vd->vdev_path);
- if (ldi_open_by_name(buf, spa_mode(spa), kcred,
- &lh, zfs_li) == 0) {
+ error = ldi_open_by_name(buf, spa_mode(spa), kcred,
+ &dvd->vd_lh, zfs_li);
+ if (error == 0) {
spa_strfree(vd->vdev_path);
vd->vdev_path = buf;
vd->vdev_wholedisk = 1ULL;
- (void) ldi_close(lh, spa_mode(spa), kcred);
} else {
kmem_free(buf, len);
}
}
- error = ldi_open_by_name(vd->vdev_path, spa_mode(spa), kcred,
- &dvd->vd_lh, zfs_li);
+ /*
+ * If we have not yet opened the device, try to open it by the
+ * specified path.
+ */
+ if (error != 0) {
+ error = ldi_open_by_name(vd->vdev_path, spa_mode(spa),
+ kcred, &dvd->vd_lh, zfs_li);
+ }
/*
* Compare the devid to the stored value.
@@ -322,6 +464,27 @@
kmem_free(physpath, MAXPATHLEN);
}
+ /*
+ * Register callbacks for the LDI offline event.
+ */
+ if (ldi_ev_get_cookie(dvd->vd_lh, LDI_EV_OFFLINE, &ecookie) ==
+ LDI_EV_SUCCESS) {
+ lcb = kmem_zalloc(sizeof (vdev_disk_ldi_cb_t), KM_SLEEP);
+ list_insert_tail(&dvd->vd_ldi_cbs, lcb);
+ (void) ldi_ev_register_callbacks(dvd->vd_lh, ecookie,
+ &vdev_disk_off_callb, (void *) vd, &lcb->lcb_id);
+ }
+
+ /*
+ * Register callbacks for the LDI degrade event.
+ */
+ if (ldi_ev_get_cookie(dvd->vd_lh, LDI_EV_DEGRADE, &ecookie) ==
+ LDI_EV_SUCCESS) {
+ lcb = kmem_zalloc(sizeof (vdev_disk_ldi_cb_t), KM_SLEEP);
+ list_insert_tail(&dvd->vd_ldi_cbs, lcb);
+ (void) ldi_ev_register_callbacks(dvd->vd_lh, ecookie,
+ &vdev_disk_dgrd_callb, (void *) vd, &lcb->lcb_id);
+ }
skip_open:
/*
* Determine the actual size of the device.
@@ -331,33 +494,53 @@
return (SET_ERROR(EINVAL));
}
+ *max_psize = *psize;
+
/*
* Determine the device's minimum transfer size.
* If the ioctl isn't supported, assume DEV_BSIZE.
*/
- if (ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFOEXT, (intptr_t)&dkmext,
- FKIOCTL, kcred, NULL) != 0)
- dkmext.dki_pbsize = DEV_BSIZE;
+ if ((error = ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFOEXT,
+ (intptr_t)dkmext, FKIOCTL, kcred, NULL)) == 0) {
+ capacity = dkmext->dki_capacity - 1;
+ blksz = dkmext->dki_lbsize;
+ pbsize = dkmext->dki_pbsize;
+ } else if ((error = ldi_ioctl(dvd->vd_lh, DKIOCGMEDIAINFO,
+ (intptr_t)dkm, FKIOCTL, kcred, NULL)) == 0) {
+ VDEV_DEBUG(
+ "vdev_disk_open(\"%s\"): fallback to DKIOCGMEDIAINFO\n",
+ vd->vdev_path);
+ capacity = dkm->dki_capacity - 1;
+ blksz = dkm->dki_lbsize;
+ pbsize = blksz;
+ } else {
+ VDEV_DEBUG("vdev_disk_open(\"%s\"): "
+ "both DKIOCGMEDIAINFO{,EXT} calls failed, %d\n",
+ vd->vdev_path, error);
+ pbsize = DEV_BSIZE;
+ }
- *ashift = highbit(MAX(dkmext.dki_pbsize, SPA_MINBLOCKSIZE)) - 1;
+ *ashift = highbit64(MAX(pbsize, SPA_MINBLOCKSIZE)) - 1;
if (vd->vdev_wholedisk == 1) {
- uint64_t capacity = dkmext.dki_capacity - 1;
- uint64_t blksz = dkmext.dki_lbsize;
int wce = 1;
+ if (error == 0) {
+ /*
+ * If we have the capability to expand, we'd have
+ * found out via success from DKIOCGMEDIAINFO{,EXT}.
+ * Adjust max_psize upward accordingly since we know
+ * we own the whole disk now.
+ */
+ *max_psize = capacity * blksz;
+ }
+
/*
- * If we own the whole disk, try to enable disk write caching.
- * We ignore errors because it's OK if we can't do it.
+ * Since we own the whole disk, try to enable disk write
+ * caching. We ignore errors because it's OK if we can't do it.
*/
(void) ldi_ioctl(dvd->vd_lh, DKIOCSETWCE, (intptr_t)&wce,
FKIOCTL, kcred, NULL);
-
- *max_psize = *psize + vdev_disk_get_space(vd, capacity, blksz);
- zfs_dbgmsg("capacity change: vdev %s, psize %llu, "
- "max_psize %llu", vd->vdev_path, *psize, *max_psize);
- } else {
- *max_psize = *psize;
}
/*
@@ -377,24 +560,65 @@
if (vd->vdev_reopening || dvd == NULL)
return;
- if (dvd->vd_minor != NULL)
+ if (dvd->vd_minor != NULL) {
ddi_devid_str_free(dvd->vd_minor);
+ dvd->vd_minor = NULL;
+ }
- if (dvd->vd_devid != NULL)
+ if (dvd->vd_devid != NULL) {
ddi_devid_free(dvd->vd_devid);
+ dvd->vd_devid = NULL;
+ }
- if (dvd->vd_lh != NULL)
+ if (dvd->vd_lh != NULL) {
(void) ldi_close(dvd->vd_lh, spa_mode(vd->vdev_spa), kcred);
+ dvd->vd_lh = NULL;
+ }
vd->vdev_delayed_close = B_FALSE;
- kmem_free(dvd, sizeof (vdev_disk_t));
- vd->vdev_tsd = NULL;
+ /*
+ * If we closed the LDI handle due to an offline notify from LDI,
+ * don't free vd->vdev_tsd or unregister the callbacks here;
+ * the offline finalize callback or a reopen will take care of it.
+ */
+ if (dvd->vd_ldi_offline)
+ return;
+
+ vdev_disk_free(vd);
}
int
-vdev_disk_physio(ldi_handle_t vd_lh, caddr_t data, size_t size,
- uint64_t offset, int flags)
+vdev_disk_physio(vdev_t *vd, caddr_t data,
+ size_t size, uint64_t offset, int flags, boolean_t isdump)
{
+ vdev_disk_t *dvd = vd->vdev_tsd;
+
+ /*
+ * If the vdev is closed, it's likely in the REMOVED or FAULTED state.
+ * Nothing to be done here but return failure.
+ */
+ if (dvd == NULL || (dvd->vd_ldi_offline && dvd->vd_lh == NULL))
+ return (EIO);
+
+ ASSERT(vd->vdev_ops == &vdev_disk_ops);
+
+ /*
+ * If in the context of an active crash dump, use the ldi_dump(9F)
+ * call instead of ldi_strategy(9F) as usual.
+ */
+ if (isdump) {
+ ASSERT3P(dvd, !=, NULL);
+ return (ldi_dump(dvd->vd_lh, data, lbtodb(offset),
+ lbtodb(size)));
+ }
+
+ return (vdev_disk_ldi_physio(dvd->vd_lh, data, size, offset, flags));
+}
+
+int
+vdev_disk_ldi_physio(ldi_handle_t vd_lh, caddr_t data,
+ size_t size, uint64_t offset, int flags)
+{
buf_t *bp;
int error = 0;
@@ -422,8 +646,8 @@
static void
vdev_disk_io_intr(buf_t *bp)
{
- vdev_disk_buf_t *vdb = (vdev_disk_buf_t *)bp;
- zio_t *zio = vdb->vdb_io;
+ vdev_buf_t *vb = (vdev_buf_t *)bp;
+ zio_t *zio = vb->vb_io;
/*
* The rest of the zio stack only deals with EIO, ECKSUM, and ENXIO.
@@ -430,14 +654,14 @@
* Rather than teach the rest of the stack about other error
* possibilities (EFAULT, etc), we normalize the error value here.
*/
- zio->io_error = (geterror(bp) != 0 ? EIO : 0);
+ zio->io_error = (geterror(bp) != 0 ? SET_ERROR(EIO) : 0);
if (zio->io_error == 0 && bp->b_resid != 0)
zio->io_error = SET_ERROR(EIO);
- kmem_free(vdb, sizeof (vdev_disk_buf_t));
+ kmem_free(vb, sizeof (vdev_buf_t));
- zio_interrupt(zio);
+ zio_delay_interrupt(zio);
}
static void
@@ -461,21 +685,32 @@
zio_interrupt(zio);
}
-static int
+static void
vdev_disk_io_start(zio_t *zio)
{
vdev_t *vd = zio->io_vd;
vdev_disk_t *dvd = vd->vdev_tsd;
- vdev_disk_buf_t *vdb;
+ vdev_buf_t *vb;
struct dk_callback *dkc;
buf_t *bp;
int error;
+ /*
+ * If the vdev is closed, it's likely in the REMOVED or FAULTED state.
+ * Nothing to be done here but return failure.
+ */
+ if (dvd == NULL || (dvd->vd_ldi_offline && dvd->vd_lh == NULL)) {
+ zio->io_error = SET_ERROR(ENXIO);
+ zio_interrupt(zio);
+ return;
+ }
+
if (zio->io_type == ZIO_TYPE_IOCTL) {
/* XXPOLICY */
if (!vdev_readable(vd)) {
zio->io_error = SET_ERROR(ENXIO);
- return (ZIO_PIPELINE_CONTINUE);
+ zio_interrupt(zio);
+ return;
}
switch (zio->io_cmd) {
@@ -506,7 +741,7 @@
* and will call vdev_disk_ioctl_done()
* upon completion.
*/
- return (ZIO_PIPELINE_STOP);
+ return;
}
if (error == ENOTSUP || error == ENOTTY) {
@@ -527,14 +762,18 @@
zio->io_error = SET_ERROR(ENOTSUP);
}
- return (ZIO_PIPELINE_CONTINUE);
+ zio_execute(zio);
+ return;
}
- vdb = kmem_alloc(sizeof (vdev_disk_buf_t), KM_SLEEP);
+ ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
+ zio->io_target_timestamp = zio_handle_io_delay(zio);
- vdb->vdb_io = zio;
- bp = &vdb->vdb_buf;
+ vb = kmem_alloc(sizeof (vdev_buf_t), KM_SLEEP);
+ vb->vb_io = zio;
+ bp = &vb->vb_buf;
+
bioinit(bp);
bp->b_flags = B_BUSY | B_NOCACHE |
(zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE);
@@ -548,8 +787,6 @@
/* ldi_strategy() will return non-zero only on programming errors */
VERIFY(ldi_strategy(dvd->vd_lh, bp) == 0);
-
- return (ZIO_PIPELINE_STOP);
}
static void
@@ -641,7 +878,7 @@
/* read vdev label */
offset = vdev_label_offset(size, l, 0);
- if (vdev_disk_physio(vd_lh, (caddr_t)label,
+ if (vdev_disk_ldi_physio(vd_lh, (caddr_t)label,
VDEV_SKIP_SIZE + VDEV_PHYS_SIZE, offset, B_READ) != 0)
continue;
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,7 +21,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -36,6 +36,21 @@
* Virtual device vector for files.
*/
+static taskq_t *vdev_file_taskq;
+
+void
+vdev_file_init(void)
+{
+ vdev_file_taskq = taskq_create("z_vdev_file", MAX(max_ncpus, 16),
+ minclsyspri, max_ncpus, INT_MAX, 0);
+}
+
+void
+vdev_file_fini(void)
+{
+ taskq_destroy(vdev_file_taskq);
+}
+
static void
vdev_file_hold(vdev_t *vd)
{
@@ -50,12 +65,12 @@
static int
vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
- uint64_t *ashift)
+ uint64_t *logical_ashift, uint64_t *physical_ashift)
{
vdev_file_t *vf;
vnode_t *vp;
vattr_t vattr;
- int error, vfslocked;
+ int error;
/*
* We must have a pathname, and it must be absolute.
@@ -119,11 +134,9 @@
* Determine the physical size of the file.
*/
vattr.va_mask = AT_SIZE;
- vfslocked = VFS_LOCK_GIANT(vp->v_mount);
vn_lock(vp, LK_SHARED | LK_RETRY);
error = VOP_GETATTR(vp, &vattr, kcred);
VOP_UNLOCK(vp, 0);
- VFS_UNLOCK_GIANT(vfslocked);
if (error) {
(void) VOP_CLOSE(vp, spa_mode(vd->vdev_spa), 1, 0, kcred, NULL);
vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
@@ -132,8 +145,11 @@
return (error);
}
+ vd->vdev_notrim = B_TRUE;
+
*max_psize = *psize = vattr.va_size;
- *ashift = SPA_MINBLOCKSHIFT;
+ *logical_ashift = SPA_MINBLOCKSHIFT;
+ *physical_ashift = SPA_MINBLOCKSHIFT;
return (0);
}
@@ -156,26 +172,58 @@
vd->vdev_tsd = NULL;
}
-static int
-vdev_file_io_start(zio_t *zio)
+/*
+ * Implements the interrupt side for file vdev types. This routine will be
+ * called when the I/O completes allowing us to transfer the I/O to the
+ * interrupt taskqs. For consistency, the code structure mimics disk vdev
+ * types.
+ */
+static void
+vdev_file_io_intr(zio_t *zio)
{
+ zio_delay_interrupt(zio);
+}
+
+static void
+vdev_file_io_strategy(void *arg)
+{
+ zio_t *zio = arg;
vdev_t *vd = zio->io_vd;
vdev_file_t *vf;
vnode_t *vp;
ssize_t resid;
- if (!vdev_readable(vd)) {
- zio->io_error = SET_ERROR(ENXIO);
- return (ZIO_PIPELINE_CONTINUE);
- }
-
vf = vd->vdev_tsd;
vp = vf->vf_vnode;
+ ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
+ zio->io_error = vn_rdwr(zio->io_type == ZIO_TYPE_READ ?
+ UIO_READ : UIO_WRITE, vp, zio->io_data, zio->io_size,
+ zio->io_offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid);
+
+ if (resid != 0 && zio->io_error == 0)
+ zio->io_error = ENOSPC;
+
+ vdev_file_io_intr(zio);
+}
+
+static void
+vdev_file_io_start(zio_t *zio)
+{
+ vdev_t *vd = zio->io_vd;
+ vdev_file_t *vf = vd->vdev_tsd;
+
if (zio->io_type == ZIO_TYPE_IOCTL) {
+ /* XXPOLICY */
+ if (!vdev_readable(vd)) {
+ zio->io_error = SET_ERROR(ENXIO);
+ zio_interrupt(zio);
+ return;
+ }
+
switch (zio->io_cmd) {
case DKIOCFLUSHWRITECACHE:
- zio->io_error = VOP_FSYNC(vp, FSYNC | FDSYNC,
+ zio->io_error = VOP_FSYNC(vf->vf_vnode, FSYNC | FDSYNC,
kcred, NULL);
break;
default:
@@ -182,19 +230,15 @@
zio->io_error = SET_ERROR(ENOTSUP);
}
- return (ZIO_PIPELINE_CONTINUE);
+ zio_execute(zio);
+ return;
}
- zio->io_error = vn_rdwr(zio->io_type == ZIO_TYPE_READ ?
- UIO_READ : UIO_WRITE, vp, zio->io_data, zio->io_size,
- zio->io_offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid);
+ ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
+ zio->io_target_timestamp = zio_handle_io_delay(zio);
- if (resid != 0 && zio->io_error == 0)
- zio->io_error = ENOSPC;
-
- zio_interrupt(zio);
-
- return (ZIO_PIPELINE_STOP);
+ VERIFY3U(taskq_dispatch(vdev_file_taskq, vdev_file_io_strategy, zio,
+ TQ_SLEEP), !=, 0);
}
/* ARGSUSED */
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -43,11 +43,23 @@
* Virtual device vector for GEOM.
*/
+static g_attrchanged_t vdev_geom_attrchanged;
struct g_class zfs_vdev_class = {
.name = "ZFS::VDEV",
.version = G_VERSION,
+ .attrchanged = vdev_geom_attrchanged,
};
+struct consumer_vdev_elem {
+ SLIST_ENTRY(consumer_vdev_elem) elems;
+ vdev_t *vd;
+};
+
+SLIST_HEAD(consumer_priv_t, consumer_vdev_elem);
+_Static_assert(sizeof(((struct g_consumer*)NULL)->private)
+ == sizeof(struct consumer_priv_t*),
+ "consumer_priv_t* can't be stored in g_consumer.private");
+
DECLARE_GEOM_CLASS(zfs_vdev_class, zfs_vdev);
SYSCTL_DECL(_vfs_zfs_vdev);
@@ -62,15 +74,103 @@
SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_delete_disable, CTLFLAG_RW,
&vdev_geom_bio_delete_disable, 0, "Disable BIO_DELETE");
+/* Declare local functions */
+static void vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read);
+
+/*
+ * Thread local storage used to indicate when a thread is probing geoms
+ * for their guids. If NULL, this thread is not tasting geoms. If non NULL,
+ * it is looking for a replacement for the vdev_t* that is its value.
+ */
+uint_t zfs_geom_probe_vdev_key;
+
static void
+vdev_geom_set_rotation_rate(vdev_t *vd, struct g_consumer *cp)
+{
+ int error;
+ uint16_t rate;
+
+ error = g_getattr("GEOM::rotation_rate", cp, &rate);
+ if (error == 0)
+ vd->vdev_rotation_rate = rate;
+ else
+ vd->vdev_rotation_rate = VDEV_RATE_UNKNOWN;
+}
+
+static void
+vdev_geom_set_physpath(vdev_t *vd, struct g_consumer *cp,
+ boolean_t do_null_update)
+{
+ boolean_t needs_update = B_FALSE;
+ char *physpath;
+ int error, physpath_len;
+
+ physpath_len = MAXPATHLEN;
+ physpath = g_malloc(physpath_len, M_WAITOK|M_ZERO);
+ error = g_io_getattr("GEOM::physpath", cp, &physpath_len, physpath);
+ if (error == 0) {
+ char *old_physpath;
+
+ /* g_topology lock ensures that vdev has not been closed */
+ g_topology_assert();
+ old_physpath = vd->vdev_physpath;
+ vd->vdev_physpath = spa_strdup(physpath);
+
+ if (old_physpath != NULL) {
+ needs_update = (strcmp(old_physpath,
+ vd->vdev_physpath) != 0);
+ spa_strfree(old_physpath);
+ } else
+ needs_update = do_null_update;
+ }
+ g_free(physpath);
+
+ /*
+ * If the physical path changed, update the config.
+ * Only request an update for previously unset physpaths if
+ * requested by the caller.
+ */
+ if (needs_update)
+ spa_async_request(vd->vdev_spa, SPA_ASYNC_CONFIG_UPDATE);
+
+}
+
+static void
+vdev_geom_attrchanged(struct g_consumer *cp, const char *attr)
+{
+ char *old_physpath;
+ struct consumer_priv_t *priv;
+ struct consumer_vdev_elem *elem;
+ int error;
+
+ priv = (struct consumer_priv_t*)&cp->private;
+ if (SLIST_EMPTY(priv))
+ return;
+
+ SLIST_FOREACH(elem, priv, elems) {
+ vdev_t *vd = elem->vd;
+ if (strcmp(attr, "GEOM::rotation_rate") == 0) {
+ vdev_geom_set_rotation_rate(vd, cp);
+ return;
+ }
+ if (strcmp(attr, "GEOM::physpath") == 0) {
+ vdev_geom_set_physpath(vd, cp, /*null_update*/B_TRUE);
+ return;
+ }
+ }
+}
+
+static void
vdev_geom_orphan(struct g_consumer *cp)
{
- vdev_t *vd;
+ struct consumer_priv_t *priv;
+ struct consumer_vdev_elem *elem;
g_topology_assert();
- vd = cp->private;
- if (vd == NULL)
+ priv = (struct consumer_priv_t*)&cp->private;
+ if (SLIST_EMPTY(priv))
+ /* Vdev close in progress. Ignore the event. */
return;
/*
@@ -87,20 +187,39 @@
* async removal support to invoke a close on this
* vdev once it is safe to do so.
*/
- zfs_post_remove(vd->vdev_spa, vd);
- vd->vdev_remove_wanted = B_TRUE;
- spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE);
+ SLIST_FOREACH(elem, priv, elems) {
+ vdev_t *vd = elem->vd;
+
+ vd->vdev_remove_wanted = B_TRUE;
+ spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE);
+ }
}
static struct g_consumer *
-vdev_geom_attach(struct g_provider *pp)
+vdev_geom_attach(struct g_provider *pp, vdev_t *vd, boolean_t sanity)
{
struct g_geom *gp;
struct g_consumer *cp;
+ int error;
g_topology_assert();
ZFS_LOG(1, "Attaching to %s.", pp->name);
+
+ if (sanity) {
+ if (pp->sectorsize > VDEV_PAD_SIZE || !ISP2(pp->sectorsize)) {
+ ZFS_LOG(1, "Failing attach of %s. "
+ "Incompatible sectorsize %d\n",
+ pp->name, pp->sectorsize);
+ return (NULL);
+ } else if (pp->mediasize < SPA_MINDEVSIZE) {
+ ZFS_LOG(1, "Failing attach of %s. "
+ "Incompatible mediasize %ju\n",
+ pp->name, pp->mediasize);
+ return (NULL);
+ }
+ }
+
/* Do we have geom already? No? Create one. */
LIST_FOREACH(gp, &zfs_vdev_class.geom, geom) {
if (gp->flags & G_GEOM_WITHER)
@@ -112,13 +231,20 @@
if (gp == NULL) {
gp = g_new_geomf(&zfs_vdev_class, "zfs::vdev");
gp->orphan = vdev_geom_orphan;
+ gp->attrchanged = vdev_geom_attrchanged;
cp = g_new_consumer(gp);
- if (g_attach(cp, pp) != 0) {
- g_wither_geom(gp, ENXIO);
+ error = g_attach(cp, pp);
+ if (error != 0) {
+ ZFS_LOG(1, "%s(%d): g_attach failed: %d\n", __func__,
+ __LINE__, error);
+ vdev_geom_detach(cp, B_FALSE);
return (NULL);
}
- if (g_access(cp, 1, 0, 1) != 0) {
- g_wither_geom(gp, ENXIO);
+ error = g_access(cp, 1, 0, 1);
+ if (error != 0) {
+ ZFS_LOG(1, "%s(%d): g_access failed: %d", __func__,
+ __LINE__, error);
+ vdev_geom_detach(cp, B_FALSE);
return (NULL);
}
ZFS_LOG(1, "Created geom and consumer for %s.", pp->name);
@@ -132,43 +258,61 @@
}
if (cp == NULL) {
cp = g_new_consumer(gp);
- if (g_attach(cp, pp) != 0) {
- g_destroy_consumer(cp);
+ error = g_attach(cp, pp);
+ if (error != 0) {
+ ZFS_LOG(1, "%s(%d): g_attach failed: %d\n",
+ __func__, __LINE__, error);
+ vdev_geom_detach(cp, B_FALSE);
return (NULL);
}
- if (g_access(cp, 1, 0, 1) != 0) {
- g_detach(cp);
- g_destroy_consumer(cp);
+ error = g_access(cp, 1, 0, 1);
+ if (error != 0) {
+ ZFS_LOG(1, "%s(%d): g_access failed: %d\n",
+ __func__, __LINE__, error);
+ vdev_geom_detach(cp, B_FALSE);
return (NULL);
}
ZFS_LOG(1, "Created consumer for %s.", pp->name);
} else {
- if (g_access(cp, 1, 0, 1) != 0)
+ error = g_access(cp, 1, 0, 1);
+ if (error != 0) {
+ ZFS_LOG(1, "%s(%d): g_access failed: %d\n",
+ __func__, __LINE__, error);
return (NULL);
+ }
ZFS_LOG(1, "Used existing consumer for %s.", pp->name);
}
}
+
+ if (vd != NULL)
+ vd->vdev_tsd = cp;
+
+ cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE;
return (cp);
}
static void
-vdev_geom_detach(void *arg, int flag __unused)
+vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read)
{
struct g_geom *gp;
- struct g_consumer *cp;
g_topology_assert();
- cp = arg;
+
+ ZFS_LOG(1, "Detaching from %s.",
+ cp->provider && cp->provider->name ? cp->provider->name : "NULL");
+
gp = cp->geom;
-
- ZFS_LOG(1, "Closing access to %s.", cp->provider->name);
- g_access(cp, -1, 0, -1);
+ if (open_for_read)
+ g_access(cp, -1, 0, -1);
/* Destroy consumer on last close. */
if (cp->acr == 0 && cp->ace == 0) {
- ZFS_LOG(1, "Destroyed consumer to %s.", cp->provider->name);
if (cp->acw > 0)
g_access(cp, 0, -cp->acw, 0);
- g_detach(cp);
+ if (cp->provider != NULL) {
+ ZFS_LOG(1, "Destroying consumer for %s.",
+ cp->provider->name ? cp->provider->name : "NULL");
+ g_detach(cp);
+ }
g_destroy_consumer(cp);
}
/* Destroy geom if there are no consumers left. */
@@ -178,70 +322,115 @@
}
}
-static uint64_t
-nvlist_get_guid(nvlist_t *list)
+static void
+vdev_geom_close_locked(vdev_t *vd)
{
- uint64_t value;
+ struct g_consumer *cp;
+ struct consumer_priv_t *priv;
+ struct consumer_vdev_elem *elem, *elem_temp;
- value = 0;
- nvlist_lookup_uint64(list, ZPOOL_CONFIG_GUID, &value);
- return (value);
+ g_topology_assert();
+
+ cp = vd->vdev_tsd;
+ vd->vdev_delayed_close = B_FALSE;
+ if (cp == NULL)
+ return;
+
+ ZFS_LOG(1, "Closing access to %s.", cp->provider->name);
+ KASSERT(cp->private != NULL, ("%s: cp->private is NULL", __func__));
+ priv = (struct consumer_priv_t*)&cp->private;
+ vd->vdev_tsd = NULL;
+ SLIST_FOREACH_SAFE(elem, priv, elems, elem_temp) {
+ if (elem->vd == vd) {
+ SLIST_REMOVE(priv, elem, consumer_vdev_elem, elems);
+ g_free(elem);
+ }
+ }
+
+ vdev_geom_detach(cp, B_TRUE);
}
-static int
-vdev_geom_io(struct g_consumer *cp, int cmd, void *data, off_t offset, off_t size)
+/*
+ * Issue one or more bios to the vdev in parallel
+ * cmds, datas, offsets, errors, and sizes are arrays of length ncmds. Each IO
+ * operation is described by parallel entries from each array. There may be
+ * more bios actually issued than entries in the array
+ */
+static void
+vdev_geom_io(struct g_consumer *cp, int *cmds, void **datas, off_t *offsets,
+ off_t *sizes, int *errors, int ncmds)
{
- struct bio *bp;
+ struct bio **bios;
u_char *p;
- off_t off, maxio;
- int error;
+ off_t off, maxio, s, end;
+ int i, n_bios, j;
+ size_t bios_size;
- ASSERT((offset % cp->provider->sectorsize) == 0);
- ASSERT((size % cp->provider->sectorsize) == 0);
-
- bp = g_alloc_bio();
- off = offset;
- offset += size;
- p = data;
maxio = MAXPHYS - (MAXPHYS % cp->provider->sectorsize);
- error = 0;
+ n_bios = 0;
- for (; off < offset; off += maxio, p += maxio, size -= maxio) {
- bzero(bp, sizeof(*bp));
- bp->bio_cmd = cmd;
- bp->bio_done = NULL;
- bp->bio_offset = off;
- bp->bio_length = MIN(size, maxio);
- bp->bio_data = p;
- g_io_request(bp, cp);
- error = biowait(bp, "vdev_geom_io");
- if (error != 0)
- break;
+ /* How many bios are required for all commands ? */
+ for (i = 0; i < ncmds; i++)
+ n_bios += (sizes[i] + maxio - 1) / maxio;
+
+ /* Allocate memory for the bios */
+ bios_size = n_bios * sizeof(struct bio*);
+ bios = kmem_zalloc(bios_size, KM_SLEEP);
+
+ /* Prepare and issue all of the bios */
+ for (i = j = 0; i < ncmds; i++) {
+ off = offsets[i];
+ p = datas[i];
+ s = sizes[i];
+ end = off + s;
+ ASSERT((off % cp->provider->sectorsize) == 0);
+ ASSERT((s % cp->provider->sectorsize) == 0);
+
+ for (; off < end; off += maxio, p += maxio, s -= maxio, j++) {
+ bios[j] = g_alloc_bio();
+ bios[j]->bio_cmd = cmds[i];
+ bios[j]->bio_done = NULL;
+ bios[j]->bio_offset = off;
+ bios[j]->bio_length = MIN(s, maxio);
+ bios[j]->bio_data = p;
+ g_io_request(bios[j], cp);
+ }
}
+ ASSERT(j == n_bios);
- g_destroy_bio(bp);
- return (error);
-}
+ /* Wait for all of the bios to complete, and clean them up */
+ for (i = j = 0; i < ncmds; i++) {
+ off = offsets[i];
+ s = sizes[i];
+ end = off + s;
-static void
-vdev_geom_taste_orphan(struct g_consumer *cp)
-{
-
- KASSERT(1 == 0, ("%s called while tasting %s.", __func__,
- cp->provider->name));
+ for (; off < end; off += maxio, s -= maxio, j++) {
+ errors[i] = biowait(bios[j], "vdev_geom_io") || errors[i];
+ g_destroy_bio(bios[j]);
+ }
+ }
+ kmem_free(bios, bios_size);
}
+/*
+ * Read the vdev config from a device. Return the number of valid labels that
+ * were found. The vdev config will be returned in config if and only if at
+ * least one valid label was found.
+ */
static int
vdev_geom_read_config(struct g_consumer *cp, nvlist_t **config)
{
struct g_provider *pp;
- vdev_label_t *label;
- char *p, *buf;
+ vdev_phys_t *vdev_lists[VDEV_LABELS];
+ char *buf;
size_t buflen;
- uint64_t psize;
- off_t offset, size;
- uint64_t guid, state, txg;
- int error, l, len;
+ uint64_t psize, state, txg;
+ off_t offsets[VDEV_LABELS];
+ off_t size;
+ off_t sizes[VDEV_LABELS];
+ int cmds[VDEV_LABELS];
+ int errors[VDEV_LABELS];
+ int l, nlabels;
g_topology_assert_not();
@@ -251,24 +440,34 @@
psize = pp->mediasize;
psize = P2ALIGN(psize, (uint64_t)sizeof(vdev_label_t));
- size = sizeof(*label) + pp->sectorsize -
- ((sizeof(*label) - 1) % pp->sectorsize) - 1;
+ size = sizeof(*vdev_lists[0]) + pp->sectorsize -
+ ((sizeof(*vdev_lists[0]) - 1) % pp->sectorsize) - 1;
- guid = 0;
- label = kmem_alloc(size, KM_SLEEP);
- buflen = sizeof(label->vl_vdev_phys.vp_nvlist);
+ buflen = sizeof(vdev_lists[0]->vp_nvlist);
*config = NULL;
+ /* Create all of the IO requests */
for (l = 0; l < VDEV_LABELS; l++) {
+ cmds[l] = BIO_READ;
+ vdev_lists[l] = kmem_alloc(size, KM_SLEEP);
+ offsets[l] = vdev_label_offset(psize, l, 0) + VDEV_SKIP_SIZE;
+ sizes[l] = size;
+ errors[l] = 0;
+ ASSERT(offsets[l] % pp->sectorsize == 0);
+ }
- offset = vdev_label_offset(psize, l, 0);
- if ((offset % pp->sectorsize) != 0)
- continue;
+ /* Issue the IO requests */
+ vdev_geom_io(cp, cmds, (void**)vdev_lists, offsets, sizes, errors,
+ VDEV_LABELS);
- if (vdev_geom_io(cp, BIO_READ, label, offset, size) != 0)
+ /* Parse the labels */
+ nlabels = 0;
+ for (l = 0; l < VDEV_LABELS; l++) {
+ if (errors[l] != 0)
continue;
- buf = label->vl_vdev_phys.vp_nvlist;
+ buf = vdev_lists[l]->vp_nvlist;
+
if (nvlist_unpack(buf, buflen, config, 0) != 0)
continue;
@@ -279,7 +478,8 @@
continue;
}
- if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE &&
+ if (state != POOL_STATE_SPARE &&
+ state != POOL_STATE_L2CACHE &&
(nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG,
&txg) != 0 || txg == 0)) {
nvlist_free(*config);
@@ -287,11 +487,14 @@
continue;
}
- break;
+ nlabels++;
}
- kmem_free(label, size);
- return (*config == NULL ? ENOENT : 0);
+ /* Free the label storage */
+ for (l = 0; l < VDEV_LABELS; l++)
+ kmem_free(vdev_lists[l], size);
+
+ return (nlabels);
}
static void
@@ -364,49 +567,21 @@
nvlist_free(cfg);
}
-static int
-vdev_geom_attach_taster(struct g_consumer *cp, struct g_provider *pp)
-{
- int error;
-
- if (pp->flags & G_PF_WITHER)
- return (EINVAL);
- if (pp->sectorsize > VDEV_PAD_SIZE || !ISP2(pp->sectorsize))
- return (EINVAL);
- g_attach(cp, pp);
- error = g_access(cp, 1, 0, 0);
- if (error != 0)
- g_detach(cp);
- return (error);
-}
-
-static void
-vdev_geom_detach_taster(struct g_consumer *cp)
-{
- g_access(cp, -1, 0, 0);
- g_detach(cp);
-}
-
int
vdev_geom_read_pool_label(const char *name,
nvlist_t ***configs, uint64_t *count)
{
struct g_class *mp;
- struct g_geom *gp, *zgp;
+ struct g_geom *gp;
struct g_provider *pp;
struct g_consumer *zcp;
nvlist_t *vdev_cfg;
uint64_t pool_guid;
- int error;
+ int error, nlabels;
DROP_GIANT();
g_topology_lock();
- zgp = g_new_geomf(&zfs_vdev_class, "zfs::vdev::taste");
- /* This orphan function should be never called. */
- zgp->orphan = vdev_geom_taste_orphan;
- zcp = g_new_consumer(zgp);
-
*configs = NULL;
*count = 0;
pool_guid = 0;
@@ -419,13 +594,14 @@
LIST_FOREACH(pp, &gp->provider, provider) {
if (pp->flags & G_PF_WITHER)
continue;
- if (vdev_geom_attach_taster(zcp, pp) != 0)
+ zcp = vdev_geom_attach(pp, NULL, B_TRUE);
+ if (zcp == NULL)
continue;
g_topology_unlock();
- error = vdev_geom_read_config(zcp, &vdev_cfg);
+ nlabels = vdev_geom_read_config(zcp, &vdev_cfg);
g_topology_lock();
- vdev_geom_detach_taster(zcp);
- if (error)
+ vdev_geom_detach(zcp, B_TRUE);
+ if (nlabels == 0)
continue;
ZFS_LOG(1, "successfully read vdev config");
@@ -434,9 +610,6 @@
}
}
}
-
- g_destroy_consumer(zcp);
- g_destroy_geom(zgp);
g_topology_unlock();
PICKUP_GIANT();
@@ -443,39 +616,89 @@
return (*count > 0 ? 0 : ENOENT);
}
-static uint64_t
-vdev_geom_read_guid(struct g_consumer *cp)
+enum match {
+ NO_MATCH = 0, /* No matching labels found */
+ TOPGUID_MATCH = 1, /* Labels match top guid, not vdev guid*/
+ ZERO_MATCH = 1, /* Should never be returned */
+ ONE_MATCH = 2, /* 1 label matching the vdev_guid */
+ TWO_MATCH = 3, /* 2 label matching the vdev_guid */
+ THREE_MATCH = 4, /* 3 label matching the vdev_guid */
+ FULL_MATCH = 5 /* all labels match the vdev_guid */
+};
+
+static enum match
+vdev_attach_ok(vdev_t *vd, struct g_provider *pp)
{
nvlist_t *config;
- uint64_t guid;
+ uint64_t pool_guid, top_guid, vdev_guid;
+ struct g_consumer *cp;
+ int nlabels;
- g_topology_assert_not();
+ cp = vdev_geom_attach(pp, NULL, B_TRUE);
+ if (cp == NULL) {
+ ZFS_LOG(1, "Unable to attach tasting instance to %s.",
+ pp->name);
+ return (NO_MATCH);
+ }
+ g_topology_unlock();
+ nlabels = vdev_geom_read_config(cp, &config);
+ g_topology_lock();
+ vdev_geom_detach(cp, B_TRUE);
+ if (nlabels == 0) {
+ ZFS_LOG(1, "Unable to read config from %s.", pp->name);
+ return (NO_MATCH);
+ }
- guid = 0;
- if (vdev_geom_read_config(cp, &config) == 0) {
- guid = nvlist_get_guid(config);
- nvlist_free(config);
+ pool_guid = 0;
+ (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid);
+ top_guid = 0;
+ (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_TOP_GUID, &top_guid);
+ vdev_guid = 0;
+ (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid);
+ nvlist_free(config);
+
+ /*
+ * Check that the label's pool guid matches the desired guid.
+ * Inactive spares and L2ARCs do not have any pool guid in the label.
+ */
+ if (pool_guid != 0 && pool_guid != spa_guid(vd->vdev_spa)) {
+ ZFS_LOG(1, "pool guid mismatch for provider %s: %ju != %ju.",
+ pp->name,
+ (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)pool_guid);
+ return (NO_MATCH);
}
- return (guid);
+
+ /*
+ * Check that the label's vdev guid matches the desired guid.
+ * The second condition handles possible race on vdev detach, when
+ * remaining vdev receives GUID of destroyed top level mirror vdev.
+ */
+ if (vdev_guid == vd->vdev_guid) {
+ ZFS_LOG(1, "guids match for provider %s.", pp->name);
+ return (ZERO_MATCH + nlabels);
+ } else if (top_guid == vd->vdev_guid && vd == vd->vdev_top) {
+ ZFS_LOG(1, "top vdev guid match for provider %s.", pp->name);
+ return (TOPGUID_MATCH);
+ }
+ ZFS_LOG(1, "vdev guid mismatch for provider %s: %ju != %ju.",
+ pp->name, (uintmax_t)vd->vdev_guid, (uintmax_t)vdev_guid);
+ return (NO_MATCH);
}
static struct g_consumer *
-vdev_geom_attach_by_guid(uint64_t guid)
+vdev_geom_attach_by_guids(vdev_t *vd)
{
struct g_class *mp;
- struct g_geom *gp, *zgp;
- struct g_provider *pp;
- struct g_consumer *cp, *zcp;
- uint64_t pguid;
+ struct g_geom *gp;
+ struct g_provider *pp, *best_pp;
+ struct g_consumer *cp;
+ enum match match, best_match;
g_topology_assert();
- zgp = g_new_geomf(&zfs_vdev_class, "zfs::vdev::taste");
- /* This orphan function should be never called. */
- zgp->orphan = vdev_geom_taste_orphan;
- zcp = g_new_consumer(zgp);
-
cp = NULL;
+ best_pp = NULL;
+ best_match = NO_MATCH;
LIST_FOREACH(mp, &g_classes, class) {
if (mp == &zfs_vdev_class)
continue;
@@ -483,36 +706,30 @@
if (gp->flags & G_GEOM_WITHER)
continue;
LIST_FOREACH(pp, &gp->provider, provider) {
- if (vdev_geom_attach_taster(zcp, pp) != 0)
- continue;
- g_topology_unlock();
- pguid = vdev_geom_read_guid(zcp);
- g_topology_lock();
- vdev_geom_detach_taster(zcp);
- if (pguid != guid)
- continue;
- cp = vdev_geom_attach(pp);
- if (cp == NULL) {
- printf("ZFS WARNING: Unable to attach to %s.\n",
- pp->name);
- continue;
+ match = vdev_attach_ok(vd, pp);
+ if (match > best_match) {
+ best_match = match;
+ best_pp = pp;
}
- break;
+ if (match == FULL_MATCH)
+ goto out;
}
- if (cp != NULL)
- break;
}
- if (cp != NULL)
- break;
}
-end:
- g_destroy_consumer(zcp);
- g_destroy_geom(zgp);
+
+out:
+ if (best_pp) {
+ cp = vdev_geom_attach(best_pp, vd, B_TRUE);
+ if (cp == NULL) {
+ printf("ZFS WARNING: Unable to attach to %s.\n",
+ best_pp->name);
+ }
+ }
return (cp);
}
static struct g_consumer *
-vdev_geom_open_by_guid(vdev_t *vd)
+vdev_geom_open_by_guids(vdev_t *vd)
{
struct g_consumer *cp;
char *buf;
@@ -520,8 +737,9 @@
g_topology_assert();
- ZFS_LOG(1, "Searching by guid [%ju].", (uintmax_t)vd->vdev_guid);
- cp = vdev_geom_attach_by_guid(vd->vdev_guid);
+ ZFS_LOG(1, "Searching by guids [%ju:%ju].",
+ (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)vd->vdev_guid);
+ cp = vdev_geom_attach_by_guids(vd);
if (cp != NULL) {
len = strlen(cp->provider->name) + strlen("/dev/") + 1;
buf = kmem_alloc(len, KM_SLEEP);
@@ -530,10 +748,12 @@
spa_strfree(vd->vdev_path);
vd->vdev_path = buf;
- ZFS_LOG(1, "Attach by guid [%ju] succeeded, provider %s.",
- (uintmax_t)vd->vdev_guid, vd->vdev_path);
+ ZFS_LOG(1, "Attach by guid [%ju:%ju] succeeded, provider %s.",
+ (uintmax_t)spa_guid(vd->vdev_spa),
+ (uintmax_t)vd->vdev_guid, cp->provider->name);
} else {
- ZFS_LOG(1, "Search by guid [%ju] failed.",
+ ZFS_LOG(1, "Search by guid [%ju:%ju] failed.",
+ (uintmax_t)spa_guid(vd->vdev_spa),
(uintmax_t)vd->vdev_guid);
}
@@ -545,7 +765,6 @@
{
struct g_provider *pp;
struct g_consumer *cp;
- uint64_t guid;
g_topology_assert();
@@ -553,23 +772,8 @@
pp = g_provider_by_name(vd->vdev_path + sizeof("/dev/") - 1);
if (pp != NULL) {
ZFS_LOG(1, "Found provider by name %s.", vd->vdev_path);
- cp = vdev_geom_attach(pp);
- if (cp != NULL && check_guid && ISP2(pp->sectorsize) &&
- pp->sectorsize <= VDEV_PAD_SIZE) {
- g_topology_unlock();
- guid = vdev_geom_read_guid(cp);
- g_topology_lock();
- if (guid != vd->vdev_guid) {
- vdev_geom_detach(cp, 0);
- cp = NULL;
- ZFS_LOG(1, "guid mismatch for provider %s: "
- "%ju != %ju.", vd->vdev_path,
- (uintmax_t)vd->vdev_guid, (uintmax_t)guid);
- } else {
- ZFS_LOG(1, "guid match for provider %s.",
- vd->vdev_path);
- }
- }
+ if (!check_guid || vdev_attach_ok(vd, pp) == FULL_MATCH)
+ cp = vdev_geom_attach(pp, vd, B_FALSE);
}
return (cp);
@@ -577,7 +781,7 @@
static int
vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
- uint64_t *ashift)
+ uint64_t *logical_ashift, uint64_t *physical_ashift)
{
struct g_provider *pp;
struct g_consumer *cp;
@@ -584,6 +788,9 @@
size_t bufsize;
int error;
+ /* Set the TLS to indicate downstack that we should not access zvols*/
+ VERIFY(tsd_set(zfs_geom_probe_vdev_key, vd) == 0);
+
/*
* We must have a pathname, and it must be absolute.
*/
@@ -592,60 +799,111 @@
return (EINVAL);
}
- vd->vdev_tsd = NULL;
+ /*
+ * Reopen the device if it's not currently open. Otherwise,
+ * just update the physical size of the device.
+ */
+ if ((cp = vd->vdev_tsd) != NULL) {
+ ASSERT(vd->vdev_reopening);
+ goto skip_open;
+ }
DROP_GIANT();
g_topology_lock();
error = 0;
- /*
- * If we're creating or splitting a pool, just find the GEOM provider
- * by its name and ignore GUID mismatches.
- */
- if (vd->vdev_spa->spa_load_state == SPA_LOAD_NONE ||
- vd->vdev_spa->spa_splitting_newspa == B_TRUE)
+ if (vd->vdev_spa->spa_splitting_newspa ||
+ (vd->vdev_prevstate == VDEV_STATE_UNKNOWN &&
+ vd->vdev_spa->spa_load_state == SPA_LOAD_NONE ||
+ vd->vdev_spa->spa_load_state == SPA_LOAD_CREATE)) {
+ /*
+ * We are dealing with a vdev that hasn't been previously
+ * opened (since boot), and we are not loading an
+ * existing pool configuration. This looks like a
+ * vdev add operation to a new or existing pool.
+ * Assume the user knows what he/she is doing and find
+ * GEOM provider by its name, ignoring GUID mismatches.
+ *
+ * XXPOLICY: It would be safer to only allow a device
+ * that is unlabeled or labeled but missing
+ * GUID information to be opened in this fashion,
+ * unless we are doing a split, in which case we
+ * should allow any guid.
+ */
cp = vdev_geom_open_by_path(vd, 0);
- else {
+ } else {
+ /*
+ * Try using the recorded path for this device, but only
+ * accept it if its label data contains the expected GUIDs.
+ */
cp = vdev_geom_open_by_path(vd, 1);
if (cp == NULL) {
/*
* The device at vd->vdev_path doesn't have the
- * expected guid. The disks might have merely
+ * expected GUIDs. The disks might have merely
* moved around so try all other GEOM providers
- * to find one with the right guid.
+ * to find one with the right GUIDs.
*/
- cp = vdev_geom_open_by_guid(vd);
+ cp = vdev_geom_open_by_guids(vd);
}
}
+ /* Clear the TLS now that tasting is done */
+ VERIFY(tsd_set(zfs_geom_probe_vdev_key, NULL) == 0);
+
if (cp == NULL) {
- ZFS_LOG(1, "Provider %s not found.", vd->vdev_path);
+ ZFS_LOG(1, "Vdev %s not found.", vd->vdev_path);
error = ENOENT;
- } else if (cp->provider->sectorsize > VDEV_PAD_SIZE ||
- !ISP2(cp->provider->sectorsize)) {
- ZFS_LOG(1, "Provider %s has unsupported sectorsize.",
- vd->vdev_path);
- vdev_geom_detach(cp, 0);
- error = EINVAL;
- cp = NULL;
- } else if (cp->acw == 0 && (spa_mode(vd->vdev_spa) & FWRITE) != 0) {
- int i;
+ } else {
+ struct consumer_priv_t *priv;
+ struct consumer_vdev_elem *elem;
+ int spamode;
- for (i = 0; i < 5; i++) {
- error = g_access(cp, 0, 1, 0);
- if (error == 0)
- break;
- g_topology_unlock();
- tsleep(vd, 0, "vdev", hz / 2);
- g_topology_lock();
- }
- if (error != 0) {
- printf("ZFS WARNING: Unable to open %s for writing (error=%d).\n",
- vd->vdev_path, error);
- vdev_geom_detach(cp, 0);
+ priv = (struct consumer_priv_t*)&cp->private;
+ if (cp->private == NULL)
+ SLIST_INIT(priv);
+ elem = g_malloc(sizeof(*elem), M_WAITOK|M_ZERO);
+ elem->vd = vd;
+ SLIST_INSERT_HEAD(priv, elem, elems);
+
+ spamode = spa_mode(vd->vdev_spa);
+ if (cp->provider->sectorsize > VDEV_PAD_SIZE ||
+ !ISP2(cp->provider->sectorsize)) {
+ ZFS_LOG(1, "Provider %s has unsupported sectorsize.",
+ cp->provider->name);
+
+ vdev_geom_close_locked(vd);
+ error = EINVAL;
cp = NULL;
+ } else if (cp->acw == 0 && (spamode & FWRITE) != 0) {
+ int i;
+
+ for (i = 0; i < 5; i++) {
+ error = g_access(cp, 0, 1, 0);
+ if (error == 0)
+ break;
+ g_topology_unlock();
+ tsleep(vd, 0, "vdev", hz / 2);
+ g_topology_lock();
+ }
+ if (error != 0) {
+ printf("ZFS WARNING: Unable to open %s for writing (error=%d).\n",
+ cp->provider->name, error);
+ vdev_geom_close_locked(vd);
+ cp = NULL;
+ }
}
}
+
+ /* Fetch initial physical path information for this device. */
+ if (cp != NULL) {
+ vdev_geom_attrchanged(cp, "GEOM::physpath");
+
+ /* Set other GEOM characteristics */
+ vdev_geom_set_physpath(vd, cp, /*do_null_update*/B_FALSE);
+ vdev_geom_set_rotation_rate(vd, cp);
+ }
+
g_topology_unlock();
PICKUP_GIANT();
if (cp == NULL) {
@@ -652,9 +910,7 @@
vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
return (error);
}
-
- cp->private = vd;
- vd->vdev_tsd = cp;
+skip_open:
pp = cp->provider;
/*
@@ -663,9 +919,14 @@
*max_psize = *psize = pp->mediasize;
/*
- * Determine the device's minimum transfer size.
+ * Determine the device's minimum transfer size and preferred
+ * transfer size.
*/
- *ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1;
+ *logical_ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1;
+ *physical_ashift = 0;
+ if (pp->stripesize > (1 << *logical_ashift) && ISP2(pp->stripesize) &&
+ pp->stripesize <= (1 << SPA_MAXASHIFT) && pp->stripeoffset == 0)
+ *physical_ashift = highbit(pp->stripesize) - 1;
/*
* Clear the nowritecache settings, so that on a vdev_reopen()
@@ -673,12 +934,6 @@
*/
vd->vdev_nowritecache = B_FALSE;
- if (vd->vdev_physpath != NULL)
- spa_strfree(vd->vdev_physpath);
- bufsize = sizeof("/dev/") + strlen(pp->name);
- vd->vdev_physpath = kmem_alloc(bufsize, KM_SLEEP);
- snprintf(vd->vdev_physpath, bufsize, "/dev/%s", pp->name);
-
return (0);
}
@@ -688,12 +943,17 @@
struct g_consumer *cp;
cp = vd->vdev_tsd;
- if (cp == NULL)
- return;
- vd->vdev_tsd = NULL;
- vd->vdev_delayed_close = B_FALSE;
- cp->private = NULL; /* XXX locking */
- g_post_event(vdev_geom_detach, cp, M_WAITOK, NULL);
+
+ DROP_GIANT();
+ g_topology_lock();
+
+ if (!vd->vdev_reopening ||
+ (cp != NULL && ((cp->flags & G_CF_ORPHAN) != 0 ||
+ (cp->provider != NULL && cp->provider->error != 0))))
+ vdev_geom_close_locked(vd);
+
+ g_topology_unlock();
+ PICKUP_GIANT();
}
static void
@@ -706,50 +966,46 @@
vd = zio->io_vd;
zio->io_error = bp->bio_error;
if (zio->io_error == 0 && bp->bio_resid != 0)
- zio->io_error = EIO;
- if (bp->bio_cmd == BIO_FLUSH && bp->bio_error == ENOTSUP) {
+ zio->io_error = SET_ERROR(EIO);
+
+ switch(zio->io_error) {
+ case ENOTSUP:
/*
- * If we get ENOTSUP, we know that no future
- * attempts will ever succeed. In this case we
- * set a persistent bit so that we don't bother
- * with the ioctl in the future.
+ * If we get ENOTSUP for BIO_FLUSH or BIO_DELETE we know
+ * that future attempts will never succeed. In this case
+ * we set a persistent flag so that we don't bother with
+ * requests in the future.
*/
- vd->vdev_nowritecache = B_TRUE;
- }
- if (bp->bio_cmd == BIO_DELETE && bp->bio_error == ENOTSUP) {
- /*
- * If we get ENOTSUP, we know that no future
- * attempts will ever succeed. In this case we
- * set a persistent bit so that we don't bother
- * with the ioctl in the future.
- */
- vd->vdev_notrim = B_TRUE;
- }
- if (zio->io_error == EIO && !vd->vdev_remove_wanted) {
- /*
- * If provider's error is set we assume it is being
- * removed.
- */
- if (bp->bio_to->error != 0) {
+ switch(bp->bio_cmd) {
+ case BIO_FLUSH:
+ vd->vdev_nowritecache = B_TRUE;
+ break;
+ case BIO_DELETE:
+ vd->vdev_notrim = B_TRUE;
+ break;
+ }
+ break;
+ case ENXIO:
+ if (!vd->vdev_remove_wanted) {
/*
- * We post the resource as soon as possible, instead of
- * when the async removal actually happens, because the
- * DE is using this information to discard previous I/O
- * errors.
+ * If provider's error is set we assume it is being
+ * removed.
*/
- /* XXX: zfs_post_remove() can sleep. */
- zfs_post_remove(zio->io_spa, vd);
- vd->vdev_remove_wanted = B_TRUE;
- spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE);
- } else if (!vd->vdev_delayed_close) {
- vd->vdev_delayed_close = B_TRUE;
+ if (bp->bio_to->error != 0) {
+ vd->vdev_remove_wanted = B_TRUE;
+ spa_async_request(zio->io_spa,
+ SPA_ASYNC_REMOVE);
+ } else if (!vd->vdev_delayed_close) {
+ vd->vdev_delayed_close = B_TRUE;
+ }
}
+ break;
}
g_destroy_bio(bp);
- zio_interrupt(zio);
+ zio_delay_interrupt(zio);
}
-static int
+static void
vdev_geom_io_start(zio_t *zio)
{
vdev_t *vd;
@@ -759,41 +1015,50 @@
vd = zio->io_vd;
- if (zio->io_type == ZIO_TYPE_IOCTL) {
+ switch (zio->io_type) {
+ case ZIO_TYPE_IOCTL:
/* XXPOLICY */
if (!vdev_readable(vd)) {
- zio->io_error = ENXIO;
- return (ZIO_PIPELINE_CONTINUE);
+ zio->io_error = SET_ERROR(ENXIO);
+ zio_interrupt(zio);
+ return;
+ } else {
+ switch (zio->io_cmd) {
+ case DKIOCFLUSHWRITECACHE:
+ if (zfs_nocacheflush || vdev_geom_bio_flush_disable)
+ break;
+ if (vd->vdev_nowritecache) {
+ zio->io_error = SET_ERROR(ENOTSUP);
+ break;
+ }
+ goto sendreq;
+ default:
+ zio->io_error = SET_ERROR(ENOTSUP);
+ }
}
- switch (zio->io_cmd) {
- case DKIOCFLUSHWRITECACHE:
- if (zfs_nocacheflush || vdev_geom_bio_flush_disable)
- break;
- if (vd->vdev_nowritecache) {
- zio->io_error = ENOTSUP;
- break;
- }
+ zio_execute(zio);
+ return;
+ case ZIO_TYPE_FREE:
+ if (vd->vdev_notrim) {
+ zio->io_error = SET_ERROR(ENOTSUP);
+ } else if (!vdev_geom_bio_delete_disable) {
goto sendreq;
- case DKIOCTRIM:
- if (vdev_geom_bio_delete_disable)
- break;
- if (vd->vdev_notrim) {
- zio->io_error = ENOTSUP;
- break;
- }
- goto sendreq;
- default:
- zio->io_error = ENOTSUP;
}
-
- return (ZIO_PIPELINE_CONTINUE);
+ zio_execute(zio);
+ return;
}
sendreq:
+ ASSERT(zio->io_type == ZIO_TYPE_READ ||
+ zio->io_type == ZIO_TYPE_WRITE ||
+ zio->io_type == ZIO_TYPE_FREE ||
+ zio->io_type == ZIO_TYPE_IOCTL);
+
cp = vd->vdev_tsd;
if (cp == NULL) {
- zio->io_error = ENXIO;
- return (ZIO_PIPELINE_CONTINUE);
+ zio->io_error = SET_ERROR(ENXIO);
+ zio_interrupt(zio);
+ return;
}
bp = g_alloc_bio();
bp->bio_caller1 = zio;
@@ -800,34 +1065,29 @@
switch (zio->io_type) {
case ZIO_TYPE_READ:
case ZIO_TYPE_WRITE:
+ zio->io_target_timestamp = zio_handle_io_delay(zio);
bp->bio_cmd = zio->io_type == ZIO_TYPE_READ ? BIO_READ : BIO_WRITE;
bp->bio_data = zio->io_data;
bp->bio_offset = zio->io_offset;
bp->bio_length = zio->io_size;
break;
+ case ZIO_TYPE_FREE:
+ bp->bio_cmd = BIO_DELETE;
+ bp->bio_data = NULL;
+ bp->bio_offset = zio->io_offset;
+ bp->bio_length = zio->io_size;
+ break;
case ZIO_TYPE_IOCTL:
- switch (zio->io_cmd) {
- case DKIOCFLUSHWRITECACHE:
- bp->bio_cmd = BIO_FLUSH;
- bp->bio_flags |= BIO_ORDERED;
- bp->bio_data = NULL;
- bp->bio_offset = cp->provider->mediasize;
- bp->bio_length = 0;
- break;
- case DKIOCTRIM:
- bp->bio_cmd = BIO_DELETE;
- bp->bio_data = NULL;
- bp->bio_offset = zio->io_offset;
- bp->bio_length = zio->io_size;
- break;
- }
+ bp->bio_cmd = BIO_FLUSH;
+ bp->bio_flags |= BIO_ORDERED;
+ bp->bio_data = NULL;
+ bp->bio_offset = cp->provider->mediasize;
+ bp->bio_length = 0;
break;
}
bp->bio_done = vdev_geom_io_intr;
g_io_request(bp, cp);
-
- return (ZIO_PIPELINE_STOP);
}
static void
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -22,7 +22,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
*/
/*
@@ -186,7 +186,7 @@
static void
vdev_label_read(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset,
- uint64_t size, zio_done_func_t *done, void *private, int flags)
+ uint64_t size, zio_done_func_t *done, void *private, int flags)
{
ASSERT(spa_config_held(zio->io_spa, SCL_STATE_ALL, RW_WRITER) ==
SCL_STATE_ALL);
@@ -200,7 +200,7 @@
static void
vdev_label_write(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset,
- uint64_t size, zio_done_func_t *done, void *private, int flags)
+ uint64_t size, zio_done_func_t *done, void *private, int flags)
{
ASSERT(spa_config_held(zio->io_spa, SCL_ALL, RW_WRITER) == SCL_ALL ||
(spa_config_held(zio->io_spa, SCL_CONFIG | SCL_STATE, RW_READER) ==
@@ -223,30 +223,25 @@
{
nvlist_t *nv = NULL;
- VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ nv = fnvlist_alloc();
- VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE,
- vd->vdev_ops->vdev_op_type) == 0);
+ fnvlist_add_string(nv, ZPOOL_CONFIG_TYPE, vd->vdev_ops->vdev_op_type);
if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)))
- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ID, vd->vdev_id)
- == 0);
- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, vd->vdev_guid) == 0);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_ID, vd->vdev_id);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, vd->vdev_guid);
if (vd->vdev_path != NULL)
- VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_PATH,
- vd->vdev_path) == 0);
+ fnvlist_add_string(nv, ZPOOL_CONFIG_PATH, vd->vdev_path);
if (vd->vdev_devid != NULL)
- VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_DEVID,
- vd->vdev_devid) == 0);
+ fnvlist_add_string(nv, ZPOOL_CONFIG_DEVID, vd->vdev_devid);
if (vd->vdev_physpath != NULL)
- VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH,
- vd->vdev_physpath) == 0);
+ fnvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH,
+ vd->vdev_physpath);
if (vd->vdev_fru != NULL)
- VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_FRU,
- vd->vdev_fru) == 0);
+ fnvlist_add_string(nv, ZPOOL_CONFIG_FRU, vd->vdev_fru);
if (vd->vdev_nparity != 0) {
ASSERT(strcmp(vd->vdev_ops->vdev_op_type,
@@ -267,44 +262,41 @@
* that only support a single parity device -- older software
* will just ignore it.
*/
- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY,
- vd->vdev_nparity) == 0);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vd->vdev_nparity);
}
if (vd->vdev_wholedisk != -1ULL)
- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
- vd->vdev_wholedisk) == 0);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
+ vd->vdev_wholedisk);
if (vd->vdev_not_present)
- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 1) == 0);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 1);
if (vd->vdev_isspare)
- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1) == 0);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1);
if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)) &&
vd == vd->vdev_top) {
- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
- vd->vdev_ms_array) == 0);
- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
- vd->vdev_ms_shift) == 0);
- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT,
- vd->vdev_ashift) == 0);
- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ASIZE,
- vd->vdev_asize) == 0);
- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG,
- vd->vdev_islog) == 0);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
+ vd->vdev_ms_array);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
+ vd->vdev_ms_shift);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT, vd->vdev_ashift);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASIZE,
+ vd->vdev_asize);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG, vd->vdev_islog);
if (vd->vdev_removing)
- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVING,
- vd->vdev_removing) == 0);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVING,
+ vd->vdev_removing);
}
- if (vd->vdev_dtl_smo.smo_object != 0)
- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_DTL,
- vd->vdev_dtl_smo.smo_object) == 0);
+ if (vd->vdev_dtl_sm != NULL) {
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_DTL,
+ space_map_object(vd->vdev_dtl_sm));
+ }
if (vd->vdev_crtxg)
- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_CREATE_TXG,
- vd->vdev_crtxg) == 0);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, vd->vdev_crtxg);
if (getstats) {
vdev_stat_t vs;
@@ -311,15 +303,14 @@
pool_scan_stat_t ps;
vdev_get_stats(vd, &vs);
- VERIFY(nvlist_add_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
- (uint64_t *)&vs, sizeof (vs) / sizeof (uint64_t)) == 0);
+ fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
+ (uint64_t *)&vs, sizeof (vs) / sizeof (uint64_t));
/* provide either current or previous scan information */
if (spa_scan_get_stats(spa, &ps) == 0) {
- VERIFY(nvlist_add_uint64_array(nv,
+ fnvlist_add_uint64_array(nv,
ZPOOL_CONFIG_SCAN_STATS, (uint64_t *)&ps,
- sizeof (pool_scan_stat_t) / sizeof (uint64_t))
- == 0);
+ sizeof (pool_scan_stat_t) / sizeof (uint64_t));
}
}
@@ -349,8 +340,8 @@
}
if (idx) {
- VERIFY(nvlist_add_nvlist_array(nv,
- ZPOOL_CONFIG_CHILDREN, child, idx) == 0);
+ fnvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+ child, idx);
}
for (c = 0; c < idx; c++)
@@ -362,26 +353,20 @@
const char *aux = NULL;
if (vd->vdev_offline && !vd->vdev_tmpoffline)
- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_OFFLINE,
- B_TRUE) == 0);
- if (vd->vdev_resilvering)
- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_RESILVERING,
- B_TRUE) == 0);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_OFFLINE, B_TRUE);
+ if (vd->vdev_resilver_txg != 0)
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG,
+ vd->vdev_resilver_txg);
if (vd->vdev_faulted)
- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_FAULTED,
- B_TRUE) == 0);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_FAULTED, B_TRUE);
if (vd->vdev_degraded)
- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_DEGRADED,
- B_TRUE) == 0);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_DEGRADED, B_TRUE);
if (vd->vdev_removed)
- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVED,
- B_TRUE) == 0);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVED, B_TRUE);
if (vd->vdev_unspare)
- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_UNSPARE,
- B_TRUE) == 0);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_UNSPARE, B_TRUE);
if (vd->vdev_ishole)
- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_HOLE,
- B_TRUE) == 0);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_HOLE, B_TRUE);
switch (vd->vdev_stat.vs_aux) {
case VDEV_AUX_ERR_EXCEEDED:
@@ -394,12 +379,11 @@
}
if (aux != NULL)
- VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_AUX_STATE,
- aux) == 0);
+ fnvlist_add_string(nv, ZPOOL_CONFIG_AUX_STATE, aux);
if (vd->vdev_splitting && vd->vdev_orig_guid != 0LL) {
- VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ORIG_GUID,
- vd->vdev_orig_guid) == 0);
+ fnvlist_add_uint64(nv, ZPOOL_CONFIG_ORIG_GUID,
+ vd->vdev_orig_guid);
}
}
@@ -619,7 +603,8 @@
* read-only. Instead we look to see if the pools is marked
* read-only in the namespace and set the state to active.
*/
- if ((spa = spa_by_guid(pool_guid, device_guid)) != NULL &&
+ if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE &&
+ (spa = spa_by_guid(pool_guid, device_guid)) != NULL &&
spa_mode(spa) == FREAD)
state = POOL_STATE_ACTIVE;
@@ -663,7 +648,7 @@
/* Track the creation time for this vdev */
vd->vdev_crtxg = crtxg;
- if (!vd->vdev_ops->vdev_op_leaf)
+ if (!vd->vdev_ops->vdev_op_leaf || !spa_writeable(spa))
return (0);
/*
@@ -730,8 +715,9 @@
* Don't TRIM if removing so that we don't interfere with zpool
* disaster recovery.
*/
- if (zfs_trim_enabled && vdev_trim_on_init && (reason == VDEV_LABEL_CREATE ||
- reason == VDEV_LABEL_SPARE || reason == VDEV_LABEL_L2CACHE))
+ if (zfs_trim_enabled && vdev_trim_on_init && !vd->vdev_notrim &&
+ (reason == VDEV_LABEL_CREATE || reason == VDEV_LABEL_SPARE ||
+ reason == VDEV_LABEL_L2CACHE))
zio_wait(zio_trim(NULL, spa, vd, 0, vd->vdev_psize));
/*
@@ -872,6 +858,44 @@
return (error);
}
+int
+vdev_label_write_pad2(vdev_t *vd, const char *buf, size_t size)
+{
+ spa_t *spa = vd->vdev_spa;
+ zio_t *zio;
+ char *pad2;
+ int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL;
+ int error;
+
+ if (size > VDEV_PAD_SIZE)
+ return (EINVAL);
+
+ if (!vd->vdev_ops->vdev_op_leaf)
+ return (ENODEV);
+ if (vdev_is_dead(vd))
+ return (ENXIO);
+
+ ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+
+ pad2 = zio_buf_alloc(VDEV_PAD_SIZE);
+ bzero(pad2, VDEV_PAD_SIZE);
+ memcpy(pad2, buf, size);
+
+retry:
+ zio = zio_root(spa, NULL, NULL, flags);
+ vdev_label_write(zio, vd, 0, pad2,
+ offsetof(vdev_label_t, vl_pad2),
+ VDEV_PAD_SIZE, NULL, NULL, flags);
+ error = zio_wait(zio);
+ if (error != 0 && !(flags & ZIO_FLAG_TRYHARD)) {
+ flags |= ZIO_FLAG_TRYHARD;
+ goto retry;
+ }
+
+ zio_buf_free(pad2, VDEV_PAD_SIZE);
+ return (error);
+}
+
/*
* ==========================================================================
* uberblock load/sync
@@ -1009,7 +1033,7 @@
uint64_t *good_writes = zio->io_private;
if (zio->io_error == 0 && zio->io_vd->vdev_top->vdev_ms_array != 0)
- atomic_add_64(good_writes, 1);
+ atomic_inc_64(good_writes);
}
/*
@@ -1084,7 +1108,7 @@
uint64_t *good_writes = zio->io_private;
if (zio->io_error == 0)
- atomic_add_64(good_writes, 1);
+ atomic_inc_64(good_writes);
}
/*
@@ -1209,15 +1233,16 @@
* at any time, you can just call it again, and it will resume its work.
*/
int
-vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg, boolean_t tryhard)
+vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg)
{
spa_t *spa = svd[0]->vdev_spa;
uberblock_t *ub = &spa->spa_uberblock;
vdev_t *vd;
zio_t *zio;
- int error;
+ int error = 0;
int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL;
+retry:
/*
* Normally, we don't want to try too hard to write every label and
* uberblock. If there is a flaky disk, we don't want the rest of the
@@ -1225,8 +1250,11 @@
* single label out, we should retry with ZIO_FLAG_TRYHARD before
* bailing out and declaring the pool faulted.
*/
- if (tryhard)
+ if (error != 0) {
+ if ((flags & ZIO_FLAG_TRYHARD) != 0)
+ return (error);
flags |= ZIO_FLAG_TRYHARD;
+ }
ASSERT(ub->ub_txg <= txg);
@@ -1270,7 +1298,7 @@
* are committed to stable storage before the uberblock update.
*/
if ((error = vdev_label_sync_list(spa, 0, txg, flags)) != 0)
- return (error);
+ goto retry;
/*
* Sync the uberblocks to all vdevs in svd[].
@@ -1288,7 +1316,7 @@
* to the new uberblocks.
*/
if ((error = vdev_uberblock_sync_list(svd, svdcount, ub, flags)) != 0)
- return (error);
+ goto retry;
/*
* Sync out odd labels for every dirty vdev. If the system dies
@@ -1301,7 +1329,7 @@
* stable storage before the next transaction group begins.
*/
if ((error = vdev_label_sync_list(spa, 1, txg, flags)) != 0)
- return (error);
+ goto retry;;
trim_thread_wakeup(spa);
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -25,11 +25,14 @@
*/
/*
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_scan.h>
#include <sys/vdev_impl.h>
#include <sys/zio.h>
#include <sys/fs/zfs.h>
@@ -42,6 +45,7 @@
vdev_t *mc_vd;
uint64_t mc_offset;
int mc_error;
+ int mc_load;
uint8_t mc_tried;
uint8_t mc_skipped;
uint8_t mc_speculative;
@@ -48,21 +52,102 @@
} mirror_child_t;
typedef struct mirror_map {
+ int *mm_preferred;
+ int mm_preferred_cnt;
int mm_children;
- int mm_replacing;
- int mm_preferred;
- int mm_root;
- mirror_child_t mm_child[1];
+ boolean_t mm_resilvering;
+ boolean_t mm_root;
+ mirror_child_t mm_child[];
} mirror_map_t;
-int vdev_mirror_shift = 21;
+static int vdev_mirror_shift = 21;
+#ifdef _KERNEL
+SYSCTL_DECL(_vfs_zfs_vdev);
+static SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, mirror, CTLFLAG_RD, 0,
+ "ZFS VDEV Mirror");
+#endif
+
+/*
+ * The load configuration settings below are tuned by default for
+ * the case where all devices are of the same rotational type.
+ *
+ * If there is a mixture of rotating and non-rotating media, setting
+ * non_rotating_seek_inc to 0 may well provide better results as it
+ * will direct more reads to the non-rotating vdevs which are more
+ * likely to have a higher performance.
+ */
+
+/* Rotating media load calculation configuration. */
+static int rotating_inc = 0;
+#ifdef _KERNEL
+TUNABLE_INT("vfs.zfs.vdev.mirror.rotating_inc", &rotating_inc);
+SYSCTL_INT(_vfs_zfs_vdev_mirror, OID_AUTO, rotating_inc, CTLFLAG_RW,
+ &rotating_inc, 0, "Rotating media load increment for non-seeking I/O's");
+#endif
+
+static int rotating_seek_inc = 5;
+#ifdef _KERNEL
+TUNABLE_INT("vfs.zfs.vdev.mirror.rotating_seek_inc", &rotating_seek_inc);
+SYSCTL_INT(_vfs_zfs_vdev_mirror, OID_AUTO, rotating_seek_inc, CTLFLAG_RW,
+ &rotating_seek_inc, 0, "Rotating media load increment for seeking I/O's");
+#endif
+
+static int rotating_seek_offset = 1 * 1024 * 1024;
+#ifdef _KERNEL
+TUNABLE_INT("vfs.zfs.vdev.mirror.rotating_seek_offset", &rotating_seek_offset);
+SYSCTL_INT(_vfs_zfs_vdev_mirror, OID_AUTO, rotating_seek_offset, CTLFLAG_RW,
+ &rotating_seek_offset, 0, "Offset in bytes from the last I/O which "
+ "triggers a reduced rotating media seek increment");
+#endif
+
+/* Non-rotating media load calculation configuration. */
+static int non_rotating_inc = 0;
+#ifdef _KERNEL
+TUNABLE_INT("vfs.zfs.vdev.mirror.non_rotating_inc", &non_rotating_inc);
+SYSCTL_INT(_vfs_zfs_vdev_mirror, OID_AUTO, non_rotating_inc, CTLFLAG_RW,
+ &non_rotating_inc, 0,
+ "Non-rotating media load increment for non-seeking I/O's");
+#endif
+
+static int non_rotating_seek_inc = 1;
+#ifdef _KERNEL
+TUNABLE_INT("vfs.zfs.vdev.mirror.non_rotating_seek_inc",
+ &non_rotating_seek_inc);
+SYSCTL_INT(_vfs_zfs_vdev_mirror, OID_AUTO, non_rotating_seek_inc, CTLFLAG_RW,
+ &non_rotating_seek_inc, 0,
+ "Non-rotating media load increment for seeking I/O's");
+#endif
+
+
+static inline size_t
+vdev_mirror_map_size(int children)
+{
+ return (offsetof(mirror_map_t, mm_child[children]) +
+ sizeof(int) * children);
+}
+
+static inline mirror_map_t *
+vdev_mirror_map_alloc(int children, boolean_t resilvering, boolean_t root)
+{
+ mirror_map_t *mm;
+
+ mm = kmem_zalloc(vdev_mirror_map_size(children), KM_SLEEP);
+ mm->mm_children = children;
+ mm->mm_resilvering = resilvering;
+ mm->mm_root = root;
+ mm->mm_preferred = (int *)((uintptr_t)mm +
+ offsetof(mirror_map_t, mm_child[children]));
+
+ return mm;
+}
+
static void
vdev_mirror_map_free(zio_t *zio)
{
mirror_map_t *mm = zio->io_vsd;
- kmem_free(mm, offsetof(mirror_map_t, mm_child[mm->mm_children]));
+ kmem_free(mm, vdev_mirror_map_size(mm->mm_children));
}
static const zio_vsd_ops_t vdev_mirror_vsd_ops = {
@@ -70,55 +155,110 @@
zio_vsd_default_cksum_report
};
+static int
+vdev_mirror_load(mirror_map_t *mm, vdev_t *vd, uint64_t zio_offset)
+{
+ uint64_t lastoffset;
+ int load;
+
+ /* All DVAs have equal weight at the root. */
+ if (mm->mm_root)
+ return (INT_MAX);
+
+ /*
+ * We don't return INT_MAX if the device is resilvering i.e.
+ * vdev_resilver_txg != 0 as when tested performance was slightly
+ * worse overall when resilvering with compared to without.
+ */
+
+ /* Standard load based on pending queue length. */
+ load = vdev_queue_length(vd);
+ lastoffset = vdev_queue_lastoffset(vd);
+
+ if (vd->vdev_rotation_rate == VDEV_RATE_NON_ROTATING) {
+ /* Non-rotating media. */
+ if (lastoffset == zio_offset)
+ return (load + non_rotating_inc);
+
+ /*
+ * Apply a seek penalty even for non-rotating devices as
+ * sequential I/O'a can be aggregated into fewer operations
+ * on the device, thus avoiding unnecessary per-command
+ * overhead and boosting performance.
+ */
+ return (load + non_rotating_seek_inc);
+ }
+
+ /* Rotating media I/O's which directly follow the last I/O. */
+ if (lastoffset == zio_offset)
+ return (load + rotating_inc);
+
+ /*
+ * Apply half the seek increment to I/O's within seek offset
+ * of the last I/O queued to this vdev as they should incure less
+ * of a seek increment.
+ */
+ if (ABS(lastoffset - zio_offset) < rotating_seek_offset)
+ return (load + (rotating_seek_inc / 2));
+
+ /* Apply the full seek increment to all other I/O's. */
+ return (load + rotating_seek_inc);
+}
+
+
static mirror_map_t *
-vdev_mirror_map_alloc(zio_t *zio)
+vdev_mirror_map_init(zio_t *zio)
{
mirror_map_t *mm = NULL;
mirror_child_t *mc;
vdev_t *vd = zio->io_vd;
- int c, d;
+ int c;
if (vd == NULL) {
dva_t *dva = zio->io_bp->blk_dva;
spa_t *spa = zio->io_spa;
- c = BP_GET_NDVAS(zio->io_bp);
-
- mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP);
- mm->mm_children = c;
- mm->mm_replacing = B_FALSE;
- mm->mm_preferred = spa_get_random(c);
- mm->mm_root = B_TRUE;
-
- /*
- * Check the other, lower-index DVAs to see if they're on
- * the same vdev as the child we picked. If they are, use
- * them since they are likely to have been allocated from
- * the primary metaslab in use at the time, and hence are
- * more likely to have locality with single-copy data.
- */
- for (c = mm->mm_preferred, d = c - 1; d >= 0; d--) {
- if (DVA_GET_VDEV(&dva[d]) == DVA_GET_VDEV(&dva[c]))
- mm->mm_preferred = d;
- }
-
+ mm = vdev_mirror_map_alloc(BP_GET_NDVAS(zio->io_bp), B_FALSE,
+ B_TRUE);
for (c = 0; c < mm->mm_children; c++) {
mc = &mm->mm_child[c];
-
mc->mc_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[c]));
mc->mc_offset = DVA_GET_OFFSET(&dva[c]);
}
} else {
- c = vd->vdev_children;
-
- mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_SLEEP);
- mm->mm_children = c;
- mm->mm_replacing = (vd->vdev_ops == &vdev_replacing_ops ||
- vd->vdev_ops == &vdev_spare_ops);
- mm->mm_preferred = mm->mm_replacing ? 0 :
- (zio->io_offset >> vdev_mirror_shift) % c;
- mm->mm_root = B_FALSE;
-
+ /*
+ * If we are resilvering, then we should handle scrub reads
+ * differently; we shouldn't issue them to the resilvering
+ * device because it might not have those blocks.
+ *
+ * We are resilvering iff:
+ * 1) We are a replacing vdev (ie our name is "replacing-1" or
+ * "spare-1" or something like that), and
+ * 2) The pool is currently being resilvered.
+ *
+ * We cannot simply check vd->vdev_resilver_txg, because it's
+ * not set in this path.
+ *
+ * Nor can we just check our vdev_ops; there are cases (such as
+ * when a user types "zpool replace pool odev spare_dev" and
+ * spare_dev is in the spare list, or when a spare device is
+ * automatically used to replace a DEGRADED device) when
+ * resilvering is complete but both the original vdev and the
+ * spare vdev remain in the pool. That behavior is intentional.
+ * It helps implement the policy that a spare should be
+ * automatically removed from the pool after the user replaces
+ * the device that originally failed.
+ *
+ * If a spa load is in progress, then spa_dsl_pool may be
+ * uninitialized. But we shouldn't be resilvering during a spa
+ * load anyway.
+ */
+ boolean_t replacing = (vd->vdev_ops == &vdev_replacing_ops ||
+ vd->vdev_ops == &vdev_spare_ops) &&
+ spa_load_state(vd->vdev_spa) == SPA_LOAD_NONE &&
+ dsl_scan_resilvering(vd->vdev_spa->spa_dsl_pool);
+ mm = vdev_mirror_map_alloc(vd->vdev_children, replacing,
+ B_FALSE);
for (c = 0; c < mm->mm_children; c++) {
mc = &mm->mm_child[c];
mc->mc_vd = vd->vdev_child[c];
@@ -133,7 +273,7 @@
static int
vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
- uint64_t *ashift)
+ uint64_t *logical_ashift, uint64_t *physical_ashift)
{
int numerrors = 0;
int lasterror = 0;
@@ -156,7 +296,9 @@
*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
- *ashift = MAX(*ashift, cvd->vdev_ashift);
+ *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
+ *physical_ashift = MAX(*physical_ashift,
+ cvd->vdev_physical_ashift);
}
if (numerrors == vd->vdev_children) {
@@ -191,9 +333,10 @@
if (zio->io_error == 0) {
zio_t *pio;
+ zio_link_t *zl = NULL;
mutex_enter(&zio->io_lock);
- while ((pio = zio_walk_parents(zio)) != NULL) {
+ while ((pio = zio_walk_parents(zio, &zl)) != NULL) {
mutex_enter(&pio->io_lock);
ASSERT3U(zio->io_size, >=, pio->io_size);
bcopy(zio->io_data, pio->io_data, pio->io_size);
@@ -210,7 +353,54 @@
}
/*
- * Try to find a child whose DTL doesn't contain the block we want to read.
+ * Check the other, lower-index DVAs to see if they're on the same
+ * vdev as the child we picked. If they are, use them since they
+ * are likely to have been allocated from the primary metaslab in
+ * use at the time, and hence are more likely to have locality with
+ * single-copy data.
+ */
+static int
+vdev_mirror_dva_select(zio_t *zio, int p)
+{
+ dva_t *dva = zio->io_bp->blk_dva;
+ mirror_map_t *mm = zio->io_vsd;
+ int preferred;
+ int c;
+
+ preferred = mm->mm_preferred[p];
+ for (p-- ; p >= 0; p--) {
+ c = mm->mm_preferred[p];
+ if (DVA_GET_VDEV(&dva[c]) == DVA_GET_VDEV(&dva[preferred]))
+ preferred = c;
+ }
+ return (preferred);
+}
+
+static int
+vdev_mirror_preferred_child_randomize(zio_t *zio)
+{
+ mirror_map_t *mm = zio->io_vsd;
+ int p;
+
+ if (mm->mm_root) {
+ p = spa_get_random(mm->mm_preferred_cnt);
+ return (vdev_mirror_dva_select(zio, p));
+ }
+
+ /*
+ * To ensure we don't always favour the first matching vdev,
+ * which could lead to wear leveling issues on SSD's, we
+ * use the I/O offset as a pseudo random seed into the vdevs
+ * which have the lowest load.
+ */
+ p = (zio->io_offset >> vdev_mirror_shift) % mm->mm_preferred_cnt;
+ return (mm->mm_preferred[p]);
+}
+
+/*
+ * Try to find a vdev whose DTL doesn't contain the block we want to read
+ * prefering vdevs based on determined load.
+ *
* If we can't, try the read on any vdev we haven't already tried.
*/
static int
@@ -217,23 +407,20 @@
vdev_mirror_child_select(zio_t *zio)
{
mirror_map_t *mm = zio->io_vsd;
- mirror_child_t *mc;
uint64_t txg = zio->io_txg;
- int i, c;
+ int c, lowest_load;
ASSERT(zio->io_bp == NULL || BP_PHYSICAL_BIRTH(zio->io_bp) == txg);
- /*
- * Try to find a child whose DTL doesn't contain the block to read.
- * If a child is known to be completely inaccessible (indicated by
- * vdev_readable() returning B_FALSE), don't even try.
- */
- for (i = 0, c = mm->mm_preferred; i < mm->mm_children; i++, c++) {
- if (c >= mm->mm_children)
- c = 0;
+ lowest_load = INT_MAX;
+ mm->mm_preferred_cnt = 0;
+ for (c = 0; c < mm->mm_children; c++) {
+ mirror_child_t *mc;
+
mc = &mm->mm_child[c];
if (mc->mc_tried || mc->mc_skipped)
continue;
+
if (!vdev_readable(mc->mc_vd)) {
mc->mc_error = SET_ERROR(ENXIO);
mc->mc_tried = 1; /* don't even try */
@@ -240,20 +427,50 @@
mc->mc_skipped = 1;
continue;
}
- if (!vdev_dtl_contains(mc->mc_vd, DTL_MISSING, txg, 1))
- return (c);
- mc->mc_error = SET_ERROR(ESTALE);
- mc->mc_skipped = 1;
- mc->mc_speculative = 1;
+
+ if (vdev_dtl_contains(mc->mc_vd, DTL_MISSING, txg, 1)) {
+ mc->mc_error = SET_ERROR(ESTALE);
+ mc->mc_skipped = 1;
+ mc->mc_speculative = 1;
+ continue;
+ }
+
+ mc->mc_load = vdev_mirror_load(mm, mc->mc_vd, mc->mc_offset);
+ if (mc->mc_load > lowest_load)
+ continue;
+
+ if (mc->mc_load < lowest_load) {
+ lowest_load = mc->mc_load;
+ mm->mm_preferred_cnt = 0;
+ }
+ mm->mm_preferred[mm->mm_preferred_cnt] = c;
+ mm->mm_preferred_cnt++;
}
+ if (mm->mm_preferred_cnt == 1) {
+ vdev_queue_register_lastoffset(
+ mm->mm_child[mm->mm_preferred[0]].mc_vd, zio);
+ return (mm->mm_preferred[0]);
+ }
+
+ if (mm->mm_preferred_cnt > 1) {
+ int c = vdev_mirror_preferred_child_randomize(zio);
+
+ vdev_queue_register_lastoffset(mm->mm_child[c].mc_vd, zio);
+ return (c);
+ }
+
/*
* Every device is either missing or has this txg in its DTL.
* Look for any child we haven't already tried before giving up.
*/
- for (c = 0; c < mm->mm_children; c++)
- if (!mm->mm_child[c].mc_tried)
+ for (c = 0; c < mm->mm_children; c++) {
+ if (!mm->mm_child[c].mc_tried) {
+ vdev_queue_register_lastoffset(mm->mm_child[c].mc_vd,
+ zio);
return (c);
+ }
+ }
/*
* Every child failed. There's no place left to look.
@@ -261,7 +478,7 @@
return (-1);
}
-static int
+static void
vdev_mirror_io_start(zio_t *zio)
{
mirror_map_t *mm;
@@ -268,10 +485,11 @@
mirror_child_t *mc;
int c, children;
- mm = vdev_mirror_map_alloc(zio);
+ mm = vdev_mirror_map_init(zio);
if (zio->io_type == ZIO_TYPE_READ) {
- if ((zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_replacing) {
+ if ((zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_resilvering &&
+ mm->mm_children > 1) {
/*
* For scrubbing reads we need to allocate a read
* buffer for each child and issue reads to all
@@ -286,7 +504,8 @@
zio->io_type, zio->io_priority, 0,
vdev_mirror_scrub_done, mc));
}
- return (ZIO_PIPELINE_CONTINUE);
+ zio_execute(zio);
+ return;
}
/*
* For normal reads just pick one child.
@@ -313,7 +532,7 @@
c++;
}
- return (ZIO_PIPELINE_CONTINUE);
+ zio_execute(zio);
}
static int
@@ -409,7 +628,7 @@
if (good_copies && spa_writeable(zio->io_spa) &&
(unexpected_errors ||
(zio->io_flags & ZIO_FLAG_RESILVER) ||
- ((zio->io_flags & ZIO_FLAG_SCRUB) && mm->mm_replacing))) {
+ ((zio->io_flags & ZIO_FLAG_SCRUB) && mm->mm_resilvering))) {
/*
* Use the good data we have in hand to repair damaged children.
*/
@@ -436,7 +655,7 @@
zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
mc->mc_vd, mc->mc_offset,
zio->io_data, zio->io_size,
- ZIO_TYPE_WRITE, zio->io_priority,
+ ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
}
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -25,7 +25,7 @@
*/
/*
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
*/
/*
@@ -46,7 +46,7 @@
/* ARGSUSED */
static int
vdev_missing_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize,
- uint64_t *ashift)
+ uint64_t *logical_ashift, uint64_t *physical_ashift)
{
/*
* Really this should just fail. But then the root vdev will be in the
@@ -56,7 +56,8 @@
*/
*psize = 0;
*max_psize = 0;
- *ashift = 0;
+ *logical_ashift = 0;
+ *physical_ashift = 0;
return (0);
}
@@ -67,11 +68,11 @@
}
/* ARGSUSED */
-static int
+static void
vdev_missing_io_start(zio_t *zio)
{
zio->io_error = SET_ERROR(ENOTSUP);
- return (ZIO_PIPELINE_CONTINUE);
+ zio_execute(zio);
}
/* ARGSUSED */
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_queue.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -25,87 +25,295 @@
*/
/*
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
*/
#include <sys/zfs_context.h>
#include <sys/vdev_impl.h>
+#include <sys/spa_impl.h>
#include <sys/zio.h>
#include <sys/avl.h>
+#include <sys/dsl_pool.h>
+#include <sys/metaslab_impl.h>
/*
- * These tunables are for performance analysis.
+ * ZFS I/O Scheduler
+ * ---------------
+ *
+ * ZFS issues I/O operations to leaf vdevs to satisfy and complete zios. The
+ * I/O scheduler determines when and in what order those operations are
+ * issued. The I/O scheduler divides operations into six I/O classes
+ * prioritized in the following order: sync read, sync write, async read,
+ * async write, scrub/resilver and trim. Each queue defines the minimum and
+ * maximum number of concurrent operations that may be issued to the device.
+ * In addition, the device has an aggregate maximum. Note that the sum of the
+ * per-queue minimums must not exceed the aggregate maximum, and if the
+ * aggregate maximum is equal to or greater than the sum of the per-queue
+ * maximums, the per-queue minimum has no effect.
+ *
+ * For many physical devices, throughput increases with the number of
+ * concurrent operations, but latency typically suffers. Further, physical
+ * devices typically have a limit at which more concurrent operations have no
+ * effect on throughput or can actually cause it to decrease.
+ *
+ * The scheduler selects the next operation to issue by first looking for an
+ * I/O class whose minimum has not been satisfied. Once all are satisfied and
+ * the aggregate maximum has not been hit, the scheduler looks for classes
+ * whose maximum has not been satisfied. Iteration through the I/O classes is
+ * done in the order specified above. No further operations are issued if the
+ * aggregate maximum number of concurrent operations has been hit or if there
+ * are no operations queued for an I/O class that has not hit its maximum.
+ * Every time an I/O is queued or an operation completes, the I/O scheduler
+ * looks for new operations to issue.
+ *
+ * All I/O classes have a fixed maximum number of outstanding operations
+ * except for the async write class. Asynchronous writes represent the data
+ * that is committed to stable storage during the syncing stage for
+ * transaction groups (see txg.c). Transaction groups enter the syncing state
+ * periodically so the number of queued async writes will quickly burst up and
+ * then bleed down to zero. Rather than servicing them as quickly as possible,
+ * the I/O scheduler changes the maximum number of active async write I/Os
+ * according to the amount of dirty data in the pool (see dsl_pool.c). Since
+ * both throughput and latency typically increase with the number of
+ * concurrent operations issued to physical devices, reducing the burstiness
+ * in the number of concurrent operations also stabilizes the response time of
+ * operations from other -- and in particular synchronous -- queues. In broad
+ * strokes, the I/O scheduler will issue more concurrent operations from the
+ * async write queue as there's more dirty data in the pool.
+ *
+ * Async Writes
+ *
+ * The number of concurrent operations issued for the async write I/O class
+ * follows a piece-wise linear function defined by a few adjustable points.
+ *
+ * | o---------| <-- zfs_vdev_async_write_max_active
+ * ^ | /^ |
+ * | | / | |
+ * active | / | |
+ * I/O | / | |
+ * count | / | |
+ * | / | |
+ * |------------o | | <-- zfs_vdev_async_write_min_active
+ * 0|____________^______|_________|
+ * 0% | | 100% of zfs_dirty_data_max
+ * | |
+ * | `-- zfs_vdev_async_write_active_max_dirty_percent
+ * `--------- zfs_vdev_async_write_active_min_dirty_percent
+ *
+ * Until the amount of dirty data exceeds a minimum percentage of the dirty
+ * data allowed in the pool, the I/O scheduler will limit the number of
+ * concurrent operations to the minimum. As that threshold is crossed, the
+ * number of concurrent operations issued increases linearly to the maximum at
+ * the specified maximum percentage of the dirty data allowed in the pool.
+ *
+ * Ideally, the amount of dirty data on a busy pool will stay in the sloped
+ * part of the function between zfs_vdev_async_write_active_min_dirty_percent
+ * and zfs_vdev_async_write_active_max_dirty_percent. If it exceeds the
+ * maximum percentage, this indicates that the rate of incoming data is
+ * greater than the rate that the backend storage can handle. In this case, we
+ * must further throttle incoming writes (see dmu_tx_delay() for details).
*/
-/* The maximum number of I/Os concurrently pending to each device. */
-int zfs_vdev_max_pending = 10;
-
/*
- * The initial number of I/Os pending to each device, before it starts ramping
- * up to zfs_vdev_max_pending.
+ * The maximum number of I/Os active to each device. Ideally, this will be >=
+ * the sum of each queue's max_active. It must be at least the sum of each
+ * queue's min_active.
*/
-int zfs_vdev_min_pending = 4;
+uint32_t zfs_vdev_max_active = 1000;
/*
- * The deadlines are grouped into buckets based on zfs_vdev_time_shift:
- * deadline = pri + gethrtime() >> time_shift)
+ * Per-queue limits on the number of I/Os active to each device. If the
+ * sum of the queue's max_active is < zfs_vdev_max_active, then the
+ * min_active comes into play. We will send min_active from each queue,
+ * and then select from queues in the order defined by zio_priority_t.
+ *
+ * In general, smaller max_active's will lead to lower latency of synchronous
+ * operations. Larger max_active's may lead to higher overall throughput,
+ * depending on underlying storage.
+ *
+ * The ratio of the queues' max_actives determines the balance of performance
+ * between reads, writes, and scrubs. E.g., increasing
+ * zfs_vdev_scrub_max_active will cause the scrub or resilver to complete
+ * more quickly, but reads and writes to have higher latency and lower
+ * throughput.
*/
-int zfs_vdev_time_shift = 29; /* each bucket is 0.537 seconds */
+uint32_t zfs_vdev_sync_read_min_active = 10;
+uint32_t zfs_vdev_sync_read_max_active = 10;
+uint32_t zfs_vdev_sync_write_min_active = 10;
+uint32_t zfs_vdev_sync_write_max_active = 10;
+uint32_t zfs_vdev_async_read_min_active = 1;
+uint32_t zfs_vdev_async_read_max_active = 3;
+uint32_t zfs_vdev_async_write_min_active = 1;
+uint32_t zfs_vdev_async_write_max_active = 10;
+uint32_t zfs_vdev_scrub_min_active = 1;
+uint32_t zfs_vdev_scrub_max_active = 2;
+uint32_t zfs_vdev_trim_min_active = 1;
+/*
+ * TRIM max active is large in comparison to the other values due to the fact
+ * that TRIM IOs are coalesced at the device layer. This value is set such
+ * that a typical SSD can process the queued IOs in a single request.
+ */
+uint32_t zfs_vdev_trim_max_active = 64;
-/* exponential I/O issue ramp-up rate */
-int zfs_vdev_ramp_rate = 2;
/*
+ * When the pool has less than zfs_vdev_async_write_active_min_dirty_percent
+ * dirty data, use zfs_vdev_async_write_min_active. When it has more than
+ * zfs_vdev_async_write_active_max_dirty_percent, use
+ * zfs_vdev_async_write_max_active. The value is linearly interpolated
+ * between min and max.
+ */
+int zfs_vdev_async_write_active_min_dirty_percent = 30;
+int zfs_vdev_async_write_active_max_dirty_percent = 60;
+
+/*
* To reduce IOPs, we aggregate small adjacent I/Os into one large I/O.
* For read I/Os, we also aggregate across small adjacency gaps; for writes
* we include spans of optional I/Os to aid aggregation at the disk even when
* they aren't able to help us aggregate at this level.
*/
-int zfs_vdev_aggregation_limit = SPA_MAXBLOCKSIZE;
+int zfs_vdev_aggregation_limit = SPA_OLD_MAXBLOCKSIZE;
int zfs_vdev_read_gap_limit = 32 << 10;
int zfs_vdev_write_gap_limit = 4 << 10;
+/*
+ * Define the queue depth percentage for each top-level. This percentage is
+ * used in conjunction with zfs_vdev_async_max_active to determine how many
+ * allocations a specific top-level vdev should handle. Once the queue depth
+ * reaches zfs_vdev_queue_depth_pct * zfs_vdev_async_write_max_active / 100
+ * then allocator will stop allocating blocks on that top-level device.
+ * The default kernel setting is 1000% which will yield 100 allocations per
+ * device. For userland testing, the default setting is 300% which equates
+ * to 30 allocations per device.
+ */
+#ifdef _KERNEL
+int zfs_vdev_queue_depth_pct = 1000;
+#else
+int zfs_vdev_queue_depth_pct = 300;
+#endif
+
+
+#ifdef __FreeBSD__
+#ifdef _KERNEL
SYSCTL_DECL(_vfs_zfs_vdev);
-TUNABLE_INT("vfs.zfs.vdev.max_pending", &zfs_vdev_max_pending);
-SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, max_pending, CTLFLAG_RW,
- &zfs_vdev_max_pending, 0, "Maximum I/O requests pending on each device");
-TUNABLE_INT("vfs.zfs.vdev.min_pending", &zfs_vdev_min_pending);
-SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, min_pending, CTLFLAG_RW,
- &zfs_vdev_min_pending, 0,
- "Initial number of I/O requests pending to each device");
-TUNABLE_INT("vfs.zfs.vdev.time_shift", &zfs_vdev_time_shift);
-SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, time_shift, CTLFLAG_RW,
- &zfs_vdev_time_shift, 0, "Used for calculating I/O request deadline");
-TUNABLE_INT("vfs.zfs.vdev.ramp_rate", &zfs_vdev_ramp_rate);
-SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, ramp_rate, CTLFLAG_RW,
- &zfs_vdev_ramp_rate, 0, "Exponential I/O issue ramp-up rate");
+
+TUNABLE_INT("vfs.zfs.vdev.async_write_active_min_dirty_percent",
+ &zfs_vdev_async_write_active_min_dirty_percent);
+static int sysctl_zfs_async_write_active_min_dirty_percent(SYSCTL_HANDLER_ARGS);
+SYSCTL_PROC(_vfs_zfs_vdev, OID_AUTO, async_write_active_min_dirty_percent,
+ CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RWTUN, 0, sizeof(int),
+ sysctl_zfs_async_write_active_min_dirty_percent, "I",
+ "Percentage of async write dirty data below which "
+ "async_write_min_active is used.");
+
+TUNABLE_INT("vfs.zfs.vdev.async_write_active_max_dirty_percent",
+ &zfs_vdev_async_write_active_max_dirty_percent);
+static int sysctl_zfs_async_write_active_max_dirty_percent(SYSCTL_HANDLER_ARGS);
+SYSCTL_PROC(_vfs_zfs_vdev, OID_AUTO, async_write_active_max_dirty_percent,
+ CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RWTUN, 0, sizeof(int),
+ sysctl_zfs_async_write_active_max_dirty_percent, "I",
+ "Percentage of async write dirty data above which "
+ "async_write_max_active is used.");
+
+TUNABLE_INT("vfs.zfs.vdev.max_active", &zfs_vdev_max_active);
+SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, max_active, CTLFLAG_RWTUN,
+ &zfs_vdev_max_active, 0,
+ "The maximum number of I/Os of all types active for each device.");
+
+#define ZFS_VDEV_QUEUE_KNOB_MIN(name) \
+TUNABLE_INT("vfs.zfs.vdev." #name "_min_active", \
+ &zfs_vdev_ ## name ## _min_active); \
+SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, name ## _min_active, \
+ CTLFLAG_RWTUN, &zfs_vdev_ ## name ## _min_active, 0, \
+ "Initial number of I/O requests of type " #name \
+ " active for each device");
+
+#define ZFS_VDEV_QUEUE_KNOB_MAX(name) \
+TUNABLE_INT("vfs.zfs.vdev." #name "_max_active", \
+ &zfs_vdev_ ## name ## _max_active); \
+SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, name ## _max_active, \
+ CTLFLAG_RWTUN, &zfs_vdev_ ## name ## _max_active, 0, \
+ "Maximum number of I/O requests of type " #name \
+ " active for each device");
+
+ZFS_VDEV_QUEUE_KNOB_MIN(sync_read);
+ZFS_VDEV_QUEUE_KNOB_MAX(sync_read);
+ZFS_VDEV_QUEUE_KNOB_MIN(sync_write);
+ZFS_VDEV_QUEUE_KNOB_MAX(sync_write);
+ZFS_VDEV_QUEUE_KNOB_MIN(async_read);
+ZFS_VDEV_QUEUE_KNOB_MAX(async_read);
+ZFS_VDEV_QUEUE_KNOB_MIN(async_write);
+ZFS_VDEV_QUEUE_KNOB_MAX(async_write);
+ZFS_VDEV_QUEUE_KNOB_MIN(scrub);
+ZFS_VDEV_QUEUE_KNOB_MAX(scrub);
+ZFS_VDEV_QUEUE_KNOB_MIN(trim);
+ZFS_VDEV_QUEUE_KNOB_MAX(trim);
+
+#undef ZFS_VDEV_QUEUE_KNOB
+
TUNABLE_INT("vfs.zfs.vdev.aggregation_limit", &zfs_vdev_aggregation_limit);
-SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, aggregation_limit, CTLFLAG_RW,
+SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, aggregation_limit, CTLFLAG_RWTUN,
&zfs_vdev_aggregation_limit, 0,
"I/O requests are aggregated up to this size");
TUNABLE_INT("vfs.zfs.vdev.read_gap_limit", &zfs_vdev_read_gap_limit);
-SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, read_gap_limit, CTLFLAG_RW,
+SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, read_gap_limit, CTLFLAG_RWTUN,
&zfs_vdev_read_gap_limit, 0,
"Acceptable gap between two reads being aggregated");
TUNABLE_INT("vfs.zfs.vdev.write_gap_limit", &zfs_vdev_write_gap_limit);
-SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, write_gap_limit, CTLFLAG_RW,
+SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, write_gap_limit, CTLFLAG_RWTUN,
&zfs_vdev_write_gap_limit, 0,
"Acceptable gap between two writes being aggregated");
+SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, queue_depth_pct, CTLFLAG_RWTUN,
+ &zfs_vdev_queue_depth_pct, 0,
+ "Queue depth percentage for each top-level");
-/*
- * Virtual device vector for disk I/O scheduling.
- */
+static int
+sysctl_zfs_async_write_active_min_dirty_percent(SYSCTL_HANDLER_ARGS)
+{
+ int val, err;
+
+ val = zfs_vdev_async_write_active_min_dirty_percent;
+ err = sysctl_handle_int(oidp, &val, 0, req);
+ if (err != 0 || req->newptr == NULL)
+ return (err);
+
+ if (val < 0 || val > 100 ||
+ val >= zfs_vdev_async_write_active_max_dirty_percent)
+ return (EINVAL);
+
+ zfs_vdev_async_write_active_min_dirty_percent = val;
+
+ return (0);
+}
+
+static int
+sysctl_zfs_async_write_active_max_dirty_percent(SYSCTL_HANDLER_ARGS)
+{
+ int val, err;
+
+ val = zfs_vdev_async_write_active_max_dirty_percent;
+ err = sysctl_handle_int(oidp, &val, 0, req);
+ if (err != 0 || req->newptr == NULL)
+ return (err);
+
+ if (val < 0 || val > 100 ||
+ val <= zfs_vdev_async_write_active_min_dirty_percent)
+ return (EINVAL);
+
+ zfs_vdev_async_write_active_max_dirty_percent = val;
+
+ return (0);
+}
+#endif
+#endif
+
int
-vdev_queue_deadline_compare(const void *x1, const void *x2)
+vdev_queue_offset_compare(const void *x1, const void *x2)
{
const zio_t *z1 = x1;
const zio_t *z2 = x2;
- if (z1->io_deadline < z2->io_deadline)
- return (-1);
- if (z1->io_deadline > z2->io_deadline)
- return (1);
-
if (z1->io_offset < z2->io_offset)
return (-1);
if (z1->io_offset > z2->io_offset)
@@ -119,12 +327,34 @@
return (0);
}
+static inline avl_tree_t *
+vdev_queue_class_tree(vdev_queue_t *vq, zio_priority_t p)
+{
+ return (&vq->vq_class[p].vqc_queued_tree);
+}
+
+static inline avl_tree_t *
+vdev_queue_type_tree(vdev_queue_t *vq, zio_type_t t)
+{
+ if (t == ZIO_TYPE_READ)
+ return (&vq->vq_read_offset_tree);
+ else if (t == ZIO_TYPE_WRITE)
+ return (&vq->vq_write_offset_tree);
+ else
+ return (NULL);
+}
+
int
-vdev_queue_offset_compare(const void *x1, const void *x2)
+vdev_queue_timestamp_compare(const void *x1, const void *x2)
{
const zio_t *z1 = x1;
const zio_t *z2 = x2;
+ if (z1->io_timestamp < z2->io_timestamp)
+ return (-1);
+ if (z1->io_timestamp > z2->io_timestamp)
+ return (1);
+
if (z1->io_offset < z2->io_offset)
return (-1);
if (z1->io_offset > z2->io_offset)
@@ -144,18 +374,35 @@
vdev_queue_t *vq = &vd->vdev_queue;
mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL);
+ vq->vq_vdev = vd;
- avl_create(&vq->vq_deadline_tree, vdev_queue_deadline_compare,
- sizeof (zio_t), offsetof(struct zio, io_deadline_node));
+ avl_create(&vq->vq_active_tree, vdev_queue_offset_compare,
+ sizeof (zio_t), offsetof(struct zio, io_queue_node));
+ avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_READ),
+ vdev_queue_offset_compare, sizeof (zio_t),
+ offsetof(struct zio, io_offset_node));
+ avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE),
+ vdev_queue_offset_compare, sizeof (zio_t),
+ offsetof(struct zio, io_offset_node));
- avl_create(&vq->vq_read_tree, vdev_queue_offset_compare,
- sizeof (zio_t), offsetof(struct zio, io_offset_node));
+ for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
+ int (*compfn) (const void *, const void *);
- avl_create(&vq->vq_write_tree, vdev_queue_offset_compare,
- sizeof (zio_t), offsetof(struct zio, io_offset_node));
+ /*
+ * The synchronous i/o queues are dispatched in FIFO rather
+ * than LBA order. This provides more consistent latency for
+ * these i/os.
+ */
+ if (p == ZIO_PRIORITY_SYNC_READ || p == ZIO_PRIORITY_SYNC_WRITE)
+ compfn = vdev_queue_timestamp_compare;
+ else
+ compfn = vdev_queue_offset_compare;
- avl_create(&vq->vq_pending_tree, vdev_queue_offset_compare,
- sizeof (zio_t), offsetof(struct zio, io_offset_node));
+ avl_create(vdev_queue_class_tree(vq, p), compfn,
+ sizeof (zio_t), offsetof(struct zio, io_queue_node));
+ }
+
+ vq->vq_lastoffset = 0;
}
void
@@ -163,10 +410,11 @@
{
vdev_queue_t *vq = &vd->vdev_queue;
- avl_destroy(&vq->vq_deadline_tree);
- avl_destroy(&vq->vq_read_tree);
- avl_destroy(&vq->vq_write_tree);
- avl_destroy(&vq->vq_pending_tree);
+ for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++)
+ avl_destroy(vdev_queue_class_tree(vq, p));
+ avl_destroy(&vq->vq_active_tree);
+ avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_READ));
+ avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE));
mutex_destroy(&vq->vq_lock);
}
@@ -174,31 +422,232 @@
static void
vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio)
{
- avl_add(&vq->vq_deadline_tree, zio);
- avl_add(zio->io_vdev_tree, zio);
+ spa_t *spa = zio->io_spa;
+ avl_tree_t *qtt;
+
+ ASSERT(MUTEX_HELD(&vq->vq_lock));
+ ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
+ avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio);
+ qtt = vdev_queue_type_tree(vq, zio->io_type);
+ if (qtt)
+ avl_add(qtt, zio);
+
+#ifdef illumos
+ mutex_enter(&spa->spa_iokstat_lock);
+ spa->spa_queue_stats[zio->io_priority].spa_queued++;
+ if (spa->spa_iokstat != NULL)
+ kstat_waitq_enter(spa->spa_iokstat->ks_data);
+ mutex_exit(&spa->spa_iokstat_lock);
+#endif
}
static void
vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
{
- avl_remove(&vq->vq_deadline_tree, zio);
- avl_remove(zio->io_vdev_tree, zio);
+ spa_t *spa = zio->io_spa;
+ avl_tree_t *qtt;
+
+ ASSERT(MUTEX_HELD(&vq->vq_lock));
+ ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
+ avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio);
+ qtt = vdev_queue_type_tree(vq, zio->io_type);
+ if (qtt)
+ avl_remove(qtt, zio);
+
+#ifdef illumos
+ mutex_enter(&spa->spa_iokstat_lock);
+ ASSERT3U(spa->spa_queue_stats[zio->io_priority].spa_queued, >, 0);
+ spa->spa_queue_stats[zio->io_priority].spa_queued--;
+ if (spa->spa_iokstat != NULL)
+ kstat_waitq_exit(spa->spa_iokstat->ks_data);
+ mutex_exit(&spa->spa_iokstat_lock);
+#endif
}
static void
+vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio)
+{
+ spa_t *spa = zio->io_spa;
+ ASSERT(MUTEX_HELD(&vq->vq_lock));
+ ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
+ vq->vq_class[zio->io_priority].vqc_active++;
+ avl_add(&vq->vq_active_tree, zio);
+
+#ifdef illumos
+ mutex_enter(&spa->spa_iokstat_lock);
+ spa->spa_queue_stats[zio->io_priority].spa_active++;
+ if (spa->spa_iokstat != NULL)
+ kstat_runq_enter(spa->spa_iokstat->ks_data);
+ mutex_exit(&spa->spa_iokstat_lock);
+#endif
+}
+
+static void
+vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio)
+{
+ spa_t *spa = zio->io_spa;
+ ASSERT(MUTEX_HELD(&vq->vq_lock));
+ ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
+ vq->vq_class[zio->io_priority].vqc_active--;
+ avl_remove(&vq->vq_active_tree, zio);
+
+#ifdef illumos
+ mutex_enter(&spa->spa_iokstat_lock);
+ ASSERT3U(spa->spa_queue_stats[zio->io_priority].spa_active, >, 0);
+ spa->spa_queue_stats[zio->io_priority].spa_active--;
+ if (spa->spa_iokstat != NULL) {
+ kstat_io_t *ksio = spa->spa_iokstat->ks_data;
+
+ kstat_runq_exit(spa->spa_iokstat->ks_data);
+ if (zio->io_type == ZIO_TYPE_READ) {
+ ksio->reads++;
+ ksio->nread += zio->io_size;
+ } else if (zio->io_type == ZIO_TYPE_WRITE) {
+ ksio->writes++;
+ ksio->nwritten += zio->io_size;
+ }
+ }
+ mutex_exit(&spa->spa_iokstat_lock);
+#endif
+}
+
+static void
vdev_queue_agg_io_done(zio_t *aio)
{
- zio_t *pio;
-
- while ((pio = zio_walk_parents(aio)) != NULL)
- if (aio->io_type == ZIO_TYPE_READ)
+ if (aio->io_type == ZIO_TYPE_READ) {
+ zio_t *pio;
+ zio_link_t *zl = NULL;
+ while ((pio = zio_walk_parents(aio, &zl)) != NULL) {
bcopy((char *)aio->io_data + (pio->io_offset -
aio->io_offset), pio->io_data, pio->io_size);
+ }
+ }
zio_buf_free(aio->io_data, aio->io_size);
}
+static int
+vdev_queue_class_min_active(zio_priority_t p)
+{
+ switch (p) {
+ case ZIO_PRIORITY_SYNC_READ:
+ return (zfs_vdev_sync_read_min_active);
+ case ZIO_PRIORITY_SYNC_WRITE:
+ return (zfs_vdev_sync_write_min_active);
+ case ZIO_PRIORITY_ASYNC_READ:
+ return (zfs_vdev_async_read_min_active);
+ case ZIO_PRIORITY_ASYNC_WRITE:
+ return (zfs_vdev_async_write_min_active);
+ case ZIO_PRIORITY_SCRUB:
+ return (zfs_vdev_scrub_min_active);
+ case ZIO_PRIORITY_TRIM:
+ return (zfs_vdev_trim_min_active);
+ default:
+ panic("invalid priority %u", p);
+ return (0);
+ }
+}
+
+static __noinline int
+vdev_queue_max_async_writes(spa_t *spa)
+{
+ int writes;
+ uint64_t dirty = spa->spa_dsl_pool->dp_dirty_total;
+ uint64_t min_bytes = zfs_dirty_data_max *
+ zfs_vdev_async_write_active_min_dirty_percent / 100;
+ uint64_t max_bytes = zfs_dirty_data_max *
+ zfs_vdev_async_write_active_max_dirty_percent / 100;
+
+ /*
+ * Sync tasks correspond to interactive user actions. To reduce the
+ * execution time of those actions we push data out as fast as possible.
+ */
+ if (spa_has_pending_synctask(spa)) {
+ return (zfs_vdev_async_write_max_active);
+ }
+
+ if (dirty < min_bytes)
+ return (zfs_vdev_async_write_min_active);
+ if (dirty > max_bytes)
+ return (zfs_vdev_async_write_max_active);
+
+ /*
+ * linear interpolation:
+ * slope = (max_writes - min_writes) / (max_bytes - min_bytes)
+ * move right by min_bytes
+ * move up by min_writes
+ */
+ writes = (dirty - min_bytes) *
+ (zfs_vdev_async_write_max_active -
+ zfs_vdev_async_write_min_active) /
+ (max_bytes - min_bytes) +
+ zfs_vdev_async_write_min_active;
+ ASSERT3U(writes, >=, zfs_vdev_async_write_min_active);
+ ASSERT3U(writes, <=, zfs_vdev_async_write_max_active);
+ return (writes);
+}
+
+static int
+vdev_queue_class_max_active(spa_t *spa, zio_priority_t p)
+{
+ switch (p) {
+ case ZIO_PRIORITY_SYNC_READ:
+ return (zfs_vdev_sync_read_max_active);
+ case ZIO_PRIORITY_SYNC_WRITE:
+ return (zfs_vdev_sync_write_max_active);
+ case ZIO_PRIORITY_ASYNC_READ:
+ return (zfs_vdev_async_read_max_active);
+ case ZIO_PRIORITY_ASYNC_WRITE:
+ return (vdev_queue_max_async_writes(spa));
+ case ZIO_PRIORITY_SCRUB:
+ return (zfs_vdev_scrub_max_active);
+ case ZIO_PRIORITY_TRIM:
+ return (zfs_vdev_trim_max_active);
+ default:
+ panic("invalid priority %u", p);
+ return (0);
+ }
+}
+
/*
+ * Return the i/o class to issue from, or ZIO_PRIORITY_MAX_QUEUEABLE if
+ * there is no eligible class.
+ */
+static zio_priority_t
+vdev_queue_class_to_issue(vdev_queue_t *vq)
+{
+ spa_t *spa = vq->vq_vdev->vdev_spa;
+ zio_priority_t p;
+
+ ASSERT(MUTEX_HELD(&vq->vq_lock));
+
+ if (avl_numnodes(&vq->vq_active_tree) >= zfs_vdev_max_active)
+ return (ZIO_PRIORITY_NUM_QUEUEABLE);
+
+ /* find a queue that has not reached its minimum # outstanding i/os */
+ for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
+ if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
+ vq->vq_class[p].vqc_active <
+ vdev_queue_class_min_active(p))
+ return (p);
+ }
+
+ /*
+ * If we haven't found a queue, look for one that hasn't reached its
+ * maximum # outstanding i/os.
+ */
+ for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
+ if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
+ vq->vq_class[p].vqc_active <
+ vdev_queue_class_max_active(spa, p))
+ return (p);
+ }
+
+ /* No eligible queued i/os */
+ return (ZIO_PRIORITY_NUM_QUEUEABLE);
+}
+
+/*
* Compute the range spanned by two i/os, which is the endpoint of the last
* (lio->io_offset + lio->io_size) minus start of the first (fio->io_offset).
* Conveniently, the gap between fio and lio is given by -IO_SPAN(lio, fio);
@@ -208,155 +657,193 @@
#define IO_GAP(fio, lio) (-IO_SPAN(lio, fio))
static zio_t *
-vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit)
+vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
{
- zio_t *fio, *lio, *aio, *dio, *nio, *mio;
+ zio_t *first, *last, *aio, *dio, *mandatory, *nio;
+ void *abuf;
+ uint64_t maxgap = 0;
+ uint64_t size;
+ boolean_t stretch;
avl_tree_t *t;
- int flags;
- uint64_t maxspan = zfs_vdev_aggregation_limit;
- uint64_t maxgap;
- int stretch;
+ enum zio_flag flags;
-again:
ASSERT(MUTEX_HELD(&vq->vq_lock));
- if (avl_numnodes(&vq->vq_pending_tree) >= pending_limit ||
- avl_numnodes(&vq->vq_deadline_tree) == 0)
+ if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE)
return (NULL);
- fio = lio = avl_first(&vq->vq_deadline_tree);
+ first = last = zio;
- t = fio->io_vdev_tree;
- flags = fio->io_flags & ZIO_FLAG_AGG_INHERIT;
- maxgap = (t == &vq->vq_read_tree) ? zfs_vdev_read_gap_limit : 0;
+ if (zio->io_type == ZIO_TYPE_READ)
+ maxgap = zfs_vdev_read_gap_limit;
- if (!(flags & ZIO_FLAG_DONT_AGGREGATE)) {
- /*
- * We can aggregate I/Os that are sufficiently adjacent and of
- * the same flavor, as expressed by the AGG_INHERIT flags.
- * The latter requirement is necessary so that certain
- * attributes of the I/O, such as whether it's a normal I/O
- * or a scrub/resilver, can be preserved in the aggregate.
- * We can include optional I/Os, but don't allow them
- * to begin a range as they add no benefit in that situation.
- */
+ /*
+ * We can aggregate I/Os that are sufficiently adjacent and of
+ * the same flavor, as expressed by the AGG_INHERIT flags.
+ * The latter requirement is necessary so that certain
+ * attributes of the I/O, such as whether it's a normal I/O
+ * or a scrub/resilver, can be preserved in the aggregate.
+ * We can include optional I/Os, but don't allow them
+ * to begin a range as they add no benefit in that situation.
+ */
- /*
- * We keep track of the last non-optional I/O.
- */
- mio = (fio->io_flags & ZIO_FLAG_OPTIONAL) ? NULL : fio;
+ /*
+ * We keep track of the last non-optional I/O.
+ */
+ mandatory = (first->io_flags & ZIO_FLAG_OPTIONAL) ? NULL : first;
- /*
- * Walk backwards through sufficiently contiguous I/Os
- * recording the last non-option I/O.
- */
- while ((dio = AVL_PREV(t, fio)) != NULL &&
- (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
- IO_SPAN(dio, lio) <= maxspan &&
- IO_GAP(dio, fio) <= maxgap) {
- fio = dio;
- if (mio == NULL && !(fio->io_flags & ZIO_FLAG_OPTIONAL))
- mio = fio;
- }
+ /*
+ * Walk backwards through sufficiently contiguous I/Os
+ * recording the last non-option I/O.
+ */
+ flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT;
+ t = vdev_queue_type_tree(vq, zio->io_type);
+ while (t != NULL && (dio = AVL_PREV(t, first)) != NULL &&
+ (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
+ IO_SPAN(dio, last) <= zfs_vdev_aggregation_limit &&
+ IO_GAP(dio, first) <= maxgap) {
+ first = dio;
+ if (mandatory == NULL && !(first->io_flags & ZIO_FLAG_OPTIONAL))
+ mandatory = first;
+ }
- /*
- * Skip any initial optional I/Os.
- */
- while ((fio->io_flags & ZIO_FLAG_OPTIONAL) && fio != lio) {
- fio = AVL_NEXT(t, fio);
- ASSERT(fio != NULL);
- }
+ /*
+ * Skip any initial optional I/Os.
+ */
+ while ((first->io_flags & ZIO_FLAG_OPTIONAL) && first != last) {
+ first = AVL_NEXT(t, first);
+ ASSERT(first != NULL);
+ }
- /*
- * Walk forward through sufficiently contiguous I/Os.
- */
- while ((dio = AVL_NEXT(t, lio)) != NULL &&
- (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
- IO_SPAN(fio, dio) <= maxspan &&
- IO_GAP(lio, dio) <= maxgap) {
- lio = dio;
- if (!(lio->io_flags & ZIO_FLAG_OPTIONAL))
- mio = lio;
- }
+ /*
+ * Walk forward through sufficiently contiguous I/Os.
+ */
+ while ((dio = AVL_NEXT(t, last)) != NULL &&
+ (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
+ IO_SPAN(first, dio) <= zfs_vdev_aggregation_limit &&
+ IO_GAP(last, dio) <= maxgap) {
+ last = dio;
+ if (!(last->io_flags & ZIO_FLAG_OPTIONAL))
+ mandatory = last;
+ }
- /*
- * Now that we've established the range of the I/O aggregation
- * we must decide what to do with trailing optional I/Os.
- * For reads, there's nothing to do. While we are unable to
- * aggregate further, it's possible that a trailing optional
- * I/O would allow the underlying device to aggregate with
- * subsequent I/Os. We must therefore determine if the next
- * non-optional I/O is close enough to make aggregation
- * worthwhile.
- */
- stretch = B_FALSE;
- if (t != &vq->vq_read_tree && mio != NULL) {
- nio = lio;
- while ((dio = AVL_NEXT(t, nio)) != NULL &&
- IO_GAP(nio, dio) == 0 &&
- IO_GAP(mio, dio) <= zfs_vdev_write_gap_limit) {
- nio = dio;
- if (!(nio->io_flags & ZIO_FLAG_OPTIONAL)) {
- stretch = B_TRUE;
- break;
- }
+ /*
+ * Now that we've established the range of the I/O aggregation
+ * we must decide what to do with trailing optional I/Os.
+ * For reads, there's nothing to do. While we are unable to
+ * aggregate further, it's possible that a trailing optional
+ * I/O would allow the underlying device to aggregate with
+ * subsequent I/Os. We must therefore determine if the next
+ * non-optional I/O is close enough to make aggregation
+ * worthwhile.
+ */
+ stretch = B_FALSE;
+ if (zio->io_type == ZIO_TYPE_WRITE && mandatory != NULL) {
+ zio_t *nio = last;
+ while ((dio = AVL_NEXT(t, nio)) != NULL &&
+ IO_GAP(nio, dio) == 0 &&
+ IO_GAP(mandatory, dio) <= zfs_vdev_write_gap_limit) {
+ nio = dio;
+ if (!(nio->io_flags & ZIO_FLAG_OPTIONAL)) {
+ stretch = B_TRUE;
+ break;
}
}
+ }
- if (stretch) {
- /* This may be a no-op. */
- VERIFY((dio = AVL_NEXT(t, lio)) != NULL);
- dio->io_flags &= ~ZIO_FLAG_OPTIONAL;
- } else {
- while (lio != mio && lio != fio) {
- ASSERT(lio->io_flags & ZIO_FLAG_OPTIONAL);
- lio = AVL_PREV(t, lio);
- ASSERT(lio != NULL);
- }
+ if (stretch) {
+ /* This may be a no-op. */
+ dio = AVL_NEXT(t, last);
+ dio->io_flags &= ~ZIO_FLAG_OPTIONAL;
+ } else {
+ while (last != mandatory && last != first) {
+ ASSERT(last->io_flags & ZIO_FLAG_OPTIONAL);
+ last = AVL_PREV(t, last);
+ ASSERT(last != NULL);
}
}
- if (fio != lio) {
- uint64_t size = IO_SPAN(fio, lio);
- ASSERT(size <= zfs_vdev_aggregation_limit);
+ if (first == last)
+ return (NULL);
- aio = zio_vdev_delegated_io(fio->io_vd, fio->io_offset,
- zio_buf_alloc(size), size, fio->io_type, ZIO_PRIORITY_AGG,
- flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
- vdev_queue_agg_io_done, NULL);
- aio->io_timestamp = fio->io_timestamp;
+ size = IO_SPAN(first, last);
+ ASSERT3U(size, <=, zfs_vdev_aggregation_limit);
- nio = fio;
- do {
- dio = nio;
- nio = AVL_NEXT(t, dio);
- ASSERT(dio->io_type == aio->io_type);
- ASSERT(dio->io_vdev_tree == t);
+ abuf = zio_buf_alloc_nowait(size);
+ if (abuf == NULL)
+ return (NULL);
- if (dio->io_flags & ZIO_FLAG_NODATA) {
- ASSERT(dio->io_type == ZIO_TYPE_WRITE);
- bzero((char *)aio->io_data + (dio->io_offset -
- aio->io_offset), dio->io_size);
- } else if (dio->io_type == ZIO_TYPE_WRITE) {
- bcopy(dio->io_data, (char *)aio->io_data +
- (dio->io_offset - aio->io_offset),
- dio->io_size);
- }
+ aio = zio_vdev_delegated_io(first->io_vd, first->io_offset,
+ abuf, size, first->io_type, zio->io_priority,
+ flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
+ vdev_queue_agg_io_done, NULL);
+ aio->io_timestamp = first->io_timestamp;
- zio_add_child(dio, aio);
- vdev_queue_io_remove(vq, dio);
- zio_vdev_io_bypass(dio);
- zio_execute(dio);
- } while (dio != lio);
+ nio = first;
+ do {
+ dio = nio;
+ nio = AVL_NEXT(t, dio);
+ ASSERT3U(dio->io_type, ==, aio->io_type);
- avl_add(&vq->vq_pending_tree, aio);
+ if (dio->io_flags & ZIO_FLAG_NODATA) {
+ ASSERT3U(dio->io_type, ==, ZIO_TYPE_WRITE);
+ bzero((char *)aio->io_data + (dio->io_offset -
+ aio->io_offset), dio->io_size);
+ } else if (dio->io_type == ZIO_TYPE_WRITE) {
+ bcopy(dio->io_data, (char *)aio->io_data +
+ (dio->io_offset - aio->io_offset),
+ dio->io_size);
+ }
- return (aio);
+ zio_add_child(dio, aio);
+ vdev_queue_io_remove(vq, dio);
+ zio_vdev_io_bypass(dio);
+ zio_execute(dio);
+ } while (dio != last);
+
+ return (aio);
+}
+
+static zio_t *
+vdev_queue_io_to_issue(vdev_queue_t *vq)
+{
+ zio_t *zio, *aio;
+ zio_priority_t p;
+ avl_index_t idx;
+ avl_tree_t *tree;
+ zio_t search;
+
+again:
+ ASSERT(MUTEX_HELD(&vq->vq_lock));
+
+ p = vdev_queue_class_to_issue(vq);
+
+ if (p == ZIO_PRIORITY_NUM_QUEUEABLE) {
+ /* No eligible queued i/os */
+ return (NULL);
}
- ASSERT(fio->io_vdev_tree == t);
- vdev_queue_io_remove(vq, fio);
+ /*
+ * For LBA-ordered queues (async / scrub), issue the i/o which follows
+ * the most recently issued i/o in LBA (offset) order.
+ *
+ * For FIFO queues (sync), issue the i/o with the lowest timestamp.
+ */
+ tree = vdev_queue_class_tree(vq, p);
+ search.io_timestamp = 0;
+ search.io_offset = vq->vq_last_offset + 1;
+ VERIFY3P(avl_find(tree, &search, &idx), ==, NULL);
+ zio = avl_nearest(tree, idx, AVL_AFTER);
+ if (zio == NULL)
+ zio = avl_first(tree);
+ ASSERT3U(zio->io_priority, ==, p);
+ aio = vdev_queue_aggregate(vq, zio);
+ if (aio != NULL)
+ zio = aio;
+ else
+ vdev_queue_io_remove(vq, zio);
+
/*
* If the I/O is or was optional and therefore has no data, we need to
* simply discard it. We need to drop the vdev queue's lock to avoid a
@@ -363,17 +850,18 @@
* deadlock that we could encounter since this I/O will complete
* immediately.
*/
- if (fio->io_flags & ZIO_FLAG_NODATA) {
+ if (zio->io_flags & ZIO_FLAG_NODATA) {
mutex_exit(&vq->vq_lock);
- zio_vdev_io_bypass(fio);
- zio_execute(fio);
+ zio_vdev_io_bypass(zio);
+ zio_execute(zio);
mutex_enter(&vq->vq_lock);
goto again;
}
- avl_add(&vq->vq_pending_tree, fio);
+ vdev_queue_pending_add(vq, zio);
+ vq->vq_last_offset = zio->io_offset;
- return (fio);
+ return (zio);
}
zio_t *
@@ -382,28 +870,33 @@
vdev_queue_t *vq = &zio->io_vd->vdev_queue;
zio_t *nio;
- ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
-
if (zio->io_flags & ZIO_FLAG_DONT_QUEUE)
return (zio);
+ /*
+ * Children i/os inherent their parent's priority, which might
+ * not match the child's i/o type. Fix it up here.
+ */
+ if (zio->io_type == ZIO_TYPE_READ) {
+ if (zio->io_priority != ZIO_PRIORITY_SYNC_READ &&
+ zio->io_priority != ZIO_PRIORITY_ASYNC_READ &&
+ zio->io_priority != ZIO_PRIORITY_SCRUB)
+ zio->io_priority = ZIO_PRIORITY_ASYNC_READ;
+ } else if (zio->io_type == ZIO_TYPE_WRITE) {
+ if (zio->io_priority != ZIO_PRIORITY_SYNC_WRITE &&
+ zio->io_priority != ZIO_PRIORITY_ASYNC_WRITE)
+ zio->io_priority = ZIO_PRIORITY_ASYNC_WRITE;
+ } else {
+ ASSERT(zio->io_type == ZIO_TYPE_FREE);
+ zio->io_priority = ZIO_PRIORITY_TRIM;
+ }
+
zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE;
- if (zio->io_type == ZIO_TYPE_READ)
- zio->io_vdev_tree = &vq->vq_read_tree;
- else
- zio->io_vdev_tree = &vq->vq_write_tree;
-
mutex_enter(&vq->vq_lock);
-
zio->io_timestamp = gethrtime();
- zio->io_deadline = (zio->io_timestamp >> zfs_vdev_time_shift) +
- zio->io_priority;
-
vdev_queue_io_add(vq, zio);
-
- nio = vdev_queue_io_to_issue(vq, zfs_vdev_min_pending);
-
+ nio = vdev_queue_io_to_issue(vq);
mutex_exit(&vq->vq_lock);
if (nio == NULL)
@@ -421,20 +914,15 @@
vdev_queue_io_done(zio_t *zio)
{
vdev_queue_t *vq = &zio->io_vd->vdev_queue;
+ zio_t *nio;
- if (zio_injection_enabled)
- delay(SEC_TO_TICK(zio_handle_io_delay(zio)));
-
mutex_enter(&vq->vq_lock);
- avl_remove(&vq->vq_pending_tree, zio);
+ vdev_queue_pending_remove(vq, zio);
vq->vq_io_complete_ts = gethrtime();
- for (int i = 0; i < zfs_vdev_ramp_rate; i++) {
- zio_t *nio = vdev_queue_io_to_issue(vq, zfs_vdev_max_pending);
- if (nio == NULL)
- break;
+ while ((nio = vdev_queue_io_to_issue(vq)) != NULL) {
mutex_exit(&vq->vq_lock);
if (nio->io_done == vdev_queue_agg_io_done) {
zio_nowait(nio);
@@ -447,3 +935,26 @@
mutex_exit(&vq->vq_lock);
}
+
+/*
+ * As these three methods are only used for load calculations we're not concerned
+ * if we get an incorrect value on 32bit platforms due to lack of vq_lock mutex
+ * use here, instead we prefer to keep it lock free for performance.
+ */
+int
+vdev_queue_length(vdev_t *vd)
+{
+ return (avl_numnodes(&vd->vdev_queue.vq_active_tree));
+}
+
+uint64_t
+vdev_queue_lastoffset(vdev_t *vd)
+{
+ return (vd->vdev_queue.vq_lastoffset);
+}
+
+void
+vdev_queue_register_lastoffset(vdev_t *vd, zio_t *zio)
+{
+ vd->vdev_queue.vq_lastoffset = zio->io_offset + zio->io_size;
+}
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -22,16 +22,24 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
*/
#include <sys/zfs_context.h>
#include <sys/spa.h>
#include <sys/vdev_impl.h>
+#ifdef illumos
+#include <sys/vdev_disk.h>
+#endif
+#include <sys/vdev_file.h>
+#include <sys/vdev_raidz.h>
#include <sys/zio.h>
#include <sys/zio_checksum.h>
#include <sys/fs/zfs.h>
#include <sys/fm/fs/zfs.h>
+#include <sys/bio.h>
/*
* Virtual device vector for RAID-Z.
@@ -155,6 +163,8 @@
VDEV_RAIDZ_64MUL_2((x), mask); \
}
+#define VDEV_LABEL_OFFSET(x) (x + VDEV_LABEL_START_SIZE)
+
/*
* Force reconstruction to use the general purpose method.
*/
@@ -438,14 +448,14 @@
* the number of children in the target vdev.
*/
static raidz_map_t *
-vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
- uint64_t nparity)
+vdev_raidz_map_alloc(caddr_t data, uint64_t size, uint64_t offset, boolean_t dofree,
+ uint64_t unit_shift, uint64_t dcols, uint64_t nparity)
{
raidz_map_t *rm;
/* The starting RAIDZ (parent) vdev sector of the block. */
- uint64_t b = zio->io_offset >> unit_shift;
+ uint64_t b = offset >> unit_shift;
/* The zio's size in units of the vdev's minimum sector size. */
- uint64_t s = zio->io_size >> unit_shift;
+ uint64_t s = size >> unit_shift;
/* The first column for this stripe. */
uint64_t f = b % dcols;
/* The starting byte offset on each child vdev. */
@@ -533,13 +543,13 @@
ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
ASSERT3U(rm->rm_nskip, <=, nparity);
- if (zio->io_type != ZIO_TYPE_FREE) {
+ if (!dofree) {
for (c = 0; c < rm->rm_firstdatacol; c++) {
rm->rm_col[c].rc_data =
zio_buf_alloc(rm->rm_col[c].rc_size);
}
- rm->rm_col[c].rc_data = zio->io_data;
+ rm->rm_col[c].rc_data = data;
for (c = c + 1; c < acols; c++) {
rm->rm_col[c].rc_data =
@@ -571,7 +581,7 @@
ASSERT(rm->rm_cols >= 2);
ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
- if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
+ if (rm->rm_firstdatacol == 1 && (offset & (1ULL << 20))) {
devidx = rm->rm_col[0].rc_devidx;
o = rm->rm_col[0].rc_offset;
rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
@@ -583,8 +593,6 @@
rm->rm_skipstart = 1;
}
- zio->io_vsd = rm;
- zio->io_vsd_ops = &vdev_raidz_vsd_ops;
return (rm);
}
@@ -994,12 +1002,9 @@
* ~~ ~~
* __ __
* | 1 1 1 1 1 1 1 1 |
- * | 128 64 32 16 8 4 2 1 |
* | 19 205 116 29 64 16 4 1 |
* | 1 0 0 0 0 0 0 0 |
- * | 0 1 0 0 0 0 0 0 |
- * (V|I)' = | 0 0 1 0 0 0 0 0 |
- * | 0 0 0 1 0 0 0 0 |
+ * (V|I)' = | 0 0 0 1 0 0 0 0 |
* | 0 0 0 0 1 0 0 0 |
* | 0 0 0 0 0 1 0 0 |
* | 0 0 0 0 0 0 1 0 |
@@ -1479,7 +1484,7 @@
static int
vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
- uint64_t *ashift)
+ uint64_t *logical_ashift, uint64_t *physical_ashift)
{
vdev_t *cvd;
uint64_t nparity = vd->vdev_nparity;
@@ -1508,7 +1513,9 @@
*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
*max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
- *ashift = MAX(*ashift, cvd->vdev_ashift);
+ *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
+ *physical_ashift = MAX(*physical_ashift,
+ cvd->vdev_physical_ashift);
}
*asize *= vd->vdev_children;
@@ -1531,6 +1538,154 @@
vdev_close(vd->vdev_child[c]);
}
+#ifdef illumos
+/*
+ * Handle a read or write I/O to a RAID-Z dump device.
+ *
+ * The dump device is in a unique situation compared to other ZFS datasets:
+ * writing to this device should be as simple and fast as possible. In
+ * addition, durability matters much less since the dump will be extracted
+ * once the machine reboots. For that reason, this function eschews parity for
+ * performance and simplicity. The dump device uses the checksum setting
+ * ZIO_CHECKSUM_NOPARITY to indicate that parity is not maintained for this
+ * dataset.
+ *
+ * Blocks of size 128 KB have been preallocated for this volume. I/Os less than
+ * 128 KB will not fill an entire block; in addition, they may not be properly
+ * aligned. In that case, this function uses the preallocated 128 KB block and
+ * omits reading or writing any "empty" portions of that block, as opposed to
+ * allocating a fresh appropriately-sized block.
+ *
+ * Looking at an example of a 32 KB I/O to a RAID-Z vdev with 5 child vdevs:
+ *
+ * vdev_raidz_io_start(data, size: 32 KB, offset: 64 KB)
+ *
+ * If this were a standard RAID-Z dataset, a block of at least 40 KB would be
+ * allocated which spans all five child vdevs. 8 KB of data would be written to
+ * each of four vdevs, with the fifth containing the parity bits.
+ *
+ * parity data data data data
+ * | PP | XX | XX | XX | XX |
+ * ^ ^ ^ ^ ^
+ * | | | | |
+ * 8 KB parity ------8 KB data blocks------
+ *
+ * However, when writing to the dump device, the behavior is different:
+ *
+ * vdev_raidz_physio(data, size: 32 KB, offset: 64 KB)
+ *
+ * Unlike the normal RAID-Z case in which the block is allocated based on the
+ * I/O size, reads and writes here always use a 128 KB logical I/O size. If the
+ * I/O size is less than 128 KB, only the actual portions of data are written.
+ * In this example the data is written to the third data vdev since that vdev
+ * contains the offset [64 KB, 96 KB).
+ *
+ * parity data data data data
+ * | | | | XX | |
+ * ^
+ * |
+ * 32 KB data block
+ *
+ * As a result, an individual I/O may not span all child vdevs; moreover, a
+ * small I/O may only operate on a single child vdev.
+ *
+ * Note that since there are no parity bits calculated or written, this format
+ * remains the same no matter how many parity bits are used in a normal RAID-Z
+ * stripe. On a RAID-Z3 configuration with seven child vdevs, the example above
+ * would look like:
+ *
+ * parity parity parity data data data data
+ * | | | | | | XX | |
+ * ^
+ * |
+ * 32 KB data block
+ */
+int
+vdev_raidz_physio(vdev_t *vd, caddr_t data, size_t size,
+ uint64_t offset, uint64_t origoffset, boolean_t doread, boolean_t isdump)
+{
+ vdev_t *tvd = vd->vdev_top;
+ vdev_t *cvd;
+ raidz_map_t *rm;
+ raidz_col_t *rc;
+ int c, err = 0;
+
+ uint64_t start, end, colstart, colend;
+ uint64_t coloffset, colsize, colskip;
+
+ int flags = doread ? BIO_READ : BIO_WRITE;
+
+#ifdef _KERNEL
+
+ /*
+ * Don't write past the end of the block
+ */
+ VERIFY3U(offset + size, <=, origoffset + SPA_OLD_MAXBLOCKSIZE);
+
+ start = offset;
+ end = start + size;
+
+ /*
+ * Allocate a RAID-Z map for this block. Note that this block starts
+ * from the "original" offset, this is, the offset of the extent which
+ * contains the requisite offset of the data being read or written.
+ *
+ * Even if this I/O operation doesn't span the full block size, let's
+ * treat the on-disk format as if the only blocks are the complete 128
+ * KB size.
+ */
+ rm = vdev_raidz_map_alloc(data - (offset - origoffset),
+ SPA_OLD_MAXBLOCKSIZE, origoffset, B_FALSE, tvd->vdev_ashift,
+ vd->vdev_children, vd->vdev_nparity);
+
+ coloffset = origoffset;
+
+ for (c = rm->rm_firstdatacol; c < rm->rm_cols;
+ c++, coloffset += rc->rc_size) {
+ rc = &rm->rm_col[c];
+ cvd = vd->vdev_child[rc->rc_devidx];
+
+ /*
+ * Find the start and end of this column in the RAID-Z map,
+ * keeping in mind that the stated size and offset of the
+ * operation may not fill the entire column for this vdev.
+ *
+ * If any portion of the data spans this column, issue the
+ * appropriate operation to the vdev.
+ */
+ if (coloffset + rc->rc_size <= start)
+ continue;
+ if (coloffset >= end)
+ continue;
+
+ colstart = MAX(coloffset, start);
+ colend = MIN(end, coloffset + rc->rc_size);
+ colsize = colend - colstart;
+ colskip = colstart - coloffset;
+
+ VERIFY3U(colsize, <=, rc->rc_size);
+ VERIFY3U(colskip, <=, rc->rc_size);
+
+ /*
+ * Note that the child vdev will have a vdev label at the start
+ * of its range of offsets, hence the need for
+ * VDEV_LABEL_OFFSET(). See zio_vdev_child_io() for another
+ * example of why this calculation is needed.
+ */
+ if ((err = vdev_disk_physio(cvd,
+ ((char *)rc->rc_data) + colskip, colsize,
+ VDEV_LABEL_OFFSET(rc->rc_offset) + colskip,
+ flags, isdump)) != 0)
+ break;
+ }
+
+ vdev_raidz_map_free(rm);
+#endif /* KERNEL */
+
+ return (err);
+}
+#endif
+
static uint64_t
vdev_raidz_asize(vdev_t *vd, uint64_t psize)
{
@@ -1573,7 +1728,7 @@
* vdevs have had errors, then create zio read operations to the parity
* columns' VDevs as well.
*/
-static int
+static void
vdev_raidz_io_start(zio_t *zio)
{
vdev_t *vd = zio->io_vd;
@@ -1583,9 +1738,14 @@
raidz_col_t *rc;
int c, i;
- rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
+ rm = vdev_raidz_map_alloc(zio->io_data, zio->io_size, zio->io_offset,
+ zio->io_type == ZIO_TYPE_FREE,
+ tvd->vdev_ashift, vd->vdev_children,
vd->vdev_nparity);
+ zio->io_vsd = rm;
+ zio->io_vsd_ops = &vdev_raidz_vsd_ops;
+
ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
if (zio->io_type == ZIO_TYPE_FREE) {
@@ -1597,7 +1757,9 @@
zio->io_type, zio->io_priority, 0,
vdev_raidz_child_done, rc));
}
- return (ZIO_PIPELINE_CONTINUE);
+
+ zio_execute(zio);
+ return;
}
if (zio->io_type == ZIO_TYPE_WRITE) {
@@ -1629,7 +1791,8 @@
ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
}
- return (ZIO_PIPELINE_CONTINUE);
+ zio_execute(zio);
+ return;
}
ASSERT(zio->io_type == ZIO_TYPE_READ);
@@ -1669,7 +1832,7 @@
}
}
- return (ZIO_PIPELINE_CONTINUE);
+ zio_execute(zio);
}
@@ -1728,6 +1891,13 @@
int c, ret = 0;
raidz_col_t *rc;
+ blkptr_t *bp = zio->io_bp;
+ enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
+ (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
+
+ if (checksum == ZIO_CHECKSUM_NOPARITY)
+ return (ret);
+
for (c = 0; c < rm->rm_firstdatacol; c++) {
rc = &rm->rm_col[c];
if (!rc->rc_tried || rc->rc_error != 0)
@@ -2205,7 +2375,7 @@
zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
rc->rc_offset, rc->rc_data, rc->rc_size,
- ZIO_TYPE_WRITE, zio->io_priority,
+ ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
}
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -56,7 +56,7 @@
static int
vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
- uint64_t *ashift)
+ uint64_t *logical_ashift, uint64_t *physical_ashift)
{
int lasterror = 0;
int numerrors = 0;
@@ -84,7 +84,8 @@
*asize = 0;
*max_asize = 0;
- *ashift = 0;
+ *logical_ashift = 0;
+ *physical_ashift = 0;
return (0);
}
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,7 +21,8 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
*/
/*
@@ -51,10 +52,10 @@
int fzap_default_block_shift = 14; /* 16k blocksize */
-static void zap_leaf_pageout(dmu_buf_t *db, void *vl);
+extern inline zap_phys_t *zap_f_phys(zap_t *zap);
+
static uint64_t zap_allocate_blocks(zap_t *zap, int nblocks);
-
void
fzap_byteswap(void *vbuf, size_t size)
{
@@ -81,13 +82,12 @@
ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
zap->zap_ismicro = FALSE;
- (void) dmu_buf_update_user(zap->zap_dbuf, zap, zap,
- &zap->zap_f.zap_phys, zap_evict);
+ zap->zap_dbu.dbu_evict_func = zap_evict;
mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0);
- zap->zap_f.zap_block_shift = highbit(zap->zap_dbuf->db_size) - 1;
+ zap->zap_f.zap_block_shift = highbit64(zap->zap_dbuf->db_size) - 1;
- zp = zap->zap_f.zap_phys;
+ zp = zap_f_phys(zap);
/*
* explicitly zero it since it might be coming from an
* initialized microzap
@@ -118,7 +118,6 @@
l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
l->l_dbuf = db;
- l->l_phys = db->db_data;
zap_leaf_init(l, zp->zap_normflags != 0);
@@ -164,8 +163,9 @@
newblk = zap_allocate_blocks(zap, tbl->zt_numblks * 2);
tbl->zt_nextblk = newblk;
ASSERT0(tbl->zt_blks_copied);
- dmu_prefetch(zap->zap_objset, zap->zap_object,
- tbl->zt_blk << bs, tbl->zt_numblks << bs);
+ dmu_prefetch(zap->zap_objset, zap->zap_object, 0,
+ tbl->zt_blk << bs, tbl->zt_numblks << bs,
+ ZIO_PRIORITY_SYNC_READ);
}
/*
@@ -271,6 +271,7 @@
uint64_t blk, off;
int err;
dmu_buf_t *db;
+ dnode_t *dn;
int bs = FZAP_BLOCK_SHIFT(zap);
ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
@@ -278,8 +279,15 @@
blk = idx >> (bs-3);
off = idx & ((1<<(bs-3))-1);
- err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ /*
+ * Note: this is equivalent to dmu_buf_hold(), but we use
+ * _dnode_enter / _by_dnode because it's faster because we don't
+ * have to hold the dnode.
+ */
+ dn = dmu_buf_dnode_enter(zap->zap_dbuf);
+ err = dmu_buf_hold_by_dnode(dn,
(tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH);
+ dmu_buf_dnode_exit(zap->zap_dbuf);
if (err)
return (err);
*valp = ((uint64_t *)db->db_data)[off];
@@ -293,9 +301,11 @@
*/
blk = (idx*2) >> (bs-3);
- err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ dn = dmu_buf_dnode_enter(zap->zap_dbuf);
+ err = dmu_buf_hold_by_dnode(dn,
(tbl->zt_nextblk + blk) << bs, FTAG, &db,
DMU_READ_NO_PREFETCH);
+ dmu_buf_dnode_exit(zap->zap_dbuf);
if (err == 0)
dmu_buf_rele(db, FTAG);
}
@@ -326,10 +336,10 @@
* If we are within 2 bits of running out, stop growing, since
* this is already an aberrant condition.
*/
- if (zap->zap_f.zap_phys->zap_ptrtbl.zt_shift >= zap_hashbits(zap) - 2)
+ if (zap_f_phys(zap)->zap_ptrtbl.zt_shift >= zap_hashbits(zap) - 2)
return (SET_ERROR(ENOSPC));
- if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) {
+ if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) {
/*
* We are outgrowing the "embedded" ptrtbl (the one
* stored in the header block). Give it its own entire
@@ -339,9 +349,9 @@
dmu_buf_t *db_new;
int err;
- ASSERT3U(zap->zap_f.zap_phys->zap_ptrtbl.zt_shift, ==,
+ ASSERT3U(zap_f_phys(zap)->zap_ptrtbl.zt_shift, ==,
ZAP_EMBEDDED_PTRTBL_SHIFT(zap));
- ASSERT0(zap->zap_f.zap_phys->zap_ptrtbl.zt_blk);
+ ASSERT0(zap_f_phys(zap)->zap_ptrtbl.zt_blk);
newblk = zap_allocate_blocks(zap, 1);
err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
@@ -354,17 +364,17 @@
db_new->db_data, 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap));
dmu_buf_rele(db_new, FTAG);
- zap->zap_f.zap_phys->zap_ptrtbl.zt_blk = newblk;
- zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks = 1;
- zap->zap_f.zap_phys->zap_ptrtbl.zt_shift++;
+ zap_f_phys(zap)->zap_ptrtbl.zt_blk = newblk;
+ zap_f_phys(zap)->zap_ptrtbl.zt_numblks = 1;
+ zap_f_phys(zap)->zap_ptrtbl.zt_shift++;
- ASSERT3U(1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift, ==,
- zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks <<
+ ASSERT3U(1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift, ==,
+ zap_f_phys(zap)->zap_ptrtbl.zt_numblks <<
(FZAP_BLOCK_SHIFT(zap)-3));
return (0);
} else {
- return (zap_table_grow(zap, &zap->zap_f.zap_phys->zap_ptrtbl,
+ return (zap_table_grow(zap, &zap_f_phys(zap)->zap_ptrtbl,
zap_ptrtbl_transfer, tx));
}
}
@@ -374,8 +384,8 @@
{
dmu_buf_will_dirty(zap->zap_dbuf, tx);
mutex_enter(&zap->zap_f.zap_num_entries_mtx);
- ASSERT(delta > 0 || zap->zap_f.zap_phys->zap_num_entries >= -delta);
- zap->zap_f.zap_phys->zap_num_entries += delta;
+ ASSERT(delta > 0 || zap_f_phys(zap)->zap_num_entries >= -delta);
+ zap_f_phys(zap)->zap_num_entries += delta;
mutex_exit(&zap->zap_f.zap_num_entries_mtx);
}
@@ -384,16 +394,25 @@
{
uint64_t newblk;
ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
- newblk = zap->zap_f.zap_phys->zap_freeblk;
- zap->zap_f.zap_phys->zap_freeblk += nblocks;
+ newblk = zap_f_phys(zap)->zap_freeblk;
+ zap_f_phys(zap)->zap_freeblk += nblocks;
return (newblk);
}
+static void
+zap_leaf_pageout(void *dbu)
+{
+ zap_leaf_t *l = dbu;
+
+ rw_destroy(&l->l_rwlock);
+ kmem_free(l, sizeof (zap_leaf_t));
+}
+
static zap_leaf_t *
zap_create_leaf(zap_t *zap, dmu_tx_t *tx)
{
void *winner;
- zap_leaf_t *l = kmem_alloc(sizeof (zap_leaf_t), KM_SLEEP);
+ zap_leaf_t *l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
@@ -401,18 +420,18 @@
rw_enter(&l->l_rwlock, RW_WRITER);
l->l_blkid = zap_allocate_blocks(zap, 1);
l->l_dbuf = NULL;
- l->l_phys = NULL;
VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
l->l_blkid << FZAP_BLOCK_SHIFT(zap), NULL, &l->l_dbuf,
DMU_READ_NO_PREFETCH));
- winner = dmu_buf_set_user(l->l_dbuf, l, &l->l_phys, zap_leaf_pageout);
+ dmu_buf_init_user(&l->l_dbu, zap_leaf_pageout, &l->l_dbuf);
+ winner = dmu_buf_set_user(l->l_dbuf, &l->l_dbu);
ASSERT(winner == NULL);
dmu_buf_will_dirty(l->l_dbuf, tx);
zap_leaf_init(l, zap->zap_normflags != 0);
- zap->zap_f.zap_phys->zap_num_leafs++;
+ zap_f_phys(zap)->zap_num_leafs++;
return (l);
}
@@ -422,7 +441,7 @@
{
ASSERT(!zap->zap_ismicro);
mutex_enter(&zap->zap_f.zap_num_entries_mtx); /* unnecessary */
- *count = zap->zap_f.zap_phys->zap_num_entries;
+ *count = zap_f_phys(zap)->zap_num_entries;
mutex_exit(&zap->zap_f.zap_num_entries_mtx);
return (0);
}
@@ -438,16 +457,6 @@
dmu_buf_rele(l->l_dbuf, NULL);
}
-_NOTE(ARGSUSED(0))
-static void
-zap_leaf_pageout(dmu_buf_t *db, void *vl)
-{
- zap_leaf_t *l = vl;
-
- rw_destroy(&l->l_rwlock);
- kmem_free(l, sizeof (zap_leaf_t));
-}
-
static zap_leaf_t *
zap_open_leaf(uint64_t blkid, dmu_buf_t *db)
{
@@ -455,20 +464,20 @@
ASSERT(blkid != 0);
- l = kmem_alloc(sizeof (zap_leaf_t), KM_SLEEP);
+ l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
rw_init(&l->l_rwlock, 0, 0, 0);
rw_enter(&l->l_rwlock, RW_WRITER);
l->l_blkid = blkid;
- l->l_bs = highbit(db->db_size)-1;
+ l->l_bs = highbit64(db->db_size) - 1;
l->l_dbuf = db;
- l->l_phys = NULL;
- winner = dmu_buf_set_user(db, l, &l->l_phys, zap_leaf_pageout);
+ dmu_buf_init_user(&l->l_dbu, zap_leaf_pageout, &l->l_dbuf);
+ winner = dmu_buf_set_user(db, &l->l_dbu);
rw_exit(&l->l_rwlock);
if (winner != NULL) {
/* someone else set it first */
- zap_leaf_pageout(NULL, l);
+ zap_leaf_pageout(&l->l_dbu);
l = winner;
}
@@ -477,7 +486,7 @@
* chain. There should be no chained leafs (as we have removed
* support for them).
*/
- ASSERT0(l->l_phys->l_hdr.lh_pad1);
+ ASSERT0(zap_leaf_phys(l)->l_hdr.lh_pad1);
/*
* There should be more hash entries than there can be
@@ -487,11 +496,11 @@
/* The chunks should begin at the end of the hash table */
ASSERT3P(&ZAP_LEAF_CHUNK(l, 0), ==,
- &l->l_phys->l_hash[ZAP_LEAF_HASH_NUMENTRIES(l)]);
+ &zap_leaf_phys(l)->l_hash[ZAP_LEAF_HASH_NUMENTRIES(l)]);
/* The chunks should end at the end of the block */
ASSERT3U((uintptr_t)&ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)) -
- (uintptr_t)l->l_phys, ==, l->l_dbuf->db_size);
+ (uintptr_t)zap_leaf_phys(l), ==, l->l_dbuf->db_size);
return (l);
}
@@ -507,8 +516,10 @@
ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
- err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ dnode_t *dn = dmu_buf_dnode_enter(zap->zap_dbuf);
+ err = dmu_buf_hold_by_dnode(dn,
blkid << bs, NULL, &db, DMU_READ_NO_PREFETCH);
+ dmu_buf_dnode_exit(zap->zap_dbuf);
if (err)
return (err);
@@ -524,7 +535,7 @@
rw_enter(&l->l_rwlock, lt);
/*
- * Must lock before dirtying, otherwise l->l_phys could change,
+ * Must lock before dirtying, otherwise zap_leaf_phys(l) could change,
* causing ASSERT below to fail.
*/
if (lt == RW_WRITER)
@@ -531,9 +542,8 @@
dmu_buf_will_dirty(db, tx);
ASSERT3U(l->l_blkid, ==, blkid);
ASSERT3P(l->l_dbuf, ==, db);
- ASSERT3P(l->l_phys, ==, l->l_dbuf->db_data);
- ASSERT3U(l->l_phys->l_hdr.lh_block_type, ==, ZBT_LEAF);
- ASSERT3U(l->l_phys->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
+ ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_block_type, ==, ZBT_LEAF);
+ ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
*lp = l;
return (0);
@@ -544,13 +554,13 @@
{
ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
- if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) {
+ if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) {
ASSERT3U(idx, <,
- (1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift));
+ (1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift));
*valp = ZAP_EMBEDDED_PTRTBL_ENT(zap, idx);
return (0);
} else {
- return (zap_table_load(zap, &zap->zap_f.zap_phys->zap_ptrtbl,
+ return (zap_table_load(zap, &zap_f_phys(zap)->zap_ptrtbl,
idx, valp));
}
}
@@ -561,11 +571,11 @@
ASSERT(tx != NULL);
ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
- if (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk == 0) {
+ if (zap_f_phys(zap)->zap_ptrtbl.zt_blk == 0) {
ZAP_EMBEDDED_PTRTBL_ENT(zap, idx) = blk;
return (0);
} else {
- return (zap_table_store(zap, &zap->zap_f.zap_phys->zap_ptrtbl,
+ return (zap_table_store(zap, &zap_f_phys(zap)->zap_ptrtbl,
idx, blk, tx));
}
}
@@ -577,21 +587,23 @@
int err;
ASSERT(zap->zap_dbuf == NULL ||
- zap->zap_f.zap_phys == zap->zap_dbuf->db_data);
- ASSERT3U(zap->zap_f.zap_phys->zap_magic, ==, ZAP_MAGIC);
- idx = ZAP_HASH_IDX(h, zap->zap_f.zap_phys->zap_ptrtbl.zt_shift);
+ zap_f_phys(zap) == zap->zap_dbuf->db_data);
+ ASSERT3U(zap_f_phys(zap)->zap_magic, ==, ZAP_MAGIC);
+ idx = ZAP_HASH_IDX(h, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
err = zap_idx_to_blk(zap, idx, &blk);
if (err != 0)
return (err);
err = zap_get_leaf_byblk(zap, blk, tx, lt, lp);
- ASSERT(err || ZAP_HASH_IDX(h, (*lp)->l_phys->l_hdr.lh_prefix_len) ==
- (*lp)->l_phys->l_hdr.lh_prefix);
+ ASSERT(err ||
+ ZAP_HASH_IDX(h, zap_leaf_phys(*lp)->l_hdr.lh_prefix_len) ==
+ zap_leaf_phys(*lp)->l_hdr.lh_prefix);
return (err);
}
static int
-zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx, zap_leaf_t **lp)
+zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l,
+ void *tag, dmu_tx_t *tx, zap_leaf_t **lp)
{
zap_t *zap = zn->zn_zap;
uint64_t hash = zn->zn_hash;
@@ -598,24 +610,24 @@
zap_leaf_t *nl;
int prefix_diff, i, err;
uint64_t sibling;
- int old_prefix_len = l->l_phys->l_hdr.lh_prefix_len;
+ int old_prefix_len = zap_leaf_phys(l)->l_hdr.lh_prefix_len;
- ASSERT3U(old_prefix_len, <=, zap->zap_f.zap_phys->zap_ptrtbl.zt_shift);
+ ASSERT3U(old_prefix_len, <=, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==,
- l->l_phys->l_hdr.lh_prefix);
+ zap_leaf_phys(l)->l_hdr.lh_prefix);
if (zap_tryupgradedir(zap, tx) == 0 ||
- old_prefix_len == zap->zap_f.zap_phys->zap_ptrtbl.zt_shift) {
+ old_prefix_len == zap_f_phys(zap)->zap_ptrtbl.zt_shift) {
/* We failed to upgrade, or need to grow the pointer table */
objset_t *os = zap->zap_objset;
uint64_t object = zap->zap_object;
zap_put_leaf(l);
- zap_unlockdir(zap);
+ zap_unlockdir(zap, tag);
err = zap_lockdir(os, object, tx, RW_WRITER,
- FALSE, FALSE, &zn->zn_zap);
+ FALSE, FALSE, tag, &zn->zn_zap);
zap = zn->zn_zap;
if (err)
return (err);
@@ -622,7 +634,7 @@
ASSERT(!zap->zap_ismicro);
while (old_prefix_len ==
- zap->zap_f.zap_phys->zap_ptrtbl.zt_shift) {
+ zap_f_phys(zap)->zap_ptrtbl.zt_shift) {
err = zap_grow_ptrtbl(zap, tx);
if (err)
return (err);
@@ -632,7 +644,7 @@
if (err)
return (err);
- if (l->l_phys->l_hdr.lh_prefix_len != old_prefix_len) {
+ if (zap_leaf_phys(l)->l_hdr.lh_prefix_len != old_prefix_len) {
/* it split while our locks were down */
*lp = l;
return (0);
@@ -639,11 +651,11 @@
}
}
ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
- ASSERT3U(old_prefix_len, <, zap->zap_f.zap_phys->zap_ptrtbl.zt_shift);
+ ASSERT3U(old_prefix_len, <, zap_f_phys(zap)->zap_ptrtbl.zt_shift);
ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==,
- l->l_phys->l_hdr.lh_prefix);
+ zap_leaf_phys(l)->l_hdr.lh_prefix);
- prefix_diff = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift -
+ prefix_diff = zap_f_phys(zap)->zap_ptrtbl.zt_shift -
(old_prefix_len + 1);
sibling = (ZAP_HASH_IDX(hash, old_prefix_len + 1) | 1) << prefix_diff;
@@ -665,7 +677,7 @@
ASSERT0(err); /* we checked for i/o errors above */
}
- if (hash & (1ULL << (64 - l->l_phys->l_hdr.lh_prefix_len))) {
+ if (hash & (1ULL << (64 - zap_leaf_phys(l)->l_hdr.lh_prefix_len))) {
/* we want the sibling */
zap_put_leaf(l);
*lp = nl;
@@ -678,16 +690,17 @@
}
static void
-zap_put_leaf_maybe_grow_ptrtbl(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx)
+zap_put_leaf_maybe_grow_ptrtbl(zap_name_t *zn, zap_leaf_t *l,
+ void *tag, dmu_tx_t *tx)
{
zap_t *zap = zn->zn_zap;
- int shift = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift;
- int leaffull = (l->l_phys->l_hdr.lh_prefix_len == shift &&
- l->l_phys->l_hdr.lh_nfree < ZAP_LEAF_LOW_WATER);
+ int shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift;
+ int leaffull = (zap_leaf_phys(l)->l_hdr.lh_prefix_len == shift &&
+ zap_leaf_phys(l)->l_hdr.lh_nfree < ZAP_LEAF_LOW_WATER);
zap_put_leaf(l);
- if (leaffull || zap->zap_f.zap_phys->zap_ptrtbl.zt_nextblk) {
+ if (leaffull || zap_f_phys(zap)->zap_ptrtbl.zt_nextblk) {
int err;
/*
@@ -698,9 +711,9 @@
objset_t *os = zap->zap_objset;
uint64_t zapobj = zap->zap_object;
- zap_unlockdir(zap);
+ zap_unlockdir(zap, tag);
err = zap_lockdir(os, zapobj, tx,
- RW_WRITER, FALSE, FALSE, &zn->zn_zap);
+ RW_WRITER, FALSE, FALSE, tag, &zn->zn_zap);
zap = zn->zn_zap;
if (err)
return;
@@ -707,7 +720,7 @@
}
/* could have finished growing while our locks were down */
- if (zap->zap_f.zap_phys->zap_ptrtbl.zt_shift == shift)
+ if (zap_f_phys(zap)->zap_ptrtbl.zt_shift == shift)
(void) zap_grow_ptrtbl(zap, tx);
}
}
@@ -790,7 +803,7 @@
int
fzap_add_cd(zap_name_t *zn,
uint64_t integer_size, uint64_t num_integers,
- const void *val, uint32_t cd, dmu_tx_t *tx)
+ const void *val, uint32_t cd, void *tag, dmu_tx_t *tx)
{
zap_leaf_t *l;
int err;
@@ -819,7 +832,7 @@
if (err == 0) {
zap_increment_num_entries(zap, 1, tx);
} else if (err == EAGAIN) {
- err = zap_expand_leaf(zn, l, tx, &l);
+ err = zap_expand_leaf(zn, l, tag, tx, &l);
zap = zn->zn_zap; /* zap_expand_leaf() may change zap */
if (err == 0)
goto retry;
@@ -827,7 +840,7 @@
out:
if (zap != NULL)
- zap_put_leaf_maybe_grow_ptrtbl(zn, l, tx);
+ zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx);
return (err);
}
@@ -834,7 +847,7 @@
int
fzap_add(zap_name_t *zn,
uint64_t integer_size, uint64_t num_integers,
- const void *val, dmu_tx_t *tx)
+ const void *val, void *tag, dmu_tx_t *tx)
{
int err = fzap_check(zn, integer_size, num_integers);
if (err != 0)
@@ -841,12 +854,13 @@
return (err);
return (fzap_add_cd(zn, integer_size, num_integers,
- val, ZAP_NEED_CD, tx));
+ val, ZAP_NEED_CD, tag, tx));
}
int
fzap_update(zap_name_t *zn,
- int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
+ int integer_size, uint64_t num_integers, const void *val,
+ void *tag, dmu_tx_t *tx)
{
zap_leaf_t *l;
int err, create;
@@ -876,7 +890,7 @@
}
if (err == EAGAIN) {
- err = zap_expand_leaf(zn, l, tx, &l);
+ err = zap_expand_leaf(zn, l, tag, tx, &l);
zap = zn->zn_zap; /* zap_expand_leaf() may change zap */
if (err == 0)
goto retry;
@@ -883,7 +897,7 @@
}
if (zap != NULL)
- zap_put_leaf_maybe_grow_ptrtbl(zn, l, tx);
+ zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx);
return (err);
}
@@ -938,11 +952,12 @@
int bs;
idx = ZAP_HASH_IDX(zn->zn_hash,
- zap->zap_f.zap_phys->zap_ptrtbl.zt_shift);
+ zap_f_phys(zap)->zap_ptrtbl.zt_shift);
if (zap_idx_to_blk(zap, idx, &blk) != 0)
return;
bs = FZAP_BLOCK_SHIFT(zap);
- dmu_prefetch(zap->zap_objset, zap->zap_object, blk << bs, 1 << bs);
+ dmu_prefetch(zap->zap_objset, zap->zap_object, 0, blk << bs, 1 << bs,
+ ZIO_PRIORITY_SYNC_READ);
}
/*
@@ -1170,8 +1185,8 @@
if (zc->zc_leaf &&
(ZAP_HASH_IDX(zc->zc_hash,
- zc->zc_leaf->l_phys->l_hdr.lh_prefix_len) !=
- zc->zc_leaf->l_phys->l_hdr.lh_prefix)) {
+ zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix_len) !=
+ zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix)) {
rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
zap_put_leaf(zc->zc_leaf);
zc->zc_leaf = NULL;
@@ -1192,10 +1207,11 @@
if (err == ENOENT) {
uint64_t nocare =
- (1ULL << (64 - l->l_phys->l_hdr.lh_prefix_len)) - 1;
+ (1ULL << (64 - zap_leaf_phys(l)->l_hdr.lh_prefix_len)) - 1;
zc->zc_hash = (zc->zc_hash & ~nocare) + nocare + 1;
zc->zc_cd = 0;
- if (l->l_phys->l_hdr.lh_prefix_len == 0 || zc->zc_hash == 0) {
+ if (zap_leaf_phys(l)->l_hdr.lh_prefix_len == 0 ||
+ zc->zc_hash == 0) {
zc->zc_hash = -1ULL;
} else {
zap_put_leaf(zc->zc_leaf);
@@ -1287,25 +1303,25 @@
/*
* Set zap_phys_t fields
*/
- zs->zs_num_leafs = zap->zap_f.zap_phys->zap_num_leafs;
- zs->zs_num_entries = zap->zap_f.zap_phys->zap_num_entries;
- zs->zs_num_blocks = zap->zap_f.zap_phys->zap_freeblk;
- zs->zs_block_type = zap->zap_f.zap_phys->zap_block_type;
- zs->zs_magic = zap->zap_f.zap_phys->zap_magic;
- zs->zs_salt = zap->zap_f.zap_phys->zap_salt;
+ zs->zs_num_leafs = zap_f_phys(zap)->zap_num_leafs;
+ zs->zs_num_entries = zap_f_phys(zap)->zap_num_entries;
+ zs->zs_num_blocks = zap_f_phys(zap)->zap_freeblk;
+ zs->zs_block_type = zap_f_phys(zap)->zap_block_type;
+ zs->zs_magic = zap_f_phys(zap)->zap_magic;
+ zs->zs_salt = zap_f_phys(zap)->zap_salt;
/*
* Set zap_ptrtbl fields
*/
- zs->zs_ptrtbl_len = 1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift;
- zs->zs_ptrtbl_nextblk = zap->zap_f.zap_phys->zap_ptrtbl.zt_nextblk;
+ zs->zs_ptrtbl_len = 1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift;
+ zs->zs_ptrtbl_nextblk = zap_f_phys(zap)->zap_ptrtbl.zt_nextblk;
zs->zs_ptrtbl_blks_copied =
- zap->zap_f.zap_phys->zap_ptrtbl.zt_blks_copied;
- zs->zs_ptrtbl_zt_blk = zap->zap_f.zap_phys->zap_ptrtbl.zt_blk;
- zs->zs_ptrtbl_zt_numblks = zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks;
- zs->zs_ptrtbl_zt_shift = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift;
+ zap_f_phys(zap)->zap_ptrtbl.zt_blks_copied;
+ zs->zs_ptrtbl_zt_blk = zap_f_phys(zap)->zap_ptrtbl.zt_blk;
+ zs->zs_ptrtbl_zt_numblks = zap_f_phys(zap)->zap_ptrtbl.zt_numblks;
+ zs->zs_ptrtbl_zt_shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift;
- if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) {
+ if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) {
/* the ptrtbl is entirely in the header block. */
zap_stats_ptrtbl(zap, &ZAP_EMBEDDED_PTRTBL_ENT(zap, 0),
1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap), zs);
@@ -1312,17 +1328,18 @@
} else {
int b;
- dmu_prefetch(zap->zap_objset, zap->zap_object,
- zap->zap_f.zap_phys->zap_ptrtbl.zt_blk << bs,
- zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks << bs);
+ dmu_prefetch(zap->zap_objset, zap->zap_object, 0,
+ zap_f_phys(zap)->zap_ptrtbl.zt_blk << bs,
+ zap_f_phys(zap)->zap_ptrtbl.zt_numblks << bs,
+ ZIO_PRIORITY_SYNC_READ);
- for (b = 0; b < zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks;
+ for (b = 0; b < zap_f_phys(zap)->zap_ptrtbl.zt_numblks;
b++) {
dmu_buf_t *db;
int err;
err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
- (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk + b) << bs,
+ (zap_f_phys(zap)->zap_ptrtbl.zt_blk + b) << bs,
FTAG, &db, DMU_READ_NO_PREFETCH);
if (err == 0) {
zap_stats_ptrtbl(zap, db->db_data,
@@ -1334,8 +1351,8 @@
}
int
-fzap_count_write(zap_name_t *zn, int add, uint64_t *towrite,
- uint64_t *tooverwrite)
+fzap_count_write(zap_name_t *zn, int add, refcount_t *towrite,
+ refcount_t *tooverwrite)
{
zap_t *zap = zn->zn_zap;
zap_leaf_t *l;
@@ -1345,9 +1362,11 @@
* Account for the header block of the fatzap.
*/
if (!add && dmu_buf_freeable(zap->zap_dbuf)) {
- *tooverwrite += zap->zap_dbuf->db_size;
+ (void) refcount_add_many(tooverwrite,
+ zap->zap_dbuf->db_size, FTAG);
} else {
- *towrite += zap->zap_dbuf->db_size;
+ (void) refcount_add_many(towrite,
+ zap->zap_dbuf->db_size, FTAG);
}
/*
@@ -1359,10 +1378,13 @@
* could extend the table.
*/
if (add) {
- if (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk == 0)
- *towrite += zap->zap_dbuf->db_size;
- else
- *towrite += (zap->zap_dbuf->db_size * 3);
+ if (zap_f_phys(zap)->zap_ptrtbl.zt_blk == 0) {
+ (void) refcount_add_many(towrite,
+ zap->zap_dbuf->db_size, FTAG);
+ } else {
+ (void) refcount_add_many(towrite,
+ zap->zap_dbuf->db_size * 3, FTAG);
+ }
}
/*
@@ -1375,13 +1397,14 @@
}
if (!add && dmu_buf_freeable(l->l_dbuf)) {
- *tooverwrite += l->l_dbuf->db_size;
+ (void) refcount_add_many(tooverwrite, l->l_dbuf->db_size, FTAG);
} else {
/*
* If this an add operation, the leaf block could split.
* Hence, we need to account for an additional leaf block.
*/
- *towrite += (add ? 2 : 1) * l->l_dbuf->db_size;
+ (void) refcount_add_many(towrite,
+ (add ? 2 : 1) * l->l_dbuf->db_size, FTAG);
}
zap_put_leaf(l);
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_leaf.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,7 +21,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
*/
/*
@@ -49,10 +49,12 @@
#define LEAF_HASH(l, h) \
((ZAP_LEAF_HASH_NUMENTRIES(l)-1) & \
- ((h) >> (64 - ZAP_LEAF_HASH_SHIFT(l)-(l)->l_phys->l_hdr.lh_prefix_len)))
+ ((h) >> \
+ (64 - ZAP_LEAF_HASH_SHIFT(l) - zap_leaf_phys(l)->l_hdr.lh_prefix_len)))
-#define LEAF_HASH_ENTPTR(l, h) (&(l)->l_phys->l_hash[LEAF_HASH(l, h)])
+#define LEAF_HASH_ENTPTR(l, h) (&zap_leaf_phys(l)->l_hash[LEAF_HASH(l, h)])
+extern inline zap_leaf_phys_t *zap_leaf_phys(zap_leaf_t *l);
static void
zap_memset(void *a, int c, size_t n)
@@ -106,17 +108,20 @@
{
int i;
zap_leaf_t l;
- l.l_bs = highbit(size)-1;
- l.l_phys = buf;
+ dmu_buf_t l_dbuf;
- buf->l_hdr.lh_block_type = BSWAP_64(buf->l_hdr.lh_block_type);
- buf->l_hdr.lh_prefix = BSWAP_64(buf->l_hdr.lh_prefix);
- buf->l_hdr.lh_magic = BSWAP_32(buf->l_hdr.lh_magic);
- buf->l_hdr.lh_nfree = BSWAP_16(buf->l_hdr.lh_nfree);
- buf->l_hdr.lh_nentries = BSWAP_16(buf->l_hdr.lh_nentries);
- buf->l_hdr.lh_prefix_len = BSWAP_16(buf->l_hdr.lh_prefix_len);
- buf->l_hdr.lh_freelist = BSWAP_16(buf->l_hdr.lh_freelist);
+ l_dbuf.db_data = buf;
+ l.l_bs = highbit64(size) - 1;
+ l.l_dbuf = &l_dbuf;
+ buf->l_hdr.lh_block_type = BSWAP_64(buf->l_hdr.lh_block_type);
+ buf->l_hdr.lh_prefix = BSWAP_64(buf->l_hdr.lh_prefix);
+ buf->l_hdr.lh_magic = BSWAP_32(buf->l_hdr.lh_magic);
+ buf->l_hdr.lh_nfree = BSWAP_16(buf->l_hdr.lh_nfree);
+ buf->l_hdr.lh_nentries = BSWAP_16(buf->l_hdr.lh_nentries);
+ buf->l_hdr.lh_prefix_len = BSWAP_16(buf->l_hdr.lh_prefix_len);
+ buf->l_hdr.lh_freelist = BSWAP_16(buf->l_hdr.lh_freelist);
+
for (i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(&l); i++)
buf->l_hash[i] = BSWAP_16(buf->l_hash[i]);
@@ -158,19 +163,21 @@
{
int i;
- l->l_bs = highbit(l->l_dbuf->db_size)-1;
- zap_memset(&l->l_phys->l_hdr, 0, sizeof (struct zap_leaf_header));
- zap_memset(l->l_phys->l_hash, CHAIN_END, 2*ZAP_LEAF_HASH_NUMENTRIES(l));
+ l->l_bs = highbit64(l->l_dbuf->db_size) - 1;
+ zap_memset(&zap_leaf_phys(l)->l_hdr, 0,
+ sizeof (struct zap_leaf_header));
+ zap_memset(zap_leaf_phys(l)->l_hash, CHAIN_END,
+ 2*ZAP_LEAF_HASH_NUMENTRIES(l));
for (i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
ZAP_LEAF_CHUNK(l, i).l_free.lf_type = ZAP_CHUNK_FREE;
ZAP_LEAF_CHUNK(l, i).l_free.lf_next = i+1;
}
ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)-1).l_free.lf_next = CHAIN_END;
- l->l_phys->l_hdr.lh_block_type = ZBT_LEAF;
- l->l_phys->l_hdr.lh_magic = ZAP_LEAF_MAGIC;
- l->l_phys->l_hdr.lh_nfree = ZAP_LEAF_NUMCHUNKS(l);
+ zap_leaf_phys(l)->l_hdr.lh_block_type = ZBT_LEAF;
+ zap_leaf_phys(l)->l_hdr.lh_magic = ZAP_LEAF_MAGIC;
+ zap_leaf_phys(l)->l_hdr.lh_nfree = ZAP_LEAF_NUMCHUNKS(l);
if (sort)
- l->l_phys->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED;
+ zap_leaf_phys(l)->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED;
}
/*
@@ -182,15 +189,16 @@
{
int chunk;
- ASSERT(l->l_phys->l_hdr.lh_nfree > 0);
+ ASSERT(zap_leaf_phys(l)->l_hdr.lh_nfree > 0);
- chunk = l->l_phys->l_hdr.lh_freelist;
+ chunk = zap_leaf_phys(l)->l_hdr.lh_freelist;
ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_free.lf_type, ==, ZAP_CHUNK_FREE);
- l->l_phys->l_hdr.lh_freelist = ZAP_LEAF_CHUNK(l, chunk).l_free.lf_next;
+ zap_leaf_phys(l)->l_hdr.lh_freelist =
+ ZAP_LEAF_CHUNK(l, chunk).l_free.lf_next;
- l->l_phys->l_hdr.lh_nfree--;
+ zap_leaf_phys(l)->l_hdr.lh_nfree--;
return (chunk);
}
@@ -199,16 +207,16 @@
zap_leaf_chunk_free(zap_leaf_t *l, uint16_t chunk)
{
struct zap_leaf_free *zlf = &ZAP_LEAF_CHUNK(l, chunk).l_free;
- ASSERT3U(l->l_phys->l_hdr.lh_nfree, <, ZAP_LEAF_NUMCHUNKS(l));
+ ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_nfree, <, ZAP_LEAF_NUMCHUNKS(l));
ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
ASSERT(zlf->lf_type != ZAP_CHUNK_FREE);
zlf->lf_type = ZAP_CHUNK_FREE;
- zlf->lf_next = l->l_phys->l_hdr.lh_freelist;
+ zlf->lf_next = zap_leaf_phys(l)->l_hdr.lh_freelist;
bzero(zlf->lf_pad, sizeof (zlf->lf_pad)); /* help it to compress */
- l->l_phys->l_hdr.lh_freelist = chunk;
+ zap_leaf_phys(l)->l_hdr.lh_freelist = chunk;
- l->l_phys->l_hdr.lh_nfree++;
+ zap_leaf_phys(l)->l_hdr.lh_nfree++;
}
/*
@@ -394,7 +402,7 @@
uint16_t *chunkp;
struct zap_leaf_entry *le;
- ASSERT3U(l->l_phys->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
+ ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
again:
for (chunkp = LEAF_HASH_ENTPTR(l, zn->zn_hash);
@@ -414,7 +422,7 @@
* lowest-cd match for MT_FIRST.
*/
ASSERT(zn->zn_matchtype == MT_EXACT ||
- (l->l_phys->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED));
+ (zap_leaf_phys(l)->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED));
if (zap_leaf_array_match(l, zn, le->le_name_chunk,
le->le_name_numints)) {
zeh->zeh_num_integers = le->le_value_numints;
@@ -454,10 +462,10 @@
uint16_t lh;
struct zap_leaf_entry *le;
- ASSERT3U(l->l_phys->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
+ ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC);
for (lh = LEAF_HASH(l, h); lh <= bestlh; lh++) {
- for (chunk = l->l_phys->l_hash[lh];
+ for (chunk = zap_leaf_phys(l)->l_hash[lh];
chunk != CHAIN_END; chunk = le->le_next) {
le = ZAP_LEAF_ENTRY(l, chunk);
@@ -528,7 +536,7 @@
int
zap_entry_update(zap_entry_handle_t *zeh,
- uint8_t integer_size, uint64_t num_integers, const void *buf)
+ uint8_t integer_size, uint64_t num_integers, const void *buf)
{
int delta_chunks;
zap_leaf_t *l = zeh->zeh_leaf;
@@ -537,7 +545,7 @@
delta_chunks = ZAP_LEAF_ARRAY_NCHUNKS(num_integers * integer_size) -
ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_numints * le->le_value_intlen);
- if ((int)l->l_phys->l_hdr.lh_nfree < delta_chunks)
+ if ((int)zap_leaf_phys(l)->l_hdr.lh_nfree < delta_chunks)
return (SET_ERROR(EAGAIN));
zap_leaf_array_free(l, &le->le_value_chunk);
@@ -567,7 +575,7 @@
*zeh->zeh_chunkp = le->le_next;
zap_leaf_chunk_free(l, entry_chunk);
- l->l_phys->l_hdr.lh_nentries--;
+ zap_leaf_phys(l)->l_hdr.lh_nentries--;
}
int
@@ -591,7 +599,7 @@
if (cd == ZAP_NEED_CD) {
/* find the lowest unused cd */
- if (l->l_phys->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED) {
+ if (zap_leaf_phys(l)->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED) {
cd = 0;
for (chunk = *LEAF_HASH_ENTPTR(l, h);
@@ -627,7 +635,7 @@
ASSERT3U(cd, <, zap_maxcd(zn->zn_zap));
}
- if (l->l_phys->l_hdr.lh_nfree < numchunks)
+ if (zap_leaf_phys(l)->l_hdr.lh_nfree < numchunks)
return (SET_ERROR(EAGAIN));
/* make the entry */
@@ -648,7 +656,7 @@
/* XXX if we did the search above, we could just use that */
chunkp = zap_leaf_rehash_entry(l, chunk);
- l->l_phys->l_hdr.lh_nentries++;
+ zap_leaf_phys(l)->l_hdr.lh_nentries++;
zeh->zeh_leaf = l;
zeh->zeh_num_integers = num_integers;
@@ -782,8 +790,8 @@
zap_leaf_chunk_free(l, entry);
- l->l_phys->l_hdr.lh_nentries--;
- nl->l_phys->l_hdr.lh_nentries++;
+ zap_leaf_phys(l)->l_hdr.lh_nentries--;
+ zap_leaf_phys(nl)->l_hdr.lh_nentries++;
}
/*
@@ -793,19 +801,22 @@
zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort)
{
int i;
- int bit = 64 - 1 - l->l_phys->l_hdr.lh_prefix_len;
+ int bit = 64 - 1 - zap_leaf_phys(l)->l_hdr.lh_prefix_len;
/* set new prefix and prefix_len */
- l->l_phys->l_hdr.lh_prefix <<= 1;
- l->l_phys->l_hdr.lh_prefix_len++;
- nl->l_phys->l_hdr.lh_prefix = l->l_phys->l_hdr.lh_prefix | 1;
- nl->l_phys->l_hdr.lh_prefix_len = l->l_phys->l_hdr.lh_prefix_len;
+ zap_leaf_phys(l)->l_hdr.lh_prefix <<= 1;
+ zap_leaf_phys(l)->l_hdr.lh_prefix_len++;
+ zap_leaf_phys(nl)->l_hdr.lh_prefix =
+ zap_leaf_phys(l)->l_hdr.lh_prefix | 1;
+ zap_leaf_phys(nl)->l_hdr.lh_prefix_len =
+ zap_leaf_phys(l)->l_hdr.lh_prefix_len;
/* break existing hash chains */
- zap_memset(l->l_phys->l_hash, CHAIN_END, 2*ZAP_LEAF_HASH_NUMENTRIES(l));
+ zap_memset(zap_leaf_phys(l)->l_hash, CHAIN_END,
+ 2*ZAP_LEAF_HASH_NUMENTRIES(l));
if (sort)
- l->l_phys->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED;
+ zap_leaf_phys(l)->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED;
/*
* Transfer entries whose hash bit 'bit' is set to nl; rehash
@@ -833,18 +844,18 @@
{
int i, n;
- n = zap->zap_f.zap_phys->zap_ptrtbl.zt_shift -
- l->l_phys->l_hdr.lh_prefix_len;
+ n = zap_f_phys(zap)->zap_ptrtbl.zt_shift -
+ zap_leaf_phys(l)->l_hdr.lh_prefix_len;
n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
zs->zs_leafs_with_2n_pointers[n]++;
- n = l->l_phys->l_hdr.lh_nentries/5;
+ n = zap_leaf_phys(l)->l_hdr.lh_nentries/5;
n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
zs->zs_blocks_with_n5_entries[n]++;
n = ((1<<FZAP_BLOCK_SHIFT(zap)) -
- l->l_phys->l_hdr.lh_nfree * (ZAP_LEAF_ARRAY_BYTES+1))*10 /
+ zap_leaf_phys(l)->l_hdr.lh_nfree * (ZAP_LEAF_ARRAY_BYTES+1))*10 /
(1<<FZAP_BLOCK_SHIFT(zap));
n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
zs->zs_blocks_n_tenths_full[n]++;
@@ -851,7 +862,7 @@
for (i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(l); i++) {
int nentries = 0;
- int chunk = l->l_phys->l_hash[i];
+ int chunk = zap_leaf_phys(l)->l_hash[i];
while (chunk != CHAIN_END) {
struct zap_leaf_entry *le =
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,7 +21,9 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
*/
#include <sys/zio.h>
@@ -34,19 +36,23 @@
#include <sys/zap_leaf.h>
#include <sys/avl.h>
#include <sys/arc.h>
+#include <sys/dmu_objset.h>
#ifdef _KERNEL
#include <sys/sunddi.h>
#endif
-static int mzap_upgrade(zap_t **zapp, dmu_tx_t *tx, zap_flags_t flags);
+extern inline mzap_phys_t *zap_m_phys(zap_t *zap);
+static int mzap_upgrade(zap_t **zapp,
+ void *tag, dmu_tx_t *tx, zap_flags_t flags);
+
uint64_t
zap_getflags(zap_t *zap)
{
if (zap->zap_ismicro)
return (0);
- return (zap->zap_u.zap_fat.zap_phys->zap_flags);
+ return (zap_f_phys(zap)->zap_flags);
}
int
@@ -381,7 +387,7 @@
if (*(uint64_t *)db->db_data != ZBT_MICRO) {
mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0);
- zap->zap_f.zap_block_shift = highbit(db->db_size) - 1;
+ zap->zap_f.zap_block_shift = highbit64(db->db_size) - 1;
} else {
zap->zap_ismicro = TRUE;
}
@@ -391,7 +397,8 @@
* it, because zap_lockdir() checks zap_ismicro without the lock
* held.
*/
- winner = dmu_buf_set_user(db, zap, &zap->zap_m.zap_phys, zap_evict);
+ dmu_buf_init_user(&zap->zap_dbu, zap_evict, &zap->zap_dbuf);
+ winner = dmu_buf_set_user(db, &zap->zap_dbu);
if (winner != NULL) {
rw_exit(&zap->zap_rwlock);
@@ -403,8 +410,8 @@
}
if (zap->zap_ismicro) {
- zap->zap_salt = zap->zap_m.zap_phys->mz_salt;
- zap->zap_normflags = zap->zap_m.zap_phys->mz_normflags;
+ zap->zap_salt = zap_m_phys(zap)->mz_salt;
+ zap->zap_normflags = zap_m_phys(zap)->mz_normflags;
zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1;
avl_create(&zap->zap_m.zap_avl, mze_compare,
sizeof (mzap_ent_t), offsetof(mzap_ent_t, mze_node));
@@ -411,7 +418,7 @@
for (i = 0; i < zap->zap_m.zap_num_chunks; i++) {
mzap_ent_phys_t *mze =
- &zap->zap_m.zap_phys->mz_chunk[i];
+ &zap_m_phys(zap)->mz_chunk[i];
if (mze->mze_name[0]) {
zap_name_t *zn;
@@ -428,8 +435,8 @@
}
}
} else {
- zap->zap_salt = zap->zap_f.zap_phys->zap_salt;
- zap->zap_normflags = zap->zap_f.zap_phys->zap_normflags;
+ zap->zap_salt = zap_f_phys(zap)->zap_salt;
+ zap->zap_normflags = zap_f_phys(zap)->zap_normflags;
ASSERT3U(sizeof (struct zap_leaf_header), ==,
2*ZAP_LEAF_CHUNKSIZE);
@@ -439,7 +446,7 @@
* other members.
*/
ASSERT3P(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), >,
- &zap->zap_f.zap_phys->zap_salt);
+ &zap_f_phys(zap)->zap_salt);
/*
* The embedded pointer table should end at the end of
@@ -447,7 +454,7 @@
*/
ASSERT3U((uintptr_t)&ZAP_EMBEDDED_PTRTBL_ENT(zap,
1<<ZAP_EMBEDDED_PTRTBL_SHIFT(zap)) -
- (uintptr_t)zap->zap_f.zap_phys, ==,
+ (uintptr_t)zap_f_phys(zap), ==,
zap->zap_dbuf->db_size);
}
rw_exit(&zap->zap_rwlock);
@@ -454,21 +461,19 @@
return (zap);
}
-int
-zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
+static int
+zap_lockdir_impl(dmu_buf_t *db, void *tag, dmu_tx_t *tx,
krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp)
{
zap_t *zap;
- dmu_buf_t *db;
krw_t lt;
- int err;
+ ASSERT0(db->db_offset);
+ objset_t *os = dmu_buf_get_objset(db);
+ uint64_t obj = db->db_object;
+
*zapp = NULL;
- err = dmu_buf_hold(os, obj, 0, NULL, &db, DMU_READ_NO_PREFETCH);
- if (err)
- return (err);
-
#ifdef ZFS_DEBUG
{
dmu_object_info_t doi;
@@ -515,10 +520,12 @@
dprintf("upgrading obj %llu: num_entries=%u\n",
obj, zap->zap_m.zap_num_entries);
*zapp = zap;
- return (mzap_upgrade(zapp, tx, 0));
+ int err = mzap_upgrade(zapp, tag, tx, 0);
+ if (err != 0)
+ rw_exit(&zap->zap_rwlock);
+ return (err);
}
- err = dmu_object_set_blocksize(os, obj, newsz, 0, tx);
- ASSERT0(err);
+ VERIFY0(dmu_object_set_blocksize(os, obj, newsz, 0, tx));
zap->zap_m.zap_num_chunks =
db->db_size / MZAP_ENT_LEN - 1;
}
@@ -527,15 +534,49 @@
return (0);
}
+static int
+zap_lockdir_by_dnode(dnode_t *dn, dmu_tx_t *tx,
+ krw_t lti, boolean_t fatreader, boolean_t adding, void *tag, zap_t **zapp)
+{
+ dmu_buf_t *db;
+ int err;
+
+ err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH);
+ if (err != 0) {
+ return (err);
+ }
+ err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp);
+ if (err != 0) {
+ dmu_buf_rele(db, tag);
+ }
+ return (err);
+}
+
+int
+zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
+ krw_t lti, boolean_t fatreader, boolean_t adding, void *tag, zap_t **zapp)
+{
+ dmu_buf_t *db;
+ int err;
+
+ err = dmu_buf_hold(os, obj, 0, tag, &db, DMU_READ_NO_PREFETCH);
+ if (err != 0)
+ return (err);
+ err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp);
+ if (err != 0)
+ dmu_buf_rele(db, tag);
+ return (err);
+}
+
void
-zap_unlockdir(zap_t *zap)
+zap_unlockdir(zap_t *zap, void *tag)
{
rw_exit(&zap->zap_rwlock);
- dmu_buf_rele(zap->zap_dbuf, NULL);
+ dmu_buf_rele(zap->zap_dbuf, tag);
}
static int
-mzap_upgrade(zap_t **zapp, dmu_tx_t *tx, zap_flags_t flags)
+mzap_upgrade(zap_t **zapp, void *tag, dmu_tx_t *tx, zap_flags_t flags)
{
mzap_phys_t *mzp;
int i, sz, nchunks;
@@ -545,7 +586,7 @@
ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
sz = zap->zap_dbuf->db_size;
- mzp = kmem_alloc(sz, KM_SLEEP);
+ mzp = zio_buf_alloc(sz);
bcopy(zap->zap_dbuf->db_data, mzp, sz);
nchunks = zap->zap_m.zap_num_chunks;
@@ -553,7 +594,7 @@
err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object,
1ULL << fzap_default_block_shift, 0, tx);
if (err) {
- kmem_free(mzp, sz);
+ zio_buf_free(mzp, sz);
return (err);
}
}
@@ -573,18 +614,19 @@
dprintf("adding %s=%llu\n",
mze->mze_name, mze->mze_value);
zn = zap_name_alloc(zap, mze->mze_name, MT_EXACT);
- err = fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd, tx);
+ err = fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd,
+ tag, tx);
zap = zn->zn_zap; /* fzap_add_cd() may change zap */
zap_name_free(zn);
if (err)
break;
}
- kmem_free(mzp, sz);
+ zio_buf_free(mzp, sz);
*zapp = zap;
return (err);
}
-static void
+void
mzap_create_impl(objset_t *os, uint64_t obj, int normflags, zap_flags_t flags,
dmu_tx_t *tx)
{
@@ -612,9 +654,9 @@
zap_t *zap;
/* Only fat zap supports flags; upgrade immediately. */
VERIFY(0 == zap_lockdir(os, obj, tx, RW_WRITER,
- B_FALSE, B_FALSE, &zap));
- VERIFY3U(0, ==, mzap_upgrade(&zap, tx, flags));
- zap_unlockdir(zap);
+ B_FALSE, B_FALSE, FTAG, &zap));
+ VERIFY3U(0, ==, mzap_upgrade(&zap, FTAG, tx, flags));
+ zap_unlockdir(zap, FTAG);
}
}
@@ -665,9 +707,9 @@
uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx);
ASSERT(leaf_blockshift >= SPA_MINBLOCKSHIFT &&
- leaf_blockshift <= SPA_MAXBLOCKSHIFT &&
+ leaf_blockshift <= SPA_OLD_MAXBLOCKSHIFT &&
indirect_blockshift >= SPA_MINBLOCKSHIFT &&
- indirect_blockshift <= SPA_MAXBLOCKSHIFT);
+ indirect_blockshift <= SPA_OLD_MAXBLOCKSHIFT);
VERIFY(dmu_object_set_blocksize(os, obj,
1ULL << leaf_blockshift, indirect_blockshift, tx) == 0);
@@ -688,11 +730,10 @@
return (dmu_object_free(os, zapobj, tx));
}
-_NOTE(ARGSUSED(0))
void
-zap_evict(dmu_buf_t *db, void *vzap)
+zap_evict(void *dbu)
{
- zap_t *zap = vzap;
+ zap_t *zap = dbu;
rw_destroy(&zap->zap_rwlock);
@@ -710,7 +751,7 @@
zap_t *zap;
int err;
- err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
+ err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
if (err)
return (err);
if (!zap->zap_ismicro) {
@@ -718,7 +759,7 @@
} else {
*count = zap->zap_m.zap_num_entries;
}
- zap_unlockdir(zap);
+ zap_unlockdir(zap, FTAG);
return (err);
}
@@ -775,25 +816,19 @@
num_integers, buf, MT_EXACT, NULL, 0, NULL));
}
-int
-zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name,
+static int
+zap_lookup_impl(zap_t *zap, const char *name,
uint64_t integer_size, uint64_t num_integers, void *buf,
matchtype_t mt, char *realname, int rn_len,
boolean_t *ncp)
{
- zap_t *zap;
- int err;
+ int err = 0;
mzap_ent_t *mze;
zap_name_t *zn;
- err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
- if (err)
- return (err);
zn = zap_name_alloc(zap, name, mt);
- if (zn == NULL) {
- zap_unlockdir(zap);
+ if (zn == NULL)
return (SET_ERROR(ENOTSUP));
- }
if (!zap->zap_ismicro) {
err = fzap_lookup(zn, integer_size, num_integers, buf,
@@ -820,11 +855,55 @@
}
}
zap_name_free(zn);
- zap_unlockdir(zap);
return (err);
}
int
+zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name,
+ uint64_t integer_size, uint64_t num_integers, void *buf,
+ matchtype_t mt, char *realname, int rn_len,
+ boolean_t *ncp)
+{
+ zap_t *zap;
+ int err;
+
+ err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
+ if (err != 0)
+ return (err);
+ err = zap_lookup_impl(zap, name, integer_size,
+ num_integers, buf, mt, realname, rn_len, ncp);
+ zap_unlockdir(zap, FTAG);
+ return (err);
+}
+
+int
+zap_lookup_by_dnode(dnode_t *dn, const char *name,
+ uint64_t integer_size, uint64_t num_integers, void *buf)
+{
+ return (zap_lookup_norm_by_dnode(dn, name, integer_size,
+ num_integers, buf, MT_EXACT, NULL, 0, NULL));
+}
+
+int
+zap_lookup_norm_by_dnode(dnode_t *dn, const char *name,
+ uint64_t integer_size, uint64_t num_integers, void *buf,
+ matchtype_t mt, char *realname, int rn_len,
+ boolean_t *ncp)
+{
+ zap_t *zap;
+ int err;
+
+ err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE,
+ FTAG, &zap);
+ if (err != 0)
+ return (err);
+ err = zap_lookup_impl(zap, name, integer_size,
+ num_integers, buf, mt, realname, rn_len, ncp);
+ zap_unlockdir(zap, FTAG);
+ return (err);
+}
+
+int
zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
int key_numints)
{
@@ -832,18 +911,18 @@
int err;
zap_name_t *zn;
- err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
+ err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
if (err)
return (err);
zn = zap_name_alloc_uint64(zap, key, key_numints);
if (zn == NULL) {
- zap_unlockdir(zap);
+ zap_unlockdir(zap, FTAG);
return (SET_ERROR(ENOTSUP));
}
fzap_prefetch(zn);
zap_name_free(zn);
- zap_unlockdir(zap);
+ zap_unlockdir(zap, FTAG);
return (err);
}
@@ -855,12 +934,12 @@
int err;
zap_name_t *zn;
- err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
+ err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
if (err)
return (err);
zn = zap_name_alloc_uint64(zap, key, key_numints);
if (zn == NULL) {
- zap_unlockdir(zap);
+ zap_unlockdir(zap, FTAG);
return (SET_ERROR(ENOTSUP));
}
@@ -867,7 +946,7 @@
err = fzap_lookup(zn, integer_size, num_integers, buf,
NULL, 0, NULL);
zap_name_free(zn);
- zap_unlockdir(zap);
+ zap_unlockdir(zap, FTAG);
return (err);
}
@@ -874,8 +953,8 @@
int
zap_contains(objset_t *os, uint64_t zapobj, const char *name)
{
- int err = (zap_lookup_norm(os, zapobj, name, 0,
- 0, NULL, MT_EXACT, NULL, 0, NULL));
+ int err = zap_lookup_norm(os, zapobj, name, 0,
+ 0, NULL, MT_EXACT, NULL, 0, NULL);
if (err == EOVERFLOW || err == EINVAL)
err = 0; /* found, but skipped reading the value */
return (err);
@@ -890,12 +969,12 @@
mzap_ent_t *mze;
zap_name_t *zn;
- err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
+ err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
if (err)
return (err);
zn = zap_name_alloc(zap, name, MT_EXACT);
if (zn == NULL) {
- zap_unlockdir(zap);
+ zap_unlockdir(zap, FTAG);
return (SET_ERROR(ENOTSUP));
}
if (!zap->zap_ismicro) {
@@ -912,7 +991,7 @@
}
}
zap_name_free(zn);
- zap_unlockdir(zap);
+ zap_unlockdir(zap, FTAG);
return (err);
}
@@ -924,17 +1003,17 @@
int err;
zap_name_t *zn;
- err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
+ err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
if (err)
return (err);
zn = zap_name_alloc_uint64(zap, key, key_numints);
if (zn == NULL) {
- zap_unlockdir(zap);
+ zap_unlockdir(zap, FTAG);
return (SET_ERROR(ENOTSUP));
}
err = fzap_length(zn, integer_size, num_integers);
zap_name_free(zn);
- zap_unlockdir(zap);
+ zap_unlockdir(zap, FTAG);
return (err);
}
@@ -950,7 +1029,7 @@
#ifdef ZFS_DEBUG
for (i = 0; i < zap->zap_m.zap_num_chunks; i++) {
- mzap_ent_phys_t *mze = &zap->zap_m.zap_phys->mz_chunk[i];
+ mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i];
ASSERT(strcmp(zn->zn_key_orig, mze->mze_name) != 0);
}
#endif
@@ -961,7 +1040,7 @@
again:
for (i = start; i < zap->zap_m.zap_num_chunks; i++) {
- mzap_ent_phys_t *mze = &zap->zap_m.zap_phys->mz_chunk[i];
+ mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i];
if (mze->mze_name[0] == 0) {
mze->mze_value = value;
mze->mze_cd = cd;
@@ -993,22 +1072,24 @@
const uint64_t *intval = val;
zap_name_t *zn;
- err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap);
+ err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
if (err)
return (err);
zn = zap_name_alloc(zap, key, MT_EXACT);
if (zn == NULL) {
- zap_unlockdir(zap);
+ zap_unlockdir(zap, FTAG);
return (SET_ERROR(ENOTSUP));
}
if (!zap->zap_ismicro) {
- err = fzap_add(zn, integer_size, num_integers, val, tx);
+ err = fzap_add(zn, integer_size, num_integers, val, FTAG, tx);
zap = zn->zn_zap; /* fzap_add() may change zap */
} else if (integer_size != 8 || num_integers != 1 ||
strlen(key) >= MZAP_NAME_LEN) {
- err = mzap_upgrade(&zn->zn_zap, tx, 0);
- if (err == 0)
- err = fzap_add(zn, integer_size, num_integers, val, tx);
+ err = mzap_upgrade(&zn->zn_zap, FTAG, tx, 0);
+ if (err == 0) {
+ err = fzap_add(zn, integer_size, num_integers, val,
+ FTAG, tx);
+ }
zap = zn->zn_zap; /* fzap_add() may change zap */
} else {
mze = mze_find(zn);
@@ -1021,7 +1102,7 @@
ASSERT(zap == zn->zn_zap);
zap_name_free(zn);
if (zap != NULL) /* may be NULL if fzap_add() failed */
- zap_unlockdir(zap);
+ zap_unlockdir(zap, FTAG);
return (err);
}
@@ -1034,19 +1115,19 @@
int err;
zap_name_t *zn;
- err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap);
+ err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
if (err)
return (err);
zn = zap_name_alloc_uint64(zap, key, key_numints);
if (zn == NULL) {
- zap_unlockdir(zap);
+ zap_unlockdir(zap, FTAG);
return (SET_ERROR(ENOTSUP));
}
- err = fzap_add(zn, integer_size, num_integers, val, tx);
+ err = fzap_add(zn, integer_size, num_integers, val, FTAG, tx);
zap = zn->zn_zap; /* fzap_add() may change zap */
zap_name_free(zn);
if (zap != NULL) /* may be NULL if fzap_add() failed */
- zap_unlockdir(zap);
+ zap_unlockdir(zap, FTAG);
return (err);
}
@@ -1070,25 +1151,27 @@
(void) zap_lookup(os, zapobj, name, 8, 1, &oldval);
#endif
- err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap);
+ err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
if (err)
return (err);
zn = zap_name_alloc(zap, name, MT_EXACT);
if (zn == NULL) {
- zap_unlockdir(zap);
+ zap_unlockdir(zap, FTAG);
return (SET_ERROR(ENOTSUP));
}
if (!zap->zap_ismicro) {
- err = fzap_update(zn, integer_size, num_integers, val, tx);
+ err = fzap_update(zn, integer_size, num_integers, val,
+ FTAG, tx);
zap = zn->zn_zap; /* fzap_update() may change zap */
} else if (integer_size != 8 || num_integers != 1 ||
strlen(name) >= MZAP_NAME_LEN) {
dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n",
zapobj, integer_size, num_integers, name);
- err = mzap_upgrade(&zn->zn_zap, tx, 0);
- if (err == 0)
+ err = mzap_upgrade(&zn->zn_zap, FTAG, tx, 0);
+ if (err == 0) {
err = fzap_update(zn, integer_size, num_integers,
- val, tx);
+ val, FTAG, tx);
+ }
zap = zn->zn_zap; /* fzap_update() may change zap */
} else {
mze = mze_find(zn);
@@ -1102,7 +1185,7 @@
ASSERT(zap == zn->zn_zap);
zap_name_free(zn);
if (zap != NULL) /* may be NULL if fzap_upgrade() failed */
- zap_unlockdir(zap);
+ zap_unlockdir(zap, FTAG);
return (err);
}
@@ -1115,19 +1198,19 @@
zap_name_t *zn;
int err;
- err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap);
+ err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap);
if (err)
return (err);
zn = zap_name_alloc_uint64(zap, key, key_numints);
if (zn == NULL) {
- zap_unlockdir(zap);
+ zap_unlockdir(zap, FTAG);
return (SET_ERROR(ENOTSUP));
}
- err = fzap_update(zn, integer_size, num_integers, val, tx);
+ err = fzap_update(zn, integer_size, num_integers, val, FTAG, tx);
zap = zn->zn_zap; /* fzap_update() may change zap */
zap_name_free(zn);
if (zap != NULL) /* may be NULL if fzap_upgrade() failed */
- zap_unlockdir(zap);
+ zap_unlockdir(zap, FTAG);
return (err);
}
@@ -1146,12 +1229,12 @@
mzap_ent_t *mze;
zap_name_t *zn;
- err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, &zap);
+ err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
if (err)
return (err);
zn = zap_name_alloc(zap, name, mt);
if (zn == NULL) {
- zap_unlockdir(zap);
+ zap_unlockdir(zap, FTAG);
return (SET_ERROR(ENOTSUP));
}
if (!zap->zap_ismicro) {
@@ -1162,13 +1245,13 @@
err = SET_ERROR(ENOENT);
} else {
zap->zap_m.zap_num_entries--;
- bzero(&zap->zap_m.zap_phys->mz_chunk[mze->mze_chunkid],
+ bzero(&zap_m_phys(zap)->mz_chunk[mze->mze_chunkid],
sizeof (mzap_ent_phys_t));
mze_remove(zap, mze);
}
}
zap_name_free(zn);
- zap_unlockdir(zap);
+ zap_unlockdir(zap, FTAG);
return (err);
}
@@ -1180,17 +1263,17 @@
int err;
zap_name_t *zn;
- err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, &zap);
+ err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap);
if (err)
return (err);
zn = zap_name_alloc_uint64(zap, key, key_numints);
if (zn == NULL) {
- zap_unlockdir(zap);
+ zap_unlockdir(zap, FTAG);
return (SET_ERROR(ENOTSUP));
}
err = fzap_remove(zn, tx);
zap_name_free(zn);
- zap_unlockdir(zap);
+ zap_unlockdir(zap, FTAG);
return (err);
}
@@ -1222,7 +1305,7 @@
{
if (zc->zc_zap) {
rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
- zap_unlockdir(zc->zc_zap);
+ zap_unlockdir(zc->zc_zap, NULL);
zc->zc_zap = NULL;
}
if (zc->zc_leaf) {
@@ -1269,7 +1352,7 @@
if (zc->zc_zap == NULL) {
int hb;
err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL,
- RW_READER, TRUE, FALSE, &zc->zc_zap);
+ RW_READER, TRUE, FALSE, NULL, &zc->zc_zap);
if (err)
return (err);
@@ -1336,7 +1419,7 @@
if (zc->zc_zap == NULL) {
err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL,
- RW_READER, TRUE, FALSE, &zc->zc_zap);
+ RW_READER, TRUE, FALSE, FTAG, &zc->zc_zap);
if (err)
return (err);
} else {
@@ -1373,7 +1456,7 @@
int err;
zap_t *zap;
- err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
+ err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap);
if (err)
return (err);
@@ -1386,31 +1469,31 @@
} else {
fzap_get_stats(zap, zs);
}
- zap_unlockdir(zap);
+ zap_unlockdir(zap, FTAG);
return (0);
}
int
-zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add,
- uint64_t *towrite, uint64_t *tooverwrite)
+zap_count_write_by_dnode(dnode_t *dn, const char *name, int add,
+ refcount_t *towrite, refcount_t *tooverwrite)
{
zap_t *zap;
int err = 0;
-
/*
* Since, we don't have a name, we cannot figure out which blocks will
* be affected in this operation. So, account for the worst case :
* - 3 blocks overwritten: target leaf, ptrtbl block, header block
* - 4 new blocks written if adding:
- * - 2 blocks for possibly split leaves,
- * - 2 grown ptrtbl blocks
+ * - 2 blocks for possibly split leaves,
+ * - 2 grown ptrtbl blocks
*
- * This also accomodates the case where an add operation to a fairly
+ * This also accommodates the case where an add operation to a fairly
* large microzap results in a promotion to fatzap.
*/
if (name == NULL) {
- *towrite += (3 + (add ? 4 : 0)) * SPA_MAXBLOCKSIZE;
+ (void) refcount_add_many(towrite,
+ (3 + (add ? 4 : 0)) * SPA_OLD_MAXBLOCKSIZE, FTAG);
return (err);
}
@@ -1418,10 +1501,11 @@
* We lock the zap with adding == FALSE. Because, if we pass
* the actual value of add, it could trigger a mzap_upgrade().
* At present we are just evaluating the possibility of this operation
- * and hence we donot want to trigger an upgrade.
+ * and hence we do not want to trigger an upgrade.
*/
- err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
- if (err)
+ err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE,
+ FTAG, &zap);
+ if (err != 0)
return (err);
if (!zap->zap_ismicro) {
@@ -1434,7 +1518,8 @@
/*
* We treat this case as similar to (name == NULL)
*/
- *towrite += (3 + (add ? 4 : 0)) * SPA_MAXBLOCKSIZE;
+ (void) refcount_add_many(towrite,
+ (3 + (add ? 4 : 0)) * SPA_OLD_MAXBLOCKSIZE, FTAG);
}
} else {
/*
@@ -1452,16 +1537,20 @@
* 4 new blocks written : 2 new split leaf, 2 grown
* ptrtbl blocks
*/
- if (dmu_buf_freeable(zap->zap_dbuf))
- *tooverwrite += SPA_MAXBLOCKSIZE;
- else
- *towrite += SPA_MAXBLOCKSIZE;
+ if (dmu_buf_freeable(zap->zap_dbuf)) {
+ (void) refcount_add_many(tooverwrite,
+ MZAP_MAX_BLKSZ, FTAG);
+ } else {
+ (void) refcount_add_many(towrite,
+ MZAP_MAX_BLKSZ, FTAG);
+ }
if (add) {
- *towrite += 4 * SPA_MAXBLOCKSIZE;
+ (void) refcount_add_many(towrite,
+ 4 * MZAP_MAX_BLKSZ, FTAG);
}
}
- zap_unlockdir(zap);
+ zap_unlockdir(zap, FTAG);
return (err);
}
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfeature.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfeature.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfeature.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,7 +21,7 @@
*/
/*
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -162,23 +162,25 @@
*/
typedef enum {
- FEATURE_ACTION_ENABLE,
FEATURE_ACTION_INCR,
FEATURE_ACTION_DECR,
} feature_action_t;
/*
- * Checks that the features active in the specified object are supported by
+ * Checks that the active features in the pool are supported by
* this software. Adds each unsupported feature (name -> description) to
* the supplied nvlist.
*/
boolean_t
-feature_is_supported(objset_t *os, uint64_t obj, uint64_t desc_obj,
+spa_features_check(spa_t *spa, boolean_t for_write,
nvlist_t *unsup_feat, nvlist_t *enabled_feat)
{
+ objset_t *os = spa->spa_meta_objset;
boolean_t supported;
zap_cursor_t zc;
zap_attribute_t za;
+ uint64_t obj = for_write ?
+ spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj;
supported = B_TRUE;
for (zap_cursor_init(&zc, os, obj);
@@ -200,8 +202,8 @@
char *desc = "";
char buf[MAXPATHLEN];
- if (zap_lookup(os, desc_obj, za.za_name,
- 1, sizeof (buf), buf) == 0)
+ if (zap_lookup(os, spa->spa_feat_desc_obj,
+ za.za_name, 1, sizeof (buf), buf) == 0)
desc = buf;
VERIFY(nvlist_add_string(unsup_feat, za.za_name,
@@ -214,13 +216,38 @@
return (supported);
}
-static int
-feature_get_refcount(objset_t *os, uint64_t read_obj, uint64_t write_obj,
- zfeature_info_t *feature, uint64_t *res)
+/*
+ * Use an in-memory cache of feature refcounts for quick retrieval.
+ *
+ * Note: well-designed features will not need to use this; they should
+ * use spa_feature_is_enabled() and spa_feature_is_active() instead.
+ * However, this is non-static for zdb and zhack.
+ */
+int
+feature_get_refcount(spa_t *spa, zfeature_info_t *feature, uint64_t *res)
{
+ ASSERT(VALID_FEATURE_FID(feature->fi_feature));
+ if (spa->spa_feat_refcount_cache[feature->fi_feature] ==
+ SPA_FEATURE_DISABLED) {
+ return (SET_ERROR(ENOTSUP));
+ }
+ *res = spa->spa_feat_refcount_cache[feature->fi_feature];
+ return (0);
+}
+
+/*
+ * Note: well-designed features will not need to use this; they should
+ * use spa_feature_is_enabled() and spa_feature_is_active() instead.
+ * However, this is non-static for zdb and zhack.
+ */
+int
+feature_get_refcount_from_disk(spa_t *spa, zfeature_info_t *feature,
+ uint64_t *res)
+{
int err;
uint64_t refcount;
- uint64_t zapobj = feature->fi_can_readonly ? write_obj : read_obj;
+ uint64_t zapobj = (feature->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ?
+ spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj;
/*
* If the pool is currently being created, the feature objects may not
@@ -229,8 +256,8 @@
if (zapobj == 0)
return (SET_ERROR(ENOTSUP));
- err = zap_lookup(os, zapobj, feature->fi_guid, sizeof (uint64_t), 1,
- &refcount);
+ err = zap_lookup(spa->spa_meta_objset, zapobj,
+ feature->fi_guid, sizeof (uint64_t), 1, &refcount);
if (err != 0) {
if (err == ENOENT)
return (SET_ERROR(ENOTSUP));
@@ -241,49 +268,139 @@
return (0);
}
+
static int
-feature_do_action(objset_t *os, uint64_t read_obj, uint64_t write_obj,
- uint64_t desc_obj, zfeature_info_t *feature, feature_action_t action,
+feature_get_enabled_txg(spa_t *spa, zfeature_info_t *feature, uint64_t *res)
+{
+ uint64_t enabled_txg_obj = spa->spa_feat_enabled_txg_obj;
+
+ ASSERT(zfeature_depends_on(feature->fi_feature,
+ SPA_FEATURE_ENABLED_TXG));
+
+ if (!spa_feature_is_enabled(spa, feature->fi_feature)) {
+ return (SET_ERROR(ENOTSUP));
+ }
+
+ ASSERT(enabled_txg_obj != 0);
+
+ VERIFY0(zap_lookup(spa->spa_meta_objset, spa->spa_feat_enabled_txg_obj,
+ feature->fi_guid, sizeof (uint64_t), 1, res));
+
+ return (0);
+}
+
+/*
+ * This function is non-static for zhack; it should otherwise not be used
+ * outside this file.
+ */
+void
+feature_sync(spa_t *spa, zfeature_info_t *feature, uint64_t refcount,
dmu_tx_t *tx)
{
- int error;
- uint64_t refcount;
- uint64_t zapobj = feature->fi_can_readonly ? write_obj : read_obj;
+ ASSERT(VALID_FEATURE_OR_NONE(feature->fi_feature));
+ uint64_t zapobj = (feature->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ?
+ spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj;
+ VERIFY0(zap_update(spa->spa_meta_objset, zapobj, feature->fi_guid,
+ sizeof (uint64_t), 1, &refcount, tx));
+
+ /*
+ * feature_sync is called directly from zhack, allowing the
+ * creation of arbitrary features whose fi_feature field may
+ * be greater than SPA_FEATURES. When called from zhack, the
+ * zfeature_info_t object's fi_feature field will be set to
+ * SPA_FEATURE_NONE.
+ */
+ if (feature->fi_feature != SPA_FEATURE_NONE) {
+ uint64_t *refcount_cache =
+ &spa->spa_feat_refcount_cache[feature->fi_feature];
+#ifdef atomic_swap_64
+ VERIFY3U(*refcount_cache, ==,
+ atomic_swap_64(refcount_cache, refcount));
+#else
+ *refcount_cache = refcount;
+#endif
+ }
+
+ if (refcount == 0)
+ spa_deactivate_mos_feature(spa, feature->fi_guid);
+ else if (feature->fi_flags & ZFEATURE_FLAG_MOS)
+ spa_activate_mos_feature(spa, feature->fi_guid, tx);
+}
+
+/*
+ * This function is non-static for zhack; it should otherwise not be used
+ * outside this file.
+ */
+void
+feature_enable_sync(spa_t *spa, zfeature_info_t *feature, dmu_tx_t *tx)
+{
+ uint64_t initial_refcount =
+ (feature->fi_flags & ZFEATURE_FLAG_ACTIVATE_ON_ENABLE) ? 1 : 0;
+ uint64_t zapobj = (feature->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ?
+ spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj;
+
ASSERT(0 != zapobj);
ASSERT(zfeature_is_valid_guid(feature->fi_guid));
+ ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES);
- error = zap_lookup(os, zapobj, feature->fi_guid,
- sizeof (uint64_t), 1, &refcount);
-
/*
- * If we can't ascertain the status of the specified feature, an I/O
- * error occurred.
+ * If the feature is already enabled, ignore the request.
*/
- if (error != 0 && error != ENOENT)
- return (error);
+ if (zap_contains(spa->spa_meta_objset, zapobj, feature->fi_guid) == 0)
+ return;
+ for (int i = 0; feature->fi_depends[i] != SPA_FEATURE_NONE; i++)
+ spa_feature_enable(spa, feature->fi_depends[i], tx);
+
+ VERIFY0(zap_update(spa->spa_meta_objset, spa->spa_feat_desc_obj,
+ feature->fi_guid, 1, strlen(feature->fi_desc) + 1,
+ feature->fi_desc, tx));
+
+ feature_sync(spa, feature, initial_refcount, tx);
+
+ if (spa_feature_is_enabled(spa, SPA_FEATURE_ENABLED_TXG)) {
+ uint64_t enabling_txg = dmu_tx_get_txg(tx);
+
+ if (spa->spa_feat_enabled_txg_obj == 0ULL) {
+ spa->spa_feat_enabled_txg_obj =
+ zap_create_link(spa->spa_meta_objset,
+ DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_FEATURE_ENABLED_TXG, tx);
+ }
+ spa_feature_incr(spa, SPA_FEATURE_ENABLED_TXG, tx);
+
+ VERIFY0(zap_add(spa->spa_meta_objset,
+ spa->spa_feat_enabled_txg_obj, feature->fi_guid,
+ sizeof (uint64_t), 1, &enabling_txg, tx));
+ }
+}
+
+static void
+feature_do_action(spa_t *spa, spa_feature_t fid, feature_action_t action,
+ dmu_tx_t *tx)
+{
+ uint64_t refcount;
+ zfeature_info_t *feature = &spa_feature_table[fid];
+ uint64_t zapobj = (feature->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ?
+ spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj;
+
+ ASSERT(VALID_FEATURE_FID(fid));
+ ASSERT(0 != zapobj);
+ ASSERT(zfeature_is_valid_guid(feature->fi_guid));
+
+ ASSERT(dmu_tx_is_syncing(tx));
+ ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES);
+
+ VERIFY3U(feature_get_refcount(spa, feature, &refcount), !=, ENOTSUP);
+
switch (action) {
- case FEATURE_ACTION_ENABLE:
- /*
- * If the feature is already enabled, ignore the request.
- */
- if (error == 0)
- return (0);
- refcount = 0;
- break;
case FEATURE_ACTION_INCR:
- if (error == ENOENT)
- return (SET_ERROR(ENOTSUP));
- if (refcount == UINT64_MAX)
- return (SET_ERROR(EOVERFLOW));
+ VERIFY3U(refcount, !=, UINT64_MAX);
refcount++;
break;
case FEATURE_ACTION_DECR:
- if (error == ENOENT)
- return (SET_ERROR(ENOTSUP));
- if (refcount == 0)
- return (SET_ERROR(EOVERFLOW));
+ VERIFY3U(refcount, !=, 0);
refcount--;
break;
default:
@@ -291,42 +408,7 @@
break;
}
- if (action == FEATURE_ACTION_ENABLE) {
- int i;
-
- for (i = 0; feature->fi_depends[i] != NULL; i++) {
- zfeature_info_t *dep = feature->fi_depends[i];
-
- error = feature_do_action(os, read_obj, write_obj,
- desc_obj, dep, FEATURE_ACTION_ENABLE, tx);
- if (error != 0)
- return (error);
- }
- }
-
- error = zap_update(os, zapobj, feature->fi_guid,
- sizeof (uint64_t), 1, &refcount, tx);
- if (error != 0)
- return (error);
-
- if (action == FEATURE_ACTION_ENABLE) {
- error = zap_update(os, desc_obj,
- feature->fi_guid, 1, strlen(feature->fi_desc) + 1,
- feature->fi_desc, tx);
- if (error != 0)
- return (error);
- }
-
- if (action == FEATURE_ACTION_INCR && refcount == 1 && feature->fi_mos) {
- spa_activate_mos_feature(dmu_objset_spa(os), feature->fi_guid);
- }
-
- if (action == FEATURE_ACTION_DECR && refcount == 0) {
- spa_deactivate_mos_feature(dmu_objset_spa(os),
- feature->fi_guid);
- }
-
- return (0);
+ feature_sync(spa, feature, refcount, tx);
}
void
@@ -354,72 +436,75 @@
* Enable any required dependencies, then enable the requested feature.
*/
void
-spa_feature_enable(spa_t *spa, zfeature_info_t *feature, dmu_tx_t *tx)
+spa_feature_enable(spa_t *spa, spa_feature_t fid, dmu_tx_t *tx)
{
ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES);
- VERIFY3U(0, ==, feature_do_action(spa->spa_meta_objset,
- spa->spa_feat_for_read_obj, spa->spa_feat_for_write_obj,
- spa->spa_feat_desc_obj, feature, FEATURE_ACTION_ENABLE, tx));
+ ASSERT(VALID_FEATURE_FID(fid));
+ feature_enable_sync(spa, &spa_feature_table[fid], tx);
}
-/*
- * If the specified feature has not yet been enabled, this function returns
- * ENOTSUP; otherwise, this function increments the feature's refcount (or
- * returns EOVERFLOW if the refcount cannot be incremented). This function must
- * be called from syncing context.
- */
void
-spa_feature_incr(spa_t *spa, zfeature_info_t *feature, dmu_tx_t *tx)
+spa_feature_incr(spa_t *spa, spa_feature_t fid, dmu_tx_t *tx)
{
- ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES);
- VERIFY3U(0, ==, feature_do_action(spa->spa_meta_objset,
- spa->spa_feat_for_read_obj, spa->spa_feat_for_write_obj,
- spa->spa_feat_desc_obj, feature, FEATURE_ACTION_INCR, tx));
+ feature_do_action(spa, fid, FEATURE_ACTION_INCR, tx);
}
-/*
- * If the specified feature has not yet been enabled, this function returns
- * ENOTSUP; otherwise, this function decrements the feature's refcount (or
- * returns EOVERFLOW if the refcount is already 0). This function must
- * be called from syncing context.
- */
void
-spa_feature_decr(spa_t *spa, zfeature_info_t *feature, dmu_tx_t *tx)
+spa_feature_decr(spa_t *spa, spa_feature_t fid, dmu_tx_t *tx)
{
- ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES);
- VERIFY3U(0, ==, feature_do_action(spa->spa_meta_objset,
- spa->spa_feat_for_read_obj, spa->spa_feat_for_write_obj,
- spa->spa_feat_desc_obj, feature, FEATURE_ACTION_DECR, tx));
+ feature_do_action(spa, fid, FEATURE_ACTION_DECR, tx);
}
boolean_t
-spa_feature_is_enabled(spa_t *spa, zfeature_info_t *feature)
+spa_feature_is_enabled(spa_t *spa, spa_feature_t fid)
{
int err;
uint64_t refcount;
+ ASSERT(VALID_FEATURE_FID(fid));
if (spa_version(spa) < SPA_VERSION_FEATURES)
return (B_FALSE);
- err = feature_get_refcount(spa->spa_meta_objset,
- spa->spa_feat_for_read_obj, spa->spa_feat_for_write_obj,
- feature, &refcount);
+ err = feature_get_refcount(spa, &spa_feature_table[fid], &refcount);
ASSERT(err == 0 || err == ENOTSUP);
return (err == 0);
}
boolean_t
-spa_feature_is_active(spa_t *spa, zfeature_info_t *feature)
+spa_feature_is_active(spa_t *spa, spa_feature_t fid)
{
int err;
uint64_t refcount;
+ ASSERT(VALID_FEATURE_FID(fid));
if (spa_version(spa) < SPA_VERSION_FEATURES)
return (B_FALSE);
- err = feature_get_refcount(spa->spa_meta_objset,
- spa->spa_feat_for_read_obj, spa->spa_feat_for_write_obj,
- feature, &refcount);
+ err = feature_get_refcount(spa, &spa_feature_table[fid], &refcount);
ASSERT(err == 0 || err == ENOTSUP);
return (err == 0 && refcount > 0);
}
+
+/*
+ * For the feature specified by fid (which must depend on
+ * SPA_FEATURE_ENABLED_TXG), return the TXG at which it was enabled in the
+ * OUT txg argument.
+ *
+ * Returns B_TRUE if the feature is enabled, in which case txg will be filled
+ * with the transaction group in which the specified feature was enabled.
+ * Returns B_FALSE otherwise (i.e. if the feature is not enabled).
+ */
+boolean_t
+spa_feature_enabled_txg(spa_t *spa, spa_feature_t fid, uint64_t *txg)
+{
+ int err;
+
+ ASSERT(VALID_FEATURE_FID(fid));
+ if (spa_version(spa) < SPA_VERSION_FEATURES)
+ return (B_FALSE);
+
+ err = feature_get_enabled_txg(spa, &spa_feature_table[fid], txg);
+ ASSERT(err == 0 || err == ENOTSUP);
+
+ return (err == 0);
+}
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_acl.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1059,8 +1059,7 @@
* create a new acl and leave any cached acl in place.
*/
static int
-zfs_acl_node_read(znode_t *zp, boolean_t have_lock, zfs_acl_t **aclpp,
- boolean_t will_modify)
+zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp, boolean_t will_modify)
{
zfs_acl_t *aclp;
int aclsize;
@@ -1069,9 +1068,9 @@
zfs_acl_phys_t znode_acl;
int version;
int error;
- boolean_t drop_lock = B_FALSE;
ASSERT(MUTEX_HELD(&zp->z_acl_lock));
+ ASSERT_VOP_LOCKED(ZTOV(zp), __func__);
if (zp->z_acl_cached && !will_modify) {
*aclpp = zp->z_acl_cached;
@@ -1078,17 +1077,6 @@
return (0);
}
- /*
- * close race where znode could be upgrade while trying to
- * read the znode attributes.
- *
- * But this could only happen if the file isn't already an SA
- * znode
- */
- if (!zp->z_is_sa && !have_lock) {
- mutex_enter(&zp->z_lock);
- drop_lock = B_TRUE;
- }
version = zfs_znode_acl_version(zp);
if ((error = zfs_acl_znode_info(zp, &aclsize,
@@ -1134,8 +1122,6 @@
if (!will_modify)
zp->z_acl_cached = aclp;
done:
- if (drop_lock)
- mutex_exit(&zp->z_lock);
return (error);
}
@@ -1162,10 +1148,10 @@
int error;
zfs_acl_t *aclp;
- ASSERT(MUTEX_HELD(&zp->z_lock));
+ ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
ASSERT(MUTEX_HELD(&zp->z_acl_lock));
- if ((error = zfs_acl_node_read(zp, B_TRUE, &aclp, B_FALSE)) == 0)
+ if ((error = zfs_acl_node_read(zp, &aclp, B_FALSE)) == 0)
zp->z_mode = zfs_mode_compute(zp->z_mode, aclp,
&zp->z_pflags, zp->z_uid, zp->z_gid);
return (error);
@@ -1189,6 +1175,7 @@
sa_bulk_attr_t bulk[5];
uint64_t ctime[2];
int count = 0;
+ zfs_acl_phys_t acl_phys;
mode = zp->z_mode;
@@ -1235,7 +1222,6 @@
} else { /* Painful legacy way */
zfs_acl_node_t *aclnode;
uint64_t off = 0;
- zfs_acl_phys_t acl_phys;
uint64_t aoid;
if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zfsvfs),
@@ -1446,11 +1432,11 @@
int error = 0;
mutex_enter(&zp->z_acl_lock);
- mutex_enter(&zp->z_lock);
+ ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_DISCARD)
*aclp = zfs_acl_alloc(zfs_acl_version_zp(zp));
else
- error = zfs_acl_node_read(zp, B_TRUE, aclp, B_TRUE);
+ error = zfs_acl_node_read(zp, aclp, B_TRUE);
if (error == 0) {
(*aclp)->z_hints = zp->z_pflags & V4_ACL_WIDE_FLAGS;
@@ -1457,7 +1443,6 @@
zfs_acl_chmod(ZTOV(zp)->v_type, mode,
(zp->z_zfsvfs->z_acl_mode == ZFS_ACL_GROUPMASK), *aclp);
}
- mutex_exit(&zp->z_lock);
mutex_exit(&zp->z_acl_lock);
return (error);
@@ -1628,6 +1613,10 @@
boolean_t need_chmod = B_TRUE;
boolean_t inherited = B_FALSE;
+ if ((flag & IS_ROOT_NODE) == 0)
+ ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__);
+ else
+ ASSERT(dzp->z_vnode == NULL);
bzero(acl_ids, sizeof (zfs_acl_ids_t));
acl_ids->z_mode = MAKEIMODE(vap->va_type, vap->va_mode);
@@ -1684,7 +1673,7 @@
} else {
acl_ids->z_fgid = zfs_fuid_create_cred(zfsvfs,
ZFS_GROUP, cr, &acl_ids->z_fuidp);
-#ifdef __FreeBSD__
+#ifdef __FreeBSD_kernel__
gid = acl_ids->z_fgid = dzp->z_gid;
#else
gid = crgetgid(cr);
@@ -1711,12 +1700,10 @@
if (acl_ids->z_aclp == NULL) {
mutex_enter(&dzp->z_acl_lock);
- mutex_enter(&dzp->z_lock);
if (!(flag & IS_ROOT_NODE) &&
(dzp->z_pflags & ZFS_INHERIT_ACE) &&
!(dzp->z_pflags & ZFS_XATTR)) {
- VERIFY(0 == zfs_acl_node_read(dzp, B_TRUE,
- &paclp, B_FALSE));
+ VERIFY(0 == zfs_acl_node_read(dzp, &paclp, B_FALSE));
acl_ids->z_aclp = zfs_acl_inherit(zfsvfs,
vap->va_type, paclp, acl_ids->z_mode, &need_chmod);
inherited = B_TRUE;
@@ -1725,7 +1712,6 @@
zfs_acl_alloc(zfs_acl_version_zp(dzp));
acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL;
}
- mutex_exit(&dzp->z_lock);
mutex_exit(&dzp->z_acl_lock);
if (need_chmod) {
acl_ids->z_aclp->z_hints |= (vap->va_type == VDIR) ?
@@ -1791,7 +1777,8 @@
mutex_enter(&zp->z_acl_lock);
- error = zfs_acl_node_read(zp, B_FALSE, &aclp, B_FALSE);
+ ASSERT_VOP_LOCKED(ZTOV(zp), __func__);
+ error = zfs_acl_node_read(zp, &aclp, B_FALSE);
if (error != 0) {
mutex_exit(&zp->z_acl_lock);
return (error);
@@ -1939,6 +1926,7 @@
boolean_t fuid_dirtied;
uint64_t acl_obj;
+ ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
if (mask == 0)
return (SET_ERROR(ENOSYS));
@@ -1963,7 +1951,6 @@
}
top:
mutex_enter(&zp->z_acl_lock);
- mutex_enter(&zp->z_lock);
tx = dmu_tx_create(zfsvfs->z_os);
@@ -1996,7 +1983,6 @@
error = dmu_tx_assign(tx, TXG_NOWAIT);
if (error) {
mutex_exit(&zp->z_acl_lock);
- mutex_exit(&zp->z_lock);
if (error == ERESTART) {
dmu_tx_wait(tx);
@@ -2021,8 +2007,6 @@
if (fuidp)
zfs_fuid_info_free(fuidp);
dmu_tx_commit(tx);
-done:
- mutex_exit(&zp->z_lock);
mutex_exit(&zp->z_acl_lock);
return (error);
@@ -2054,7 +2038,7 @@
return (SET_ERROR(EPERM));
}
-#ifdef sun
+#ifdef illumos
if ((v4_mode & (ACE_DELETE | ACE_DELETE_CHILD)) &&
(zp->z_pflags & ZFS_NOUNLINK)) {
return (SET_ERROR(EPERM));
@@ -2126,7 +2110,8 @@
mutex_enter(&zp->z_acl_lock);
- error = zfs_acl_node_read(zp, B_FALSE, &aclp, B_FALSE);
+ ASSERT_VOP_LOCKED(ZTOV(zp), __func__);
+ error = zfs_acl_node_read(zp, &aclp, B_FALSE);
if (error != 0) {
mutex_exit(&zp->z_acl_lock);
return (error);
@@ -2375,7 +2360,7 @@
is_attr = ((zp->z_pflags & ZFS_XATTR) && (ZTOV(zp)->v_type == VDIR));
-#ifdef __FreeBSD__
+#ifdef __FreeBSD_kernel__
/*
* In FreeBSD, we don't care about permissions of individual ADS.
* Note that not checking them is not just an optimization - without
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,7 +21,8 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved.
*/
/*
@@ -70,136 +71,253 @@
#include <sys/zfs_ioctl.h>
#include <sys/zfs_vfsops.h>
#include <sys/namei.h>
-#include <sys/gfs.h>
#include <sys/stat.h>
#include <sys/dmu.h>
+#include <sys/dsl_dataset.h>
#include <sys/dsl_destroy.h>
#include <sys/dsl_deleg.h>
#include <sys/mount.h>
-#include <sys/sunddi.h>
+#include <sys/zap.h>
#include "zfs_namecheck.h"
-typedef struct zfsctl_node {
- gfs_dir_t zc_gfs_private;
- uint64_t zc_id;
- timestruc_t zc_cmtime; /* ctime and mtime, always the same */
-} zfsctl_node_t;
+/* Common access mode for all virtual directories under the ctldir */
+const u_short zfsctl_ctldir_mode = S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP |
+ S_IROTH | S_IXOTH;
-typedef struct zfsctl_snapdir {
- zfsctl_node_t sd_node;
- kmutex_t sd_lock;
- avl_tree_t sd_snaps;
-} zfsctl_snapdir_t;
+/*
+ * "Synthetic" filesystem implementation.
+ */
-typedef struct {
- char *se_name;
- vnode_t *se_root;
- avl_node_t se_node;
-} zfs_snapentry_t;
+/*
+ * Assert that A implies B.
+ */
+#define KASSERT_IMPLY(A, B, msg) KASSERT(!(A) || (B), (msg));
+static MALLOC_DEFINE(M_SFSNODES, "sfs_nodes", "synthetic-fs nodes");
+
+typedef struct sfs_node {
+ char sn_name[ZFS_MAX_DATASET_NAME_LEN];
+ uint64_t sn_parent_id;
+ uint64_t sn_id;
+} sfs_node_t;
+
+/*
+ * Check the parent's ID as well as the node's to account for a chance
+ * that IDs originating from different domains (snapshot IDs, artifical
+ * IDs, znode IDs) may clash.
+ */
static int
-snapentry_compare(const void *a, const void *b)
+sfs_compare_ids(struct vnode *vp, void *arg)
{
- const zfs_snapentry_t *sa = a;
- const zfs_snapentry_t *sb = b;
- int ret = strcmp(sa->se_name, sb->se_name);
+ sfs_node_t *n1 = vp->v_data;
+ sfs_node_t *n2 = arg;
+ bool equal;
- if (ret < 0)
- return (-1);
- else if (ret > 0)
- return (1);
- else
- return (0);
+ equal = n1->sn_id == n2->sn_id &&
+ n1->sn_parent_id == n2->sn_parent_id;
+
+ /* Zero means equality. */
+ return (!equal);
}
-#ifdef sun
-vnodeops_t *zfsctl_ops_root;
-vnodeops_t *zfsctl_ops_snapdir;
-vnodeops_t *zfsctl_ops_snapshot;
-vnodeops_t *zfsctl_ops_shares;
-vnodeops_t *zfsctl_ops_shares_dir;
+static int
+sfs_vnode_get(const struct mount *mp, int flags, uint64_t parent_id,
+ uint64_t id, struct vnode **vpp)
+{
+ sfs_node_t search;
+ int err;
-static const fs_operation_def_t zfsctl_tops_root[];
-static const fs_operation_def_t zfsctl_tops_snapdir[];
-static const fs_operation_def_t zfsctl_tops_snapshot[];
-static const fs_operation_def_t zfsctl_tops_shares[];
-#else /* !sun */
-static struct vop_vector zfsctl_ops_root;
-static struct vop_vector zfsctl_ops_snapdir;
-static struct vop_vector zfsctl_ops_snapshot;
-static struct vop_vector zfsctl_ops_shares;
-static struct vop_vector zfsctl_ops_shares_dir;
-#endif /* !sun */
+ search.sn_id = id;
+ search.sn_parent_id = parent_id;
+ err = vfs_hash_get(mp, (u_int)id, flags, curthread, vpp,
+ sfs_compare_ids, &search);
+ return (err);
+}
-static vnode_t *zfsctl_mknode_snapdir(vnode_t *);
-static vnode_t *zfsctl_mknode_shares(vnode_t *);
-static vnode_t *zfsctl_snapshot_mknode(vnode_t *, uint64_t objset);
-static int zfsctl_unmount_snap(zfs_snapentry_t *, int, cred_t *);
+static int
+sfs_vnode_insert(struct vnode *vp, int flags, uint64_t parent_id,
+ uint64_t id, struct vnode **vpp)
+{
+ int err;
-#ifdef sun
-static gfs_opsvec_t zfsctl_opsvec[] = {
- { ".zfs", zfsctl_tops_root, &zfsctl_ops_root },
- { ".zfs/snapshot", zfsctl_tops_snapdir, &zfsctl_ops_snapdir },
- { ".zfs/snapshot/vnode", zfsctl_tops_snapshot, &zfsctl_ops_snapshot },
- { ".zfs/shares", zfsctl_tops_shares, &zfsctl_ops_shares_dir },
- { ".zfs/shares/vnode", zfsctl_tops_shares, &zfsctl_ops_shares },
- { NULL }
-};
-#endif /* sun */
+ KASSERT(vp->v_data != NULL, ("sfs_vnode_insert with NULL v_data"));
+ err = vfs_hash_insert(vp, (u_int)id, flags, curthread, vpp,
+ sfs_compare_ids, vp->v_data);
+ return (err);
+}
-/*
- * Root directory elements. We only have two entries
- * snapshot and shares.
- */
-static gfs_dirent_t zfsctl_root_entries[] = {
- { "snapshot", zfsctl_mknode_snapdir, GFS_CACHE_VNODE },
- { "shares", zfsctl_mknode_shares, GFS_CACHE_VNODE },
- { NULL }
-};
+static void
+sfs_vnode_remove(struct vnode *vp)
+{
+ vfs_hash_remove(vp);
+}
-/* include . and .. in the calculation */
-#define NROOT_ENTRIES ((sizeof (zfsctl_root_entries) / \
- sizeof (gfs_dirent_t)) + 1)
+typedef void sfs_vnode_setup_fn(vnode_t *vp, void *arg);
+static int
+sfs_vgetx(struct mount *mp, int flags, uint64_t parent_id, uint64_t id,
+ const char *tag, struct vop_vector *vops,
+ sfs_vnode_setup_fn setup, void *arg,
+ struct vnode **vpp)
+{
+ struct vnode *vp;
+ int error;
+ error = sfs_vnode_get(mp, flags, parent_id, id, vpp);
+ if (error != 0 || *vpp != NULL) {
+ KASSERT_IMPLY(error == 0, (*vpp)->v_data != NULL,
+ "sfs vnode with no data");
+ return (error);
+ }
+
+ /* Allocate a new vnode/inode. */
+ error = getnewvnode(tag, mp, vops, &vp);
+ if (error != 0) {
+ *vpp = NULL;
+ return (error);
+ }
+
+ /*
+ * Exclusively lock the vnode vnode while it's being constructed.
+ */
+ lockmgr(vp->v_vnlock, LK_EXCLUSIVE, NULL);
+ error = insmntque(vp, mp);
+ if (error != 0) {
+ *vpp = NULL;
+ return (error);
+ }
+
+ setup(vp, arg);
+
+ error = sfs_vnode_insert(vp, flags, parent_id, id, vpp);
+ if (error != 0 || *vpp != NULL) {
+ KASSERT_IMPLY(error == 0, (*vpp)->v_data != NULL,
+ "sfs vnode with no data");
+ return (error);
+ }
+
+ *vpp = vp;
+ return (0);
+}
+
+static void
+sfs_print_node(sfs_node_t *node)
+{
+ printf("\tname = %s\n", node->sn_name);
+ printf("\tparent_id = %ju\n", (uintmax_t)node->sn_parent_id);
+ printf("\tid = %ju\n", (uintmax_t)node->sn_id);
+}
+
+static sfs_node_t *
+sfs_alloc_node(size_t size, const char *name, uint64_t parent_id, uint64_t id)
+{
+ struct sfs_node *node;
+
+ KASSERT(strlen(name) < sizeof(node->sn_name),
+ ("sfs node name is too long"));
+ KASSERT(size >= sizeof(*node), ("sfs node size is too small"));
+ node = malloc(size, M_SFSNODES, M_WAITOK | M_ZERO);
+ strlcpy(node->sn_name, name, sizeof(node->sn_name));
+ node->sn_parent_id = parent_id;
+ node->sn_id = id;
+
+ return (node);
+}
+
+static void
+sfs_destroy_node(sfs_node_t *node)
+{
+ free(node, M_SFSNODES);
+}
+
+static void *
+sfs_reclaim_vnode(vnode_t *vp)
+{
+ sfs_node_t *node;
+ void *data;
+
+ sfs_vnode_remove(vp);
+ data = vp->v_data;
+ vp->v_data = NULL;
+ return (data);
+}
+
+static int
+sfs_readdir_common(uint64_t parent_id, uint64_t id, struct vop_readdir_args *ap,
+ uio_t *uio, off_t *offp)
+{
+ struct dirent entry;
+ int error;
+
+ /* Reset ncookies for subsequent use of vfs_read_dirent. */
+ if (ap->a_ncookies != NULL)
+ *ap->a_ncookies = 0;
+
+ if (uio->uio_resid < sizeof(entry))
+ return (SET_ERROR(EINVAL));
+
+ if (uio->uio_offset < 0)
+ return (SET_ERROR(EINVAL));
+ if (uio->uio_offset == 0) {
+ entry.d_fileno = id;
+ entry.d_type = DT_DIR;
+ entry.d_name[0] = '.';
+ entry.d_name[1] = '\0';
+ entry.d_namlen = 1;
+ entry.d_reclen = sizeof(entry);
+ error = vfs_read_dirent(ap, &entry, uio->uio_offset);
+ if (error != 0)
+ return (SET_ERROR(error));
+ }
+
+ if (uio->uio_offset < sizeof(entry))
+ return (SET_ERROR(EINVAL));
+ if (uio->uio_offset == sizeof(entry)) {
+ entry.d_fileno = parent_id;
+ entry.d_type = DT_DIR;
+ entry.d_name[0] = '.';
+ entry.d_name[1] = '.';
+ entry.d_name[2] = '\0';
+ entry.d_namlen = 2;
+ entry.d_reclen = sizeof(entry);
+ error = vfs_read_dirent(ap, &entry, uio->uio_offset);
+ if (error != 0)
+ return (SET_ERROR(error));
+ }
+
+ if (offp != NULL)
+ *offp = 2 * sizeof(entry);
+ return (0);
+}
+
+
/*
- * Initialize the various GFS pieces we'll need to create and manipulate .zfs
- * directories. This is called from the ZFS init routine, and initializes the
- * vnode ops vectors that we'll be using.
+ * .zfs inode namespace
+ *
+ * We need to generate unique inode numbers for all files and directories
+ * within the .zfs pseudo-filesystem. We use the following scheme:
+ *
+ * ENTRY ZFSCTL_INODE
+ * .zfs 1
+ * .zfs/snapshot 2
+ * .zfs/snapshot/<snap> objectid(snap)
*/
+#define ZFSCTL_INO_SNAP(id) (id)
+
+static struct vop_vector zfsctl_ops_root;
+static struct vop_vector zfsctl_ops_snapdir;
+static struct vop_vector zfsctl_ops_snapshot;
+static struct vop_vector zfsctl_ops_shares_dir;
+
void
zfsctl_init(void)
{
-#ifdef sun
- VERIFY(gfs_make_opsvec(zfsctl_opsvec) == 0);
-#endif
}
void
zfsctl_fini(void)
{
-#ifdef sun
- /*
- * Remove vfsctl vnode ops
- */
- if (zfsctl_ops_root)
- vn_freevnodeops(zfsctl_ops_root);
- if (zfsctl_ops_snapdir)
- vn_freevnodeops(zfsctl_ops_snapdir);
- if (zfsctl_ops_snapshot)
- vn_freevnodeops(zfsctl_ops_snapshot);
- if (zfsctl_ops_shares)
- vn_freevnodeops(zfsctl_ops_shares);
- if (zfsctl_ops_shares_dir)
- vn_freevnodeops(zfsctl_ops_shares_dir);
-
- zfsctl_ops_root = NULL;
- zfsctl_ops_snapdir = NULL;
- zfsctl_ops_snapshot = NULL;
- zfsctl_ops_shares = NULL;
- zfsctl_ops_shares_dir = NULL;
-#endif /* sun */
}
boolean_t
@@ -208,96 +326,119 @@
return (vn_matchops(vp, zfsctl_ops_root) ||
vn_matchops(vp, zfsctl_ops_snapdir) ||
vn_matchops(vp, zfsctl_ops_snapshot) ||
- vn_matchops(vp, zfsctl_ops_shares) ||
vn_matchops(vp, zfsctl_ops_shares_dir));
}
-/*
- * Return the inode number associated with the 'snapshot' or
- * 'shares' directory.
- */
-/* ARGSUSED */
-static ino64_t
-zfsctl_root_inode_cb(vnode_t *vp, int index)
-{
- zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
+typedef struct zfsctl_root {
+ sfs_node_t node;
+ sfs_node_t *snapdir;
+ timestruc_t cmtime;
+} zfsctl_root_t;
- ASSERT(index <= 2);
- if (index == 0)
- return (ZFSCTL_INO_SNAPDIR);
-
- return (zfsvfs->z_shares_dir);
-}
-
/*
- * Create the '.zfs' directory. This directory is cached as part of the VFS
- * structure. This results in a hold on the vfs_t. The code in zfs_umount()
- * therefore checks against a vfs_count of 2 instead of 1. This reference
- * is removed when the ctldir is destroyed in the unmount.
+ * Create the '.zfs' directory.
*/
void
zfsctl_create(zfsvfs_t *zfsvfs)
{
- vnode_t *vp, *rvp;
- zfsctl_node_t *zcp;
+ zfsctl_root_t *dot_zfs;
+ sfs_node_t *snapdir;
+ vnode_t *rvp;
uint64_t crtime[2];
ASSERT(zfsvfs->z_ctldir == NULL);
- vp = gfs_root_create(sizeof (zfsctl_node_t), zfsvfs->z_vfs,
- &zfsctl_ops_root, ZFSCTL_INO_ROOT, zfsctl_root_entries,
- zfsctl_root_inode_cb, MAXNAMELEN, NULL, NULL);
- zcp = vp->v_data;
- zcp->zc_id = ZFSCTL_INO_ROOT;
+ snapdir = sfs_alloc_node(sizeof(*snapdir), "snapshot", ZFSCTL_INO_ROOT,
+ ZFSCTL_INO_SNAPDIR);
+ dot_zfs = (zfsctl_root_t *)sfs_alloc_node(sizeof(*dot_zfs), ".zfs", 0,
+ ZFSCTL_INO_ROOT);
+ dot_zfs->snapdir = snapdir;
VERIFY(VFS_ROOT(zfsvfs->z_vfs, LK_EXCLUSIVE, &rvp) == 0);
VERIFY(0 == sa_lookup(VTOZ(rvp)->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs),
- &crtime, sizeof (crtime)));
- ZFS_TIME_DECODE(&zcp->zc_cmtime, crtime);
- VN_URELE(rvp);
+ &crtime, sizeof(crtime)));
+ ZFS_TIME_DECODE(&dot_zfs->cmtime, crtime);
+ vput(rvp);
- /*
- * We're only faking the fact that we have a root of a filesystem for
- * the sake of the GFS interfaces. Undo the flag manipulation it did
- * for us.
- */
- vp->v_vflag &= ~VV_ROOT;
-
- zfsvfs->z_ctldir = vp;
-
- VOP_UNLOCK(vp, 0);
+ zfsvfs->z_ctldir = dot_zfs;
}
/*
* Destroy the '.zfs' directory. Only called when the filesystem is unmounted.
- * There might still be more references if we were force unmounted, but only
- * new zfs_inactive() calls can occur and they don't reference .zfs
+ * The nodes must not have any associated vnodes by now as they should be
+ * vflush-ed.
*/
void
zfsctl_destroy(zfsvfs_t *zfsvfs)
{
- VN_RELE(zfsvfs->z_ctldir);
+ sfs_destroy_node(zfsvfs->z_ctldir->snapdir);
+ sfs_destroy_node((sfs_node_t *)zfsvfs->z_ctldir);
zfsvfs->z_ctldir = NULL;
}
+static int
+zfsctl_fs_root_vnode(struct mount *mp, void *arg __unused, int flags,
+ struct vnode **vpp)
+{
+ return (VFS_ROOT(mp, flags, vpp));
+}
+
+static void
+zfsctl_common_vnode_setup(vnode_t *vp, void *arg)
+{
+ ASSERT_VOP_ELOCKED(vp, __func__);
+
+ /* We support shared locking. */
+ VN_LOCK_ASHARE(vp);
+ vp->v_type = VDIR;
+ vp->v_data = arg;
+}
+
+static int
+zfsctl_root_vnode(struct mount *mp, void *arg __unused, int flags,
+ struct vnode **vpp)
+{
+ void *node;
+ int err;
+
+ node = ((zfsvfs_t*)mp->mnt_data)->z_ctldir;
+ err = sfs_vgetx(mp, flags, 0, ZFSCTL_INO_ROOT, "zfs", &zfsctl_ops_root,
+ zfsctl_common_vnode_setup, node, vpp);
+ return (err);
+}
+
+static int
+zfsctl_snapdir_vnode(struct mount *mp, void *arg __unused, int flags,
+ struct vnode **vpp)
+{
+ void *node;
+ int err;
+
+ node = ((zfsvfs_t*)mp->mnt_data)->z_ctldir->snapdir;
+ err = sfs_vgetx(mp, flags, ZFSCTL_INO_ROOT, ZFSCTL_INO_SNAPDIR, "zfs",
+ &zfsctl_ops_snapdir, zfsctl_common_vnode_setup, node, vpp);
+ return (err);
+}
+
/*
* Given a root znode, retrieve the associated .zfs directory.
* Add a hold to the vnode and return it.
*/
-vnode_t *
-zfsctl_root(znode_t *zp)
+int
+zfsctl_root(zfsvfs_t *zfsvfs, int flags, vnode_t **vpp)
{
- ASSERT(zfs_has_ctldir(zp));
- VN_HOLD(zp->z_zfsvfs->z_ctldir);
- return (zp->z_zfsvfs->z_ctldir);
+ vnode_t *vp;
+ int error;
+
+ error = zfsctl_root_vnode(zfsvfs->z_vfs, NULL, flags, vpp);
+ return (error);
}
/*
* Common open routine. Disallow any write access.
*/
-/* ARGSUSED */
static int
zfsctl_common_open(struct vop_open_args *ap)
{
@@ -322,7 +463,6 @@
/*
* Common access routine. Disallow writes.
*/
-/* ARGSUSED */
static int
zfsctl_common_access(ap)
struct vop_access_args /* {
@@ -334,18 +474,8 @@
{
accmode_t accmode = ap->a_accmode;
-#ifdef TODO
- if (flags & V_ACE_MASK) {
- if (accmode & ACE_ALL_WRITE_PERMS)
- return (SET_ERROR(EACCES));
- } else {
-#endif
- if (accmode & VWRITE)
- return (SET_ERROR(EACCES));
-#ifdef TODO
- }
-#endif
-
+ if (accmode & VWRITE)
+ return (SET_ERROR(EACCES));
return (0);
}
@@ -356,7 +486,10 @@
zfsctl_common_getattr(vnode_t *vp, vattr_t *vap)
{
timestruc_t now;
+ sfs_node_t *node;
+ node = vp->v_data;
+
vap->va_uid = 0;
vap->va_gid = 0;
vap->va_rdev = 0;
@@ -368,8 +501,7 @@
vap->va_nblocks = 0;
vap->va_seq = 0;
vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
- vap->va_mode = S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP |
- S_IROTH | S_IXOTH;
+ vap->va_mode = zfsctl_ctldir_mode;
vap->va_type = VDIR;
/*
* We live in the now (for atime).
@@ -378,9 +510,13 @@
vap->va_atime = now;
/* FreeBSD: Reset chflags(2) flags. */
vap->va_flags = 0;
+
+ vap->va_nodeid = node->sn_id;
+
+ /* At least '.' and '..'. */
+ vap->va_nlink = 2;
}
-/*ARGSUSED*/
static int
zfsctl_common_fid(ap)
struct vop_fid_args /* {
@@ -390,107 +526,50 @@
{
vnode_t *vp = ap->a_vp;
fid_t *fidp = (void *)ap->a_fid;
- zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
- zfsctl_node_t *zcp = vp->v_data;
- uint64_t object = zcp->zc_id;
+ sfs_node_t *node = vp->v_data;
+ uint64_t object = node->sn_id;
zfid_short_t *zfid;
int i;
- ZFS_ENTER(zfsvfs);
-
-#ifdef illumos
- if (fidp->fid_len < SHORT_FID_LEN) {
- fidp->fid_len = SHORT_FID_LEN;
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(ENOSPC));
- }
-#else
- fidp->fid_len = SHORT_FID_LEN;
-#endif
-
zfid = (zfid_short_t *)fidp;
-
zfid->zf_len = SHORT_FID_LEN;
- for (i = 0; i < sizeof (zfid->zf_object); i++)
+ for (i = 0; i < sizeof(zfid->zf_object); i++)
zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
- /* .zfs znodes always have a generation number of 0 */
- for (i = 0; i < sizeof (zfid->zf_gen); i++)
+ /* .zfs nodes always have a generation number of 0 */
+ for (i = 0; i < sizeof(zfid->zf_gen); i++)
zfid->zf_gen[i] = 0;
- ZFS_EXIT(zfsvfs);
return (0);
}
-
-/*ARGSUSED*/
static int
-zfsctl_shares_fid(ap)
- struct vop_fid_args /* {
+zfsctl_common_reclaim(ap)
+ struct vop_reclaim_args /* {
struct vnode *a_vp;
- struct fid *a_fid;
+ struct thread *a_td;
} */ *ap;
{
- vnode_t *vp = ap->a_vp;
- fid_t *fidp = (void *)ap->a_fid;
- zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
- znode_t *dzp;
- int error;
+ vnode_t *vp = ap->a_vp;
- ZFS_ENTER(zfsvfs);
-
- if (zfsvfs->z_shares_dir == 0) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(ENOTSUP));
- }
-
- if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) {
- error = VOP_FID(ZTOV(dzp), fidp);
- VN_RELE(ZTOV(dzp));
- }
-
- ZFS_EXIT(zfsvfs);
- return (error);
+ (void) sfs_reclaim_vnode(vp);
+ return (0);
}
static int
-zfsctl_common_reclaim(ap)
- struct vop_reclaim_args /* {
+zfsctl_common_print(ap)
+ struct vop_print_args /* {
struct vnode *a_vp;
- struct thread *a_td;
} */ *ap;
{
- vnode_t *vp = ap->a_vp;
-
- /*
- * Destroy the vm object and flush associated pages.
- */
- vnode_destroy_vobject(vp);
- VI_LOCK(vp);
- vp->v_data = NULL;
- VI_UNLOCK(vp);
+ sfs_print_node(ap->a_vp->v_data);
return (0);
}
/*
- * .zfs inode namespace
- *
- * We need to generate unique inode numbers for all files and directories
- * within the .zfs pseudo-filesystem. We use the following scheme:
- *
- * ENTRY ZFSCTL_INODE
- * .zfs 1
- * .zfs/snapshot 2
- * .zfs/snapshot/<snap> objectid(snap)
- */
-
-#define ZFSCTL_INO_SNAP(id) (id)
-
-/*
* Get root directory attributes.
*/
-/* ARGSUSED */
static int
zfsctl_root_getattr(ap)
struct vop_getattr_args /* {
@@ -501,96 +580,45 @@
{
struct vnode *vp = ap->a_vp;
struct vattr *vap = ap->a_vap;
- zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
- zfsctl_node_t *zcp = vp->v_data;
+ zfsctl_root_t *node = vp->v_data;
- ZFS_ENTER(zfsvfs);
- vap->va_nodeid = ZFSCTL_INO_ROOT;
- vap->va_nlink = vap->va_size = NROOT_ENTRIES;
- vap->va_mtime = vap->va_ctime = zcp->zc_cmtime;
+ zfsctl_common_getattr(vp, vap);
+ vap->va_ctime = node->cmtime;
+ vap->va_mtime = vap->va_ctime;
vap->va_birthtime = vap->va_ctime;
-
- zfsctl_common_getattr(vp, vap);
- ZFS_EXIT(zfsvfs);
-
+ vap->va_nlink += 1; /* snapdir */
+ vap->va_size = vap->va_nlink;
return (0);
}
/*
- * Special case the handling of "..".
+ * When we lookup "." we still can be asked to lock it
+ * differently, can't we?
*/
-/* ARGSUSED */
int
-zfsctl_root_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
- int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
- int *direntflags, pathname_t *realpnp)
+zfsctl_relock_dot(vnode_t *dvp, int ltype)
{
- zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
- int err;
+ vref(dvp);
+ if (ltype != VOP_ISLOCKED(dvp)) {
+ if (ltype == LK_EXCLUSIVE)
+ vn_lock(dvp, LK_UPGRADE | LK_RETRY);
+ else /* if (ltype == LK_SHARED) */
+ vn_lock(dvp, LK_DOWNGRADE | LK_RETRY);
- /*
- * No extended attributes allowed under .zfs
- */
- if (flags & LOOKUP_XATTR)
- return (SET_ERROR(EINVAL));
-
- ZFS_ENTER(zfsvfs);
-
- if (strcmp(nm, "..") == 0) {
- err = VFS_ROOT(dvp->v_vfsp, LK_EXCLUSIVE, vpp);
- if (err == 0)
- VOP_UNLOCK(*vpp, 0);
- } else {
- err = gfs_vop_lookup(dvp, nm, vpp, pnp, flags, rdir,
- cr, ct, direntflags, realpnp);
+ /* Relock for the "." case may left us with reclaimed vnode. */
+ if ((dvp->v_iflag & VI_DOOMED) != 0) {
+ vrele(dvp);
+ return (SET_ERROR(ENOENT));
+ }
}
-
- ZFS_EXIT(zfsvfs);
-
- return (err);
+ return (0);
}
-#ifdef sun
-static int
-zfsctl_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
- caller_context_t *ct)
-{
- /*
- * We only care about ACL_ENABLED so that libsec can
- * display ACL correctly and not default to POSIX draft.
- */
- if (cmd == _PC_ACL_ENABLED) {
- *valp = _ACL_ACE_ENABLED;
- return (0);
- }
-
- return (fs_pathconf(vp, cmd, valp, cr, ct));
-}
-#endif /* sun */
-
-#ifdef sun
-static const fs_operation_def_t zfsctl_tops_root[] = {
- { VOPNAME_OPEN, { .vop_open = zfsctl_common_open } },
- { VOPNAME_CLOSE, { .vop_close = zfsctl_common_close } },
- { VOPNAME_IOCTL, { .error = fs_inval } },
- { VOPNAME_GETATTR, { .vop_getattr = zfsctl_root_getattr } },
- { VOPNAME_ACCESS, { .vop_access = zfsctl_common_access } },
- { VOPNAME_READDIR, { .vop_readdir = gfs_vop_readdir } },
- { VOPNAME_LOOKUP, { .vop_lookup = zfsctl_root_lookup } },
- { VOPNAME_SEEK, { .vop_seek = fs_seek } },
- { VOPNAME_INACTIVE, { .vop_inactive = gfs_vop_inactive } },
- { VOPNAME_PATHCONF, { .vop_pathconf = zfsctl_pathconf } },
- { VOPNAME_FID, { .vop_fid = zfsctl_common_fid } },
- { NULL }
-};
-#endif /* sun */
-
/*
* Special case the handling of "..".
*/
-/* ARGSUSED */
int
-zfsctl_freebsd_root_lookup(ap)
+zfsctl_root_lookup(ap)
struct vop_lookup_args /* {
struct vnode *a_dvp;
struct vnode **a_vpp;
@@ -597,329 +625,282 @@
struct componentname *a_cnp;
} */ *ap;
{
+ struct componentname *cnp = ap->a_cnp;
vnode_t *dvp = ap->a_dvp;
vnode_t **vpp = ap->a_vpp;
cred_t *cr = ap->a_cnp->cn_cred;
int flags = ap->a_cnp->cn_flags;
+ int lkflags = ap->a_cnp->cn_lkflags;
int nameiop = ap->a_cnp->cn_nameiop;
- char nm[NAME_MAX + 1];
int err;
+ int ltype;
- if ((flags & ISLASTCN) && (nameiop == RENAME || nameiop == CREATE))
- return (EOPNOTSUPP);
+ ASSERT(dvp->v_type == VDIR);
- ASSERT(ap->a_cnp->cn_namelen < sizeof(nm));
- strlcpy(nm, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen + 1);
+ if ((flags & ISLASTCN) != 0 && nameiop != LOOKUP)
+ return (SET_ERROR(ENOTSUP));
- err = zfsctl_root_lookup(dvp, nm, vpp, NULL, 0, NULL, cr, NULL, NULL, NULL);
- if (err == 0 && (nm[0] != '.' || nm[1] != '\0'))
- vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
+ if (cnp->cn_namelen == 1 && *cnp->cn_nameptr == '.') {
+ err = zfsctl_relock_dot(dvp, lkflags & LK_TYPE_MASK);
+ if (err == 0)
+ *vpp = dvp;
+ } else if ((flags & ISDOTDOT) != 0) {
+ err = vn_vget_ino_gen(dvp, zfsctl_fs_root_vnode, NULL,
+ lkflags, vpp);
+ } else if (strncmp(cnp->cn_nameptr, "snapshot", cnp->cn_namelen) == 0) {
+ err = zfsctl_snapdir_vnode(dvp->v_mount, NULL, lkflags, vpp);
+ } else {
+ err = SET_ERROR(ENOENT);
+ }
+ if (err != 0)
+ *vpp = NULL;
return (err);
}
-static struct vop_vector zfsctl_ops_root = {
- .vop_default = &default_vnodeops,
- .vop_open = zfsctl_common_open,
- .vop_close = zfsctl_common_close,
- .vop_ioctl = VOP_EINVAL,
- .vop_getattr = zfsctl_root_getattr,
- .vop_access = zfsctl_common_access,
- .vop_readdir = gfs_vop_readdir,
- .vop_lookup = zfsctl_freebsd_root_lookup,
- .vop_inactive = gfs_vop_inactive,
- .vop_reclaim = zfsctl_common_reclaim,
-#ifdef TODO
- .vop_pathconf = zfsctl_pathconf,
-#endif
- .vop_fid = zfsctl_common_fid,
-};
-
-/*
- * Gets the full dataset name that corresponds to the given snapshot name
- * Example:
- * zfsctl_snapshot_zname("snap1") -> "mypool/myfs at snap1"
- */
static int
-zfsctl_snapshot_zname(vnode_t *vp, const char *name, int len, char *zname)
+zfsctl_root_readdir(ap)
+ struct vop_readdir_args /* {
+ struct vnode *a_vp;
+ struct uio *a_uio;
+ struct ucred *a_cred;
+ int *a_eofflag;
+ int *ncookies;
+ u_long **a_cookies;
+ } */ *ap;
{
- objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os;
+ struct dirent entry;
+ vnode_t *vp = ap->a_vp;
+ zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
+ zfsctl_root_t *node = vp->v_data;
+ uio_t *uio = ap->a_uio;
+ int *eofp = ap->a_eofflag;
+ off_t dots_offset;
+ int error;
- if (snapshot_namecheck(name, NULL, NULL) != 0)
- return (SET_ERROR(EILSEQ));
- dmu_objset_name(os, zname);
- if (strlen(zname) + 1 + strlen(name) >= len)
- return (SET_ERROR(ENAMETOOLONG));
- (void) strcat(zname, "@");
- (void) strcat(zname, name);
+ ASSERT(vp->v_type == VDIR);
+
+ error = sfs_readdir_common(zfsvfs->z_root, ZFSCTL_INO_ROOT, ap, uio,
+ &dots_offset);
+ if (error != 0) {
+ if (error == ENAMETOOLONG) /* ran out of destination space */
+ error = 0;
+ return (error);
+ }
+ if (uio->uio_offset != dots_offset)
+ return (SET_ERROR(EINVAL));
+
+ CTASSERT(sizeof(node->snapdir->sn_name) <= sizeof(entry.d_name));
+ entry.d_fileno = node->snapdir->sn_id;
+ entry.d_type = DT_DIR;
+ strcpy(entry.d_name, node->snapdir->sn_name);
+ entry.d_namlen = strlen(entry.d_name);
+ entry.d_reclen = sizeof(entry);
+ error = vfs_read_dirent(ap, &entry, uio->uio_offset);
+ if (error != 0) {
+ if (error == ENAMETOOLONG)
+ error = 0;
+ return (SET_ERROR(error));
+ }
+ if (eofp != NULL)
+ *eofp = 1;
return (0);
}
static int
-zfsctl_unmount_snap(zfs_snapentry_t *sep, int fflags, cred_t *cr)
+zfsctl_root_vptocnp(struct vop_vptocnp_args *ap)
{
- vnode_t *svp = sep->se_root;
+ static const char dotzfs_name[4] = ".zfs";
+ vnode_t *dvp;
int error;
- ASSERT(vn_ismntpt(svp));
+ if (*ap->a_buflen < sizeof (dotzfs_name))
+ return (SET_ERROR(ENOMEM));
- /* this will be dropped by dounmount() */
- if ((error = vn_vfswlock(svp)) != 0)
- return (error);
+ error = vn_vget_ino_gen(ap->a_vp, zfsctl_fs_root_vnode, NULL,
+ LK_SHARED, &dvp);
+ if (error != 0)
+ return (SET_ERROR(error));
-#ifdef sun
- VN_HOLD(svp);
- error = dounmount(vn_mountedvfs(svp), fflags, cr);
- if (error) {
- VN_RELE(svp);
- return (error);
- }
-
- /*
- * We can't use VN_RELE(), as that will try to invoke
- * zfsctl_snapdir_inactive(), which would cause us to destroy
- * the sd_lock mutex held by our caller.
- */
- ASSERT(svp->v_count == 1);
- gfs_vop_inactive(svp, cr, NULL);
-
- kmem_free(sep->se_name, strlen(sep->se_name) + 1);
- kmem_free(sep, sizeof (zfs_snapentry_t));
-
+ VOP_UNLOCK(dvp, 0);
+ *ap->a_vpp = dvp;
+ *ap->a_buflen -= sizeof (dotzfs_name);
+ bcopy(dotzfs_name, ap->a_buf + *ap->a_buflen, sizeof (dotzfs_name));
return (0);
-#else /* !sun */
- return (dounmount(vn_mountedvfs(svp), fflags, curthread));
-#endif /* !sun */
}
-#ifdef sun
-static void
-zfsctl_rename_snap(zfsctl_snapdir_t *sdp, zfs_snapentry_t *sep, const char *nm)
+static int
+zfsctl_common_pathconf(ap)
+ struct vop_pathconf_args /* {
+ struct vnode *a_vp;
+ int a_name;
+ int *a_retval;
+ } */ *ap;
{
- avl_index_t where;
- vfs_t *vfsp;
- refstr_t *pathref;
- char newpath[MAXNAMELEN];
- char *tail;
+ /*
+ * We care about ACL variables so that user land utilities like ls
+ * can display them correctly. Since the ctldir's st_dev is set to be
+ * the same as the parent dataset, we must support all variables that
+ * it supports.
+ */
+ switch (ap->a_name) {
+ case _PC_LINK_MAX:
+ *ap->a_retval = INT_MAX;
+ return (0);
- ASSERT(MUTEX_HELD(&sdp->sd_lock));
- ASSERT(sep != NULL);
+ case _PC_FILESIZEBITS:
+ *ap->a_retval = 64;
+ return (0);
- vfsp = vn_mountedvfs(sep->se_root);
- ASSERT(vfsp != NULL);
+ case _PC_MIN_HOLE_SIZE:
+ *ap->a_retval = (int)SPA_MINBLOCKSIZE;
+ return (0);
- vfs_lock_wait(vfsp);
+ case _PC_ACL_EXTENDED:
+ *ap->a_retval = 0;
+ return (0);
- /*
- * Change the name in the AVL tree.
- */
- avl_remove(&sdp->sd_snaps, sep);
- kmem_free(sep->se_name, strlen(sep->se_name) + 1);
- sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP);
- (void) strcpy(sep->se_name, nm);
- VERIFY(avl_find(&sdp->sd_snaps, sep, &where) == NULL);
- avl_insert(&sdp->sd_snaps, sep, where);
+ case _PC_ACL_NFS4:
+ *ap->a_retval = 1;
+ return (0);
- /*
- * Change the current mountpoint info:
- * - update the tail of the mntpoint path
- * - update the tail of the resource path
- */
- pathref = vfs_getmntpoint(vfsp);
- (void) strncpy(newpath, refstr_value(pathref), sizeof (newpath));
- VERIFY((tail = strrchr(newpath, '/')) != NULL);
- *(tail+1) = '\0';
- ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath));
- (void) strcat(newpath, nm);
- refstr_rele(pathref);
- vfs_setmntpoint(vfsp, newpath, 0);
+ case _PC_ACL_PATH_MAX:
+ *ap->a_retval = ACL_MAX_ENTRIES;
+ return (0);
- pathref = vfs_getresource(vfsp);
- (void) strncpy(newpath, refstr_value(pathref), sizeof (newpath));
- VERIFY((tail = strrchr(newpath, '@')) != NULL);
- *(tail+1) = '\0';
- ASSERT3U(strlen(newpath) + strlen(nm), <, sizeof (newpath));
- (void) strcat(newpath, nm);
- refstr_rele(pathref);
- vfs_setresource(vfsp, newpath, 0);
+ case _PC_NAME_MAX:
+ *ap->a_retval = NAME_MAX;
+ return (0);
- vfs_unlock(vfsp);
+ default:
+ return (vop_stdpathconf(ap));
+ }
}
-#endif /* sun */
-#ifdef sun
-/*ARGSUSED*/
-static int
-zfsctl_snapdir_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm,
- cred_t *cr, caller_context_t *ct, int flags)
+/**
+ * Returns a trivial ACL
+ */
+int
+zfsctl_common_getacl(ap)
+ struct vop_getacl_args /* {
+ struct vnode *vp;
+ acl_type_t a_type;
+ struct acl *a_aclp;
+ struct ucred *cred;
+ struct thread *td;
+ } */ *ap;
{
- zfsctl_snapdir_t *sdp = sdvp->v_data;
- zfs_snapentry_t search, *sep;
- zfsvfs_t *zfsvfs;
- avl_index_t where;
- char from[MAXNAMELEN], to[MAXNAMELEN];
- char real[MAXNAMELEN], fsname[MAXNAMELEN];
- int err;
+ int i;
- zfsvfs = sdvp->v_vfsp->vfs_data;
- ZFS_ENTER(zfsvfs);
+ if (ap->a_type != ACL_TYPE_NFS4)
+ return (EINVAL);
- if ((flags & FIGNORECASE) || zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
- err = dmu_snapshot_realname(zfsvfs->z_os, snm, real,
- MAXNAMELEN, NULL);
- if (err == 0) {
- snm = real;
- } else if (err != ENOTSUP) {
- ZFS_EXIT(zfsvfs);
- return (err);
- }
- }
-
- ZFS_EXIT(zfsvfs);
-
- dmu_objset_name(zfsvfs->z_os, fsname);
-
- err = zfsctl_snapshot_zname(sdvp, snm, MAXNAMELEN, from);
- if (err == 0)
- err = zfsctl_snapshot_zname(tdvp, tnm, MAXNAMELEN, to);
- if (err == 0)
- err = zfs_secpolicy_rename_perms(from, to, cr);
- if (err != 0)
- return (err);
-
+ acl_nfs4_sync_acl_from_mode(ap->a_aclp, zfsctl_ctldir_mode, 0);
/*
- * Cannot move snapshots out of the snapdir.
+ * acl_nfs4_sync_acl_from_mode assumes that the owner can always modify
+ * attributes. That is not the case for the ctldir, so we must clear
+ * those bits. We also must clear ACL_READ_NAMED_ATTRS, because xattrs
+ * aren't supported by the ctldir.
*/
- if (sdvp != tdvp)
- return (SET_ERROR(EINVAL));
+ for (i = 0; i < ap->a_aclp->acl_cnt; i++) {
+ struct acl_entry *entry;
+ entry = &(ap->a_aclp->acl_entry[i]);
+ uint32_t old_perm = entry->ae_perm;
+ entry->ae_perm &= ~(ACL_WRITE_ACL | ACL_WRITE_OWNER |
+ ACL_WRITE_ATTRIBUTES | ACL_WRITE_NAMED_ATTRS |
+ ACL_READ_NAMED_ATTRS );
+ }
- if (strcmp(snm, tnm) == 0)
- return (0);
+ return (0);
+}
- mutex_enter(&sdp->sd_lock);
+static struct vop_vector zfsctl_ops_root = {
+ .vop_default = &default_vnodeops,
+ .vop_open = zfsctl_common_open,
+ .vop_close = zfsctl_common_close,
+ .vop_ioctl = VOP_EINVAL,
+ .vop_getattr = zfsctl_root_getattr,
+ .vop_access = zfsctl_common_access,
+ .vop_readdir = zfsctl_root_readdir,
+ .vop_lookup = zfsctl_root_lookup,
+ .vop_inactive = VOP_NULL,
+ .vop_reclaim = zfsctl_common_reclaim,
+ .vop_fid = zfsctl_common_fid,
+ .vop_print = zfsctl_common_print,
+ .vop_vptocnp = zfsctl_root_vptocnp,
+ .vop_pathconf = zfsctl_common_pathconf,
+ .vop_getacl = zfsctl_common_getacl,
+};
- search.se_name = (char *)snm;
- if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) == NULL) {
- mutex_exit(&sdp->sd_lock);
- return (SET_ERROR(ENOENT));
- }
+static int
+zfsctl_snapshot_zname(vnode_t *vp, const char *name, int len, char *zname)
+{
+ objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os;
- err = dsl_dataset_rename_snapshot(fsname, snm, tnm, 0);
- if (err == 0)
- zfsctl_rename_snap(sdp, sep, tnm);
-
- mutex_exit(&sdp->sd_lock);
-
- return (err);
+ dmu_objset_name(os, zname);
+ if (strlen(zname) + 1 + strlen(name) >= len)
+ return (SET_ERROR(ENAMETOOLONG));
+ (void) strcat(zname, "@");
+ (void) strcat(zname, name);
+ return (0);
}
-#endif /* sun */
-#ifdef sun
-/* ARGSUSED */
static int
-zfsctl_snapdir_remove(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr,
- caller_context_t *ct, int flags)
+zfsctl_snapshot_lookup(vnode_t *vp, const char *name, uint64_t *id)
{
- zfsctl_snapdir_t *sdp = dvp->v_data;
- zfs_snapentry_t *sep;
- zfs_snapentry_t search;
- zfsvfs_t *zfsvfs;
- char snapname[MAXNAMELEN];
- char real[MAXNAMELEN];
+ objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os;
int err;
- zfsvfs = dvp->v_vfsp->vfs_data;
- ZFS_ENTER(zfsvfs);
-
- if ((flags & FIGNORECASE) || zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
-
- err = dmu_snapshot_realname(zfsvfs->z_os, name, real,
- MAXNAMELEN, NULL);
- if (err == 0) {
- name = real;
- } else if (err != ENOTSUP) {
- ZFS_EXIT(zfsvfs);
- return (err);
- }
- }
-
- ZFS_EXIT(zfsvfs);
-
- err = zfsctl_snapshot_zname(dvp, name, MAXNAMELEN, snapname);
- if (err == 0)
- err = zfs_secpolicy_destroy_perms(snapname, cr);
- if (err != 0)
- return (err);
-
- mutex_enter(&sdp->sd_lock);
-
- search.se_name = name;
- sep = avl_find(&sdp->sd_snaps, &search, NULL);
- if (sep) {
- avl_remove(&sdp->sd_snaps, sep);
- err = zfsctl_unmount_snap(sep, MS_FORCE, cr);
- if (err != 0)
- avl_add(&sdp->sd_snaps, sep);
- else
- err = dsl_destroy_snapshot(snapname, B_FALSE);
- } else {
- err = SET_ERROR(ENOENT);
- }
-
- mutex_exit(&sdp->sd_lock);
-
+ err = dsl_dataset_snap_lookup(dmu_objset_ds(os), name, id);
return (err);
}
-#endif /* sun */
/*
- * This creates a snapshot under '.zfs/snapshot'.
+ * Given a vnode get a root vnode of a filesystem mounted on top of
+ * the vnode, if any. The root vnode is referenced and locked.
+ * If no filesystem is mounted then the orinal vnode remains referenced
+ * and locked. If any error happens the orinal vnode is unlocked and
+ * released.
*/
-/* ARGSUSED */
static int
-zfsctl_snapdir_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp,
- cred_t *cr, caller_context_t *cc, int flags, vsecattr_t *vsecp)
+zfsctl_mounted_here(vnode_t **vpp, int flags)
{
- zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
- char name[MAXNAMELEN];
+ struct mount *mp;
int err;
- static enum symfollow follow = NO_FOLLOW;
- static enum uio_seg seg = UIO_SYSSPACE;
- if (snapshot_namecheck(dirname, NULL, NULL) != 0)
- return (SET_ERROR(EILSEQ));
+ ASSERT_VOP_LOCKED(*vpp, __func__);
+ ASSERT3S((*vpp)->v_type, ==, VDIR);
- dmu_objset_name(zfsvfs->z_os, name);
-
- *vpp = NULL;
-
- err = zfs_secpolicy_snapshot_perms(name, cr);
- if (err != 0)
+ if ((mp = (*vpp)->v_mountedhere) != NULL) {
+ err = vfs_busy(mp, 0);
+ KASSERT(err == 0, ("vfs_busy(mp, 0) failed with %d", err));
+ KASSERT(vrefcnt(*vpp) > 1, ("unreferenced mountpoint"));
+ vput(*vpp);
+ err = VFS_ROOT(mp, flags, vpp);
+ vfs_unbusy(mp);
return (err);
-
- if (err == 0) {
- err = dmu_objset_snapshot_one(name, dirname);
- if (err != 0)
- return (err);
- err = lookupnameat(dirname, seg, follow, NULL, vpp, dvp);
}
-
- return (err);
+ return (EJUSTRETURN);
}
-static int
-zfsctl_freebsd_snapdir_mkdir(ap)
- struct vop_mkdir_args /* {
- struct vnode *a_dvp;
- struct vnode **a_vpp;
- struct componentname *a_cnp;
- struct vattr *a_vap;
- } */ *ap;
+typedef struct {
+ const char *snap_name;
+ uint64_t snap_id;
+} snapshot_setup_arg_t;
+
+static void
+zfsctl_snapshot_vnode_setup(vnode_t *vp, void *arg)
{
+ snapshot_setup_arg_t *ssa = arg;
+ sfs_node_t *node;
- ASSERT(ap->a_cnp->cn_flags & SAVENAME);
+ ASSERT_VOP_ELOCKED(vp, __func__);
- return (zfsctl_snapdir_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, NULL,
- ap->a_vpp, ap->a_cnp->cn_cred, NULL, 0, NULL));
+ node = sfs_alloc_node(sizeof(sfs_node_t),
+ ssa->snap_name, ZFSCTL_INO_SNAPDIR, ssa->snap_id);
+ zfsctl_common_vnode_setup(vp, node);
+
+ /* We have to support recursive locking. */
+ VN_LOCK_AREC(vp);
}
/*
@@ -926,8 +907,13 @@
* Lookup entry point for the 'snapshot' directory. Try to open the
* snapshot if it exist, creating the pseudo filesystem vnode as necessary.
* Perform a mount of the associated dataset on top of the vnode.
+ * There are four possibilities:
+ * - the snapshot node and vnode do not exist
+ * - the snapshot vnode is covered by the mounted snapshot
+ * - the snapshot vnode is not covered yet, the mount operation is in progress
+ * - the snapshot vnode is not covered, because the snapshot has been unmounted
+ * The last two states are transient and should be relatively short-lived.
*/
-/* ARGSUSED */
int
zfsctl_snapdir_lookup(ap)
struct vop_lookup_args /* {
@@ -939,144 +925,102 @@
vnode_t *dvp = ap->a_dvp;
vnode_t **vpp = ap->a_vpp;
struct componentname *cnp = ap->a_cnp;
- char nm[NAME_MAX + 1];
- zfsctl_snapdir_t *sdp = dvp->v_data;
- objset_t *snap;
- char snapname[MAXNAMELEN];
- char real[MAXNAMELEN];
+ char name[NAME_MAX + 1];
+ char fullname[ZFS_MAX_DATASET_NAME_LEN];
char *mountpoint;
- zfs_snapentry_t *sep, search;
size_t mountpoint_len;
- avl_index_t where;
zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
+ uint64_t snap_id;
+ int nameiop = cnp->cn_nameiop;
+ int lkflags = cnp->cn_lkflags;
+ int flags = cnp->cn_flags;
int err;
- int flags = 0;
- /*
- * No extended attributes allowed under .zfs
- */
- if (flags & LOOKUP_XATTR)
- return (SET_ERROR(EINVAL));
- ASSERT(ap->a_cnp->cn_namelen < sizeof(nm));
- strlcpy(nm, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen + 1);
-
ASSERT(dvp->v_type == VDIR);
- *vpp = NULL;
+ if ((flags & ISLASTCN) != 0 && nameiop != LOOKUP)
+ return (SET_ERROR(ENOTSUP));
- /*
- * If we get a recursive call, that means we got called
- * from the domount() code while it was trying to look up the
- * spec (which looks like a local path for zfs). We need to
- * add some flag to domount() to tell it not to do this lookup.
- */
- if (MUTEX_HELD(&sdp->sd_lock))
+ if (cnp->cn_namelen == 1 && *cnp->cn_nameptr == '.') {
+ err = zfsctl_relock_dot(dvp, lkflags & LK_TYPE_MASK);
+ if (err == 0)
+ *vpp = dvp;
+ return (err);
+ }
+ if (flags & ISDOTDOT) {
+ err = vn_vget_ino_gen(dvp, zfsctl_root_vnode, NULL, lkflags,
+ vpp);
+ return (err);
+ }
+
+ if (cnp->cn_namelen >= sizeof(name))
+ return (SET_ERROR(ENAMETOOLONG));
+
+ strlcpy(name, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen + 1);
+ err = zfsctl_snapshot_lookup(dvp, name, &snap_id);
+ if (err != 0)
return (SET_ERROR(ENOENT));
- ZFS_ENTER(zfsvfs);
+ for (;;) {
+ snapshot_setup_arg_t ssa;
- if (gfs_lookup_dot(vpp, dvp, zfsvfs->z_ctldir, nm) == 0) {
- ZFS_EXIT(zfsvfs);
- return (0);
- }
+ ssa.snap_name = name;
+ ssa.snap_id = snap_id;
+ err = sfs_vgetx(dvp->v_mount, LK_SHARED, ZFSCTL_INO_SNAPDIR,
+ snap_id, "zfs", &zfsctl_ops_snapshot,
+ zfsctl_snapshot_vnode_setup, &ssa, vpp);
+ if (err != 0)
+ return (err);
- if (flags & FIGNORECASE) {
- boolean_t conflict = B_FALSE;
+ /* Check if a new vnode has just been created. */
+ if (VOP_ISLOCKED(*vpp) == LK_EXCLUSIVE)
+ break;
- err = dmu_snapshot_realname(zfsvfs->z_os, nm, real,
- MAXNAMELEN, &conflict);
- if (err == 0) {
- strlcpy(nm, real, sizeof(nm));
- } else if (err != ENOTSUP) {
- ZFS_EXIT(zfsvfs);
+ /*
+ * Check if a snapshot is already mounted on top of the vnode.
+ */
+ err = zfsctl_mounted_here(vpp, lkflags);
+ if (err != EJUSTRETURN)
return (err);
- }
-#if 0
- if (realpnp)
- (void) strlcpy(realpnp->pn_buf, nm,
- realpnp->pn_bufsize);
- if (conflict && direntflags)
- *direntflags = ED_CASE_CONFLICT;
-#endif
- }
- mutex_enter(&sdp->sd_lock);
- search.se_name = (char *)nm;
- if ((sep = avl_find(&sdp->sd_snaps, &search, &where)) != NULL) {
- *vpp = sep->se_root;
- VN_HOLD(*vpp);
- err = traverse(vpp, LK_EXCLUSIVE | LK_RETRY);
- if (err != 0) {
- VN_RELE(*vpp);
- *vpp = NULL;
- } else if (*vpp == sep->se_root) {
- /*
- * The snapshot was unmounted behind our backs,
- * try to remount it.
+ /*
+ * If the vnode is not covered, then either the mount operation
+ * is in progress or the snapshot has already been unmounted
+ * but the vnode hasn't been inactivated and reclaimed yet.
+ * We can try to re-use the vnode in the latter case.
+ */
+ VI_LOCK(*vpp);
+ if (((*vpp)->v_iflag & VI_MOUNT) == 0) {
+ /* Upgrade to exclusive lock in order to:
+ * - avoid race conditions
+ * - satisfy the contract of mount_snapshot()
*/
- VERIFY(zfsctl_snapshot_zname(dvp, nm, MAXNAMELEN, snapname) == 0);
- goto domount;
+ err = VOP_LOCK(*vpp, LK_TRYUPGRADE | LK_INTERLOCK);
+ if (err == 0)
+ break;
} else {
- /*
- * VROOT was set during the traverse call. We need
- * to clear it since we're pretending to be part
- * of our parent's vfs.
- */
- (*vpp)->v_flag &= ~VROOT;
+ VI_UNLOCK(*vpp);
}
- mutex_exit(&sdp->sd_lock);
- ZFS_EXIT(zfsvfs);
- return (err);
- }
- /*
- * The requested snapshot is not currently mounted, look it up.
- */
- err = zfsctl_snapshot_zname(dvp, nm, MAXNAMELEN, snapname);
- if (err != 0) {
- mutex_exit(&sdp->sd_lock);
- ZFS_EXIT(zfsvfs);
/*
- * handle "ls *" or "?" in a graceful manner,
- * forcing EILSEQ to ENOENT.
- * Since shell ultimately passes "*" or "?" as name to lookup
+ * In this state we can loop on uncontested locks and starve
+ * the thread doing the lengthy, non-trivial mount operation.
+ * So, yield to prevent that from happening.
*/
- return (err == EILSEQ ? ENOENT : err);
+ vput(*vpp);
+ kern_yield(PRI_USER);
}
- if (dmu_objset_hold(snapname, FTAG, &snap) != 0) {
- mutex_exit(&sdp->sd_lock);
-#ifdef illumos
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(ENOENT));
-#else /* !illumos */
- /* Translate errors and add SAVENAME when needed. */
- if ((cnp->cn_flags & ISLASTCN) && cnp->cn_nameiop == CREATE) {
- err = EJUSTRETURN;
- cnp->cn_flags |= SAVENAME;
- } else {
- err = SET_ERROR(ENOENT);
- }
- ZFS_EXIT(zfsvfs);
- return (err);
-#endif /* !illumos */
- }
- sep = kmem_alloc(sizeof (zfs_snapentry_t), KM_SLEEP);
- sep->se_name = kmem_alloc(strlen(nm) + 1, KM_SLEEP);
- (void) strcpy(sep->se_name, nm);
- *vpp = sep->se_root = zfsctl_snapshot_mknode(dvp, dmu_objset_id(snap));
- VN_HOLD(*vpp);
- avl_insert(&sdp->sd_snaps, sep, where);
+ VERIFY0(zfsctl_snapshot_zname(dvp, name, sizeof(fullname), fullname));
- dmu_objset_rele(snap, FTAG);
-domount:
mountpoint_len = strlen(dvp->v_vfsp->mnt_stat.f_mntonname) +
- strlen("/" ZFS_CTLDIR_NAME "/snapshot/") + strlen(nm) + 1;
+ strlen("/" ZFS_CTLDIR_NAME "/snapshot/") + strlen(name) + 1;
mountpoint = kmem_alloc(mountpoint_len, KM_SLEEP);
(void) snprintf(mountpoint, mountpoint_len,
"%s/" ZFS_CTLDIR_NAME "/snapshot/%s",
- dvp->v_vfsp->mnt_stat.f_mntonname, nm);
- err = mount_snapshot(curthread, vpp, "zfs", mountpoint, snapname, 0);
+ dvp->v_vfsp->mnt_stat.f_mntonname, name);
+
+ err = mount_snapshot(curthread, vpp, "zfs", mountpoint, fullname, 0);
kmem_free(mountpoint, mountpoint_len);
if (err == 0) {
/*
@@ -1088,228 +1032,84 @@
*/
ASSERT(VTOZ(*vpp)->z_zfsvfs != zfsvfs);
VTOZ(*vpp)->z_zfsvfs->z_parent = zfsvfs;
+
+ /* Clear the root flag (set via VFS_ROOT) as well. */
+ (*vpp)->v_vflag &= ~VV_ROOT;
}
- mutex_exit(&sdp->sd_lock);
- ZFS_EXIT(zfsvfs);
-#ifdef illumos
- /*
- * If we had an error, drop our hold on the vnode and
- * zfsctl_snapshot_inactive() will clean up.
- */
- if (err != 0) {
- VN_RELE(*vpp);
- *vpp = NULL;
- }
-#else
if (err != 0)
*vpp = NULL;
-#endif
return (err);
}
-/* ARGSUSED */
-int
-zfsctl_shares_lookup(ap)
- struct vop_lookup_args /* {
- struct vnode *a_dvp;
- struct vnode **a_vpp;
- struct componentname *a_cnp;
- } */ *ap;
-{
- vnode_t *dvp = ap->a_dvp;
- vnode_t **vpp = ap->a_vpp;
- struct componentname *cnp = ap->a_cnp;
- zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
- char nm[NAME_MAX + 1];
- znode_t *dzp;
- int error;
-
- ZFS_ENTER(zfsvfs);
-
- ASSERT(cnp->cn_namelen < sizeof(nm));
- strlcpy(nm, cnp->cn_nameptr, cnp->cn_namelen + 1);
-
- if (gfs_lookup_dot(vpp, dvp, zfsvfs->z_ctldir, nm) == 0) {
- ZFS_EXIT(zfsvfs);
- return (0);
- }
-
- if (zfsvfs->z_shares_dir == 0) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(ENOTSUP));
- }
- if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0)
- error = VOP_LOOKUP(ZTOV(dzp), vpp, cnp);
-
- VN_RELE(ZTOV(dzp));
- ZFS_EXIT(zfsvfs);
-
- return (error);
-}
-
-/* ARGSUSED */
static int
-zfsctl_snapdir_readdir_cb(vnode_t *vp, void *dp, int *eofp,
- offset_t *offp, offset_t *nextp, void *data, int flags)
-{
- zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
- char snapname[MAXNAMELEN];
- uint64_t id, cookie;
- boolean_t case_conflict;
- int error;
-
- ZFS_ENTER(zfsvfs);
-
- cookie = *offp;
- dsl_pool_config_enter(dmu_objset_pool(zfsvfs->z_os), FTAG);
- error = dmu_snapshot_list_next(zfsvfs->z_os, MAXNAMELEN, snapname, &id,
- &cookie, &case_conflict);
- dsl_pool_config_exit(dmu_objset_pool(zfsvfs->z_os), FTAG);
- if (error) {
- ZFS_EXIT(zfsvfs);
- if (error == ENOENT) {
- *eofp = 1;
- return (0);
- }
- return (error);
- }
-
- if (flags & V_RDDIR_ENTFLAGS) {
- edirent_t *eodp = dp;
-
- (void) strcpy(eodp->ed_name, snapname);
- eodp->ed_ino = ZFSCTL_INO_SNAP(id);
- eodp->ed_eflags = case_conflict ? ED_CASE_CONFLICT : 0;
- } else {
- struct dirent64 *odp = dp;
-
- (void) strcpy(odp->d_name, snapname);
- odp->d_ino = ZFSCTL_INO_SNAP(id);
- }
- *nextp = cookie;
-
- ZFS_EXIT(zfsvfs);
-
- return (0);
-}
-
-/* ARGSUSED */
-static int
-zfsctl_shares_readdir(ap)
+zfsctl_snapdir_readdir(ap)
struct vop_readdir_args /* {
struct vnode *a_vp;
struct uio *a_uio;
struct ucred *a_cred;
int *a_eofflag;
- int *a_ncookies;
+ int *ncookies;
u_long **a_cookies;
} */ *ap;
{
+ char snapname[ZFS_MAX_DATASET_NAME_LEN];
+ struct dirent entry;
vnode_t *vp = ap->a_vp;
- uio_t *uiop = ap->a_uio;
- cred_t *cr = ap->a_cred;
+ zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
+ uio_t *uio = ap->a_uio;
int *eofp = ap->a_eofflag;
- zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
- znode_t *dzp;
+ off_t dots_offset;
int error;
- ZFS_ENTER(zfsvfs);
+ ASSERT(vp->v_type == VDIR);
- if (zfsvfs->z_shares_dir == 0) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(ENOTSUP));
+ error = sfs_readdir_common(ZFSCTL_INO_ROOT, ZFSCTL_INO_SNAPDIR, ap, uio,
+ &dots_offset);
+ if (error != 0) {
+ if (error == ENAMETOOLONG) /* ran out of destination space */
+ error = 0;
+ return (error);
}
- if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) {
- vn_lock(ZTOV(dzp), LK_SHARED | LK_RETRY);
- error = VOP_READDIR(ZTOV(dzp), uiop, cr, eofp, ap->a_ncookies, ap->a_cookies);
- VN_URELE(ZTOV(dzp));
- } else {
- *eofp = 1;
- error = SET_ERROR(ENOENT);
- }
- ZFS_EXIT(zfsvfs);
- return (error);
-}
+ ZFS_ENTER(zfsvfs);
+ for (;;) {
+ uint64_t cookie;
+ uint64_t id;
-/*
- * pvp is the '.zfs' directory (zfsctl_node_t).
- *
- * Creates vp, which is '.zfs/snapshot' (zfsctl_snapdir_t).
- *
- * This function is the callback to create a GFS vnode for '.zfs/snapshot'
- * when a lookup is performed on .zfs for "snapshot".
- */
-vnode_t *
-zfsctl_mknode_snapdir(vnode_t *pvp)
-{
- vnode_t *vp;
- zfsctl_snapdir_t *sdp;
+ cookie = uio->uio_offset - dots_offset;
- vp = gfs_dir_create(sizeof (zfsctl_snapdir_t), pvp, pvp->v_vfsp,
- &zfsctl_ops_snapdir, NULL, NULL, MAXNAMELEN,
- zfsctl_snapdir_readdir_cb, NULL);
- sdp = vp->v_data;
- sdp->sd_node.zc_id = ZFSCTL_INO_SNAPDIR;
- sdp->sd_node.zc_cmtime = ((zfsctl_node_t *)pvp->v_data)->zc_cmtime;
- mutex_init(&sdp->sd_lock, NULL, MUTEX_DEFAULT, NULL);
- avl_create(&sdp->sd_snaps, snapentry_compare,
- sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t, se_node));
- VOP_UNLOCK(vp, 0);
- return (vp);
-}
+ dsl_pool_config_enter(dmu_objset_pool(zfsvfs->z_os), FTAG);
+ error = dmu_snapshot_list_next(zfsvfs->z_os, sizeof(snapname),
+ snapname, &id, &cookie, NULL);
+ dsl_pool_config_exit(dmu_objset_pool(zfsvfs->z_os), FTAG);
+ if (error != 0) {
+ if (error == ENOENT) {
+ if (eofp != NULL)
+ *eofp = 1;
+ error = 0;
+ }
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
-vnode_t *
-zfsctl_mknode_shares(vnode_t *pvp)
-{
- vnode_t *vp;
- zfsctl_node_t *sdp;
-
- vp = gfs_dir_create(sizeof (zfsctl_node_t), pvp, pvp->v_vfsp,
- &zfsctl_ops_shares, NULL, NULL, MAXNAMELEN,
- NULL, NULL);
- sdp = vp->v_data;
- sdp->zc_cmtime = ((zfsctl_node_t *)pvp->v_data)->zc_cmtime;
- VOP_UNLOCK(vp, 0);
- return (vp);
-
-}
-
-/* ARGSUSED */
-static int
-zfsctl_shares_getattr(ap)
- struct vop_getattr_args /* {
- struct vnode *a_vp;
- struct vattr *a_vap;
- struct ucred *a_cred;
- struct thread *a_td;
- } */ *ap;
-{
- vnode_t *vp = ap->a_vp;
- vattr_t *vap = ap->a_vap;
- cred_t *cr = ap->a_cred;
- zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
- znode_t *dzp;
- int error;
-
- ZFS_ENTER(zfsvfs);
- if (zfsvfs->z_shares_dir == 0) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(ENOTSUP));
+ entry.d_fileno = id;
+ entry.d_type = DT_DIR;
+ strcpy(entry.d_name, snapname);
+ entry.d_namlen = strlen(entry.d_name);
+ entry.d_reclen = sizeof(entry);
+ error = vfs_read_dirent(ap, &entry, uio->uio_offset);
+ if (error != 0) {
+ if (error == ENAMETOOLONG)
+ error = 0;
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(error));
+ }
+ uio->uio_offset = cookie + dots_offset;
}
- if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) {
- vn_lock(ZTOV(dzp), LK_SHARED | LK_RETRY);
- error = VOP_GETATTR(ZTOV(dzp), vap, cr);
- VN_URELE(ZTOV(dzp));
- }
- ZFS_EXIT(zfsvfs);
- return (error);
-
-
+ /* NOTREACHED */
}
-/* ARGSUSED */
static int
zfsctl_snapdir_getattr(ap)
struct vop_getattr_args /* {
@@ -1321,135 +1121,46 @@
vnode_t *vp = ap->a_vp;
vattr_t *vap = ap->a_vap;
zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
- zfsctl_snapdir_t *sdp = vp->v_data;
+ dsl_dataset_t *ds = dmu_objset_ds(zfsvfs->z_os);
+ sfs_node_t *node = vp->v_data;
+ uint64_t snap_count;
+ int err;
ZFS_ENTER(zfsvfs);
zfsctl_common_getattr(vp, vap);
- vap->va_nodeid = gfs_file_inode(vp);
- vap->va_nlink = vap->va_size = avl_numnodes(&sdp->sd_snaps) + 2;
- vap->va_ctime = vap->va_mtime = dmu_objset_snap_cmtime(zfsvfs->z_os);
+ vap->va_ctime = dmu_objset_snap_cmtime(zfsvfs->z_os);
+ vap->va_mtime = vap->va_ctime;
vap->va_birthtime = vap->va_ctime;
- ZFS_EXIT(zfsvfs);
-
- return (0);
-}
-
-/* ARGSUSED */
-static int
-zfsctl_snapdir_inactive(ap)
- struct vop_inactive_args /* {
- struct vnode *a_vp;
- struct thread *a_td;
- } */ *ap;
-{
- vnode_t *vp = ap->a_vp;
- zfsctl_snapdir_t *sdp = vp->v_data;
- zfs_snapentry_t *sep;
-
- /*
- * On forced unmount we have to free snapshots from here.
- */
- mutex_enter(&sdp->sd_lock);
- while ((sep = avl_first(&sdp->sd_snaps)) != NULL) {
- avl_remove(&sdp->sd_snaps, sep);
- kmem_free(sep->se_name, strlen(sep->se_name) + 1);
- kmem_free(sep, sizeof (zfs_snapentry_t));
+ if (dsl_dataset_phys(ds)->ds_snapnames_zapobj != 0) {
+ err = zap_count(dmu_objset_pool(ds->ds_objset)->dp_meta_objset,
+ dsl_dataset_phys(ds)->ds_snapnames_zapobj, &snap_count);
+ if (err != 0) {
+ ZFS_EXIT(zfsvfs);
+ return (err);
+ }
+ vap->va_nlink += snap_count;
}
- mutex_exit(&sdp->sd_lock);
- gfs_dir_inactive(vp);
- ASSERT(avl_numnodes(&sdp->sd_snaps) == 0);
- mutex_destroy(&sdp->sd_lock);
- avl_destroy(&sdp->sd_snaps);
- kmem_free(sdp, sizeof (zfsctl_snapdir_t));
+ vap->va_size = vap->va_nlink;
+ ZFS_EXIT(zfsvfs);
return (0);
}
-#ifdef sun
-static const fs_operation_def_t zfsctl_tops_snapdir[] = {
- { VOPNAME_OPEN, { .vop_open = zfsctl_common_open } },
- { VOPNAME_CLOSE, { .vop_close = zfsctl_common_close } },
- { VOPNAME_IOCTL, { .error = fs_inval } },
- { VOPNAME_GETATTR, { .vop_getattr = zfsctl_snapdir_getattr } },
- { VOPNAME_ACCESS, { .vop_access = zfsctl_common_access } },
- { VOPNAME_RENAME, { .vop_rename = zfsctl_snapdir_rename } },
- { VOPNAME_RMDIR, { .vop_rmdir = zfsctl_snapdir_remove } },
- { VOPNAME_MKDIR, { .vop_mkdir = zfsctl_snapdir_mkdir } },
- { VOPNAME_READDIR, { .vop_readdir = gfs_vop_readdir } },
- { VOPNAME_LOOKUP, { .vop_lookup = zfsctl_snapdir_lookup } },
- { VOPNAME_SEEK, { .vop_seek = fs_seek } },
- { VOPNAME_INACTIVE, { .vop_inactive = zfsctl_snapdir_inactive } },
- { VOPNAME_FID, { .vop_fid = zfsctl_common_fid } },
- { NULL }
-};
-
-static const fs_operation_def_t zfsctl_tops_shares[] = {
- { VOPNAME_OPEN, { .vop_open = zfsctl_common_open } },
- { VOPNAME_CLOSE, { .vop_close = zfsctl_common_close } },
- { VOPNAME_IOCTL, { .error = fs_inval } },
- { VOPNAME_GETATTR, { .vop_getattr = zfsctl_shares_getattr } },
- { VOPNAME_ACCESS, { .vop_access = zfsctl_common_access } },
- { VOPNAME_READDIR, { .vop_readdir = zfsctl_shares_readdir } },
- { VOPNAME_LOOKUP, { .vop_lookup = zfsctl_shares_lookup } },
- { VOPNAME_SEEK, { .vop_seek = fs_seek } },
- { VOPNAME_INACTIVE, { .vop_inactive = gfs_vop_inactive } },
- { VOPNAME_FID, { .vop_fid = zfsctl_shares_fid } },
- { NULL }
-};
-#else /* !sun */
static struct vop_vector zfsctl_ops_snapdir = {
.vop_default = &default_vnodeops,
.vop_open = zfsctl_common_open,
.vop_close = zfsctl_common_close,
- .vop_ioctl = VOP_EINVAL,
.vop_getattr = zfsctl_snapdir_getattr,
.vop_access = zfsctl_common_access,
- .vop_mkdir = zfsctl_freebsd_snapdir_mkdir,
- .vop_readdir = gfs_vop_readdir,
+ .vop_readdir = zfsctl_snapdir_readdir,
.vop_lookup = zfsctl_snapdir_lookup,
- .vop_inactive = zfsctl_snapdir_inactive,
.vop_reclaim = zfsctl_common_reclaim,
.vop_fid = zfsctl_common_fid,
+ .vop_print = zfsctl_common_print,
+ .vop_pathconf = zfsctl_common_pathconf,
+ .vop_getacl = zfsctl_common_getacl,
};
-static struct vop_vector zfsctl_ops_shares = {
- .vop_default = &default_vnodeops,
- .vop_open = zfsctl_common_open,
- .vop_close = zfsctl_common_close,
- .vop_ioctl = VOP_EINVAL,
- .vop_getattr = zfsctl_shares_getattr,
- .vop_access = zfsctl_common_access,
- .vop_readdir = zfsctl_shares_readdir,
- .vop_lookup = zfsctl_shares_lookup,
- .vop_inactive = gfs_vop_inactive,
- .vop_reclaim = zfsctl_common_reclaim,
- .vop_fid = zfsctl_shares_fid,
-};
-#endif /* !sun */
-
-/*
- * pvp is the GFS vnode '.zfs/snapshot'.
- *
- * This creates a GFS node under '.zfs/snapshot' representing each
- * snapshot. This newly created GFS node is what we mount snapshot
- * vfs_t's ontop of.
- */
-static vnode_t *
-zfsctl_snapshot_mknode(vnode_t *pvp, uint64_t objset)
-{
- vnode_t *vp;
- zfsctl_node_t *zcp;
-
- vp = gfs_dir_create(sizeof (zfsctl_node_t), pvp, pvp->v_vfsp,
- &zfsctl_ops_snapshot, NULL, NULL, MAXNAMELEN, NULL, NULL);
- VN_HOLD(vp);
- zcp = vp->v_data;
- zcp->zc_id = objset;
- VOP_UNLOCK(vp, 0);
-
- return (vp);
-}
-
static int
zfsctl_snapshot_inactive(ap)
struct vop_inactive_args /* {
@@ -1458,180 +1169,76 @@
} */ *ap;
{
vnode_t *vp = ap->a_vp;
- cred_t *cr = ap->a_td->td_ucred;
- struct vop_inactive_args iap;
- zfsctl_snapdir_t *sdp;
- zfs_snapentry_t *sep, *next;
- int locked;
- vnode_t *dvp;
- if (vp->v_count > 0)
- goto end;
-
- VERIFY(gfs_dir_lookup(vp, "..", &dvp, cr, 0, NULL, NULL) == 0);
- sdp = dvp->v_data;
- VOP_UNLOCK(dvp, 0);
-
- if (!(locked = MUTEX_HELD(&sdp->sd_lock)))
- mutex_enter(&sdp->sd_lock);
-
- ASSERT(!vn_ismntpt(vp));
-
- sep = avl_first(&sdp->sd_snaps);
- while (sep != NULL) {
- next = AVL_NEXT(&sdp->sd_snaps, sep);
-
- if (sep->se_root == vp) {
- avl_remove(&sdp->sd_snaps, sep);
- kmem_free(sep->se_name, strlen(sep->se_name) + 1);
- kmem_free(sep, sizeof (zfs_snapentry_t));
- break;
- }
- sep = next;
- }
- ASSERT(sep != NULL);
-
- if (!locked)
- mutex_exit(&sdp->sd_lock);
- VN_RELE(dvp);
-
-end:
- /*
- * Dispose of the vnode for the snapshot mount point.
- * This is safe to do because once this entry has been removed
- * from the AVL tree, it can't be found again, so cannot become
- * "active". If we lookup the same name again we will end up
- * creating a new vnode.
- */
- iap.a_vp = vp;
- return (gfs_vop_inactive(&iap));
+ VERIFY(vrecycle(vp) == 1);
+ return (0);
}
static int
-zfsctl_traverse_begin(vnode_t **vpp, int lktype)
-{
-
- VN_HOLD(*vpp);
- /* Snapshot should be already mounted, but just in case. */
- if (vn_mountedvfs(*vpp) == NULL)
- return (ENOENT);
- return (traverse(vpp, lktype));
-}
-
-static void
-zfsctl_traverse_end(vnode_t *vp, int err)
-{
-
- if (err == 0)
- vput(vp);
- else
- VN_RELE(vp);
-}
-
-static int
-zfsctl_snapshot_getattr(ap)
- struct vop_getattr_args /* {
+zfsctl_snapshot_reclaim(ap)
+ struct vop_reclaim_args /* {
struct vnode *a_vp;
- struct vattr *a_vap;
- struct ucred *a_cred;
+ struct thread *a_td;
} */ *ap;
{
vnode_t *vp = ap->a_vp;
- int err;
+ void *data = vp->v_data;
- err = zfsctl_traverse_begin(&vp, LK_SHARED | LK_RETRY);
- if (err == 0)
- err = VOP_GETATTR(vp, ap->a_vap, ap->a_cred);
- zfsctl_traverse_end(vp, err);
- return (err);
+ sfs_reclaim_vnode(vp);
+ sfs_destroy_node(data);
+ return (0);
}
static int
-zfsctl_snapshot_fid(ap)
- struct vop_fid_args /* {
- struct vnode *a_vp;
- struct fid *a_fid;
- } */ *ap;
+zfsctl_snapshot_vptocnp(struct vop_vptocnp_args *ap)
{
- vnode_t *vp = ap->a_vp;
- int err;
-
- err = zfsctl_traverse_begin(&vp, LK_SHARED | LK_RETRY);
- if (err == 0)
- err = VOP_VPTOFH(vp, (void *)ap->a_fid);
- zfsctl_traverse_end(vp, err);
- return (err);
-}
-
-static int
-zfsctl_snapshot_lookup(ap)
- struct vop_lookup_args /* {
- struct vnode *a_dvp;
- struct vnode **a_vpp;
- struct componentname *a_cnp;
- } */ *ap;
-{
- vnode_t *dvp = ap->a_dvp;
- vnode_t **vpp = ap->a_vpp;
- struct componentname *cnp = ap->a_cnp;
- cred_t *cr = ap->a_cnp->cn_cred;
- zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
+ struct mount *mp;
+ vnode_t *dvp;
+ vnode_t *vp;
+ sfs_node_t *node;
+ size_t len;
+ int locked;
int error;
- if (cnp->cn_namelen != 2 || cnp->cn_nameptr[0] != '.' ||
- cnp->cn_nameptr[1] != '.') {
- return (ENOENT);
- }
+ vp = ap->a_vp;
+ node = vp->v_data;
+ len = strlen(node->sn_name);
+ if (*ap->a_buflen < len)
+ return (SET_ERROR(ENOMEM));
- ASSERT(dvp->v_type == VDIR);
- ASSERT(zfsvfs->z_ctldir != NULL);
+ /*
+ * Prevent unmounting of the snapshot while the vnode lock
+ * is not held. That is not strictly required, but allows
+ * us to assert that an uncovered snapshot vnode is never
+ * "leaked".
+ */
+ mp = vp->v_mountedhere;
+ if (mp == NULL)
+ return (SET_ERROR(ENOENT));
+ error = vfs_busy(mp, 0);
+ KASSERT(error == 0, ("vfs_busy(mp, 0) failed with %d", error));
- error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", vpp,
- NULL, 0, NULL, cr, NULL, NULL, NULL);
- if (error == 0)
- vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY);
- return (error);
-}
+ /*
+ * We can vput the vnode as we can now depend on the reference owned
+ * by the busied mp. But we also need to hold the vnode, because
+ * the reference may go after vfs_unbusy() which has to be called
+ * before we can lock the vnode again.
+ */
+ locked = VOP_ISLOCKED(vp);
+ vhold(vp);
+ vput(vp);
-static int
-zfsctl_snapshot_vptocnp(struct vop_vptocnp_args *ap)
-{
- zfsvfs_t *zfsvfs = ap->a_vp->v_vfsp->vfs_data;
- vnode_t *dvp, *vp;
- zfsctl_snapdir_t *sdp;
- zfs_snapentry_t *sep;
- int error;
-
- ASSERT(zfsvfs->z_ctldir != NULL);
- error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp,
- NULL, 0, NULL, kcred, NULL, NULL, NULL);
- if (error != 0)
- return (error);
- sdp = dvp->v_data;
-
- mutex_enter(&sdp->sd_lock);
- sep = avl_first(&sdp->sd_snaps);
- while (sep != NULL) {
- vp = sep->se_root;
- if (vp == ap->a_vp)
- break;
- sep = AVL_NEXT(&sdp->sd_snaps, sep);
- }
- if (sep == NULL) {
- mutex_exit(&sdp->sd_lock);
- error = ENOENT;
- } else {
- size_t len;
-
- len = strlen(sep->se_name);
+ /* Look up .zfs/snapshot, our parent. */
+ error = zfsctl_snapdir_vnode(vp->v_mount, NULL, LK_SHARED, &dvp);
+ if (error == 0) {
+ VOP_UNLOCK(dvp, 0);
+ *ap->a_vpp = dvp;
*ap->a_buflen -= len;
- bcopy(sep->se_name, ap->a_buf + *ap->a_buflen, len);
- mutex_exit(&sdp->sd_lock);
- vref(dvp);
- *ap->a_vpp = dvp;
+ bcopy(node->sn_name, ap->a_buf + *ap->a_buflen, len);
}
- VN_RELE(dvp);
-
+ vfs_unbusy(mp);
+ vget(vp, locked | LK_RETRY, curthread);
+ vdrop(vp);
return (error);
}
@@ -1640,71 +1247,40 @@
* be covered.
*/
static struct vop_vector zfsctl_ops_snapshot = {
- .vop_default = &default_vnodeops,
- .vop_inactive = zfsctl_snapshot_inactive,
- .vop_lookup = zfsctl_snapshot_lookup,
- .vop_reclaim = zfsctl_common_reclaim,
- .vop_getattr = zfsctl_snapshot_getattr,
- .vop_fid = zfsctl_snapshot_fid,
- .vop_vptocnp = zfsctl_snapshot_vptocnp,
+ .vop_default = NULL, /* ensure very restricted access */
+ .vop_inactive = zfsctl_snapshot_inactive,
+ .vop_reclaim = zfsctl_snapshot_reclaim,
+ .vop_vptocnp = zfsctl_snapshot_vptocnp,
+ .vop_lock1 = vop_stdlock,
+ .vop_unlock = vop_stdunlock,
+ .vop_islocked = vop_stdislocked,
+ .vop_advlockpurge = vop_stdadvlockpurge, /* called by vgone */
+ .vop_print = zfsctl_common_print,
};
int
zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp)
{
+ struct mount *mp;
zfsvfs_t *zfsvfs = vfsp->vfs_data;
- vnode_t *dvp, *vp;
- zfsctl_snapdir_t *sdp;
- zfsctl_node_t *zcp;
- zfs_snapentry_t *sep;
+ vnode_t *vp;
int error;
ASSERT(zfsvfs->z_ctldir != NULL);
- error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp,
- NULL, 0, NULL, kcred, NULL, NULL, NULL);
- if (error != 0)
- return (error);
- sdp = dvp->v_data;
-
- mutex_enter(&sdp->sd_lock);
- sep = avl_first(&sdp->sd_snaps);
- while (sep != NULL) {
- vp = sep->se_root;
- zcp = vp->v_data;
- if (zcp->zc_id == objsetid)
- break;
-
- sep = AVL_NEXT(&sdp->sd_snaps, sep);
- }
-
- if (sep != NULL) {
- VN_HOLD(vp);
+ *zfsvfsp = NULL;
+ error = sfs_vnode_get(vfsp, LK_EXCLUSIVE,
+ ZFSCTL_INO_SNAPDIR, objsetid, &vp);
+ if (error == 0 && vp != NULL) {
/*
- * Return the mounted root rather than the covered mount point.
- * Takes the GFS vnode at .zfs/snapshot/<snapshot objsetid>
- * and returns the ZFS vnode mounted on top of the GFS node.
- * This ZFS vnode is the root of the vfs for objset 'objsetid'.
+ * XXX Probably need to at least reference, if not busy, the mp.
*/
- error = traverse(&vp, LK_SHARED | LK_RETRY);
- if (error == 0) {
- if (vp == sep->se_root)
- error = SET_ERROR(EINVAL);
- else
- *zfsvfsp = VTOZ(vp)->z_zfsvfs;
- }
- mutex_exit(&sdp->sd_lock);
- if (error == 0)
- VN_URELE(vp);
- else
- VN_RELE(vp);
- } else {
- error = SET_ERROR(EINVAL);
- mutex_exit(&sdp->sd_lock);
+ if (vp->v_mountedhere != NULL)
+ *zfsvfsp = vp->v_mountedhere->mnt_data;
+ vput(vp);
}
-
- VN_RELE(dvp);
-
- return (error);
+ if (*zfsvfsp == NULL)
+ return (SET_ERROR(EINVAL));
+ return (0);
}
/*
@@ -1715,52 +1291,70 @@
int
zfsctl_umount_snapshots(vfs_t *vfsp, int fflags, cred_t *cr)
{
+ char snapname[ZFS_MAX_DATASET_NAME_LEN];
zfsvfs_t *zfsvfs = vfsp->vfs_data;
+ struct mount *mp;
vnode_t *dvp;
- zfsctl_snapdir_t *sdp;
- zfs_snapentry_t *sep, *next;
+ vnode_t *vp;
+ sfs_node_t *node;
+ sfs_node_t *snap;
+ uint64_t cookie;
int error;
ASSERT(zfsvfs->z_ctldir != NULL);
- error = zfsctl_root_lookup(zfsvfs->z_ctldir, "snapshot", &dvp,
- NULL, 0, NULL, cr, NULL, NULL, NULL);
- if (error != 0)
- return (error);
- sdp = dvp->v_data;
- mutex_enter(&sdp->sd_lock);
+ cookie = 0;
+ for (;;) {
+ uint64_t id;
- sep = avl_first(&sdp->sd_snaps);
- while (sep != NULL) {
- next = AVL_NEXT(&sdp->sd_snaps, sep);
+ dsl_pool_config_enter(dmu_objset_pool(zfsvfs->z_os), FTAG);
+ error = dmu_snapshot_list_next(zfsvfs->z_os, sizeof(snapname),
+ snapname, &id, &cookie, NULL);
+ dsl_pool_config_exit(dmu_objset_pool(zfsvfs->z_os), FTAG);
+ if (error != 0) {
+ if (error == ENOENT)
+ error = 0;
+ break;
+ }
- /*
- * If this snapshot is not mounted, then it must
- * have just been unmounted by somebody else, and
- * will be cleaned up by zfsctl_snapdir_inactive().
- */
- if (vn_ismntpt(sep->se_root)) {
- error = zfsctl_unmount_snap(sep, fflags, cr);
- if (error) {
- avl_index_t where;
+ for (;;) {
+ error = sfs_vnode_get(vfsp, LK_EXCLUSIVE,
+ ZFSCTL_INO_SNAPDIR, id, &vp);
+ if (error != 0 || vp == NULL)
+ break;
- /*
- * Before reinserting snapshot to the tree,
- * check if it was actually removed. For example
- * when snapshot mount point is busy, we will
- * have an error here, but there will be no need
- * to reinsert snapshot.
- */
- if (avl_find(&sdp->sd_snaps, sep, &where) == NULL)
- avl_insert(&sdp->sd_snaps, sep, where);
+ mp = vp->v_mountedhere;
+
+ /*
+ * v_mountedhere being NULL means that the
+ * (uncovered) vnode is in a transient state
+ * (mounting or unmounting), so loop until it
+ * settles down.
+ */
+ if (mp != NULL)
break;
- }
+ vput(vp);
}
- sep = next;
+ if (error != 0)
+ break;
+ if (vp == NULL)
+ continue; /* no mountpoint, nothing to do */
+
+ /*
+ * The mount-point vnode is kept locked to avoid spurious EBUSY
+ * from a concurrent umount.
+ * The vnode lock must have recursive locking enabled.
+ */
+ vfs_ref(mp);
+ error = dounmount(mp, fflags, curthread);
+ KASSERT_IMPLY(error == 0, vrefcnt(vp) == 1,
+ ("extra references after unmount"));
+ vput(vp);
+ if (error != 0)
+ break;
}
-
- mutex_exit(&sdp->sd_lock);
- VN_RELE(dvp);
-
+ KASSERT_IMPLY((fflags & MS_FORCE) != 0, error == 0,
+ ("force unmounting failed"));
return (error);
}
+
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_debug.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_debug.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_debug.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,7 +21,7 @@
*/
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
*/
#include <sys/zfs_context.h>
@@ -29,7 +29,7 @@
list_t zfs_dbgmsgs;
int zfs_dbgmsg_size;
kmutex_t zfs_dbgmsgs_lock;
-int zfs_dbgmsg_maxsize = 1<<20; /* 1MB */
+int zfs_dbgmsg_maxsize = 4<<20; /* 4MB */
void
zfs_dbgmsg_init(void)
@@ -58,7 +58,10 @@
* echo ::zfs_dbgmsg | mdb -k
*
* Monitor these messages by running:
- * dtrace -q -n 'zfs-dbgmsg{printf("%s\n", stringof(arg0))}'
+ * dtrace -qn 'zfs-dbgmsg{printf("%s\n", stringof(arg0))}'
+ *
+ * When used with libzpool, monitor with:
+ * dtrace -qn 'zfs$pid::zfs_dbgmsg:probe1{printf("%s\n", copyinstr(arg1))}'
*/
void
zfs_dbgmsg(const char *fmt, ...)
@@ -95,3 +98,16 @@
}
mutex_exit(&zfs_dbgmsgs_lock);
}
+
+void
+zfs_dbgmsg_print(const char *tag)
+{
+ zfs_dbgmsg_t *zdm;
+
+ (void) printf("ZFS_DBGMSG(%s):\n", tag);
+ mutex_enter(&zfs_dbgmsgs_lock);
+ for (zdm = list_head(&zfs_dbgmsgs); zdm;
+ zdm = list_next(&zfs_dbgmsgs, zdm))
+ (void) printf("%s\n", zdm->zdm_msg);
+ mutex_exit(&zfs_dbgmsgs_lock);
+}
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_dir.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,7 +21,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
*/
#include <sys/types.h>
@@ -59,51 +59,37 @@
#include <sys/extdirent.h>
/*
- * zfs_match_find() is used by zfs_dirent_lock() to peform zap lookups
+ * zfs_match_find() is used by zfs_dirent_lookup() to peform zap lookups
* of names after deciding which is the appropriate lookup interface.
*/
static int
-zfs_match_find(zfsvfs_t *zfsvfs, znode_t *dzp, char *name, boolean_t exact,
- boolean_t update, int *deflags, pathname_t *rpnp, uint64_t *zoid)
+zfs_match_find(zfsvfs_t *zfsvfs, znode_t *dzp, const char *name,
+ boolean_t exact, uint64_t *zoid)
{
int error;
if (zfsvfs->z_norm) {
- matchtype_t mt = MT_FIRST;
- boolean_t conflict = B_FALSE;
- size_t bufsz = 0;
- char *buf = NULL;
+ matchtype_t mt = exact? MT_EXACT : MT_FIRST;
- if (rpnp) {
- buf = rpnp->pn_buf;
- bufsz = rpnp->pn_bufsize;
- }
- if (exact)
- mt = MT_EXACT;
/*
* In the non-mixed case we only expect there would ever
* be one match, but we need to use the normalizing lookup.
*/
error = zap_lookup_norm(zfsvfs->z_os, dzp->z_id, name, 8, 1,
- zoid, mt, buf, bufsz, &conflict);
- if (!error && deflags)
- *deflags = conflict ? ED_CASE_CONFLICT : 0;
+ zoid, mt, NULL, 0, NULL);
} else {
error = zap_lookup(zfsvfs->z_os, dzp->z_id, name, 8, 1, zoid);
}
*zoid = ZFS_DIRENT_OBJ(*zoid);
- if (error == ENOENT && update)
- dnlc_update(ZTOV(dzp), name, DNLC_NO_VNODE);
-
return (error);
}
/*
- * Lock a directory entry. A dirlock on <dzp, name> protects that name
- * in dzp's directory zap object. As long as you hold a dirlock, you can
- * assume two things: (1) dzp cannot be reaped, and (2) no other thread
- * can change the zap entry for (i.e. link or unlink) this name.
+ * Look up a directory entry under a locked vnode.
+ * dvp being locked gives us a guarantee that there are no concurrent
+ * modification of the directory and, thus, if a node can be found in
+ * the directory, then it must not be unlinked.
*
* Input arguments:
* dzp - znode for directory
@@ -110,45 +96,27 @@
* name - name of entry to lock
* flag - ZNEW: if the entry already exists, fail with EEXIST.
* ZEXISTS: if the entry does not exist, fail with ENOENT.
- * ZSHARED: allow concurrent access with other ZSHARED callers.
* ZXATTR: we want dzp's xattr directory
- * ZCILOOK: On a mixed sensitivity file system,
- * this lookup should be case-insensitive.
- * ZCIEXACT: On a purely case-insensitive file system,
- * this lookup should be case-sensitive.
- * ZRENAMING: we are locking for renaming, force narrow locks
- * ZHAVELOCK: Don't grab the z_name_lock for this call. The
- * current thread already holds it.
*
* Output arguments:
* zpp - pointer to the znode for the entry (NULL if there isn't one)
- * dlpp - pointer to the dirlock for this entry (NULL on error)
- * direntflags - (case-insensitive lookup only)
- * flags if multiple case-sensitive matches exist in directory
- * realpnp - (case-insensitive lookup only)
- * actual name matched within the directory
*
* Return value: 0 on success or errno on failure.
*
* NOTE: Always checks for, and rejects, '.' and '..'.
- * NOTE: For case-insensitive file systems we take wide locks (see below),
- * but return znode pointers to a single match.
*/
int
-zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp,
- int flag, int *direntflags, pathname_t *realpnp)
+zfs_dirent_lookup(znode_t *dzp, const char *name, znode_t **zpp, int flag)
{
zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
- zfs_dirlock_t *dl;
- boolean_t update;
boolean_t exact;
uint64_t zoid;
vnode_t *vp = NULL;
int error = 0;
- int cmpflags;
+ ASSERT_VOP_LOCKED(ZTOV(dzp), __func__);
+
*zpp = NULL;
- *dlpp = NULL;
/*
* Verify that we are not trying to lock '.', '..', or '.zfs'
@@ -162,125 +130,20 @@
* Case sensitivity and normalization preferences are set when
* the file system is created. These are stored in the
* zfsvfs->z_case and zfsvfs->z_norm fields. These choices
- * affect what vnodes can be cached in the DNLC, how we
- * perform zap lookups, and the "width" of our dirlocks.
+ * affect how we perform zap lookups.
*
- * A normal dirlock locks a single name. Note that with
- * normalization a name can be composed multiple ways, but
- * when normalized, these names all compare equal. A wide
- * dirlock locks multiple names. We need these when the file
- * system is supporting mixed-mode access. It is sometimes
- * necessary to lock all case permutations of file name at
- * once so that simultaneous case-insensitive/case-sensitive
- * behaves as rationally as possible.
- */
-
- /*
* Decide if exact matches should be requested when performing
* a zap lookup on file systems supporting case-insensitive
* access.
- */
- exact =
- ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE) && (flag & ZCIEXACT)) ||
- ((zfsvfs->z_case == ZFS_CASE_MIXED) && !(flag & ZCILOOK));
-
- /*
- * Only look in or update the DNLC if we are looking for the
- * name on a file system that does not require normalization
- * or case folding. We can also look there if we happen to be
- * on a non-normalizing, mixed sensitivity file system IF we
- * are looking for the exact name.
*
- * Maybe can add TO-UPPERed version of name to dnlc in ci-only
- * case for performance improvement?
+ * NB: we do not need to worry about this flag for ZFS_CASE_SENSITIVE
+ * because in that case MT_EXACT and MT_FIRST should produce exactly
+ * the same result.
*/
- update = !zfsvfs->z_norm ||
- ((zfsvfs->z_case == ZFS_CASE_MIXED) &&
- !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER) && !(flag & ZCILOOK));
+ exact = zfsvfs->z_case == ZFS_CASE_MIXED;
- /*
- * ZRENAMING indicates we are in a situation where we should
- * take narrow locks regardless of the file system's
- * preferences for normalizing and case folding. This will
- * prevent us deadlocking trying to grab the same wide lock
- * twice if the two names happen to be case-insensitive
- * matches.
- */
- if (flag & ZRENAMING)
- cmpflags = 0;
- else
- cmpflags = zfsvfs->z_norm;
-
- /*
- * Wait until there are no locks on this name.
- *
- * Don't grab the the lock if it is already held. However, cannot
- * have both ZSHARED and ZHAVELOCK together.
- */
- ASSERT(!(flag & ZSHARED) || !(flag & ZHAVELOCK));
- if (!(flag & ZHAVELOCK))
- rw_enter(&dzp->z_name_lock, RW_READER);
-
- mutex_enter(&dzp->z_lock);
- for (;;) {
- if (dzp->z_unlinked) {
- mutex_exit(&dzp->z_lock);
- if (!(flag & ZHAVELOCK))
- rw_exit(&dzp->z_name_lock);
- return (SET_ERROR(ENOENT));
- }
- for (dl = dzp->z_dirlocks; dl != NULL; dl = dl->dl_next) {
- if ((u8_strcmp(name, dl->dl_name, 0, cmpflags,
- U8_UNICODE_LATEST, &error) == 0) || error != 0)
- break;
- }
- if (error != 0) {
- mutex_exit(&dzp->z_lock);
- if (!(flag & ZHAVELOCK))
- rw_exit(&dzp->z_name_lock);
- return (SET_ERROR(ENOENT));
- }
- if (dl == NULL) {
- size_t namesize;
-
- /*
- * Allocate a new dirlock and add it to the list.
- */
- namesize = strlen(name) + 1;
- dl = kmem_alloc(sizeof (zfs_dirlock_t) + namesize,
- KM_SLEEP);
- cv_init(&dl->dl_cv, NULL, CV_DEFAULT, NULL);
- dl->dl_name = (char *)(dl + 1);
- bcopy(name, dl->dl_name, namesize);
- dl->dl_sharecnt = 0;
- dl->dl_namelock = 0;
- dl->dl_namesize = namesize;
- dl->dl_dzp = dzp;
- dl->dl_next = dzp->z_dirlocks;
- dzp->z_dirlocks = dl;
- break;
- }
- if ((flag & ZSHARED) && dl->dl_sharecnt != 0)
- break;
- cv_wait(&dl->dl_cv, &dzp->z_lock);
- }
-
- /*
- * If the z_name_lock was NOT held for this dirlock record it.
- */
- if (flag & ZHAVELOCK)
- dl->dl_namelock = 1;
-
- if (flag & ZSHARED)
- dl->dl_sharecnt++;
-
- mutex_exit(&dzp->z_lock);
-
- /*
- * We have a dirlock on the name. (Note that it is the dirlock,
- * not the dzp's z_lock, that protects the name in the zap object.)
- * See if there's an object by this name; if so, put a hold on it.
- */
+ if (dzp->z_unlinked && !(flag & ZXATTR))
+ return (ENOENT);
if (flag & ZXATTR) {
error = sa_lookup(dzp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &zoid,
sizeof (zoid));
@@ -287,155 +150,73 @@
if (error == 0)
error = (zoid == 0 ? ENOENT : 0);
} else {
- if (update)
- vp = dnlc_lookup(ZTOV(dzp), name);
- if (vp == DNLC_NO_VNODE) {
- VN_RELE(vp);
- error = SET_ERROR(ENOENT);
- } else if (vp) {
- if (flag & ZNEW) {
- zfs_dirent_unlock(dl);
- VN_RELE(vp);
- return (SET_ERROR(EEXIST));
- }
- *dlpp = dl;
- *zpp = VTOZ(vp);
- return (0);
- } else {
- error = zfs_match_find(zfsvfs, dzp, name, exact,
- update, direntflags, realpnp, &zoid);
- }
+ error = zfs_match_find(zfsvfs, dzp, name, exact, &zoid);
}
if (error) {
if (error != ENOENT || (flag & ZEXISTS)) {
- zfs_dirent_unlock(dl);
return (error);
}
} else {
if (flag & ZNEW) {
- zfs_dirent_unlock(dl);
return (SET_ERROR(EEXIST));
}
error = zfs_zget(zfsvfs, zoid, zpp);
- if (error) {
- zfs_dirent_unlock(dl);
+ if (error)
return (error);
- }
- if (!(flag & ZXATTR) && update)
- dnlc_update(ZTOV(dzp), name, ZTOV(*zpp));
+ ASSERT(!(*zpp)->z_unlinked);
}
- *dlpp = dl;
-
return (0);
}
-/*
- * Unlock this directory entry and wake anyone who was waiting for it.
- */
-void
-zfs_dirent_unlock(zfs_dirlock_t *dl)
+static int
+zfs_dd_lookup(znode_t *dzp, znode_t **zpp)
{
- znode_t *dzp = dl->dl_dzp;
- zfs_dirlock_t **prev_dl, *cur_dl;
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+ znode_t *zp;
+ uint64_t parent;
+ int error;
- mutex_enter(&dzp->z_lock);
+ ASSERT_VOP_LOCKED(ZTOV(dzp), __func__);
+ ASSERT(RRM_READ_HELD(&zfsvfs->z_teardown_lock));
- if (!dl->dl_namelock)
- rw_exit(&dzp->z_name_lock);
+ if (dzp->z_unlinked)
+ return (ENOENT);
- if (dl->dl_sharecnt > 1) {
- dl->dl_sharecnt--;
- mutex_exit(&dzp->z_lock);
- return;
- }
- prev_dl = &dzp->z_dirlocks;
- while ((cur_dl = *prev_dl) != dl)
- prev_dl = &cur_dl->dl_next;
- *prev_dl = dl->dl_next;
- cv_broadcast(&dl->dl_cv);
- mutex_exit(&dzp->z_lock);
+ if ((error = sa_lookup(dzp->z_sa_hdl,
+ SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)
+ return (error);
- cv_destroy(&dl->dl_cv);
- kmem_free(dl, sizeof (*dl) + dl->dl_namesize);
+ error = zfs_zget(zfsvfs, parent, &zp);
+ if (error == 0)
+ *zpp = zp;
+ return (error);
}
-/*
- * Look up an entry in a directory.
- *
- * NOTE: '.' and '..' are handled as special cases because
- * no directory entries are actually stored for them. If this is
- * the root of a filesystem, then '.zfs' is also treated as a
- * special pseudo-directory.
- */
int
-zfs_dirlook(znode_t *dzp, char *name, vnode_t **vpp, int flags,
- int *deflg, pathname_t *rpnp)
+zfs_dirlook(znode_t *dzp, const char *name, znode_t **zpp)
{
- zfs_dirlock_t *dl;
+ zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
znode_t *zp;
int error = 0;
- uint64_t parent;
- int unlinked;
+ ASSERT_VOP_LOCKED(ZTOV(dzp), __func__);
+ ASSERT(RRM_READ_HELD(&zfsvfs->z_teardown_lock));
+
+ if (dzp->z_unlinked)
+ return (SET_ERROR(ENOENT));
+
if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
- mutex_enter(&dzp->z_lock);
- unlinked = dzp->z_unlinked;
- mutex_exit(&dzp->z_lock);
- if (unlinked)
- return (ENOENT);
-
- *vpp = ZTOV(dzp);
- VN_HOLD(*vpp);
+ *zpp = dzp;
} else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
- zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
-
- /*
- * If we are a snapshot mounted under .zfs, return
- * the vp for the snapshot directory.
- */
- if ((error = sa_lookup(dzp->z_sa_hdl,
- SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)
- return (error);
- if (parent == dzp->z_id && zfsvfs->z_parent != zfsvfs) {
- error = zfsctl_root_lookup(zfsvfs->z_parent->z_ctldir,
- "snapshot", vpp, NULL, 0, NULL, kcred,
- NULL, NULL, NULL);
- return (error);
- }
-
- mutex_enter(&dzp->z_lock);
- unlinked = dzp->z_unlinked;
- mutex_exit(&dzp->z_lock);
- if (unlinked)
- return (ENOENT);
-
- rw_enter(&dzp->z_parent_lock, RW_READER);
- error = zfs_zget(zfsvfs, parent, &zp);
- if (error == 0)
- *vpp = ZTOV(zp);
- rw_exit(&dzp->z_parent_lock);
- } else if (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0) {
- *vpp = zfsctl_root(dzp);
+ error = zfs_dd_lookup(dzp, zpp);
} else {
- int zf;
-
- zf = ZEXISTS | ZSHARED;
- if (flags & FIGNORECASE)
- zf |= ZCILOOK;
-
- error = zfs_dirent_lock(&dl, dzp, name, &zp, zf, deflg, rpnp);
+ error = zfs_dirent_lookup(dzp, name, &zp, ZEXISTS);
if (error == 0) {
- *vpp = ZTOV(zp);
- zfs_dirent_unlock(dl);
dzp->z_zn_prefetch = B_TRUE; /* enable prefetching */
+ *zpp = zp;
}
- rpnp = NULL;
}
-
- if ((flags & FIGNORECASE) && rpnp && !error)
- (void) strlcpy(rpnp->pn_buf, name, rpnp->pn_bufsize);
-
return (error);
}
@@ -511,8 +292,9 @@
if (error != 0)
continue;
+ vn_lock(ZTOV(zp), LK_EXCLUSIVE | LK_RETRY);
zp->z_unlinked = B_TRUE;
- VN_RELE(ZTOV(zp));
+ vput(ZTOV(zp));
}
zap_cursor_fini(&zc);
}
@@ -536,7 +318,6 @@
znode_t *xzp;
dmu_tx_t *tx;
zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
- zfs_dirlock_t dl;
int skipped = 0;
int error;
@@ -550,6 +331,7 @@
continue;
}
+ vn_lock(ZTOV(xzp), LK_EXCLUSIVE | LK_RETRY);
ASSERT((ZTOV(xzp)->v_type == VREG) ||
(ZTOV(xzp)->v_type == VLNK));
@@ -560,23 +342,21 @@
dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
/* Is this really needed ? */
zfs_sa_upgrade_txholds(tx, xzp);
+ dmu_tx_mark_netfree(tx);
error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
dmu_tx_abort(tx);
- VN_RELE(ZTOV(xzp));
+ vput(ZTOV(xzp));
skipped += 1;
continue;
}
- bzero(&dl, sizeof (dl));
- dl.dl_dzp = dzp;
- dl.dl_name = zap.za_name;
- error = zfs_link_destroy(&dl, xzp, tx, 0, NULL);
+ error = zfs_link_destroy(dzp, zap.za_name, xzp, tx, 0, NULL);
if (error)
skipped += 1;
dmu_tx_commit(tx);
- VN_RELE(ZTOV(xzp));
+ vput(ZTOV(xzp));
}
zap_cursor_fini(&zc);
if (error != ENOENT)
@@ -596,6 +376,7 @@
int error;
ASSERT(zp->z_links == 0);
+ ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
/*
* If this is an attribute directory, purge its contents.
@@ -634,7 +415,8 @@
&xattr_obj, sizeof (xattr_obj));
if (error == 0 && xattr_obj) {
error = zfs_zget(zfsvfs, xattr_obj, &xzp);
- ASSERT(error == 0);
+ ASSERT3S(error, ==, 0);
+ vn_lock(ZTOV(xzp), LK_EXCLUSIVE | LK_RETRY);
}
acl_obj = zfs_external_acl(zp);
@@ -668,12 +450,10 @@
if (xzp) {
ASSERT(error == 0);
- mutex_enter(&xzp->z_lock);
xzp->z_unlinked = B_TRUE; /* mark xzp for deletion */
xzp->z_links = 0; /* no more links to it */
VERIFY(0 == sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
&xzp->z_links, sizeof (xzp->z_links), tx));
- mutex_exit(&xzp->z_lock);
zfs_unlinked_add(xzp, tx);
}
@@ -686,7 +466,7 @@
dmu_tx_commit(tx);
out:
if (xzp)
- VN_RELE(ZTOV(xzp));
+ vput(ZTOV(xzp));
}
static uint64_t
@@ -700,12 +480,12 @@
}
/*
- * Link zp into dl. Can only fail if zp has been unlinked.
+ * Link zp into dzp. Can only fail if zp has been unlinked.
*/
int
-zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag)
+zfs_link_create(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx,
+ int flag)
{
- znode_t *dzp = dl->dl_dzp;
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
vnode_t *vp = ZTOV(zp);
uint64_t value;
@@ -715,18 +495,32 @@
int count = 0;
int error;
- mutex_enter(&zp->z_lock);
-
+ ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__);
+ ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
+#if 0
+ if (zp_is_dir) {
+ error = 0;
+ if (dzp->z_links >= LINK_MAX)
+ error = SET_ERROR(EMLINK);
+ return (error);
+ }
+#endif
if (!(flag & ZRENAMING)) {
if (zp->z_unlinked) { /* no new links to unlinked zp */
ASSERT(!(flag & (ZNEW | ZEXISTS)));
- mutex_exit(&zp->z_lock);
return (SET_ERROR(ENOENT));
}
+#if 0
+ if (zp->z_links >= LINK_MAX) {
+ return (SET_ERROR(EMLINK));
+ }
+#endif
zp->z_links++;
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
&zp->z_links, sizeof (zp->z_links));
+ } else {
+ ASSERT(zp->z_unlinked == 0);
}
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL,
&dzp->z_id, sizeof (dzp->z_id));
@@ -740,11 +534,8 @@
ctime, B_TRUE);
}
error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
- ASSERT(error == 0);
+ ASSERT0(error);
- mutex_exit(&zp->z_lock);
-
- mutex_enter(&dzp->z_lock);
dzp->z_size++;
dzp->z_links += zp_is_dir;
count = 0;
@@ -760,38 +551,32 @@
&dzp->z_pflags, sizeof (dzp->z_pflags));
zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime, B_TRUE);
error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx);
- ASSERT(error == 0);
- mutex_exit(&dzp->z_lock);
+ ASSERT0(error);
value = zfs_dirent(zp, zp->z_mode);
- error = zap_add(zp->z_zfsvfs->z_os, dzp->z_id, dl->dl_name,
+ error = zap_add(zp->z_zfsvfs->z_os, dzp->z_id, name,
8, 1, &value, tx);
- ASSERT(error == 0);
+ VERIFY0(error);
- dnlc_update(ZTOV(dzp), dl->dl_name, vp);
-
return (0);
}
static int
-zfs_dropname(zfs_dirlock_t *dl, znode_t *zp, znode_t *dzp, dmu_tx_t *tx,
+zfs_dropname(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx,
int flag)
{
int error;
if (zp->z_zfsvfs->z_norm) {
- if (((zp->z_zfsvfs->z_case == ZFS_CASE_INSENSITIVE) &&
- (flag & ZCIEXACT)) ||
- ((zp->z_zfsvfs->z_case == ZFS_CASE_MIXED) &&
- !(flag & ZCILOOK)))
+ if (zp->z_zfsvfs->z_case == ZFS_CASE_MIXED)
error = zap_remove_norm(zp->z_zfsvfs->z_os,
- dzp->z_id, dl->dl_name, MT_EXACT, tx);
+ dzp->z_id, name, MT_EXACT, tx);
else
error = zap_remove_norm(zp->z_zfsvfs->z_os,
- dzp->z_id, dl->dl_name, MT_FIRST, tx);
+ dzp->z_id, name, MT_FIRST, tx);
} else {
error = zap_remove(zp->z_zfsvfs->z_os,
- dzp->z_id, dl->dl_name, tx);
+ dzp->z_id, name, tx);
}
return (error);
@@ -798,7 +583,7 @@
}
/*
- * Unlink zp from dl, and mark zp for deletion if this was the last link.
+ * Unlink zp from dzp, and mark zp for deletion if this was the last link.
* Can fail if zp is a mount point (EBUSY) or a non-empty directory (EEXIST).
* If 'unlinkedp' is NULL, we put unlinked znodes on the unlinked list.
* If it's non-NULL, we use it to indicate whether the znode needs deletion,
@@ -805,10 +590,9 @@
* and it's the caller's job to do it.
*/
int
-zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag,
- boolean_t *unlinkedp)
+zfs_link_destroy(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx,
+ int flag, boolean_t *unlinkedp)
{
- znode_t *dzp = dl->dl_dzp;
zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
vnode_t *vp = ZTOV(zp);
int zp_is_dir = (vp->v_type == VDIR);
@@ -818,22 +602,12 @@
int count = 0;
int error;
- dnlc_remove(ZTOV(dzp), dl->dl_name);
+ ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__);
+ ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
if (!(flag & ZRENAMING)) {
- if (vn_vfswlock(vp)) /* prevent new mounts on zp */
- return (SET_ERROR(EBUSY));
- if (vn_ismntpt(vp)) { /* don't remove mount point */
- vn_vfsunlock(vp);
- return (SET_ERROR(EBUSY));
- }
-
- mutex_enter(&zp->z_lock);
-
if (zp_is_dir && !zfs_dirempty(zp)) {
- mutex_exit(&zp->z_lock);
- vn_vfsunlock(vp);
#ifdef illumos
return (SET_ERROR(EEXIST));
#else
@@ -846,10 +620,8 @@
* First try removing the name from the directory; if that
* fails, return the error.
*/
- error = zfs_dropname(dl, zp, dzp, tx, flag);
+ error = zfs_dropname(dzp, name, zp, tx, flag);
if (error != 0) {
- mutex_exit(&zp->z_lock);
- vn_vfsunlock(vp);
return (error);
}
@@ -876,16 +648,14 @@
NULL, &zp->z_links, sizeof (zp->z_links));
error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
count = 0;
- ASSERT(error == 0);
- mutex_exit(&zp->z_lock);
- vn_vfsunlock(vp);
+ ASSERT0(error);
} else {
- error = zfs_dropname(dl, zp, dzp, tx, flag);
+ ASSERT(zp->z_unlinked == 0);
+ error = zfs_dropname(dzp, name, zp, tx, flag);
if (error != 0)
return (error);
}
- mutex_enter(&dzp->z_lock);
dzp->z_size--; /* one dirent removed */
dzp->z_links -= zp_is_dir; /* ".." link from zp */
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs),
@@ -900,8 +670,7 @@
NULL, &dzp->z_pflags, sizeof (dzp->z_pflags));
zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime, B_TRUE);
error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx);
- ASSERT(error == 0);
- mutex_exit(&dzp->z_lock);
+ ASSERT0(error);
if (unlinkedp != NULL)
*unlinkedp = unlinked;
@@ -912,14 +681,12 @@
}
/*
- * Indicate whether the directory is empty. Works with or without z_lock
- * held, but can only be consider a hint in the latter case. Returns true
- * if only "." and ".." remain and there's no work in progress.
+ * Indicate whether the directory is empty.
*/
boolean_t
zfs_dirempty(znode_t *dzp)
{
- return (dzp->z_size == 2 && dzp->z_dirlocks == 0);
+ return (dzp->z_size == 2);
}
int
@@ -939,7 +706,7 @@
* In FreeBSD, access checking for creating an EA is being done
* in zfs_setextattr(),
*/
-#ifndef __FreeBSD__
+#ifndef __FreeBSD_kernel__
if (error = zfs_zaccess(zp, ACE_WRITE_NAMED_ATTRS, 0, B_FALSE, cr))
return (error);
#endif
@@ -952,7 +719,8 @@
return (SET_ERROR(EDQUOT));
}
-top:
+ getnewvnode_reserve(1);
+
tx = dmu_tx_create(zfsvfs->z_os);
dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
ZFS_SA_BASE_ATTR_SIZE);
@@ -961,13 +729,8 @@
fuid_dirtied = zfsvfs->z_fuid_dirty;
if (fuid_dirtied)
zfs_fuid_txhold(zfsvfs, tx);
- error = dmu_tx_assign(tx, TXG_NOWAIT);
+ error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
- if (error == ERESTART) {
- dmu_tx_wait(tx);
- dmu_tx_abort(tx);
- goto top;
- }
zfs_acl_ids_free(&acl_ids);
dmu_tx_abort(tx);
return (error);
@@ -992,6 +755,8 @@
zfs_acl_ids_free(&acl_ids);
dmu_tx_commit(tx);
+ getnewvnode_drop_reserve();
+
*xvpp = ZTOV(xzp);
return (0);
@@ -1015,23 +780,20 @@
{
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
znode_t *xzp;
- zfs_dirlock_t *dl;
vattr_t va;
int error;
top:
- error = zfs_dirent_lock(&dl, zp, "", &xzp, ZXATTR, NULL, NULL);
+ error = zfs_dirent_lookup(zp, "", &xzp, ZXATTR);
if (error)
return (error);
if (xzp != NULL) {
*xvpp = ZTOV(xzp);
- zfs_dirent_unlock(dl);
return (0);
}
if (!(flags & CREATE_XATTR_DIR)) {
- zfs_dirent_unlock(dl);
#ifdef illumos
return (SET_ERROR(ENOENT));
#else
@@ -1040,7 +802,6 @@
}
if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
- zfs_dirent_unlock(dl);
return (SET_ERROR(EROFS));
}
@@ -1060,7 +821,6 @@
zfs_fuid_map_ids(zp, cr, &va.va_uid, &va.va_gid);
error = zfs_make_xattrdir(zp, &va, xvpp, cr);
- zfs_dirent_unlock(dl);
if (error == ERESTART) {
/* NB: we already did dmu_tx_wait() if necessary */
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fm.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -361,8 +361,8 @@
typedef struct zfs_ecksum_info {
/* histograms of set and cleared bits by bit number in a 64-bit word */
- uint16_t zei_histogram_set[sizeof (uint64_t) * NBBY];
- uint16_t zei_histogram_cleared[sizeof (uint64_t) * NBBY];
+ uint32_t zei_histogram_set[sizeof (uint64_t) * NBBY];
+ uint32_t zei_histogram_cleared[sizeof (uint64_t) * NBBY];
/* inline arrays of bits set and cleared. */
uint64_t zei_bits_set[ZFM_MAX_INLINE];
@@ -387,7 +387,7 @@
} zfs_ecksum_info_t;
static void
-update_histogram(uint64_t value_arg, uint16_t *hist, uint32_t *count)
+update_histogram(uint64_t value_arg, uint32_t *hist, uint32_t *count)
{
size_t i;
size_t bits = 0;
@@ -553,7 +553,7 @@
if (badbuf == NULL || goodbuf == NULL)
return (eip);
- ASSERT3U(nui64s, <=, UINT16_MAX);
+ ASSERT3U(nui64s, <=, UINT32_MAX);
ASSERT3U(size, ==, nui64s * sizeof (uint64_t));
ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
ASSERT3U(size, <=, UINT32_MAX);
@@ -655,10 +655,10 @@
} else {
fm_payload_set(ereport,
FM_EREPORT_PAYLOAD_ZFS_BAD_SET_HISTOGRAM,
- DATA_TYPE_UINT16_ARRAY,
+ DATA_TYPE_UINT32_ARRAY,
NBBY * sizeof (uint64_t), eip->zei_histogram_set,
FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_HISTOGRAM,
- DATA_TYPE_UINT16_ARRAY,
+ DATA_TYPE_UINT32_ARRAY,
NBBY * sizeof (uint64_t), eip->zei_histogram_cleared,
NULL);
}
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_fuid.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -407,7 +407,7 @@
domain = zfs_fuid_find_by_idx(zfsvfs, index);
ASSERT(domain != NULL);
-#ifdef sun
+#ifdef illumos
if (type == ZFS_OWNER || type == ZFS_ACE_USER) {
(void) kidmap_getuidbysid(crgetzone(cr), domain,
FUID_RID(fuid), &id);
@@ -415,9 +415,9 @@
(void) kidmap_getgidbysid(crgetzone(cr), domain,
FUID_RID(fuid), &id);
}
-#else /* !sun */
+#else
id = UID_NOBODY;
-#endif /* !sun */
+#endif
return (id);
}
@@ -704,13 +704,13 @@
boolean_t
zfs_groupmember(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr)
{
-#ifdef sun
+#ifdef illumos
ksid_t *ksid = crgetsid(cr, KSID_GROUP);
ksidlist_t *ksidlist = crgetsidlist(cr);
-#endif /* !sun */
+#endif
uid_t gid;
-#ifdef sun
+#ifdef illumos
if (ksid && ksidlist) {
int i;
ksid_t *ksid_groups;
@@ -742,7 +742,7 @@
}
}
}
-#endif /* !sun */
+#endif /* illumos */
/*
* Not found in ksidlist, check posix groups
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -22,14 +22,16 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011-2012 Pawel Jakub Dawidek <pawel at dawidek.net>.
- * All rights reserved.
+ * Copyright (c) 2011-2012 Pawel Jakub Dawidek. All rights reserved.
* Copyright 2013 Martin Matuska <mm at FreeBSD.org>. All rights reserved.
- * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2012, Joyent, Inc. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright 2014 Xin Li <delphij at FreeBSD.org>. All rights reserved.
+ * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved.
+ * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2014, 2016 Joyent, Inc. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
* Copyright (c) 2013 Steven Hartland. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
*/
/*
@@ -132,6 +134,9 @@
* distinguish between the operation failing, and
* deserialization failing.
*/
+#ifdef __FreeBSD__
+#include "opt_kstack_pages.h"
+#endif
#include <sys/types.h>
#include <sys/param.h>
@@ -182,8 +187,10 @@
#include <sys/dmu_objset.h>
#include <sys/dmu_send.h>
#include <sys/dsl_destroy.h>
+#include <sys/dsl_bookmark.h>
#include <sys/dsl_userhold.h>
#include <sys/zfeature.h>
+#include <sys/zio_checksum.h>
#include "zfs_namecheck.h"
#include "zfs_prop.h"
@@ -191,14 +198,7 @@
#include "zfs_comutil.h"
#include "zfs_ioctl_compat.h"
-CTASSERT(sizeof(zfs_cmd_t) < IOCPARM_MAX);
-static int snapshot_list_prefetch;
-SYSCTL_DECL(_vfs_zfs);
-TUNABLE_INT("vfs.zfs.snapshot_list_prefetch", &snapshot_list_prefetch);
-SYSCTL_INT(_vfs_zfs, OID_AUTO, snapshot_list_prefetch, CTLFLAG_RW,
- &snapshot_list_prefetch, 0, "Prefetch data when listing snapshots");
-
static struct cdev *zfsdev;
extern void zfs_init(void);
@@ -207,6 +207,7 @@
uint_t zfs_fsyncer_key;
extern uint_t rrw_tsd_key;
static uint_t zfs_allow_log_key;
+extern uint_t zfs_geom_probe_vdev_key;
typedef int zfs_ioc_legacy_func_t(zfs_cmd_t *);
typedef int zfs_ioc_func_t(const char *, nvlist_t *, nvlist_t *);
@@ -255,7 +256,7 @@
static void zfsdev_close(void *data);
-static int zfs_prop_activate_feature(spa_t *spa, zfeature_info_t *feature);
+static int zfs_prop_activate_feature(spa_t *spa, spa_feature_t feature);
/* _NOTE(PRINTFLIKE(4)) - this is printf-like, but lint is too whiney */
void
@@ -499,6 +500,14 @@
dsl_dataset_t *ds;
dsl_pool_t *dp;
+ /*
+ * First do a quick check for root in the global zone, which
+ * is allowed to do all write_perms. This ensures that zfs_ioc_*
+ * will get to handle nonexistent datasets.
+ */
+ if (INGLOBALZONE(curthread) && secpolicy_zfs(cr) == 0)
+ return (0);
+
error = dsl_pool_hold(name, FTAG, &dp);
if (error != 0)
return (error);
@@ -634,12 +643,14 @@
break;
case ZFS_PROP_QUOTA:
+ case ZFS_PROP_FILESYSTEM_LIMIT:
+ case ZFS_PROP_SNAPSHOT_LIMIT:
if (!INGLOBALZONE(curthread)) {
uint64_t zoned;
- char setpoint[MAXNAMELEN];
+ char setpoint[ZFS_MAX_DATASET_NAME_LEN];
/*
* Unprivileged users are allowed to modify the
- * quota on things *under* (ie. contained by)
+ * limit on things *under* (ie. contained by)
* the thing they own.
*/
if (dsl_prop_get_integer(dsname, "jailed", &zoned,
@@ -848,22 +859,9 @@
return (SET_ERROR(EINVAL));
for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL;
pair = nextpair) {
- dsl_pool_t *dp;
- dsl_dataset_t *ds;
-
- error = dsl_pool_hold(nvpair_name(pair), FTAG, &dp);
- if (error != 0)
- break;
nextpair = nvlist_next_nvpair(snaps, pair);
- error = dsl_dataset_hold(dp, nvpair_name(pair), FTAG, &ds);
- if (error == 0)
- dsl_dataset_rele(ds, FTAG);
- dsl_pool_rele(dp, FTAG);
-
- if (error == 0) {
- error = zfs_secpolicy_destroy_perms(nvpair_name(pair),
- cr);
- } else if (error == ENOENT) {
+ error = zfs_secpolicy_destroy_perms(nvpair_name(pair), cr);
+ if (error == ENOENT) {
/*
* Ignore any snapshots that don't exist (we consider
* them "already destroyed"). Remove the name from the
@@ -885,7 +883,7 @@
int
zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr)
{
- char parentname[MAXNAMELEN];
+ char parentname[ZFS_MAX_DATASET_NAME_LEN];
int error;
if ((error = zfs_secpolicy_write_perms(from,
@@ -957,13 +955,13 @@
error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &clone);
if (error == 0) {
- char parentname[MAXNAMELEN];
+ char parentname[ZFS_MAX_DATASET_NAME_LEN];
dsl_dataset_t *origin = NULL;
dsl_dir_t *dd;
dd = clone->ds_dir;
error = dsl_dataset_hold_obj(dd->dd_pool,
- dd->dd_phys->dd_origin_obj, FTAG, &origin);
+ dsl_dir_phys(dd)->dd_origin_obj, FTAG, &origin);
if (error != 0) {
dsl_dataset_rele(clone, FTAG);
dsl_pool_rele(dp, FTAG);
@@ -1041,8 +1039,77 @@
return (error);
}
+/*
+ * Check for permission to create each snapshot in the nvlist.
+ */
/* ARGSUSED */
static int
+zfs_secpolicy_bookmark(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+ int error = 0;
+
+ for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL);
+ pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) {
+ char *name = nvpair_name(pair);
+ char *hashp = strchr(name, '#');
+
+ if (hashp == NULL) {
+ error = SET_ERROR(EINVAL);
+ break;
+ }
+ *hashp = '\0';
+ error = zfs_secpolicy_write_perms(name,
+ ZFS_DELEG_PERM_BOOKMARK, cr);
+ *hashp = '#';
+ if (error != 0)
+ break;
+ }
+ return (error);
+}
+
+/* ARGSUSED */
+static int
+zfs_secpolicy_destroy_bookmarks(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
+{
+ nvpair_t *pair, *nextpair;
+ int error = 0;
+
+ for (pair = nvlist_next_nvpair(innvl, NULL); pair != NULL;
+ pair = nextpair) {
+ char *name = nvpair_name(pair);
+ char *hashp = strchr(name, '#');
+ nextpair = nvlist_next_nvpair(innvl, pair);
+
+ if (hashp == NULL) {
+ error = SET_ERROR(EINVAL);
+ break;
+ }
+
+ *hashp = '\0';
+ error = zfs_secpolicy_write_perms(name,
+ ZFS_DELEG_PERM_DESTROY, cr);
+ *hashp = '#';
+ if (error == ENOENT) {
+ /*
+ * Ignore any filesystems that don't exist (we consider
+ * their bookmarks "already destroyed"). Remove
+ * the name from the nvl here in case the filesystem
+ * is created between now and when we try to destroy
+ * the bookmark (in which case we don't want to
+ * destroy it since we haven't checked for permission).
+ */
+ fnvlist_remove_nvpair(innvl, pair);
+ error = 0;
+ }
+ if (error != 0)
+ break;
+ }
+
+ return (error);
+}
+
+/* ARGSUSED */
+static int
zfs_secpolicy_log_history(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
/*
@@ -1057,7 +1124,7 @@
static int
zfs_secpolicy_create_clone(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr)
{
- char parentname[MAXNAMELEN];
+ char parentname[ZFS_MAX_DATASET_NAME_LEN];
int error;
char *origin;
@@ -1200,7 +1267,7 @@
for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL;
pair = nvlist_next_nvpair(holds, pair)) {
- char fsname[MAXNAMELEN];
+ char fsname[ZFS_MAX_DATASET_NAME_LEN];
error = dmu_fsname(nvpair_name(pair), fsname);
if (error != 0)
return (error);
@@ -1221,7 +1288,7 @@
for (pair = nvlist_next_nvpair(innvl, NULL); pair != NULL;
pair = nvlist_next_nvpair(innvl, pair)) {
- char fsname[MAXNAMELEN];
+ char fsname[ZFS_MAX_DATASET_NAME_LEN];
error = dmu_fsname(nvpair_name(pair), fsname);
if (error != 0)
return (error);
@@ -1281,7 +1348,7 @@
if ((error = ddi_copyin((void *)(uintptr_t)nvl, packed, size,
iflag)) != 0) {
kmem_free(packed, size);
- return (error);
+ return (SET_ERROR(EFAULT));
}
if ((error = nvlist_unpack(packed, size, &list, 0)) != 0) {
@@ -1370,6 +1437,7 @@
getzfsvfs(const char *dsname, zfsvfs_t **zfvp)
{
objset_t *os;
+ vfs_t *vfsp;
int error;
error = dmu_objset_hold(dsname, FTAG, &os);
@@ -1383,12 +1451,21 @@
mutex_enter(&os->os_user_ptr_lock);
*zfvp = dmu_objset_get_user(os);
if (*zfvp) {
- VFS_HOLD((*zfvp)->z_vfs);
+ vfsp = (*zfvp)->z_vfs;
+ vfs_ref(vfsp);
} else {
error = SET_ERROR(ESRCH);
}
mutex_exit(&os->os_user_ptr_lock);
dmu_objset_rele(os, FTAG);
+ if (error == 0) {
+ error = vfs_busy(vfsp, 0);
+ vfs_rel(vfsp);
+ if (error != 0) {
+ *zfvp = NULL;
+ error = SET_ERROR(ESRCH);
+ }
+ }
return (error);
}
@@ -1406,8 +1483,9 @@
if (getzfsvfs(name, zfvp) != 0)
error = zfsvfs_create(name, zfvp);
if (error == 0) {
- rrw_enter(&(*zfvp)->z_teardown_lock, (writer) ? RW_WRITER :
+ rrm_enter(&(*zfvp)->z_teardown_lock, (writer) ? RW_WRITER :
RW_READER, tag);
+#ifdef illumos
if ((*zfvp)->z_unmounted) {
/*
* XXX we could probably try again, since the unmounting
@@ -1414,9 +1492,16 @@
* thread should be just about to disassociate the
* objset from the zfsvfs.
*/
- rrw_exit(&(*zfvp)->z_teardown_lock, tag);
+ rrm_exit(&(*zfvp)->z_teardown_lock, tag);
return (SET_ERROR(EBUSY));
}
+#else
+ /*
+ * vfs_busy() ensures that the filesystem is not and
+ * can not be unmounted.
+ */
+ ASSERT(!(*zfvp)->z_unmounted);
+#endif
}
return (error);
}
@@ -1424,10 +1509,14 @@
static void
zfsvfs_rele(zfsvfs_t *zfsvfs, void *tag)
{
- rrw_exit(&zfsvfs->z_teardown_lock, tag);
+ rrm_exit(&zfsvfs->z_teardown_lock, tag);
if (zfsvfs->z_vfs) {
+#ifdef illumos
VFS_RELE(zfsvfs->z_vfs);
+#else
+ vfs_unbusy(zfsvfs->z_vfs);
+#endif
} else {
dmu_objset_disown(zfsvfs->z_os, zfsvfs);
zfsvfs_free(zfsvfs);
@@ -1542,8 +1631,7 @@
nvlist_free(config);
- if (props)
- nvlist_free(props);
+ nvlist_free(props);
return (error);
}
@@ -1829,6 +1917,7 @@
(void) nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_SPARES,
&spares, &nspares);
+#ifdef illumos
/*
* A root pool with concatenated devices is not supported.
* Thus, can not add a device to a root pool.
@@ -1844,6 +1933,7 @@
spa_close(spa, FTAG);
return (SET_ERROR(EDOM));
}
+#endif /* illumos */
if (error == 0) {
error = spa_vdev_add(spa, config);
@@ -2278,7 +2368,8 @@
* A dataset name of maximum length cannot have any snapshots,
* so exit immediately.
*/
- if (strlcat(zc->zc_name, "@", sizeof (zc->zc_name)) >= MAXNAMELEN) {
+ if (strlcat(zc->zc_name, "@", sizeof (zc->zc_name)) >=
+ ZFS_MAX_DATASET_NAME_LEN) {
dmu_objset_rele(os, FTAG);
return (SET_ERROR(ESRCH));
}
@@ -2372,7 +2463,7 @@
const char *propname = nvpair_name(pair);
zfs_prop_t prop = zfs_name_to_prop(propname);
uint64_t intval;
- int err;
+ int err = -1;
if (prop == ZPROP_INVAL) {
if (zfs_prop_userquota(propname))
@@ -2399,6 +2490,21 @@
case ZFS_PROP_REFQUOTA:
err = dsl_dataset_set_refquota(dsname, source, intval);
break;
+ case ZFS_PROP_FILESYSTEM_LIMIT:
+ case ZFS_PROP_SNAPSHOT_LIMIT:
+ if (intval == UINT64_MAX) {
+ /* clearing the limit, just do it */
+ err = 0;
+ } else {
+ err = dsl_dir_activate_fs_ss_limit(dsname);
+ }
+ /*
+ * Set err to -1 to force the zfs_set_prop_nvlist code down the
+ * default path to set the value in the nvlist.
+ */
+ if (err == 0)
+ err = -1;
+ break;
case ZFS_PROP_RESERVATION:
err = dsl_dir_set_reservation(dsname, source, intval);
break;
@@ -2406,8 +2512,7 @@
err = dsl_dataset_set_refreservation(dsname, source, intval);
break;
case ZFS_PROP_VOLSIZE:
- err = zvol_set_volsize(dsname, ddi_driver_major(zfs_dip),
- intval);
+ err = zvol_set_volsize(dsname, intval);
break;
case ZFS_PROP_VERSION:
{
@@ -2429,38 +2534,6 @@
}
break;
}
- case ZFS_PROP_COMPRESSION:
- {
- if (intval == ZIO_COMPRESS_LZ4) {
- zfeature_info_t *feature =
- &spa_feature_table[SPA_FEATURE_LZ4_COMPRESS];
- spa_t *spa;
-
- if ((err = spa_open(dsname, &spa, FTAG)) != 0)
- return (err);
-
- /*
- * Setting the LZ4 compression algorithm activates
- * the feature.
- */
- if (!spa_feature_is_active(spa, feature)) {
- if ((err = zfs_prop_activate_feature(spa,
- feature)) != 0) {
- spa_close(spa, FTAG);
- return (err);
- }
- }
-
- spa_close(spa, FTAG);
- }
- /*
- * We still want the default set action to be performed in the
- * caller, we only performed zfeature settings here.
- */
- err = -1;
- break;
- }
-
default:
err = -1;
}
@@ -2637,7 +2710,6 @@
while ((pair = nvlist_next_nvpair(nvl, pair)) != NULL) {
const char *propname = nvpair_name(pair);
- char *valstr;
if (!zfs_prop_user(propname) ||
nvpair_type(pair) != DATA_TYPE_STRING)
@@ -2650,8 +2722,7 @@
if (strlen(propname) >= ZAP_MAXNAMELEN)
return (SET_ERROR(ENAMETOOLONG));
- VERIFY(nvpair_value_string(pair, &valstr) == 0);
- if (strlen(valstr) >= ZAP_MAXVALUELEN)
+ if (strlen(fnvpair_value_string(pair)) >= ZAP_MAXVALUELEN)
return (E2BIG);
}
return (0);
@@ -2977,7 +3048,7 @@
mtx_lock(&mountlist_mtx);
TAILQ_FOREACH(vfsp, &mountlist, mnt_list) {
if (strcmp(refstr_value(vfsp->vfs_resource), resource) == 0) {
- VFS_HOLD(vfsp);
+ vfs_ref(vfsp);
break;
}
}
@@ -3100,7 +3171,7 @@
boolean_t fuids_ok, sa_ok;
uint64_t zplver = ZPL_VERSION;
objset_t *os = NULL;
- char parentname[MAXNAMELEN];
+ char parentname[ZFS_MAX_DATASET_NAME_LEN];
char *cp;
spa_t *spa;
uint64_t spa_vers;
@@ -3297,6 +3368,10 @@
if (error != 0)
(void) dsl_destroy_head(fsname);
}
+#ifdef __FreeBSD__
+ if (error == 0)
+ zvol_create_minors(fsname);
+#endif
return (error);
}
@@ -3336,7 +3411,8 @@
* The snap name must contain an @, and the part after it must
* contain only valid characters.
*/
- if (cp == NULL || snapshot_namecheck(cp + 1, NULL, NULL) != 0)
+ if (cp == NULL ||
+ zfs_component_namecheck(cp + 1, NULL, NULL) != 0)
return (SET_ERROR(EINVAL));
/*
@@ -3401,6 +3477,53 @@
return (error);
}
+#ifdef __FreeBSD__
+static int
+zfs_ioc_nextboot(const char *unused, nvlist_t *innvl, nvlist_t *outnvl)
+{
+ char name[MAXNAMELEN];
+ spa_t *spa;
+ vdev_t *vd;
+ char *command;
+ uint64_t pool_guid;
+ uint64_t vdev_guid;
+ int error;
+
+ if (nvlist_lookup_uint64(innvl,
+ ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0)
+ return (EINVAL);
+ if (nvlist_lookup_uint64(innvl,
+ ZPOOL_CONFIG_GUID, &vdev_guid) != 0)
+ return (EINVAL);
+ if (nvlist_lookup_string(innvl,
+ "command", &command) != 0)
+ return (EINVAL);
+
+ mutex_enter(&spa_namespace_lock);
+ spa = spa_by_guid(pool_guid, vdev_guid);
+ if (spa != NULL)
+ strcpy(name, spa_name(spa));
+ mutex_exit(&spa_namespace_lock);
+ if (spa == NULL)
+ return (ENOENT);
+
+ if ((error = spa_open(name, &spa, FTAG)) != 0)
+ return (error);
+ spa_vdev_state_enter(spa, SCL_ALL);
+ vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE);
+ if (vd == NULL) {
+ (void) spa_vdev_state_exit(spa, NULL, ENXIO);
+ spa_close(spa, FTAG);
+ return (ENODEV);
+ }
+ error = vdev_label_write_pad2(vd, command, strlen(command));
+ (void) spa_vdev_state_exit(spa, NULL, 0);
+ txg_wait_synced(spa->spa_dsl_pool, 0);
+ spa_close(spa, FTAG);
+ return (error);
+}
+#endif
+
/*
* The dp_config_rwlock must not be held when calling this, because the
* unmount may need to write out data.
@@ -3416,7 +3539,9 @@
{
vfs_t *vfsp;
zfsvfs_t *zfsvfs;
+#ifdef illumos
int err;
+#endif
if (strchr(snapname, '@') == NULL)
return (0);
@@ -3428,21 +3553,20 @@
zfsvfs = vfsp->vfs_data;
ASSERT(!dsl_pool_config_held(dmu_objset_pool(zfsvfs->z_os)));
+#ifdef illumos
err = vn_vfswlock(vfsp->vfs_vnodecovered);
VFS_RELE(vfsp);
if (err != 0)
return (SET_ERROR(err));
+#endif
/*
* Always force the unmount for snapshots.
*/
-
#ifdef illumos
(void) dounmount(vfsp, MS_FORCE, kcred);
#else
- mtx_lock(&Giant); /* dounmount() */
(void) dounmount(vfsp, MS_FORCE, curthread);
- mtx_unlock(&Giant); /* dounmount() */
#endif
return (0);
}
@@ -3471,7 +3595,7 @@
return;
ds = dmu_objset_ds(os);
if (dsl_dir_is_clone(ds->ds_dir) && DS_IS_DEFER_DESTROY(ds->ds_prev)) {
- char originname[MAXNAMELEN];
+ char originname[ZFS_MAX_DATASET_NAME_LEN];
dsl_dataset_name(ds->ds_prev, originname);
dmu_objset_rele(os, FTAG);
(void) zfs_unmount_snap(originname);
@@ -3489,6 +3613,7 @@
* outnvl: snapshot -> error code (int32)
*
*/
+/* ARGSUSED */
static int
zfs_ioc_destroy_snaps(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
{
@@ -3507,7 +3632,8 @@
const char *name = nvpair_name(pair);
/*
- * The snap must be in the specified pool.
+ * The snap must be in the specified pool to prevent the
+ * invalid removal of zvol minors below.
*/
if (strncmp(name, poolname, poollen) != 0 ||
(name[poollen] != '/' && name[poollen] != '@'))
@@ -3516,7 +3642,9 @@
error = zfs_unmount_snap(name);
if (error != 0)
return (error);
- (void) zvol_remove_minor(name);
+#if defined(__FreeBSD__)
+ zvol_remove_minors(name);
+#endif
}
return (dsl_destroy_snapshots_nvl(snaps, defer, outnvl));
@@ -3523,6 +3651,100 @@
}
/*
+ * Create bookmarks. Bookmark names are of the form <fs>#<bmark>.
+ * All bookmarks must be in the same pool.
+ *
+ * innvl: {
+ * bookmark1 -> snapshot1, bookmark2 -> snapshot2
+ * }
+ *
+ * outnvl: bookmark -> error code (int32)
+ *
+ */
+/* ARGSUSED */
+static int
+zfs_ioc_bookmark(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+ for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL);
+ pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) {
+ char *snap_name;
+
+ /*
+ * Verify the snapshot argument.
+ */
+ if (nvpair_value_string(pair, &snap_name) != 0)
+ return (SET_ERROR(EINVAL));
+
+
+ /* Verify that the keys (bookmarks) are unique */
+ for (nvpair_t *pair2 = nvlist_next_nvpair(innvl, pair);
+ pair2 != NULL; pair2 = nvlist_next_nvpair(innvl, pair2)) {
+ if (strcmp(nvpair_name(pair), nvpair_name(pair2)) == 0)
+ return (SET_ERROR(EINVAL));
+ }
+ }
+
+ return (dsl_bookmark_create(innvl, outnvl));
+}
+
+/*
+ * innvl: {
+ * property 1, property 2, ...
+ * }
+ *
+ * outnvl: {
+ * bookmark name 1 -> { property 1, property 2, ... },
+ * bookmark name 2 -> { property 1, property 2, ... }
+ * }
+ *
+ */
+static int
+zfs_ioc_get_bookmarks(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+ return (dsl_get_bookmarks(fsname, innvl, outnvl));
+}
+
+/*
+ * innvl: {
+ * bookmark name 1, bookmark name 2
+ * }
+ *
+ * outnvl: bookmark -> error code (int32)
+ *
+ */
+static int
+zfs_ioc_destroy_bookmarks(const char *poolname, nvlist_t *innvl,
+ nvlist_t *outnvl)
+{
+ int error, poollen;
+
+ poollen = strlen(poolname);
+ for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL);
+ pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) {
+ const char *name = nvpair_name(pair);
+ const char *cp = strchr(name, '#');
+
+ /*
+ * The bookmark name must contain an #, and the part after it
+ * must contain only valid characters.
+ */
+ if (cp == NULL ||
+ zfs_component_namecheck(cp + 1, NULL, NULL) != 0)
+ return (SET_ERROR(EINVAL));
+
+ /*
+ * The bookmark must be in the specified pool.
+ */
+ if (strncmp(name, poolname, poollen) != 0 ||
+ (name[poollen] != '/' && name[poollen] != '#'))
+ return (SET_ERROR(EXDEV));
+ }
+
+ error = dsl_bookmark_destroy(innvl, outnvl);
+ return (error);
+}
+
+/*
* inputs:
* zc_name name of dataset to destroy
* zc_objset_type type of objset
@@ -3546,34 +3768,48 @@
else
err = dsl_destroy_head(zc->zc_name);
if (zc->zc_objset_type == DMU_OST_ZVOL && err == 0)
+#ifdef __FreeBSD__
+ zvol_remove_minors(zc->zc_name);
+#else
(void) zvol_remove_minor(zc->zc_name);
+#endif
return (err);
}
/*
- * inputs:
- * zc_name name of dataset to rollback (to most recent snapshot)
+ * fsname is name of dataset to rollback (to most recent snapshot)
*
- * outputs: none
+ * innvl is not used.
+ *
+ * outnvl: "target" -> name of most recent snapshot
+ * }
*/
+/* ARGSUSED */
static int
-zfs_ioc_rollback(zfs_cmd_t *zc)
+zfs_ioc_rollback(const char *fsname, nvlist_t *args, nvlist_t *outnvl)
{
zfsvfs_t *zfsvfs;
int error;
- if (getzfsvfs(zc->zc_name, &zfsvfs) == 0) {
+ if (getzfsvfs(fsname, &zfsvfs) == 0) {
+ dsl_dataset_t *ds;
+
+ ds = dmu_objset_ds(zfsvfs->z_os);
error = zfs_suspend_fs(zfsvfs);
if (error == 0) {
int resume_err;
- error = dsl_dataset_rollback(zc->zc_name);
- resume_err = zfs_resume_fs(zfsvfs, zc->zc_name);
+ error = dsl_dataset_rollback(fsname, zfsvfs, outnvl);
+ resume_err = zfs_resume_fs(zfsvfs, ds);
error = error ? error : resume_err;
}
+#ifdef illumos
VFS_RELE(zfsvfs->z_vfs);
+#else
+ vfs_unbusy(zfsvfs->z_vfs);
+#endif
} else {
- error = dsl_dataset_rollback(zc->zc_name);
+ error = dsl_dataset_rollback(fsname, NULL, outnvl);
}
return (error);
}
@@ -3582,7 +3818,7 @@
recursive_unmount(const char *fsname, void *arg)
{
const char *snapname = arg;
- char fullname[MAXNAMELEN];
+ char fullname[ZFS_MAX_DATASET_NAME_LEN];
(void) snprintf(fullname, sizeof (fullname), "%s@%s", fsname, snapname);
return (zfs_unmount_snap(fullname));
@@ -3600,10 +3836,12 @@
zfs_ioc_rename(zfs_cmd_t *zc)
{
boolean_t recursive = zc->zc_cookie & 1;
+ char *at;
+ boolean_t allow_mounted = B_TRUE;
+
#ifdef __FreeBSD__
- boolean_t allow_mounted = zc->zc_cookie & 2;
+ allow_mounted = (zc->zc_cookie & 2) != 0;
#endif
- char *at;
zc->zc_value[sizeof (zc->zc_value) - 1] = '\0';
if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 ||
@@ -3613,22 +3851,25 @@
at = strchr(zc->zc_name, '@');
if (at != NULL) {
/* snaps must be in same fs */
+ int error;
+
if (strncmp(zc->zc_name, zc->zc_value, at - zc->zc_name + 1))
return (SET_ERROR(EXDEV));
*at = '\0';
-#ifdef illumos
- if (zc->zc_objset_type == DMU_OST_ZFS) {
-#else
- if (zc->zc_objset_type == DMU_OST_ZFS && allow_mounted) {
-#endif
- int error = dmu_objset_find(zc->zc_name,
+ if (zc->zc_objset_type == DMU_OST_ZFS && !allow_mounted) {
+ error = dmu_objset_find(zc->zc_name,
recursive_unmount, at + 1,
recursive ? DS_FIND_CHILDREN : 0);
- if (error != 0)
+ if (error != 0) {
+ *at = '@';
return (error);
+ }
}
- return (dsl_dataset_rename_snapshot(zc->zc_name,
- at + 1, strchr(zc->zc_value, '@') + 1, recursive));
+ error = dsl_dataset_rename_snapshot(zc->zc_name,
+ at + 1, strchr(zc->zc_value, '@') + 1, recursive);
+ *at = '@';
+
+ return (error);
} else {
#ifdef illumos
if (zc->zc_objset_type == DMU_OST_ZVOL)
@@ -3705,8 +3946,7 @@
* the SPA supports it. We ignore any errors here since
* we'll catch them later.
*/
- if (nvpair_type(pair) == DATA_TYPE_UINT64 &&
- nvpair_value_uint64(pair, &intval) == 0) {
+ if (nvpair_value_uint64(pair, &intval) == 0) {
if (intval >= ZIO_COMPRESS_GZIP_1 &&
intval <= ZIO_COMPRESS_GZIP_9 &&
zfs_earlier_version(dsname,
@@ -3720,15 +3960,13 @@
return (SET_ERROR(ENOTSUP));
if (intval == ZIO_COMPRESS_LZ4) {
- zfeature_info_t *feature =
- &spa_feature_table[
- SPA_FEATURE_LZ4_COMPRESS];
spa_t *spa;
if ((err = spa_open(dsname, &spa, FTAG)) != 0)
return (err);
- if (!spa_feature_is_enabled(spa, feature)) {
+ if (!spa_feature_is_enabled(spa,
+ SPA_FEATURE_LZ4_COMPRESS)) {
spa_close(spa, FTAG);
return (SET_ERROR(ENOTSUP));
}
@@ -3754,9 +3992,40 @@
return (SET_ERROR(ENOTSUP));
break;
- case ZFS_PROP_DEDUP:
- if (zfs_earlier_version(dsname, SPA_VERSION_DEDUP))
- return (SET_ERROR(ENOTSUP));
+ case ZFS_PROP_RECORDSIZE:
+ /* Record sizes above 128k need the feature to be enabled */
+ if (nvpair_value_uint64(pair, &intval) == 0 &&
+ intval > SPA_OLD_MAXBLOCKSIZE) {
+ spa_t *spa;
+
+ /*
+ * If this is a bootable dataset then
+ * the we don't allow large (>128K) blocks,
+ * because GRUB doesn't support them.
+ */
+ if (zfs_is_bootfs(dsname) &&
+ intval > SPA_OLD_MAXBLOCKSIZE) {
+ return (SET_ERROR(ERANGE));
+ }
+
+ /*
+ * We don't allow setting the property above 1MB,
+ * unless the tunable has been changed.
+ */
+ if (intval > zfs_max_recordsize ||
+ intval > SPA_MAXBLOCKSIZE)
+ return (SET_ERROR(ERANGE));
+
+ if ((err = spa_open(dsname, &spa, FTAG)) != 0)
+ return (err);
+
+ if (!spa_feature_is_enabled(spa,
+ SPA_FEATURE_LARGE_BLOCKS)) {
+ spa_close(spa, FTAG);
+ return (SET_ERROR(ENOTSUP));
+ }
+ spa_close(spa, FTAG);
+ }
break;
case ZFS_PROP_SHARESMB:
@@ -3773,7 +4042,46 @@
return (SET_ERROR(ENOTSUP));
}
break;
+
+ case ZFS_PROP_CHECKSUM:
+ case ZFS_PROP_DEDUP:
+ {
+ spa_feature_t feature;
+ spa_t *spa;
+
+ /* dedup feature version checks */
+ if (prop == ZFS_PROP_DEDUP &&
+ zfs_earlier_version(dsname, SPA_VERSION_DEDUP))
+ return (SET_ERROR(ENOTSUP));
+
+ if (nvpair_value_uint64(pair, &intval) != 0)
+ return (SET_ERROR(EINVAL));
+
+ /* check prop value is enabled in features */
+ feature = zio_checksum_to_feature(intval & ZIO_CHECKSUM_MASK);
+ if (feature == SPA_FEATURE_NONE)
+ break;
+
+ if ((err = spa_open(dsname, &spa, FTAG)) != 0)
+ return (err);
+ /*
+ * Salted checksums are not supported on root pools.
+ */
+ if (spa_bootfs(spa) != 0 &&
+ intval < ZIO_CHECKSUM_FUNCTIONS &&
+ (zio_checksum_table[intval].ci_flags &
+ ZCHECKSUM_FLAG_SALTED)) {
+ spa_close(spa, FTAG);
+ return (SET_ERROR(ERANGE));
+ }
+ if (!spa_feature_is_enabled(spa, feature)) {
+ spa_close(spa, FTAG);
+ return (SET_ERROR(ENOTSUP));
+ }
+ spa_close(spa, FTAG);
+ break;
}
+ }
return (zfs_secpolicy_setprop(dsname, prop, pair, CRED()));
}
@@ -3786,9 +4094,9 @@
zfs_prop_activate_feature_check(void *arg, dmu_tx_t *tx)
{
spa_t *spa = dmu_tx_pool(tx)->dp_spa;
- zfeature_info_t *feature = arg;
+ spa_feature_t *featurep = arg;
- if (!spa_feature_is_active(spa, feature))
+ if (!spa_feature_is_active(spa, *featurep))
return (0);
else
return (SET_ERROR(EBUSY));
@@ -3802,9 +4110,9 @@
zfs_prop_activate_feature_sync(void *arg, dmu_tx_t *tx)
{
spa_t *spa = dmu_tx_pool(tx)->dp_spa;
- zfeature_info_t *feature = arg;
+ spa_feature_t *featurep = arg;
- spa_feature_incr(spa, feature, tx);
+ spa_feature_incr(spa, *featurep, tx);
}
/*
@@ -3813,7 +4121,7 @@
* as being active.
*/
static int
-zfs_prop_activate_feature(spa_t *spa, zfeature_info_t *feature)
+zfs_prop_activate_feature(spa_t *spa, spa_feature_t feature)
{
int err;
@@ -3820,7 +4128,7 @@
/* EBUSY here indicates that the feature is already active */
err = dsl_sync_task(spa_name(spa),
zfs_prop_activate_feature_check, zfs_prop_activate_feature_sync,
- feature, 2);
+ &feature, 2, ZFS_SPACE_CHECK_RESERVED);
if (err != 0 && err != EBUSY)
return (err);
@@ -3956,6 +4264,56 @@
}
}
+/*
+ * Extract properties that cannot be set PRIOR to the receipt of a dataset.
+ * For example, refquota cannot be set until after the receipt of a dataset,
+ * because in replication streams, an older/earlier snapshot may exceed the
+ * refquota. We want to receive the older/earlier snapshot, but setting
+ * refquota pre-receipt will set the dsl's ACTUAL quota, which will prevent
+ * the older/earlier snapshot from being received (with EDQUOT).
+ *
+ * The ZFS test "zfs_receive_011_pos" demonstrates such a scenario.
+ *
+ * libzfs will need to be judicious handling errors encountered by props
+ * extracted by this function.
+ */
+static nvlist_t *
+extract_delay_props(nvlist_t *props)
+{
+ nvlist_t *delayprops;
+ nvpair_t *nvp, *tmp;
+ static const zfs_prop_t delayable[] = { ZFS_PROP_REFQUOTA, 0 };
+ int i;
+
+ VERIFY(nvlist_alloc(&delayprops, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+ for (nvp = nvlist_next_nvpair(props, NULL); nvp != NULL;
+ nvp = nvlist_next_nvpair(props, nvp)) {
+ /*
+ * strcmp() is safe because zfs_prop_to_name() always returns
+ * a bounded string.
+ */
+ for (i = 0; delayable[i] != 0; i++) {
+ if (strcmp(zfs_prop_to_name(delayable[i]),
+ nvpair_name(nvp)) == 0) {
+ break;
+ }
+ }
+ if (delayable[i] != 0) {
+ tmp = nvlist_prev_nvpair(props, nvp);
+ VERIFY(nvlist_add_nvpair(delayprops, nvp) == 0);
+ VERIFY(nvlist_remove_nvpair(props, nvp) == 0);
+ nvp = tmp;
+ }
+ }
+
+ if (nvlist_empty(delayprops)) {
+ nvlist_free(delayprops);
+ delayprops = NULL;
+ }
+ return (delayprops);
+}
+
#ifdef DEBUG
static boolean_t zfs_ioc_recv_inject_err;
#endif
@@ -3971,6 +4329,7 @@
* zc_guid force flag
* zc_cleanup_fd cleanup-on-exit file descriptor
* zc_action_handle handle for this guid/ds mapping (or zero on first call)
+ * zc_resumable if data is incomplete assume sender will resume
*
* outputs:
* zc_cookie number of bytes read
@@ -3991,9 +4350,11 @@
offset_t off;
nvlist_t *props = NULL; /* sent properties */
nvlist_t *origprops = NULL; /* existing properties */
+ nvlist_t *delayprops = NULL; /* sent properties applied post-receive */
char *origin = NULL;
char *tosnap;
- char tofs[ZFS_MAXNAMELEN];
+ char tofs[ZFS_MAX_DATASET_NAME_LEN];
+ cap_rights_t rights;
boolean_t first_recvd_props = B_FALSE;
if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 ||
@@ -4011,19 +4372,23 @@
return (error);
fd = zc->zc_cookie;
+#ifdef illumos
fp = getf(fd);
+#else
+ fget_read(curthread, fd, cap_rights_init(&rights, CAP_PREAD), &fp);
+#endif
if (fp == NULL) {
nvlist_free(props);
return (SET_ERROR(EBADF));
}
- VERIFY(nvlist_alloc(&errors, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ errors = fnvlist_alloc();
if (zc->zc_string[0])
origin = zc->zc_string;
error = dmu_recv_begin(tofs, tosnap,
- &zc->zc_begin_record, force, origin, &drc);
+ &zc->zc_begin_record, force, zc->zc_resumable, origin, &drc);
if (error != 0)
goto out;
@@ -4071,21 +4436,12 @@
props_error = dsl_prop_set_hasrecvd(tofs);
if (props_error == 0) {
+ delayprops = extract_delay_props(props);
(void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED,
props, errors);
}
}
- if (zc->zc_nvlist_dst_size != 0 &&
- (nvlist_smush(errors, zc->zc_nvlist_dst_size) != 0 ||
- put_nvlist(zc, errors) != 0)) {
- /*
- * Caller made zc->zc_nvlist_dst less than the minimum expected
- * size or supplied an invalid address.
- */
- props_error = SET_ERROR(EINVAL);
- }
-
off = fp->f_offset;
error = dmu_recv_stream(&drc, fp, &off, zc->zc_cleanup_fd,
&zc->zc_action_handle);
@@ -4095,23 +4451,63 @@
if (getzfsvfs(tofs, &zfsvfs) == 0) {
/* online recv */
+ dsl_dataset_t *ds;
int end_err;
+ ds = dmu_objset_ds(zfsvfs->z_os);
error = zfs_suspend_fs(zfsvfs);
/*
* If the suspend fails, then the recv_end will
* likely also fail, and clean up after itself.
*/
- end_err = dmu_recv_end(&drc);
+ end_err = dmu_recv_end(&drc, zfsvfs);
if (error == 0)
- error = zfs_resume_fs(zfsvfs, tofs);
+ error = zfs_resume_fs(zfsvfs, ds);
error = error ? error : end_err;
+#ifdef illumos
VFS_RELE(zfsvfs->z_vfs);
+#else
+ vfs_unbusy(zfsvfs->z_vfs);
+#endif
} else {
- error = dmu_recv_end(&drc);
+ error = dmu_recv_end(&drc, NULL);
}
+
+ /* Set delayed properties now, after we're done receiving. */
+ if (delayprops != NULL && error == 0) {
+ (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED,
+ delayprops, errors);
+ }
}
+ if (delayprops != NULL) {
+ /*
+ * Merge delayed props back in with initial props, in case
+ * we're DEBUG and zfs_ioc_recv_inject_err is set (which means
+ * we have to make sure clear_received_props() includes
+ * the delayed properties).
+ *
+ * Since zfs_ioc_recv_inject_err is only in DEBUG kernels,
+ * using ASSERT() will be just like a VERIFY.
+ */
+ ASSERT(nvlist_merge(props, delayprops, 0) == 0);
+ nvlist_free(delayprops);
+ }
+
+ /*
+ * Now that all props, initial and delayed, are set, report the prop
+ * errors to the caller.
+ */
+ if (zc->zc_nvlist_dst_size != 0 &&
+ (nvlist_smush(errors, zc->zc_nvlist_dst_size) != 0 ||
+ put_nvlist(zc, errors) != 0)) {
+ /*
+ * Caller made zc->zc_nvlist_dst less than the minimum expected
+ * size or supplied an invalid address.
+ */
+ props_error = SET_ERROR(EINVAL);
+ }
+
zc->zc_cookie = off - fp->f_offset;
if (off >= 0 && off <= MAXOFFSET_T)
fp->f_offset = off;
@@ -4186,8 +4582,10 @@
* zc_fromobj objsetid of incremental fromsnap (may be zero)
* zc_guid if set, estimate size of stream only. zc_cookie is ignored.
* output size in zc_objset_type.
+ * zc_flags lzc_send_flags
*
- * outputs: none
+ * outputs:
+ * zc_objset_type estimated size, if zc_guid is set
*/
static int
zfs_ioc_send(zfs_cmd_t *zc)
@@ -4195,6 +4593,8 @@
int error;
offset_t off;
boolean_t estimate = (zc->zc_guid != 0);
+ boolean_t embedok = (zc->zc_flags & 0x1);
+ boolean_t large_block_ok = (zc->zc_flags & 0x2);
if (zc->zc_obj != 0) {
dsl_pool_t *dp;
@@ -4211,7 +4611,8 @@
}
if (dsl_dir_is_clone(tosnap->ds_dir))
- zc->zc_fromobj = tosnap->ds_dir->dd_phys->dd_origin_obj;
+ zc->zc_fromobj =
+ dsl_dir_phys(tosnap->ds_dir)->dd_origin_obj;
dsl_dataset_rele(tosnap, FTAG);
dsl_pool_rele(dp, FTAG);
}
@@ -4249,16 +4650,25 @@
dsl_dataset_rele(tosnap, FTAG);
dsl_pool_rele(dp, FTAG);
} else {
- file_t *fp = getf(zc->zc_cookie);
+ file_t *fp;
+ cap_rights_t rights;
+
+#ifdef illumos
+ fp = getf(zc->zc_cookie);
+#else
+ fget_write(curthread, zc->zc_cookie,
+ cap_rights_init(&rights, CAP_WRITE), &fp);
+#endif
if (fp == NULL)
return (SET_ERROR(EBADF));
off = fp->f_offset;
error = dmu_send_obj(zc->zc_name, zc->zc_sendobj,
+ zc->zc_fromobj, embedok, large_block_ok,
#ifdef illumos
- zc->zc_fromobj, zc->zc_cookie, fp->f_vnode, &off);
+ zc->zc_cookie, fp->f_vnode, &off);
#else
- zc->zc_fromobj, zc->zc_cookie, fp, &off);
+ zc->zc_cookie, fp, &off);
#endif
if (off >= 0 && off <= MAXOFFSET_T)
@@ -4598,13 +5008,23 @@
* objset needs to be closed & reopened (to grow the
* objset_phys_t). Suspend/resume the fs will do that.
*/
+ dsl_dataset_t *ds, *newds;
+
+ ds = dmu_objset_ds(zfsvfs->z_os);
error = zfs_suspend_fs(zfsvfs);
- if (error == 0)
- error = zfs_resume_fs(zfsvfs, zc->zc_name);
+ if (error == 0) {
+ dmu_objset_refresh_ownership(ds, &newds,
+ zfsvfs);
+ error = zfs_resume_fs(zfsvfs, newds);
+ }
}
if (error == 0)
error = dmu_objset_userspace_upgrade(zfsvfs->z_os);
+#ifdef illumos
VFS_RELE(zfsvfs->z_vfs);
+#else
+ vfs_unbusy(zfsvfs->z_vfs);
+#endif
} else {
/* XXX kind of reading contents without owning */
error = dmu_objset_hold(zc->zc_name, FTAG, &os);
@@ -4618,7 +5038,7 @@
return (error);
}
-#ifdef sun
+#ifdef illumos
/*
* We don't want to have a hard dependency
* against some special symbols in sharefs
@@ -4636,10 +5056,10 @@
ddi_modhandle_t nfs_mod;
ddi_modhandle_t sharefs_mod;
ddi_modhandle_t smbsrv_mod;
-#endif /* sun */
+#endif /* illumos */
kmutex_t zfs_share_lock;
-#ifdef sun
+#ifdef illumos
static int
zfs_init_sharefs()
{
@@ -4659,12 +5079,12 @@
}
return (0);
}
-#endif /* sun */
+#endif /* illumos */
static int
zfs_ioc_share(zfs_cmd_t *zc)
{
-#ifdef sun
+#ifdef illumos
int error;
int opcode;
@@ -4755,9 +5175,9 @@
return (error);
-#else /* !sun */
+#else /* !illumos */
return (ENOSYS);
-#endif /* !sun */
+#endif /* illumos */
}
ace_t full_access[] = {
@@ -4783,7 +5203,7 @@
return (error);
error = dmu_object_next(os, &zc->zc_obj, B_FALSE,
- os->os_dsl_dataset->ds_phys->ds_prev_snap_txg);
+ dsl_dataset_phys(os->os_dsl_dataset)->ds_prev_snap_txg);
dmu_objset_rele(os, FTAG);
return (error);
@@ -4837,10 +5257,16 @@
zfs_ioc_diff(zfs_cmd_t *zc)
{
file_t *fp;
+ cap_rights_t rights;
offset_t off;
int error;
+#ifdef illumos
fp = getf(zc->zc_cookie);
+#else
+ fget_write(curthread, zc->zc_cookie,
+ cap_rights_init(&rights, CAP_WRITE), &fp);
+#endif
if (fp == NULL)
return (SET_ERROR(EBADF));
@@ -4859,7 +5285,7 @@
return (error);
}
-#ifdef sun
+#ifdef illumos
/*
* Remove all ACL files in shares dir
*/
@@ -4881,12 +5307,12 @@
zap_cursor_fini(&zc);
return (error);
}
-#endif /* sun */
+#endif /* illumos */
static int
zfs_ioc_smb_acl(zfs_cmd_t *zc)
{
-#ifdef sun
+#ifdef illumos
vnode_t *vp;
znode_t *dzp;
vnode_t *resourcevp = NULL;
@@ -4977,6 +5403,7 @@
if ((error = get_nvlist(zc->zc_nvlist_src,
zc->zc_nvlist_src_size, zc->zc_iflags, &nvlist)) != 0) {
VN_RELE(vp);
+ VN_RELE(ZTOV(sharedir));
ZFS_EXIT(zfsvfs);
return (error);
}
@@ -5009,9 +5436,9 @@
ZFS_EXIT(zfsvfs);
return (error);
-#else /* !sun */
+#else /* !illumos */
return (EOPNOTSUPP);
-#endif /* !sun */
+#endif /* illumos */
}
/*
@@ -5029,6 +5456,7 @@
static int
zfs_ioc_hold(const char *pool, nvlist_t *args, nvlist_t *errlist)
{
+ nvpair_t *pair;
nvlist_t *holds;
int cleanup_fd = -1;
int error;
@@ -5038,6 +5466,19 @@
if (error != 0)
return (SET_ERROR(EINVAL));
+ /* make sure the user didn't pass us any invalid (empty) tags */
+ for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL;
+ pair = nvlist_next_nvpair(holds, pair)) {
+ char *htag;
+
+ error = nvpair_value_string(pair, &htag);
+ if (error != 0)
+ return (SET_ERROR(error));
+
+ if (strlen(htag) == 0)
+ return (SET_ERROR(EINVAL));
+ }
+
if (nvlist_lookup_int32(args, "cleanup_fd", &cleanup_fd) == 0) {
error = zfs_onexit_fd_hold(cleanup_fd, &minor);
if (error != 0)
@@ -5151,11 +5592,19 @@
return (error);
error = dsl_dataset_hold(dp, lastsnap, FTAG, &new);
+ if (error == 0 && !new->ds_is_snapshot) {
+ dsl_dataset_rele(new, FTAG);
+ error = SET_ERROR(EINVAL);
+ }
if (error != 0) {
dsl_pool_rele(dp, FTAG);
return (error);
}
error = dsl_dataset_hold(dp, firstsnap, FTAG, &old);
+ if (error == 0 && !old->ds_is_snapshot) {
+ dsl_dataset_rele(old, FTAG);
+ error = SET_ERROR(EINVAL);
+ }
if (error != 0) {
dsl_dataset_rele(new, FTAG);
dsl_pool_rele(dp, FTAG);
@@ -5192,6 +5641,12 @@
* innvl: {
* "fd" -> file descriptor to write stream to (int32)
* (optional) "fromsnap" -> full snap name to send an incremental from
+ * (optional) "largeblockok" -> (value ignored)
+ * indicates that blocks > 128KB are permitted
+ * (optional) "embedok" -> (value ignored)
+ * presence indicates DRR_WRITE_EMBEDDED records are permitted
+ * (optional) "resume_object" and "resume_offset" -> (uint64)
+ * if present, resume send stream from specified object and offset.
* }
*
* outnvl is unused
@@ -5200,10 +5655,16 @@
static int
zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
{
+ cap_rights_t rights;
+ file_t *fp;
int error;
offset_t off;
char *fromname = NULL;
int fd;
+ boolean_t largeblockok;
+ boolean_t embedok;
+ uint64_t resumeobj = 0;
+ uint64_t resumeoff = 0;
error = nvlist_lookup_int32(innvl, "fd", &fd);
if (error != 0)
@@ -5211,15 +5672,26 @@
(void) nvlist_lookup_string(innvl, "fromsnap", &fromname);
+ largeblockok = nvlist_exists(innvl, "largeblockok");
+ embedok = nvlist_exists(innvl, "embedok");
+
+ (void) nvlist_lookup_uint64(innvl, "resume_object", &resumeobj);
+ (void) nvlist_lookup_uint64(innvl, "resume_offset", &resumeoff);
+
+#ifdef illumos
file_t *fp = getf(fd);
+#else
+ fget_write(curthread, fd, cap_rights_init(&rights, CAP_WRITE), &fp);
+#endif
if (fp == NULL)
return (SET_ERROR(EBADF));
off = fp->f_offset;
+ error = dmu_send(snapname, fromname, embedok, largeblockok, fd,
#ifdef illumos
- error = dmu_send(snapname, fromname, fd, fp->f_vnode, &off);
+ resumeobj, resumeoff, fp->f_vnode, &off);
#else
- error = dmu_send(snapname, fromname, fd, fp, &off);
+ resumeobj, resumeoff, fp, &off);
#endif
#ifdef illumos
@@ -5238,7 +5710,8 @@
* of bytes that will be written to the fd supplied to zfs_ioc_send_new().
*
* innvl: {
- * (optional) "fromsnap" -> full snap name to send an incremental from
+ * (optional) "from" -> full snap or bookmark name to send an incremental
+ * from
* }
*
* outnvl: {
@@ -5249,7 +5722,6 @@
zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
{
dsl_pool_t *dp;
- dsl_dataset_t *fromsnap = NULL;
dsl_dataset_t *tosnap;
int error;
char *fromname;
@@ -5265,27 +5737,55 @@
return (error);
}
- error = nvlist_lookup_string(innvl, "fromsnap", &fromname);
+ error = nvlist_lookup_string(innvl, "from", &fromname);
if (error == 0) {
- error = dsl_dataset_hold(dp, fromname, FTAG, &fromsnap);
- if (error != 0) {
- dsl_dataset_rele(tosnap, FTAG);
- dsl_pool_rele(dp, FTAG);
- return (error);
+ if (strchr(fromname, '@') != NULL) {
+ /*
+ * If from is a snapshot, hold it and use the more
+ * efficient dmu_send_estimate to estimate send space
+ * size using deadlists.
+ */
+ dsl_dataset_t *fromsnap;
+ error = dsl_dataset_hold(dp, fromname, FTAG, &fromsnap);
+ if (error != 0)
+ goto out;
+ error = dmu_send_estimate(tosnap, fromsnap, &space);
+ dsl_dataset_rele(fromsnap, FTAG);
+ } else if (strchr(fromname, '#') != NULL) {
+ /*
+ * If from is a bookmark, fetch the creation TXG of the
+ * snapshot it was created from and use that to find
+ * blocks that were born after it.
+ */
+ zfs_bookmark_phys_t frombm;
+
+ error = dsl_bookmark_lookup(dp, fromname, tosnap,
+ &frombm);
+ if (error != 0)
+ goto out;
+ error = dmu_send_estimate_from_txg(tosnap,
+ frombm.zbm_creation_txg, &space);
+ } else {
+ /*
+ * from is not properly formatted as a snapshot or
+ * bookmark
+ */
+ error = SET_ERROR(EINVAL);
+ goto out;
}
+ } else {
+ // If estimating the size of a full send, use dmu_send_estimate
+ error = dmu_send_estimate(tosnap, NULL, &space);
}
- error = dmu_send_estimate(tosnap, fromsnap, &space);
fnvlist_add_uint64(outnvl, "space", space);
- if (fromsnap != NULL)
- dsl_dataset_rele(fromsnap, FTAG);
+out:
dsl_dataset_rele(tosnap, FTAG);
dsl_pool_rele(dp, FTAG);
return (error);
}
-
static zfs_ioc_vec_t zfs_ioc_vec[ZFS_IOC_LAST - ZFS_IOC_FIRST];
static void
@@ -5385,7 +5885,7 @@
static void
zfs_ioctl_register_dataset_modify(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func,
- zfs_secpolicy_func_t *secpolicy)
+ zfs_secpolicy_func_t *secpolicy)
{
zfs_ioctl_register_legacy(ioc, func, secpolicy,
DATASET_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY);
@@ -5437,6 +5937,23 @@
zfs_ioc_get_holds, zfs_secpolicy_read, DATASET_NAME,
POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE);
+ zfs_ioctl_register("rollback", ZFS_IOC_ROLLBACK,
+ zfs_ioc_rollback, zfs_secpolicy_rollback, DATASET_NAME,
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE);
+
+ zfs_ioctl_register("bookmark", ZFS_IOC_BOOKMARK,
+ zfs_ioc_bookmark, zfs_secpolicy_bookmark, POOL_NAME,
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE);
+
+ zfs_ioctl_register("get_bookmarks", ZFS_IOC_GET_BOOKMARKS,
+ zfs_ioc_get_bookmarks, zfs_secpolicy_read, DATASET_NAME,
+ POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE);
+
+ zfs_ioctl_register("destroy_bookmarks", ZFS_IOC_DESTROY_BOOKMARKS,
+ zfs_ioc_destroy_bookmarks, zfs_secpolicy_destroy_bookmarks,
+ POOL_NAME,
+ POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE);
+
/* IOCTLS that use the legacy function signature */
zfs_ioctl_register_legacy(ZFS_IOC_POOL_FREEZE, zfs_ioc_pool_freeze,
@@ -5496,10 +6013,10 @@
zfs_secpolicy_read, B_FALSE, POOL_CHECK_NONE);
zfs_ioctl_register_pool(ZFS_IOC_ERROR_LOG, zfs_ioc_error_log,
- zfs_secpolicy_inject, B_FALSE, POOL_CHECK_SUSPENDED);
+ zfs_secpolicy_inject, B_FALSE, POOL_CHECK_NONE);
zfs_ioctl_register_pool(ZFS_IOC_DSOBJ_TO_DSNAME,
zfs_ioc_dsobj_to_dsname,
- zfs_secpolicy_diff, B_FALSE, POOL_CHECK_SUSPENDED);
+ zfs_secpolicy_diff, B_FALSE, POOL_CHECK_NONE);
zfs_ioctl_register_pool(ZFS_IOC_POOL_GET_HISTORY,
zfs_ioc_pool_get_history,
zfs_secpolicy_config, B_FALSE, POOL_CHECK_SUSPENDED);
@@ -5508,7 +6025,7 @@
zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE);
zfs_ioctl_register_pool(ZFS_IOC_CLEAR, zfs_ioc_clear,
- zfs_secpolicy_config, B_TRUE, POOL_CHECK_SUSPENDED);
+ zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE);
zfs_ioctl_register_pool(ZFS_IOC_POOL_REOPEN, zfs_ioc_pool_reopen,
zfs_secpolicy_config, B_TRUE, POOL_CHECK_SUSPENDED);
@@ -5548,8 +6065,6 @@
zfs_secpolicy_none);
zfs_ioctl_register_dataset_modify(ZFS_IOC_DESTROY, zfs_ioc_destroy,
zfs_secpolicy_destroy);
- zfs_ioctl_register_dataset_modify(ZFS_IOC_ROLLBACK, zfs_ioc_rollback,
- zfs_secpolicy_rollback);
zfs_ioctl_register_dataset_modify(ZFS_IOC_RENAME, zfs_ioc_rename,
zfs_secpolicy_rename);
zfs_ioctl_register_dataset_modify(ZFS_IOC_RECV, zfs_ioc_recv,
@@ -5577,6 +6092,9 @@
zfs_secpolicy_config, POOL_CHECK_NONE);
zfs_ioctl_register_dataset_nolog(ZFS_IOC_UNJAIL, zfs_ioc_unjail,
zfs_secpolicy_config, POOL_CHECK_NONE);
+ zfs_ioctl_register("fbsd_nextboot", ZFS_IOC_NEXTBOOT,
+ zfs_ioc_nextboot, zfs_secpolicy_config, NO_NAME,
+ POOL_CHECK_NONE, B_FALSE, B_FALSE);
#endif
}
@@ -5676,7 +6194,7 @@
{
int error = 0;
-#ifdef sun
+#ifdef illumos
if (getminor(*devp) != 0)
return (zvol_open(devp, flag, otyp, cr));
#endif
@@ -5723,6 +6241,7 @@
zfs_iocparm_t *zc_iocparm;
int cflag, cmd, oldvecnum;
boolean_t newioc, compat;
+ void *compat_zc = NULL;
cred_t *cr = td->td_ucred;
#endif
const zfs_ioc_vec_t *vec;
@@ -5731,10 +6250,10 @@
cflag = ZFS_CMD_COMPAT_NONE;
compat = B_FALSE;
- newioc = B_TRUE;
+ newioc = B_TRUE; /* "new" style (zfs_iocparm_t) ioctl */
len = IOCPARM_LEN(zcmd);
- cmd = zcmd & 0xff;
+ vecnum = cmd = zcmd & 0xff;
/*
* Check if we are talking to supported older binaries
@@ -5742,25 +6261,41 @@
*/
if (len != sizeof(zfs_iocparm_t)) {
newioc = B_FALSE;
- if (len == sizeof(zfs_cmd_t)) {
+ compat = B_TRUE;
+
+ vecnum = cmd;
+
+ switch (len) {
+ case sizeof(zfs_cmd_zcmd_t):
cflag = ZFS_CMD_COMPAT_LZC;
- vecnum = cmd;
- } else if (len == sizeof(zfs_cmd_deadman_t)) {
+ break;
+ case sizeof(zfs_cmd_deadman_t):
cflag = ZFS_CMD_COMPAT_DEADMAN;
- compat = B_TRUE;
- vecnum = cmd;
- } else if (len == sizeof(zfs_cmd_v28_t)) {
+ break;
+ case sizeof(zfs_cmd_v28_t):
cflag = ZFS_CMD_COMPAT_V28;
- compat = B_TRUE;
- vecnum = cmd;
- } else if (len == sizeof(zfs_cmd_v15_t)) {
+ break;
+ case sizeof(zfs_cmd_v15_t):
+ if (cmd >= sizeof(zfs_ioctl_v15_to_v28) /
+ sizeof(zfs_ioctl_v15_to_v28[0]))
+ return (EINVAL);
+
cflag = ZFS_CMD_COMPAT_V15;
- compat = B_TRUE;
vecnum = zfs_ioctl_v15_to_v28[cmd];
- } else
+
+ /*
+ * Return without further handling
+ * if the command is blacklisted.
+ */
+ if (vecnum == ZFS_IOC_COMPAT_PASS)
+ return (0);
+ else if (vecnum == ZFS_IOC_COMPAT_FAIL)
+ return (ENOTSUP);
+ break;
+ default:
return (EINVAL);
- } else
- vecnum = cmd;
+ }
+ }
#ifdef illumos
vecnum = cmd - ZFS_IOC_FIRST;
@@ -5767,26 +6302,13 @@
ASSERT3U(getmajor(dev), ==, ddi_driver_major(zfs_dip));
#endif
- if (compat) {
- if (vecnum == ZFS_IOC_COMPAT_PASS)
- return (0);
- else if (vecnum == ZFS_IOC_COMPAT_FAIL)
- return (ENOTSUP);
- }
-
- /*
- * Check if we have sufficient kernel memory allocated
- * for the zfs_cmd_t request. Bail out if not so we
- * will not access undefined memory region.
- */
if (vecnum >= sizeof (zfs_ioc_vec) / sizeof (zfs_ioc_vec[0]))
return (SET_ERROR(EINVAL));
vec = &zfs_ioc_vec[vecnum];
-#ifdef illumos
zc = kmem_zalloc(sizeof(zfs_cmd_t), KM_SLEEP);
- bzero(zc, sizeof(zfs_cmd_t));
+#ifdef illumos
error = ddi_copyin((void *)arg, zc, sizeof (zfs_cmd_t), flag);
if (error != 0) {
error = SET_ERROR(EFAULT);
@@ -5793,33 +6315,86 @@
goto out;
}
#else /* !illumos */
- /*
- * We don't alloc/free zc only if talking to library ioctl version 2
- */
- if (cflag != ZFS_CMD_COMPAT_LZC) {
- zc = kmem_zalloc(sizeof(zfs_cmd_t), KM_SLEEP);
- bzero(zc, sizeof(zfs_cmd_t));
- } else {
- zc = (void *)arg;
- error = 0;
- }
+ bzero(zc, sizeof(zfs_cmd_t));
if (newioc) {
zc_iocparm = (void *)arg;
- if (zc_iocparm->zfs_cmd_size != sizeof(zfs_cmd_t)) {
- error = SET_ERROR(EFAULT);
+
+ switch (zc_iocparm->zfs_ioctl_version) {
+ case ZFS_IOCVER_CURRENT:
+ if (zc_iocparm->zfs_cmd_size != sizeof(zfs_cmd_t)) {
+ error = SET_ERROR(EINVAL);
+ goto out;
+ }
+ break;
+ case ZFS_IOCVER_INLANES:
+ if (zc_iocparm->zfs_cmd_size != sizeof(zfs_cmd_inlanes_t)) {
+ error = SET_ERROR(EFAULT);
+ goto out;
+ }
+ compat = B_TRUE;
+ cflag = ZFS_CMD_COMPAT_INLANES;
+ break;
+ case ZFS_IOCVER_RESUME:
+ if (zc_iocparm->zfs_cmd_size != sizeof(zfs_cmd_resume_t)) {
+ error = SET_ERROR(EFAULT);
+ goto out;
+ }
+ compat = B_TRUE;
+ cflag = ZFS_CMD_COMPAT_RESUME;
+ break;
+ case ZFS_IOCVER_EDBP:
+ if (zc_iocparm->zfs_cmd_size != sizeof(zfs_cmd_edbp_t)) {
+ error = SET_ERROR(EFAULT);
+ goto out;
+ }
+ compat = B_TRUE;
+ cflag = ZFS_CMD_COMPAT_EDBP;
+ break;
+ case ZFS_IOCVER_ZCMD:
+ if (zc_iocparm->zfs_cmd_size > sizeof(zfs_cmd_t) ||
+ zc_iocparm->zfs_cmd_size < sizeof(zfs_cmd_zcmd_t)) {
+ error = SET_ERROR(EFAULT);
+ goto out;
+ }
+ compat = B_TRUE;
+ cflag = ZFS_CMD_COMPAT_ZCMD;
+ break;
+ default:
+ error = SET_ERROR(EINVAL);
goto out;
+ /* NOTREACHED */
}
- error = ddi_copyin((void *)(uintptr_t)zc_iocparm->zfs_cmd, zc,
- sizeof(zfs_cmd_t), flag);
- if (error != 0) {
- error = SET_ERROR(EFAULT);
- goto out;
+
+ if (compat) {
+ ASSERT(sizeof(zfs_cmd_t) >= zc_iocparm->zfs_cmd_size);
+ compat_zc = kmem_zalloc(sizeof(zfs_cmd_t), KM_SLEEP);
+ bzero(compat_zc, sizeof(zfs_cmd_t));
+
+ error = ddi_copyin((void *)(uintptr_t)zc_iocparm->zfs_cmd,
+ compat_zc, zc_iocparm->zfs_cmd_size, flag);
+ if (error != 0) {
+ error = SET_ERROR(EFAULT);
+ goto out;
+ }
+ } else {
+ error = ddi_copyin((void *)(uintptr_t)zc_iocparm->zfs_cmd,
+ zc, zc_iocparm->zfs_cmd_size, flag);
+ if (error != 0) {
+ error = SET_ERROR(EFAULT);
+ goto out;
+ }
}
}
if (compat) {
- zfs_cmd_compat_get(zc, arg, cflag);
+ if (newioc) {
+ ASSERT(compat_zc != NULL);
+ zfs_cmd_compat_get(zc, compat_zc, cflag);
+ } else {
+ ASSERT(compat_zc == NULL);
+ zfs_cmd_compat_get(zc, arg, cflag);
+ }
oldvecnum = vecnum;
error = zfs_ioctl_compat_pre(zc, &vecnum, cflag);
if (error != 0)
@@ -5867,7 +6442,7 @@
break;
}
- if (error == 0 && !(flag & FKIOCTL))
+ if (error == 0)
error = vec->zvec_secpolicy(zc, innvl, cr);
if (error != 0)
@@ -5874,7 +6449,7 @@
goto out;
/* legacy ioctls can modify zc_name */
- len = strcspn(zc->zc_name, "/@") + 1;
+ len = strcspn(zc->zc_name, "/@#") + 1;
saved_poolname = kmem_alloc(len, KM_SLEEP);
(void) strlcpy(saved_poolname, zc->zc_name, len);
@@ -5915,7 +6490,7 @@
fnvlist_free(lognv);
/* rewrite outnvl for backwards compatibility */
- if (cflag != ZFS_CMD_COMPAT_NONE && cflag != ZFS_CMD_COMPAT_LZC)
+ if (compat)
outnvl = zfs_ioctl_compat_outnvl(zc, outnvl, vecnum,
cflag);
@@ -5940,17 +6515,30 @@
out:
nvlist_free(innvl);
- if (compat) {
- zfs_ioctl_compat_post(zc, cmd, cflag);
- zfs_cmd_compat_put(zc, arg, vecnum, cflag);
- }
-
#ifdef illumos
rc = ddi_copyout(zc, (void *)arg, sizeof (zfs_cmd_t), flag);
if (error == 0 && rc != 0)
error = SET_ERROR(EFAULT);
#else
- if (newioc) {
+ if (compat) {
+ zfs_ioctl_compat_post(zc, cmd, cflag);
+ if (newioc) {
+ ASSERT(compat_zc != NULL);
+ ASSERT(sizeof(zfs_cmd_t) >= zc_iocparm->zfs_cmd_size);
+
+ zfs_cmd_compat_put(zc, compat_zc, vecnum, cflag);
+ rc = ddi_copyout(compat_zc,
+ (void *)(uintptr_t)zc_iocparm->zfs_cmd,
+ zc_iocparm->zfs_cmd_size, flag);
+ if (error == 0 && rc != 0)
+ error = SET_ERROR(EFAULT);
+ kmem_free(compat_zc, sizeof (zfs_cmd_t));
+ } else {
+ zfs_cmd_compat_put(zc, arg, vecnum, cflag);
+ }
+ } else {
+ ASSERT(newioc);
+
rc = ddi_copyout(zc, (void *)(uintptr_t)zc_iocparm->zfs_cmd,
sizeof (zfs_cmd_t), flag);
if (error == 0 && rc != 0)
@@ -5967,19 +6555,11 @@
strfree(saved_poolname);
}
-#ifdef illumos
kmem_free(zc, sizeof (zfs_cmd_t));
-#else
- /*
- * We don't alloc/free zc only if talking to library ioctl version 2
- */
- if (cflag != ZFS_CMD_COMPAT_LZC)
- kmem_free(zc, sizeof (zfs_cmd_t));
-#endif
return (error);
}
-#ifdef sun
+#ifdef illumos
static int
zfs_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
{
@@ -6030,7 +6610,7 @@
return (DDI_FAILURE);
}
-#endif /* sun */
+#endif /* illumos */
/*
* OK, so this is a little weird.
@@ -6041,7 +6621,7 @@
* /dev/zfs has basically nothing to do except serve up ioctls,
* so most of the standard driver entry points are in zvol.c.
*/
-#ifdef sun
+#ifdef illumos
static struct cb_ops zfs_cb_ops = {
zfsdev_open, /* open */
zfsdev_close, /* close */
@@ -6090,7 +6670,7 @@
(void *)&zfs_modldrv,
NULL
};
-#endif /* sun */
+#endif /* illumos */
static struct cdevsw zfs_cdevsw = {
.d_version = D_VERSION,
@@ -6123,7 +6703,7 @@
static struct root_hold_token *zfs_root_token;
struct proc *zfsproc;
-#ifdef sun
+#ifdef illumos
int
_init(void)
{
@@ -6186,56 +6766,111 @@
{
return (mod_info(&modlinkage, modinfop));
}
-#endif /* sun */
+#endif /* illumos */
-static int
-zfs_modevent(module_t mod, int type, void *unused __unused)
+static int zfs__init(void);
+static int zfs__fini(void);
+static void zfs_shutdown(void *, int);
+
+static eventhandler_tag zfs_shutdown_event_tag;
+
+#ifdef __FreeBSD__
+#define ZFS_MIN_KSTACK_PAGES 4
+#endif
+
+int
+zfs__init(void)
{
- int error = 0;
- switch (type) {
- case MOD_LOAD:
- zfs_root_token = root_mount_hold("ZFS");
+#ifdef __FreeBSD__
+#if KSTACK_PAGES < ZFS_MIN_KSTACK_PAGES
+ printf("ZFS NOTICE: KSTACK_PAGES is %d which could result in stack "
+ "overflow panic!\nPlease consider adding "
+ "'options KSTACK_PAGES=%d' to your kernel config\n", KSTACK_PAGES,
+ ZFS_MIN_KSTACK_PAGES);
+#endif
+#endif
+ zfs_root_token = root_mount_hold("ZFS");
- mutex_init(&zfs_share_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&zfs_share_lock, NULL, MUTEX_DEFAULT, NULL);
- spa_init(FREAD | FWRITE);
- zfs_init();
- zvol_init();
- zfs_ioctl_init();
+ spa_init(FREAD | FWRITE);
+ zfs_init();
+ zvol_init();
+ zfs_ioctl_init();
- tsd_create(&zfs_fsyncer_key, NULL);
- tsd_create(&rrw_tsd_key, rrw_tsd_destroy);
- tsd_create(&zfs_allow_log_key, zfs_allow_log_destroy);
+ tsd_create(&zfs_fsyncer_key, NULL);
+ tsd_create(&rrw_tsd_key, rrw_tsd_destroy);
+ tsd_create(&zfs_allow_log_key, zfs_allow_log_destroy);
+ tsd_create(&zfs_geom_probe_vdev_key, NULL);
- printf("ZFS storage pool version: features support (" SPA_VERSION_STRING ")\n");
- root_mount_rel(zfs_root_token);
+ printf("ZFS storage pool version: features support (" SPA_VERSION_STRING ")\n");
+ root_mount_rel(zfs_root_token);
- zfsdev_init();
- break;
- case MOD_UNLOAD:
- if (spa_busy() || zfs_busy() || zvol_busy() ||
- zio_injection_enabled) {
- error = EBUSY;
- break;
- }
+ zfsdev_init();
- zfsdev_fini();
- zvol_fini();
- zfs_fini();
- spa_fini();
+ return (0);
+}
- tsd_destroy(&zfs_fsyncer_key);
- tsd_destroy(&rrw_tsd_key);
- tsd_destroy(&zfs_allow_log_key);
+int
+zfs__fini(void)
+{
+ if (spa_busy() || zfs_busy() || zvol_busy() ||
+ zio_injection_enabled) {
+ return (EBUSY);
+ }
- mutex_destroy(&zfs_share_lock);
- break;
+ zfsdev_fini();
+ zvol_fini();
+ zfs_fini();
+ spa_fini();
+
+ tsd_destroy(&zfs_fsyncer_key);
+ tsd_destroy(&rrw_tsd_key);
+ tsd_destroy(&zfs_allow_log_key);
+
+ mutex_destroy(&zfs_share_lock);
+
+ return (0);
+}
+
+static void
+zfs_shutdown(void *arg __unused, int howto __unused)
+{
+
+ /*
+ * ZFS fini routines can not properly work in a panic-ed system.
+ */
+ if (panicstr == NULL)
+ (void)zfs__fini();
+}
+
+
+static int
+zfs_modevent(module_t mod, int type, void *unused __unused)
+{
+ int err;
+
+ switch (type) {
+ case MOD_LOAD:
+ err = zfs__init();
+ if (err == 0)
+ zfs_shutdown_event_tag = EVENTHANDLER_REGISTER(
+ shutdown_post_sync, zfs_shutdown, NULL,
+ SHUTDOWN_PRI_FIRST);
+ return (err);
+ case MOD_UNLOAD:
+ err = zfs__fini();
+ if (err == 0 && zfs_shutdown_event_tag != NULL)
+ EVENTHANDLER_DEREGISTER(shutdown_post_sync,
+ zfs_shutdown_event_tag);
+ return (err);
+ case MOD_SHUTDOWN:
+ return (0);
default:
- error = EOPNOTSUPP;
break;
}
- return (error);
+ return (EOPNOTSUPP);
}
static moduledata_t zfs_mod = {
@@ -6247,3 +6882,4 @@
MODULE_VERSION(zfsctrl, 1);
MODULE_DEPEND(zfsctrl, opensolaris, 1, 1, 1);
MODULE_DEPEND(zfsctrl, krpc, 1, 1, 1);
+MODULE_DEPEND(zfsctrl, acl_nfs4, 1, 1, 1);
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_log.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,6 +21,8 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
*/
#include <sys/types.h>
@@ -349,7 +351,7 @@
*/
void
zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
- znode_t *dzp, char *name, uint64_t foid)
+ znode_t *dzp, char *name, uint64_t foid)
{
itx_t *itx;
lr_remove_t *lr;
@@ -373,7 +375,7 @@
*/
void
zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
- znode_t *dzp, znode_t *zp, char *name)
+ znode_t *dzp, znode_t *zp, char *name)
{
itx_t *itx;
lr_link_t *lr;
@@ -428,7 +430,7 @@
*/
void
zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
- znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp)
+ znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp)
{
itx_t *itx;
lr_rename_t *lr;
@@ -456,23 +458,20 @@
void
zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
- znode_t *zp, offset_t off, ssize_t resid, int ioflag)
+ znode_t *zp, offset_t off, ssize_t resid, int ioflag)
{
+ uint32_t blocksize = zp->z_blksz;
itx_wr_state_t write_state;
- boolean_t slogging;
uintptr_t fsync_cnt;
- ssize_t immediate_write_sz;
if (zil_replaying(zilog, tx) || zp->z_unlinked)
return;
- immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
- ? 0 : zfs_immediate_write_sz;
-
- slogging = spa_has_slogs(zilog->zl_spa) &&
- (zilog->zl_logbias == ZFS_LOGBIAS_LATENCY);
- if (resid > immediate_write_sz && !slogging && resid <= zp->z_blksz)
+ if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
write_state = WR_INDIRECT;
+ else if (!spa_has_slogs(zilog->zl_spa) &&
+ resid >= zfs_immediate_write_sz)
+ write_state = WR_INDIRECT;
else if (ioflag & (FSYNC | FDSYNC))
write_state = WR_COPIED;
else
@@ -485,30 +484,26 @@
while (resid) {
itx_t *itx;
lr_write_t *lr;
- ssize_t len;
+ itx_wr_state_t wr_state = write_state;
+ ssize_t len = resid;
- /*
- * If the write would overflow the largest block then split it.
- */
- if (write_state != WR_INDIRECT && resid > ZIL_MAX_LOG_DATA)
- len = SPA_MAXBLOCKSIZE >> 1;
- else
- len = resid;
+ if (wr_state == WR_COPIED && resid > ZIL_MAX_COPIED_DATA)
+ wr_state = WR_NEED_COPY;
+ else if (wr_state == WR_INDIRECT)
+ len = MIN(blocksize - P2PHASE(off, blocksize), resid);
itx = zil_itx_create(txtype, sizeof (*lr) +
- (write_state == WR_COPIED ? len : 0));
+ (wr_state == WR_COPIED ? len : 0));
lr = (lr_write_t *)&itx->itx_lr;
- if (write_state == WR_COPIED && dmu_read(zp->z_zfsvfs->z_os,
+ if (wr_state == WR_COPIED && dmu_read(zp->z_zfsvfs->z_os,
zp->z_id, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) {
zil_itx_destroy(itx);
itx = zil_itx_create(txtype, sizeof (*lr));
lr = (lr_write_t *)&itx->itx_lr;
- write_state = WR_NEED_COPY;
+ wr_state = WR_NEED_COPY;
}
- itx->itx_wr_state = write_state;
- if (write_state == WR_NEED_COPY)
- itx->itx_sod += len;
+ itx->itx_wr_state = wr_state;
lr->lr_foid = zp->z_id;
lr->lr_offset = off;
lr->lr_length = len;
@@ -533,7 +528,7 @@
*/
void
zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype,
- znode_t *zp, uint64_t off, uint64_t len)
+ znode_t *zp, uint64_t off, uint64_t len)
{
itx_t *itx;
lr_truncate_t *lr;
@@ -556,7 +551,7 @@
*/
void
zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype,
- znode_t *zp, vattr_t *vap, uint_t mask_applied, zfs_fuid_info_t *fuidp)
+ znode_t *zp, vattr_t *vap, uint_t mask_applied, zfs_fuid_info_t *fuidp)
{
itx_t *itx;
lr_setattr_t *lr;
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_onexit.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_onexit.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_onexit.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -123,10 +123,11 @@
{
file_t *fp, *tmpfp;
zfs_onexit_t *zo;
+ cap_rights_t rights;
void *data;
int error;
- fp = getf(fd);
+ fp = getf(fd, cap_rights_init(&rights));
if (fp == NULL)
return (SET_ERROR(EBADF));
@@ -137,7 +138,7 @@
*minorp = (minor_t)(uintptr_t)data;
curthread->td_fpop = tmpfp;
if (error != 0)
- return (error);
+ return (SET_ERROR(EBADF));
return (zfs_onexit_minor_to_state(*minorp, &zo));
}
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_replay.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,7 +21,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
*/
#include <sys/types.h>
@@ -55,7 +55,7 @@
static void
zfs_init_vattr(vattr_t *vap, uint64_t mask, uint64_t mode,
- uint64_t uid, uint64_t gid, uint64_t rdev, uint64_t nodeid)
+ uint64_t uid, uint64_t gid, uint64_t rdev, uint64_t nodeid)
{
VATTR_NULL(vap);
vap->va_mask = (uint_t)mask;
@@ -821,7 +821,7 @@
static int
zfs_replay_truncate(zfsvfs_t *zfsvfs, lr_truncate_t *lr, boolean_t byteswap)
{
-#ifdef sun
+#ifdef illumos
znode_t *zp;
flock64_t fl;
int error;
@@ -844,10 +844,10 @@
VN_RELE(ZTOV(zp));
return (error);
-#else /* !sun */
+#else
ZFS_LOG(0, "Unexpected code path, report to pjd at FreeBSD.org");
return (EOPNOTSUPP);
-#endif /* !sun */
+#endif
}
static int
@@ -906,11 +906,15 @@
return (error);
}
+extern int zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
+ caller_context_t *ct);
+
static int
zfs_replay_acl_v0(zfsvfs_t *zfsvfs, lr_acl_v0_t *lr, boolean_t byteswap)
{
ace_t *ace = (ace_t *)(lr + 1); /* ace array follows lr_acl_t */
vsecattr_t vsa;
+ vnode_t *vp;
znode_t *zp;
int error;
@@ -929,13 +933,12 @@
vsa.vsa_aclflags = 0;
vsa.vsa_aclentp = ace;
-#ifdef TODO
- error = VOP_SETSECATTR(ZTOV(zp), &vsa, 0, kcred, NULL);
-#else
- panic("%s:%u: unsupported condition", __func__, __LINE__);
-#endif
+ vp = ZTOV(zp);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+ error = zfs_setsecattr(vp, &vsa, 0, kcred, NULL);
+ VOP_UNLOCK(vp, 0);
- VN_RELE(ZTOV(zp));
+ VN_RELE(vp);
return (error);
}
@@ -960,6 +963,7 @@
ace_t *ace = (ace_t *)(lr + 1);
vsecattr_t vsa;
znode_t *zp;
+ vnode_t *vp;
int error;
if (byteswap) {
@@ -975,7 +979,6 @@
if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
return (error);
-#ifdef TODO
bzero(&vsa, sizeof (vsa));
vsa.vsa_mask = VSA_ACE | VSA_ACECNT | VSA_ACE_ACLFLAGS;
vsa.vsa_aclcnt = lr->lr_aclcnt;
@@ -992,16 +995,16 @@
lr->lr_fuidcnt, lr->lr_domcnt, 0, 0);
}
- error = VOP_SETSECATTR(ZTOV(zp), &vsa, 0, kcred, NULL);
+ vp = ZTOV(zp);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+ error = zfs_setsecattr(vp, &vsa, 0, kcred, NULL);
+ VOP_UNLOCK(vp, 0);
if (zfsvfs->z_fuid_replay)
zfs_fuid_info_free(zfsvfs->z_fuid_replay);
-#else
- error = EOPNOTSUPP;
-#endif
zfsvfs->z_fuid_replay = NULL;
- VN_RELE(ZTOV(zp));
+ VN_RELE(vp);
return (error);
}
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_sa.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_sa.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_sa.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -23,8 +23,7 @@
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
*/
-#include <sys/types.h>
-#include <sys/param.h>
+#include <sys/zfs_context.h>
#include <sys/vnode.h>
#include <sys/sa.h>
#include <sys/zfs_acl.h>
@@ -126,7 +125,7 @@
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
xoptattr_t *xoap;
- ASSERT(MUTEX_HELD(&zp->z_lock));
+ ASSERT_VOP_LOCKED(ZTOV(zp), __func__);
VERIFY((xoap = xva_getxoptattr(xvap)) != NULL);
if (zp->z_is_sa) {
if (sa_lookup(zp->z_sa_hdl, SA_ZPL_SCANSTAMP(zfsvfs),
@@ -160,7 +159,7 @@
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
xoptattr_t *xoap;
- ASSERT(MUTEX_HELD(&zp->z_lock));
+ ASSERT_VOP_ELOCKED(ZTOV(zp), __func__);
VERIFY((xoap = xva_getxoptattr(xvap)) != NULL);
if (zp->z_is_sa)
VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SCANSTAMP(zfsvfs),
@@ -207,7 +206,6 @@
uint64_t crtime[2], mtime[2], ctime[2];
zfs_acl_phys_t znode_acl;
char scanstamp[AV_SCANSTAMP_SZ];
- boolean_t drop_lock = B_FALSE;
/*
* No upgrade if ACL isn't cached
@@ -219,8 +217,8 @@
return;
/*
- * If the z_lock is held and we aren't the owner
- * the just return since we don't want to deadlock
+ * If the vnode lock is held and we aren't the owner
+ * then just return since we don't want to deadlock
* trying to update the status of z_is_sa. This
* file can then be upgraded at a later time.
*
@@ -227,12 +225,8 @@
* Otherwise, we know we are doing the
* sa_update() that caused us to enter this function.
*/
- if (mutex_owner(&zp->z_lock) != curthread) {
- if (mutex_tryenter(&zp->z_lock) == 0)
+ if (vn_lock(ZTOV(zp), LK_EXCLUSIVE | LK_NOWAIT) != 0)
return;
- else
- drop_lock = B_TRUE;
- }
/* First do a bulk query of the attributes that aren't cached */
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
@@ -313,8 +307,7 @@
zp->z_is_sa = B_TRUE;
done:
- if (drop_lock)
- mutex_exit(&zp->z_lock);
+ VOP_UNLOCK(ZTOV(zp), 0);
}
void
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -23,7 +23,8 @@
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011 Pawel Jakub Dawidek <pawel at dawidek.net>.
* All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
*/
/* Portions Copyright 2010 Robert Milkowski */
@@ -134,6 +135,13 @@
if (panicstr)
return (0);
+ /*
+ * Ignore the system syncher. ZFS already commits async data
+ * at zfs_txg_timeout intervals.
+ */
+ if (waitfor == MNT_LAZY)
+ return (0);
+
if (vfsp != NULL) {
/*
* Sync a specific filesystem.
@@ -174,7 +182,7 @@
return (0);
}
-#ifndef __FreeBSD__
+#ifndef __FreeBSD_kernel__
static int
zfs_create_unique_device(dev_t *dev)
{
@@ -226,7 +234,7 @@
return (0);
}
-#endif /* !__FreeBSD__ */
+#endif /* !__FreeBSD_kernel__ */
static void
atime_changed_cb(void *arg, uint64_t newval)
@@ -272,11 +280,10 @@
blksz_changed_cb(void *arg, uint64_t newval)
{
zfsvfs_t *zfsvfs = arg;
+ ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os)));
+ ASSERT3U(newval, >=, SPA_MINBLOCKSIZE);
+ ASSERT(ISP2(newval));
- if (newval < SPA_MINBLOCKSIZE ||
- newval > SPA_MAXBLOCKSIZE || !ISP2(newval))
- newval = SPA_MAXBLOCKSIZE;
-
zfsvfs->z_max_blksz = newval;
zfsvfs->z_vfs->mnt_stat.f_iosize = newval;
}
@@ -468,6 +475,19 @@
}
/*
+ * We need to enter pool configuration here, so that we can use
+ * dsl_prop_get_int_ds() to handle the special nbmand property below.
+ * dsl_prop_get_integer() can not be used, because it has to acquire
+ * spa_namespace_lock and we can not do that because we already hold
+ * z_teardown_lock. The problem is that spa_config_sync() is called
+ * with spa_namespace_lock held and the function calls ZFS vnode
+ * operations to write the cache file and thus z_teardown_lock is
+ * acquired after spa_namespace_lock.
+ */
+ ds = dmu_objset_ds(os);
+ dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
+
+ /*
* nbmand is a special property. It can only be changed at
* mount time.
*
@@ -478,14 +498,9 @@
nbmand = B_FALSE;
} else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) {
nbmand = B_TRUE;
- } else {
- char osname[MAXNAMELEN];
-
- dmu_objset_name(os, osname);
- if (error = dsl_prop_get_integer(osname, "nbmand", &nbmand,
- NULL)) {
- return (error);
- }
+ } else if (error = dsl_prop_get_int_ds(ds, "nbmand", &nbmand) != 0) {
+ dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
+ return (error);
}
/*
@@ -495,8 +510,6 @@
* the first prop_register(), but I guess I like to go
* overboard...
*/
- ds = dmu_objset_ds(os);
- dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
error = dsl_prop_register(ds,
zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs);
error = error ? error : dsl_prop_register(ds,
@@ -545,35 +558,7 @@
return (0);
unregister:
- /*
- * We may attempt to unregister some callbacks that are not
- * registered, but this is OK; it will simply return ENOMSG,
- * which we will ignore.
- */
- (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_ATIME),
- atime_changed_cb, zfsvfs);
- (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_XATTR),
- xattr_changed_cb, zfsvfs);
- (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_RECORDSIZE),
- blksz_changed_cb, zfsvfs);
- (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_READONLY),
- readonly_changed_cb, zfsvfs);
-#ifdef illumos
- (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_DEVICES),
- devices_changed_cb, zfsvfs);
-#endif
- (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_SETUID),
- setuid_changed_cb, zfsvfs);
- (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_EXEC),
- exec_changed_cb, zfsvfs);
- (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_SNAPDIR),
- snapdir_changed_cb, zfsvfs);
- (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_ACLMODE),
- acl_mode_changed_cb, zfsvfs);
- (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_ACLINHERIT),
- acl_inherit_changed_cb, zfsvfs);
- (void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_VSCAN),
- vscan_changed_cb, zfsvfs);
+ dsl_prop_unregister_all(ds, zfsvfs);
return (error);
}
@@ -863,61 +848,46 @@
return (zfs_fuid_overquota(zfsvfs, isgroup, fuid));
}
-int
-zfsvfs_create(const char *osname, zfsvfs_t **zfvp)
+/*
+ * Associate this zfsvfs with the given objset, which must be owned.
+ * This will cache a bunch of on-disk state from the objset in the
+ * zfsvfs.
+ */
+static int
+zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os)
{
- objset_t *os;
- zfsvfs_t *zfsvfs;
- uint64_t zval;
- int i, error;
- uint64_t sa_obj;
+ int error;
+ uint64_t val;
- zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
-
- /*
- * We claim to always be readonly so we can open snapshots;
- * other ZPL code will prevent us from writing to snapshots.
- */
- error = dmu_objset_own(osname, DMU_OST_ZFS, B_TRUE, zfsvfs, &os);
- if (error) {
- kmem_free(zfsvfs, sizeof (zfsvfs_t));
- return (error);
- }
-
- /*
- * Initialize the zfs-specific filesystem structure.
- * Should probably make this a kmem cache, shuffle fields,
- * and just bzero up to z_hold_mtx[].
- */
- zfsvfs->z_vfs = NULL;
- zfsvfs->z_parent = zfsvfs;
- zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE;
+ zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE;
zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
zfsvfs->z_os = os;
error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version);
- if (error) {
- goto out;
- } else if (zfsvfs->z_version >
+ if (error != 0)
+ return (error);
+ if (zfsvfs->z_version >
zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) {
(void) printf("Can't mount a version %lld file system "
"on a version %lld pool\n. Pool must be upgraded to mount "
"this file system.", (u_longlong_t)zfsvfs->z_version,
(u_longlong_t)spa_version(dmu_objset_spa(os)));
- error = SET_ERROR(ENOTSUP);
- goto out;
+ return (SET_ERROR(ENOTSUP));
}
- if ((error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &zval)) != 0)
- goto out;
- zfsvfs->z_norm = (int)zval;
+ error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val);
+ if (error != 0)
+ return (error);
+ zfsvfs->z_norm = (int)val;
- if ((error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &zval)) != 0)
- goto out;
- zfsvfs->z_utf8 = (zval != 0);
+ error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val);
+ if (error != 0)
+ return (error);
+ zfsvfs->z_utf8 = (val != 0);
- if ((error = zfs_get_zplprop(os, ZFS_PROP_CASE, &zval)) != 0)
- goto out;
- zfsvfs->z_case = (uint_t)zval;
+ error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val);
+ if (error != 0)
+ return (error);
+ zfsvfs->z_case = (uint_t)val;
/*
* Fold case on file systems that are always or sometimes case
@@ -930,24 +900,19 @@
zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
+ uint64_t sa_obj = 0;
if (zfsvfs->z_use_sa) {
/* should either have both of these objects or none */
error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1,
&sa_obj);
- if (error)
+ if (error != 0)
return (error);
- } else {
- /*
- * Pre SA versions file systems should never touch
- * either the attribute registration or layout objects.
- */
- sa_obj = 0;
}
error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
&zfsvfs->z_attr_table);
- if (error)
- goto out;
+ if (error != 0)
+ return (error);
if (zfsvfs->z_version >= ZPL_VERSION_SA)
sa_register_update_callback(os, zfs_sa_upgrade);
@@ -954,55 +919,117 @@
error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
&zfsvfs->z_root);
- if (error)
- goto out;
+ if (error != 0)
+ return (error);
ASSERT(zfsvfs->z_root != 0);
error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
&zfsvfs->z_unlinkedobj);
- if (error)
- goto out;
+ if (error != 0)
+ return (error);
error = zap_lookup(os, MASTER_NODE_OBJ,
zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA],
8, 1, &zfsvfs->z_userquota_obj);
- if (error && error != ENOENT)
- goto out;
+ if (error == ENOENT)
+ zfsvfs->z_userquota_obj = 0;
+ else if (error != 0)
+ return (error);
error = zap_lookup(os, MASTER_NODE_OBJ,
zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA],
8, 1, &zfsvfs->z_groupquota_obj);
- if (error && error != ENOENT)
- goto out;
+ if (error == ENOENT)
+ zfsvfs->z_groupquota_obj = 0;
+ else if (error != 0)
+ return (error);
error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1,
&zfsvfs->z_fuid_obj);
- if (error && error != ENOENT)
- goto out;
+ if (error == ENOENT)
+ zfsvfs->z_fuid_obj = 0;
+ else if (error != 0)
+ return (error);
error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1,
&zfsvfs->z_shares_dir);
- if (error && error != ENOENT)
- goto out;
+ if (error == ENOENT)
+ zfsvfs->z_shares_dir = 0;
+ else if (error != 0)
+ return (error);
+ /*
+ * Only use the name cache if we are looking for a
+ * name on a file system that does not require normalization
+ * or case folding. We can also look there if we happen to be
+ * on a non-normalizing, mixed sensitivity file system IF we
+ * are looking for the exact name (which is always the case on
+ * FreeBSD).
+ */
+ zfsvfs->z_use_namecache = !zfsvfs->z_norm ||
+ ((zfsvfs->z_case == ZFS_CASE_MIXED) &&
+ !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER));
+
+ return (0);
+}
+
+int
+zfsvfs_create(const char *osname, zfsvfs_t **zfvp)
+{
+ objset_t *os;
+ zfsvfs_t *zfsvfs;
+ int error;
+
+ /*
+ * XXX: Fix struct statfs so this isn't necessary!
+ *
+ * The 'osname' is used as the filesystem's special node, which means
+ * it must fit in statfs.f_mntfromname, or else it can't be
+ * enumerated, so libzfs_mnttab_find() returns NULL, which causes
+ * 'zfs unmount' to think it's not mounted when it is.
+ */
+ if (strlen(osname) >= MNAMELEN)
+ return (SET_ERROR(ENAMETOOLONG));
+
+ zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
+
+ /*
+ * We claim to always be readonly so we can open snapshots;
+ * other ZPL code will prevent us from writing to snapshots.
+ */
+ error = dmu_objset_own(osname, DMU_OST_ZFS, B_TRUE, zfsvfs, &os);
+ if (error) {
+ kmem_free(zfsvfs, sizeof (zfsvfs_t));
+ return (error);
+ }
+
+ zfsvfs->z_vfs = NULL;
+ zfsvfs->z_parent = zfsvfs;
+
mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL);
list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
offsetof(znode_t, z_link_node));
- rrw_init(&zfsvfs->z_teardown_lock, B_FALSE);
+#ifdef DIAGNOSTIC
+ rrm_init(&zfsvfs->z_teardown_lock, B_TRUE);
+#else
+ rrm_init(&zfsvfs->z_teardown_lock, B_FALSE);
+#endif
rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL);
rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
- for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
+ for (int i = 0; i != ZFS_OBJ_MTX_SZ; i++)
mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
+ error = zfsvfs_init(zfsvfs, os);
+ if (error != 0) {
+ dmu_objset_disown(os, zfsvfs);
+ *zfvp = NULL;
+ kmem_free(zfsvfs, sizeof (zfsvfs_t));
+ return (error);
+ }
+
*zfvp = zfsvfs;
return (0);
-
-out:
- dmu_objset_disown(os, zfsvfs);
- *zfvp = NULL;
- kmem_free(zfsvfs, sizeof (zfsvfs_t));
- return (error);
}
static int
@@ -1014,13 +1041,6 @@
if (error)
return (error);
- /*
- * Set the objset user_ptr to track its zfsvfs.
- */
- mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
- dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
- mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
-
zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
/*
@@ -1081,6 +1101,13 @@
zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */
}
+ /*
+ * Set the objset user_ptr to track its zfsvfs.
+ */
+ mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
+ dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
+ mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
+
return (0);
}
@@ -1105,7 +1132,7 @@
mutex_destroy(&zfsvfs->z_znodes_lock);
mutex_destroy(&zfsvfs->z_lock);
list_destroy(&zfsvfs->z_all_znodes);
- rrw_destroy(&zfsvfs->z_teardown_lock);
+ rrm_destroy(&zfsvfs->z_teardown_lock);
rw_destroy(&zfsvfs->z_teardown_inactive_lock);
rw_destroy(&zfsvfs->z_fuid_lock);
for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
@@ -1173,10 +1200,10 @@
vfsp->vfs_data = zfsvfs;
vfsp->mnt_flag |= MNT_LOCAL;
- vfsp->mnt_kern_flag |= MNTK_MPSAFE;
vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED;
vfsp->mnt_kern_flag |= MNTK_SHARED_WRITES;
vfsp->mnt_kern_flag |= MNTK_EXTENDED_SHARED;
+ vfsp->mnt_kern_flag |= MNTK_NO_IOPF; /* vn_io_fault can be used */
/*
* The fsid is 64 bits, composed of an 8-bit fs type, which
@@ -1225,9 +1252,6 @@
}
vfs_mountedfrom(vfsp, osname);
- /* Grab extra reference. */
- VERIFY(VFS_ROOT(vfsp, LK_EXCLUSIVE, &vp) == 0);
- VOP_UNLOCK(vp, 0);
if (!zfsvfs->z_issnap)
zfsctl_create(zfsvfs);
@@ -1236,7 +1260,7 @@
dmu_objset_disown(zfsvfs->z_os, zfsvfs);
zfsvfs_free(zfsvfs);
} else {
- atomic_add_32(&zfs_active_fs_count, 1);
+ atomic_inc_32(&zfs_active_fs_count);
}
return (error);
@@ -1246,43 +1270,9 @@
zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
{
objset_t *os = zfsvfs->z_os;
- struct dsl_dataset *ds;
- /*
- * Unregister properties.
- */
- if (!dmu_objset_is_snapshot(os)) {
- ds = dmu_objset_ds(os);
- VERIFY(dsl_prop_unregister(ds, "atime", atime_changed_cb,
- zfsvfs) == 0);
-
- VERIFY(dsl_prop_unregister(ds, "xattr", xattr_changed_cb,
- zfsvfs) == 0);
-
- VERIFY(dsl_prop_unregister(ds, "recordsize", blksz_changed_cb,
- zfsvfs) == 0);
-
- VERIFY(dsl_prop_unregister(ds, "readonly", readonly_changed_cb,
- zfsvfs) == 0);
-
- VERIFY(dsl_prop_unregister(ds, "setuid", setuid_changed_cb,
- zfsvfs) == 0);
-
- VERIFY(dsl_prop_unregister(ds, "exec", exec_changed_cb,
- zfsvfs) == 0);
-
- VERIFY(dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb,
- zfsvfs) == 0);
-
- VERIFY(dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb,
- zfsvfs) == 0);
-
- VERIFY(dsl_prop_unregister(ds, "aclinherit",
- acl_inherit_changed_cb, zfsvfs) == 0);
-
- VERIFY(dsl_prop_unregister(ds, "vscan",
- vscan_changed_cb, zfsvfs) == 0);
- }
+ if (!dmu_objset_is_snapshot(os))
+ dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs);
}
#ifdef SECLABEL
@@ -1628,13 +1618,21 @@
* can be interrogated.
*/
if ((uap->flags & MS_DATA) && uap->datalen > 0)
-#else
+ return (SET_ERROR(EINVAL));
+
+ /*
+ * Get the objset name (the "special" mount argument).
+ */
+ if (error = pn_get(uap->spec, fromspace, &spn))
+ return (error);
+
+ osname = spn.pn_path;
+#else /* !illumos */
if (!prison_allow(td->td_ucred, PR_ALLOW_MOUNT_ZFS))
return (SET_ERROR(EPERM));
if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&osname, NULL))
return (SET_ERROR(EINVAL));
-#endif /* ! illumos */
/*
* If full-owner-access is enabled and delegated administration is
@@ -1644,6 +1642,7 @@
dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != ECANCELED) {
secpolicy_fs_mount_clearopts(cr, vfsp);
}
+#endif /* illumos */
/*
* Check for mount privilege?
@@ -1706,9 +1705,19 @@
* according to those options set in the current VFS options.
*/
if (vfsp->vfs_flag & MS_REMOUNT) {
- /* refresh mount options */
- zfs_unregister_callbacks(vfsp->vfs_data);
+ zfsvfs_t *zfsvfs = vfsp->vfs_data;
+
+ /*
+ * Refresh mount options with z_teardown_lock blocking I/O while
+ * the filesystem is in an inconsistent state.
+ * The lock also serializes this code with filesystem
+ * manipulations between entry to zfs_suspend_fs() and return
+ * from zfs_resume_fs().
+ */
+ rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
+ zfs_unregister_callbacks(zfsvfs);
error = zfs_register_callbacks(vfsp);
+ rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
goto out;
}
@@ -1727,7 +1736,7 @@
error = zfs_domount(vfsp, osname);
PICKUP_GIANT();
-#ifdef sun
+#ifdef illumos
/*
* Add an extra VFS_HOLD on our parent vfs so that it can't
* disappear due to a forced unmount.
@@ -1734,7 +1743,7 @@
*/
if (error == 0 && ((zfsvfs_t *)vfsp->vfs_data)->z_issnap)
VFS_HOLD(mvp->v_vfsp);
-#endif /* sun */
+#endif
out:
return (error);
@@ -1792,23 +1801,12 @@
strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname,
sizeof(statp->f_mntonname));
- statp->f_namemax = ZFS_MAXNAMELEN;
+ statp->f_namemax = MAXNAMELEN - 1;
ZFS_EXIT(zfsvfs);
return (0);
}
-int
-zfs_vnode_lock(vnode_t *vp, int flags)
-{
- int error;
-
- ASSERT(vp != NULL);
-
- error = vn_lock(vp, flags);
- return (error);
-}
-
static int
zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp)
{
@@ -1816,7 +1814,7 @@
znode_t *rootzp;
int error;
- ZFS_ENTER_NOERROR(zfsvfs);
+ ZFS_ENTER(zfsvfs);
error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
if (error == 0)
@@ -1825,13 +1823,12 @@
ZFS_EXIT(zfsvfs);
if (error == 0) {
- error = zfs_vnode_lock(*vpp, flags);
- if (error == 0)
- (*vpp)->v_vflag |= VV_ROOT;
+ error = vn_lock(*vpp, flags);
+ if (error != 0) {
+ VN_RELE(*vpp);
+ *vpp = NULL;
+ }
}
- if (error != 0)
- *vpp = NULL;
-
return (error);
}
@@ -1846,7 +1843,7 @@
{
znode_t *zp;
- rrw_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
+ rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
if (!unmounting) {
/*
@@ -1879,7 +1876,7 @@
*/
if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) {
rw_exit(&zfsvfs->z_teardown_inactive_lock);
- rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
+ rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
return (SET_ERROR(EIO));
}
@@ -1906,7 +1903,7 @@
*/
if (unmounting) {
zfsvfs->z_unmounted = B_TRUE;
- rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
+ rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
rw_exit(&zfsvfs->z_teardown_inactive_lock);
}
@@ -1966,15 +1963,6 @@
if (zfsvfs->z_ctldir != NULL) {
if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0)
return (ret);
- ret = vflush(vfsp, 0, 0, td);
- ASSERT(ret == EBUSY);
- if (!(fflag & MS_FORCE)) {
- if (zfsvfs->z_ctldir->v_count > 1)
- return (EBUSY);
- ASSERT(zfsvfs->z_ctldir->v_count == 1);
- }
- zfsctl_destroy(zfsvfs);
- ASSERT(zfsvfs->z_ctldir == NULL);
}
if (fflag & MS_FORCE) {
@@ -1983,23 +1971,19 @@
* vflush(FORCECLOSE). This way we ensure no future vnops
* will be called and risk operating on DOOMED vnodes.
*/
- rrw_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
+ rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
zfsvfs->z_unmounted = B_TRUE;
- rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
+ rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
}
/*
* Flush all the files.
*/
- ret = vflush(vfsp, 1, (fflag & MS_FORCE) ? FORCECLOSE : 0, td);
- if (ret != 0) {
- if (!zfsvfs->z_issnap) {
- zfsctl_create(zfsvfs);
- ASSERT(zfsvfs->z_ctldir != NULL);
- }
+ ret = vflush(vfsp, 0, (fflag & MS_FORCE) ? FORCECLOSE : 0, td);
+ if (ret != 0)
return (ret);
- }
+#ifdef illumos
if (!(fflag & MS_FORCE)) {
/*
* Check the number of active vnodes in the file system.
@@ -2020,6 +2004,7 @@
return (SET_ERROR(EBUSY));
}
}
+#endif
VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0);
os = zfsvfs->z_os;
@@ -2047,12 +2032,6 @@
*/
if (zfsvfs->z_ctldir != NULL)
zfsctl_destroy(zfsvfs);
- if (zfsvfs->z_issnap) {
- vnode_t *svp = vfsp->mnt_vnodecovered;
-
- if (svp->v_count >= 2)
- VN_RELE(svp);
- }
zfs_freevfs(vfsp);
return (0);
@@ -2077,7 +2056,7 @@
ZFS_ENTER(zfsvfs);
err = zfs_zget(zfsvfs, ino, &zp);
if (err == 0 && zp->z_unlinked) {
- VN_RELE(ZTOV(zp));
+ vrele(ZTOV(zp));
err = EINVAL;
}
if (err == 0)
@@ -2084,11 +2063,9 @@
*vpp = ZTOV(zp);
ZFS_EXIT(zfsvfs);
if (err == 0)
- err = zfs_vnode_lock(*vpp, flags);
+ err = vn_lock(*vpp, flags);
if (err != 0)
*vpp = NULL;
- else
- (*vpp)->v_hash = ino;
return (err);
}
@@ -2115,8 +2092,10 @@
static int
zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp)
{
+ struct componentname cn;
zfsvfs_t *zfsvfs = vfsp->vfs_data;
znode_t *zp;
+ vnode_t *dvp;
uint64_t object = 0;
uint64_t fid_gen = 0;
uint64_t gen_mask;
@@ -2171,21 +2150,32 @@
if ((fid_gen == 0 &&
(object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) ||
(zfsvfs->z_shares_dir != 0 && object == zfsvfs->z_shares_dir)) {
- *vpp = zfsvfs->z_ctldir;
- ASSERT(*vpp != NULL);
+ ZFS_EXIT(zfsvfs);
+ VERIFY0(zfsctl_root(zfsvfs, LK_SHARED, &dvp));
if (object == ZFSCTL_INO_SNAPDIR) {
- VERIFY(zfsctl_root_lookup(*vpp, "snapshot", vpp, NULL,
- 0, NULL, NULL, NULL, NULL, NULL) == 0);
+ cn.cn_nameptr = "snapshot";
+ cn.cn_namelen = strlen(cn.cn_nameptr);
+ cn.cn_nameiop = LOOKUP;
+ cn.cn_flags = ISLASTCN | LOCKLEAF;
+ cn.cn_lkflags = flags;
+ VERIFY0(VOP_LOOKUP(dvp, vpp, &cn));
+ vput(dvp);
} else if (object == zfsvfs->z_shares_dir) {
- VERIFY(zfsctl_root_lookup(*vpp, "shares", vpp, NULL,
- 0, NULL, NULL, NULL, NULL, NULL) == 0);
+ /*
+ * XXX This branch must not be taken,
+ * if it is, then the lookup below will
+ * explode.
+ */
+ cn.cn_nameptr = "shares";
+ cn.cn_namelen = strlen(cn.cn_nameptr);
+ cn.cn_nameiop = LOOKUP;
+ cn.cn_flags = ISLASTCN;
+ cn.cn_lkflags = flags;
+ VERIFY0(VOP_LOOKUP(dvp, vpp, &cn));
+ vput(dvp);
} else {
- VN_HOLD(*vpp);
+ *vpp = dvp;
}
- ZFS_EXIT(zfsvfs);
- err = zfs_vnode_lock(*vpp, flags);
- if (err != 0)
- *vpp = NULL;
return (err);
}
@@ -2203,7 +2193,7 @@
zp_gen = 1;
if (zp->z_unlinked || zp_gen != fid_gen) {
dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen);
- VN_RELE(ZTOV(zp));
+ vrele(ZTOV(zp));
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EINVAL));
}
@@ -2210,7 +2200,7 @@
*vpp = ZTOV(zp);
ZFS_EXIT(zfsvfs);
- err = zfs_vnode_lock(*vpp, flags | LK_RETRY);
+ err = vn_lock(*vpp, flags);
if (err == 0)
vnode_create_vobject(*vpp, zp->z_size, curthread);
else
@@ -2222,7 +2212,9 @@
* Block out VOPs and close zfsvfs_t::z_os
*
* Note, if successful, then we return with the 'z_teardown_lock' and
- * 'z_teardown_inactive_lock' write held.
+ * 'z_teardown_inactive_lock' write held. We leave ownership of the underlying
+ * dataset and objset intact so that they can be atomically handed off during
+ * a subsequent rollback or recv operation and the resume thereafter.
*/
int
zfs_suspend_fs(zfsvfs_t *zfsvfs)
@@ -2231,84 +2223,70 @@
if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0)
return (error);
- dmu_objset_disown(zfsvfs->z_os, zfsvfs);
return (0);
}
/*
- * Reopen zfsvfs_t::z_os and release VOPs.
+ * Rebuild SA and release VOPs. Note that ownership of the underlying dataset
+ * is an invariant across any of the operations that can be performed while the
+ * filesystem was suspended. Whether it succeeded or failed, the preconditions
+ * are the same: the relevant objset and associated dataset are owned by
+ * zfsvfs, held, and long held on entry.
*/
int
-zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname)
+zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
{
int err;
+ znode_t *zp;
- ASSERT(RRW_WRITE_HELD(&zfsvfs->z_teardown_lock));
+ ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock));
ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock));
- err = dmu_objset_own(osname, DMU_OST_ZFS, B_FALSE, zfsvfs,
- &zfsvfs->z_os);
- if (err) {
- zfsvfs->z_os = NULL;
- } else {
- znode_t *zp;
- uint64_t sa_obj = 0;
+ /*
+ * We already own this, so just update the objset_t, as the one we
+ * had before may have been evicted.
+ */
+ objset_t *os;
+ VERIFY3P(ds->ds_owner, ==, zfsvfs);
+ VERIFY(dsl_dataset_long_held(ds));
+ VERIFY0(dmu_objset_from_ds(ds, &os));
- /*
- * Make sure version hasn't changed
- */
+ err = zfsvfs_init(zfsvfs, os);
+ if (err != 0)
+ goto bail;
- err = zfs_get_zplprop(zfsvfs->z_os, ZFS_PROP_VERSION,
- &zfsvfs->z_version);
+ VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0);
- if (err)
- goto bail;
+ zfs_set_fuid_feature(zfsvfs);
- err = zap_lookup(zfsvfs->z_os, MASTER_NODE_OBJ,
- ZFS_SA_ATTRS, 8, 1, &sa_obj);
-
- if (err && zfsvfs->z_version >= ZPL_VERSION_SA)
- goto bail;
-
- if ((err = sa_setup(zfsvfs->z_os, sa_obj,
- zfs_attr_table, ZPL_END, &zfsvfs->z_attr_table)) != 0)
- goto bail;
-
- if (zfsvfs->z_version >= ZPL_VERSION_SA)
- sa_register_update_callback(zfsvfs->z_os,
- zfs_sa_upgrade);
-
- VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0);
-
- zfs_set_fuid_feature(zfsvfs);
-
- /*
- * Attempt to re-establish all the active znodes with
- * their dbufs. If a zfs_rezget() fails, then we'll let
- * any potential callers discover that via ZFS_ENTER_VERIFY_VP
- * when they try to use their znode.
- */
- mutex_enter(&zfsvfs->z_znodes_lock);
- for (zp = list_head(&zfsvfs->z_all_znodes); zp;
- zp = list_next(&zfsvfs->z_all_znodes, zp)) {
- (void) zfs_rezget(zp);
- }
- mutex_exit(&zfsvfs->z_znodes_lock);
+ /*
+ * Attempt to re-establish all the active znodes with
+ * their dbufs. If a zfs_rezget() fails, then we'll let
+ * any potential callers discover that via ZFS_ENTER_VERIFY_VP
+ * when they try to use their znode.
+ */
+ mutex_enter(&zfsvfs->z_znodes_lock);
+ for (zp = list_head(&zfsvfs->z_all_znodes); zp;
+ zp = list_next(&zfsvfs->z_all_znodes, zp)) {
+ (void) zfs_rezget(zp);
}
+ mutex_exit(&zfsvfs->z_znodes_lock);
bail:
/* release the VOPs */
rw_exit(&zfsvfs->z_teardown_inactive_lock);
- rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
+ rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
if (err) {
/*
- * Since we couldn't reopen zfsvfs::z_os, or
- * setup the sa framework force unmount this file system.
+ * Since we couldn't setup the sa framework, try to force
+ * unmount this file system.
*/
- if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0)
+ if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0) {
+ vfs_ref(zfsvfs->z_vfs);
(void) dounmount(zfsvfs->z_vfs, MS_FORCE, curthread);
+ }
}
return (err);
}
@@ -2318,7 +2296,7 @@
{
zfsvfs_t *zfsvfs = vfsp->vfs_data;
-#ifdef sun
+#ifdef illumos
/*
* If this is a snapshot, we have an extra VFS_HOLD on our parent
* from zfs_mount(). Release it here. If we came through
@@ -2327,11 +2305,11 @@
*/
if (zfsvfs->z_issnap && (vfsp != rootvfs))
VFS_RELE(zfsvfs->z_parent->z_vfs);
-#endif /* sun */
+#endif
zfsvfs_free(zfsvfs);
- atomic_add_32(&zfs_active_fs_count, -1);
+ atomic_dec_32(&zfs_active_fs_count);
}
#ifdef __i386__
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,7 +21,9 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright 2014 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
*/
/* Portions Copyright 2007 Jeremy Teo */
@@ -34,6 +36,7 @@
#include <sys/sysmacros.h>
#include <sys/resource.h>
#include <sys/vfs.h>
+#include <sys/vm.h>
#include <sys/vnode.h>
#include <sys/file.h>
#include <sys/stat.h>
@@ -64,16 +67,14 @@
#include <sys/zfs_ctldir.h>
#include <sys/zfs_fuid.h>
#include <sys/zfs_sa.h>
-#include <sys/dnlc.h>
#include <sys/zfs_rlock.h>
#include <sys/extdirent.h>
#include <sys/kidmap.h>
#include <sys/bio.h>
#include <sys/buf.h>
-#include <sys/sf_buf.h>
#include <sys/sched.h>
#include <sys/acl.h>
-#include <vm/vm_pageout.h>
+#include <vm/vm_param.h>
/*
* Programming rules.
@@ -105,12 +106,19 @@
* (3) All range locks must be grabbed before calling dmu_tx_assign(),
* as they can span dmu_tx_assign() calls.
*
- * (4) Always pass TXG_NOWAIT as the second argument to dmu_tx_assign().
- * This is critical because we don't want to block while holding locks.
- * Note, in particular, that if a lock is sometimes acquired before
- * the tx assigns, and sometimes after (e.g. z_lock), then failing to
- * use a non-blocking assign can deadlock the system. The scenario:
+ * (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to
+ * dmu_tx_assign(). This is critical because we don't want to block
+ * while holding locks.
*
+ * If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT. This
+ * reduces lock contention and CPU usage when we must wait (note that if
+ * throughput is constrained by the storage, nearly every transaction
+ * must wait).
+ *
+ * Note, in particular, that if a lock is sometimes acquired before
+ * the tx assigns, and sometimes after (e.g. z_lock), then failing
+ * to use a non-blocking assign can deadlock the system. The scenario:
+ *
* Thread A has grabbed a lock before calling dmu_tx_assign().
* Thread B is in an already-assigned tx, and blocks for this lock.
* Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
@@ -117,7 +125,11 @@
* forever, because the previous txg can't quiesce until B's tx commits.
*
* If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
- * then drop all locks, call dmu_tx_wait(), and try again.
+ * then drop all locks, call dmu_tx_wait(), and try again. On subsequent
+ * calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT,
+ * to indicate that this operation has already called dmu_tx_wait().
+ * This will ensure that we don't retry forever, waiting a short bit
+ * each time.
*
* (5) If the operation succeeded, generate the intent log entry for it
* before dropping locks. This ensures that the ordering of events
@@ -135,16 +147,17 @@
*
* ZFS_ENTER(zfsvfs); // exit if unmounted
* top:
- * zfs_dirent_lock(&dl, ...) // lock directory entry (may VN_HOLD())
+ * zfs_dirent_lookup(&dl, ...) // lock directory entry (may VN_HOLD())
* rw_enter(...); // grab any other locks you need
* tx = dmu_tx_create(...); // get DMU tx
* dmu_tx_hold_*(); // hold each object you might modify
- * error = dmu_tx_assign(tx, TXG_NOWAIT); // try to assign
+ * error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
* if (error) {
* rw_exit(...); // drop locks
* zfs_dirent_unlock(dl); // unlock directory entry
* VN_RELE(...); // release held vnodes
* if (error == ERESTART) {
+ * waited = B_TRUE;
* dmu_tx_wait(tx);
* dmu_tx_abort(tx);
* goto top;
@@ -253,16 +266,19 @@
error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
- /* end of file? */
- if ((error == ESRCH) || (noff > file_sz)) {
- /*
- * Handle the virtual hole at the end of file.
- */
- if (hole) {
- *off = file_sz;
- return (0);
- }
+ if (error == ESRCH)
return (SET_ERROR(ENXIO));
+
+ /*
+ * We could find a hole that begins after the logical end-of-file,
+ * because dmu_offset_next() only works on whole blocks. If the
+ * EOF falls mid-block, then indicate that the "virtual hole"
+ * at the end of the file begins at the logical EOF, rather than
+ * at the end of the last block.
+ */
+ if (noff > file_sz) {
+ ASSERT(hole);
+ noff = file_sz;
}
if (noff < *off)
@@ -277,6 +293,8 @@
int *rvalp, caller_context_t *ct)
{
offset_t off;
+ offset_t ndata;
+ dmu_object_info_t doi;
int error;
zfsvfs_t *zfsvfs;
znode_t *zp;
@@ -283,6 +301,7 @@
switch (com) {
case _FIOFFS:
+ {
return (0);
/*
@@ -289,13 +308,17 @@
* The following two ioctls are used by bfu. Faking out,
* necessary to avoid bfu errors.
*/
+ }
case _FIOGDIO:
case _FIOSDIO:
+ {
return (0);
+ }
case _FIO_SEEK_DATA:
case _FIO_SEEK_HOLE:
-#ifdef sun
+ {
+#ifdef illumos
if (ddi_copyin((void *)data, &off, sizeof (off), flag))
return (SET_ERROR(EFAULT));
#else
@@ -311,7 +334,7 @@
ZFS_EXIT(zfsvfs);
if (error)
return (error);
-#ifdef sun
+#ifdef illumos
if (ddi_copyout(&off, (void *)data, sizeof (off), flag))
return (SET_ERROR(EFAULT));
#else
@@ -319,6 +342,48 @@
#endif
return (0);
}
+#ifdef illumos
+ case _FIO_COUNT_FILLED:
+ {
+ /*
+ * _FIO_COUNT_FILLED adds a new ioctl command which
+ * exposes the number of filled blocks in a
+ * ZFS object.
+ */
+ zp = VTOZ(vp);
+ zfsvfs = zp->z_zfsvfs;
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ /*
+ * Wait for all dirty blocks for this object
+ * to get synced out to disk, and the DMU info
+ * updated.
+ */
+ error = dmu_object_wait_synced(zfsvfs->z_os, zp->z_id);
+ if (error) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ /*
+ * Retrieve fill count from DMU object.
+ */
+ error = dmu_object_info(zfsvfs->z_os, zp->z_id, &doi);
+ if (error) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+
+ ndata = doi.doi_fill_count;
+
+ ZFS_EXIT(zfsvfs);
+ if (ddi_copyout(&ndata, (void *)data, sizeof (ndata), flag))
+ return (SET_ERROR(EFAULT));
+ return (0);
+ }
+#endif
+ }
return (SET_ERROR(ENOTTY));
}
@@ -327,14 +392,28 @@
{
vm_object_t obj;
vm_page_t pp;
+ int64_t end;
+ /*
+ * At present vm_page_clear_dirty extends the cleared range to DEV_BSIZE
+ * aligned boundaries, if the range is not aligned. As a result a
+ * DEV_BSIZE subrange with partially dirty data may get marked as clean.
+ * It may happen that all DEV_BSIZE subranges are marked clean and thus
+ * the whole page would be considred clean despite have some dirty data.
+ * For this reason we should shrink the range to DEV_BSIZE aligned
+ * boundaries before calling vm_page_clear_dirty.
+ */
+ end = rounddown2(off + nbytes, DEV_BSIZE);
+ off = roundup2(off, DEV_BSIZE);
+ nbytes = end - off;
+
obj = vp->v_object;
- VM_OBJECT_LOCK_ASSERT(obj, MA_OWNED);
+ zfs_vmobject_assert_wlocked(obj);
for (;;) {
if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
pp->valid) {
- if ((pp->oflags & VPO_BUSY) != 0) {
+ if (vm_page_xbusied(pp)) {
/*
* Reference the page before unlocking and
* sleeping so that the page daemon is less
@@ -341,13 +420,17 @@
* likely to reclaim it.
*/
vm_page_reference(pp);
- vm_page_sleep(pp, "zfsmwb");
+ vm_page_lock(pp);
+ zfs_vmobject_wunlock(obj);
+ vm_page_busy_sleep(pp, "zfsmwb", true);
+ zfs_vmobject_wlock(obj);
continue;
}
+ vm_page_sbusy(pp);
} else if (pp == NULL) {
pp = vm_page_alloc(obj, OFF_TO_IDX(start),
VM_ALLOC_SYSTEM | VM_ALLOC_IFCACHED |
- VM_ALLOC_NOBUSY);
+ VM_ALLOC_SBUSY);
} else {
ASSERT(pp != NULL && !pp->valid);
pp = NULL;
@@ -356,9 +439,9 @@
if (pp != NULL) {
ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
vm_object_pip_add(obj, 1);
- vm_page_io_start(pp);
pmap_remove_write(pp);
- vm_page_clear_dirty(pp, off, nbytes);
+ if (nbytes != 0)
+ vm_page_clear_dirty(pp, off, nbytes);
}
break;
}
@@ -369,7 +452,7 @@
page_unbusy(vm_page_t pp)
{
- vm_page_io_finish(pp);
+ vm_page_sunbusy(pp);
vm_object_pip_subtract(pp->object, 1);
}
@@ -380,12 +463,12 @@
vm_page_t pp;
obj = vp->v_object;
- VM_OBJECT_LOCK_ASSERT(obj, MA_OWNED);
+ zfs_vmobject_assert_wlocked(obj);
for (;;) {
if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL &&
pp->valid) {
- if ((pp->oflags & VPO_BUSY) != 0) {
+ if (vm_page_xbusied(pp)) {
/*
* Reference the page before unlocking and
* sleeping so that the page daemon is less
@@ -392,7 +475,10 @@
* likely to reclaim it.
*/
vm_page_reference(pp);
- vm_page_sleep(pp, "zfsmwb");
+ vm_page_lock(pp);
+ zfs_vmobject_wunlock(obj);
+ vm_page_busy_sleep(pp, "zfsmwb", true);
+ zfs_vmobject_wlock(obj);
continue;
}
@@ -417,21 +503,6 @@
vm_page_unlock(pp);
}
-static caddr_t
-zfs_map_page(vm_page_t pp, struct sf_buf **sfp)
-{
-
- *sfp = sf_buf_alloc(pp, 0);
- return ((caddr_t)sf_buf_kva(*sfp));
-}
-
-static void
-zfs_unmap_page(struct sf_buf *sf)
-{
-
- sf_buf_free(sf);
-}
-
/*
* When a file is memory mapped, we must keep the IO data synchronized
* between the DMU cache and the memory mapped pages. What this means:
@@ -448,53 +519,33 @@
caddr_t va;
int off;
+ ASSERT(segflg != UIO_NOCOPY);
ASSERT(vp->v_mount != NULL);
obj = vp->v_object;
ASSERT(obj != NULL);
off = start & PAGEOFFSET;
- VM_OBJECT_LOCK(obj);
+ zfs_vmobject_wlock(obj);
for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
vm_page_t pp;
int nbytes = imin(PAGESIZE - off, len);
- if (segflg == UIO_NOCOPY) {
- pp = vm_page_lookup(obj, OFF_TO_IDX(start));
- KASSERT(pp != NULL,
- ("zfs update_pages: NULL page in putpages case"));
- KASSERT(off == 0,
- ("zfs update_pages: unaligned data in putpages case"));
- KASSERT(pp->valid == VM_PAGE_BITS_ALL,
- ("zfs update_pages: invalid page in putpages case"));
- KASSERT(pp->busy > 0,
- ("zfs update_pages: unbusy page in putpages case"));
- KASSERT(!pmap_page_is_write_mapped(pp),
- ("zfs update_pages: writable page in putpages case"));
- VM_OBJECT_UNLOCK(obj);
+ if ((pp = page_busy(vp, start, off, nbytes)) != NULL) {
+ zfs_vmobject_wunlock(obj);
va = zfs_map_page(pp, &sf);
- (void) dmu_write(os, oid, start, nbytes, va, tx);
- zfs_unmap_page(sf);
-
- VM_OBJECT_LOCK(obj);
- vm_page_undirty(pp);
- } else if ((pp = page_busy(vp, start, off, nbytes)) != NULL) {
- VM_OBJECT_UNLOCK(obj);
-
- va = zfs_map_page(pp, &sf);
(void) dmu_read(os, oid, start+off, nbytes,
va+off, DMU_READ_PREFETCH);;
zfs_unmap_page(sf);
- VM_OBJECT_LOCK(obj);
+ zfs_vmobject_wlock(obj);
page_unbusy(pp);
}
len -= nbytes;
off = 0;
}
- if (segflg != UIO_NOCOPY)
- vm_object_pip_wakeupn(obj, 0);
- VM_OBJECT_UNLOCK(obj);
+ vm_object_pip_wakeupn(obj, 0);
+ zfs_vmobject_wunlock(obj);
}
/*
@@ -502,7 +553,7 @@
* ZFS to populate a range of page cache pages with data.
*
* NOTE: this function could be optimized to pre-allocate
- * all pages in advance, drain VPO_BUSY on all of them,
+ * all pages in advance, drain exclusive busy on all of them,
* map them into contiguous KVA region and populate them
* in one single dmu_read() call.
*/
@@ -526,15 +577,14 @@
ASSERT(obj != NULL);
ASSERT((uio->uio_loffset & PAGEOFFSET) == 0);
- VM_OBJECT_LOCK(obj);
+ zfs_vmobject_wlock(obj);
for (start = uio->uio_loffset; len > 0; start += PAGESIZE) {
int bytes = MIN(PAGESIZE, len);
- pp = vm_page_grab(obj, OFF_TO_IDX(start), VM_ALLOC_NOBUSY |
- VM_ALLOC_NORMAL | VM_ALLOC_RETRY | VM_ALLOC_IGN_SBUSY);
+ pp = vm_page_grab(obj, OFF_TO_IDX(start), VM_ALLOC_SBUSY |
+ VM_ALLOC_NORMAL | VM_ALLOC_IGN_SBUSY);
if (pp->valid == 0) {
- vm_page_io_start(pp);
- VM_OBJECT_UNLOCK(obj);
+ zfs_vmobject_wunlock(obj);
va = zfs_map_page(pp, &sf);
error = dmu_read(os, zp->z_id, start, bytes, va,
DMU_READ_PREFETCH);
@@ -541,16 +591,21 @@
if (bytes != PAGESIZE && error == 0)
bzero(va + bytes, PAGESIZE - bytes);
zfs_unmap_page(sf);
- VM_OBJECT_LOCK(obj);
- vm_page_io_finish(pp);
+ zfs_vmobject_wlock(obj);
+ vm_page_sunbusy(pp);
vm_page_lock(pp);
if (error) {
- vm_page_free(pp);
+ if (pp->wire_count == 0 && pp->valid == 0 &&
+ !vm_page_busied(pp))
+ vm_page_free(pp);
} else {
pp->valid = VM_PAGE_BITS_ALL;
vm_page_activate(pp);
}
vm_page_unlock(pp);
+ } else {
+ ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL);
+ vm_page_sunbusy(pp);
}
if (error)
break;
@@ -558,7 +613,7 @@
uio->uio_offset += bytes;
len -= bytes;
}
- VM_OBJECT_UNLOCK(obj);
+ zfs_vmobject_wunlock(obj);
return (error);
}
@@ -576,7 +631,6 @@
mappedread(vnode_t *vp, int nbytes, uio_t *uio)
{
znode_t *zp = VTOZ(vp);
- objset_t *os = zp->z_zfsvfs->z_os;
vm_object_t obj;
int64_t start;
caddr_t va;
@@ -590,7 +644,7 @@
start = uio->uio_loffset;
off = start & PAGEOFFSET;
- VM_OBJECT_LOCK(obj);
+ zfs_vmobject_wlock(obj);
for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
vm_page_t pp;
uint64_t bytes = MIN(PAGESIZE - off, len);
@@ -599,16 +653,21 @@
struct sf_buf *sf;
caddr_t va;
- VM_OBJECT_UNLOCK(obj);
+ zfs_vmobject_wunlock(obj);
va = zfs_map_page(pp, &sf);
+#ifdef illumos
error = uiomove(va + off, bytes, UIO_READ, uio);
+#else
+ error = vn_io_fault_uiomove(va + off, bytes, uio);
+#endif
zfs_unmap_page(sf);
- VM_OBJECT_LOCK(obj);
+ zfs_vmobject_wlock(obj);
page_unhold(pp);
} else {
- VM_OBJECT_UNLOCK(obj);
- error = dmu_read_uio(os, zp->z_id, uio, bytes);
- VM_OBJECT_LOCK(obj);
+ zfs_vmobject_wunlock(obj);
+ error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
+ uio, bytes);
+ zfs_vmobject_wlock(obj);
}
len -= bytes;
off = 0;
@@ -615,7 +674,7 @@
if (error)
break;
}
- VM_OBJECT_UNLOCK(obj);
+ zfs_vmobject_wunlock(obj);
return (error);
}
@@ -644,7 +703,6 @@
{
znode_t *zp = VTOZ(vp);
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- objset_t *os;
ssize_t n, nbytes;
int error = 0;
rl_t *rl;
@@ -652,7 +710,6 @@
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
- os = zfsvfs->z_os;
if (zp->z_pflags & ZFS_AV_QUARANTINED) {
ZFS_EXIT(zfsvfs);
@@ -710,7 +767,7 @@
ASSERT(uio->uio_loffset < zp->z_size);
n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset);
-#ifdef sun
+#ifdef illumos
if ((uio->uio_extflg == UIO_XUIO) &&
(((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) {
int nblk;
@@ -739,7 +796,7 @@
}
}
}
-#endif /* sun */
+#endif /* illumos */
while (n > 0) {
nbytes = MIN(n, zfs_read_chunk_size -
@@ -750,10 +807,12 @@
error = mappedread_sf(vp, nbytes, uio);
else
#endif /* __FreeBSD__ */
- if (vn_has_cached_data(vp))
+ if (vn_has_cached_data(vp)) {
error = mappedread(vp, nbytes, uio);
- else
- error = dmu_read_uio(os, zp->z_id, uio, nbytes);
+ } else {
+ error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
+ uio, nbytes);
+ }
if (error) {
/* convert checksum errors into IO errors */
if (error == ECKSUM)
@@ -839,6 +898,16 @@
&zp->z_pflags, 8);
/*
+ * In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our
+ * callers might not be able to detect properly that we are read-only,
+ * so check it explicitly here.
+ */
+ if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
+ ZFS_EXIT(zfsvfs);
+ return (SET_ERROR(EROFS));
+ }
+
+ /*
* If immutable or not appending then return EPERM
*/
if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) ||
@@ -869,7 +938,7 @@
return (error);
}
-#ifdef sun
+#ifdef illumos
/*
* Pre-fault the pages to ensure slow (eg NFS) pages
* don't hold up txg.
@@ -880,7 +949,7 @@
xuio = (xuio_t *)uio;
else
uio_prefaultpages(MIN(n, max_blksz), uio);
-#endif /* sun */
+#endif
/*
* If in append mode, set the io offset pointer to eof.
@@ -938,7 +1007,6 @@
while (n > 0) {
abuf = NULL;
woff = uio->uio_loffset;
-again:
if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
if (abuf != NULL)
@@ -990,13 +1058,8 @@
dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
zfs_sa_upgrade_txholds(tx, zp);
- error = dmu_tx_assign(tx, TXG_NOWAIT);
+ error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
- if (error == ERESTART) {
- dmu_tx_wait(tx);
- dmu_tx_abort(tx);
- goto again;
- }
dmu_tx_abort(tx);
if (abuf != NULL)
dmu_return_arcbuf(abuf);
@@ -1013,8 +1076,14 @@
uint64_t new_blksz;
if (zp->z_blksz > max_blksz) {
+ /*
+ * File's blocksize is already larger than the
+ * "recordsize" property. Only let it grow to
+ * the next power of 2.
+ */
ASSERT(!ISP2(zp->z_blksz));
- new_blksz = MIN(end_size, SPA_MAXBLOCKSIZE);
+ new_blksz = MIN(end_size,
+ 1 << highbit64(zp->z_blksz));
} else {
new_blksz = MIN(end_size, max_blksz);
}
@@ -1112,7 +1181,11 @@
while ((end_size = zp->z_size) < uio->uio_loffset) {
(void) atomic_cas_64(&zp->z_size, end_size,
uio->uio_loffset);
+#ifdef illumos
ASSERT(error == 0);
+#else
+ ASSERT(error == 0 || error == EFAULT);
+#endif
}
/*
* If we are replaying and eof is non zero then force
@@ -1122,7 +1195,10 @@
if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
zp->z_size = zfsvfs->z_replay_eof;
- error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+ if (error == 0)
+ error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+ else
+ (void) sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
dmu_tx_commit(tx);
@@ -1132,10 +1208,10 @@
ASSERT(tx_bytes == nbytes);
n -= nbytes;
-#ifdef sun
+#ifdef illumos
if (!xuio && n > 0)
uio_prefaultpages(MIN(n, max_blksz), uio);
-#endif /* sun */
+#endif
}
zfs_range_unlock(rl);
@@ -1149,6 +1225,17 @@
return (error);
}
+#ifdef __FreeBSD__
+ /*
+ * EFAULT means that at least one page of the source buffer was not
+ * available. VFS will re-try remaining I/O upon this error.
+ */
+ if (error == EFAULT) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+#endif
+
if (ioflag & (FSYNC | FDSYNC) ||
zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
zil_commit(zilog, zp->z_id);
@@ -1162,7 +1249,6 @@
{
znode_t *zp = zgd->zgd_private;
objset_t *os = zp->z_zfsvfs->z_os;
- int vfslocked;
if (zgd->zgd_db)
dmu_buf_rele(zgd->zgd_db, zgd);
@@ -1169,7 +1255,6 @@
zfs_range_unlock(zgd->zgd_rl);
- vfslocked = VFS_LOCK_GIANT(zp->z_zfsvfs->z_vfs);
/*
* Release the vnode asynchronously as we currently have the
* txg stopped from syncing.
@@ -1180,7 +1265,6 @@
zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
kmem_free(zgd, sizeof (zgd_t));
- VFS_UNLOCK_GIANT(vfslocked);
}
#ifdef DEBUG
@@ -1246,7 +1330,7 @@
} else { /* indirect write */
/*
* Have to lock the whole block to ensure when it's
- * written out and it's checksum is being calculated
+ * written out and its checksum is being calculated
* that no one can change the data. We need to re-check
* blocksize after we get the lock in case it's changed!
*/
@@ -1334,27 +1418,83 @@
return (error);
}
-/*
- * If vnode is for a device return a specfs vnode instead.
- */
static int
-specvp_check(vnode_t **vpp, cred_t *cr)
+zfs_dd_callback(struct mount *mp, void *arg, int lkflags, struct vnode **vpp)
{
- int error = 0;
+ int error;
- if (IS_DEVVP(*vpp)) {
- struct vnode *svp;
+ *vpp = arg;
+ error = vn_lock(*vpp, lkflags);
+ if (error != 0)
+ vrele(*vpp);
+ return (error);
+}
- svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
- VN_RELE(*vpp);
- if (svp == NULL)
- error = SET_ERROR(ENOSYS);
- *vpp = svp;
+static int
+zfs_lookup_lock(vnode_t *dvp, vnode_t *vp, const char *name, int lkflags)
+{
+ znode_t *zdp = VTOZ(dvp);
+ zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
+ int error;
+ int ltype;
+
+ ASSERT_VOP_LOCKED(dvp, __func__);
+#ifdef DIAGNOSTIC
+ if ((zdp->z_pflags & ZFS_XATTR) == 0)
+ VERIFY(!RRM_LOCK_HELD(&zfsvfs->z_teardown_lock));
+#endif
+
+ if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
+ ASSERT3P(dvp, ==, vp);
+ vref(dvp);
+ ltype = lkflags & LK_TYPE_MASK;
+ if (ltype != VOP_ISLOCKED(dvp)) {
+ if (ltype == LK_EXCLUSIVE)
+ vn_lock(dvp, LK_UPGRADE | LK_RETRY);
+ else /* if (ltype == LK_SHARED) */
+ vn_lock(dvp, LK_DOWNGRADE | LK_RETRY);
+
+ /*
+ * Relock for the "." case could leave us with
+ * reclaimed vnode.
+ */
+ if (dvp->v_iflag & VI_DOOMED) {
+ vrele(dvp);
+ return (SET_ERROR(ENOENT));
+ }
+ }
+ return (0);
+ } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
+ /*
+ * Note that in this case, dvp is the child vnode, and we
+ * are looking up the parent vnode - exactly reverse from
+ * normal operation. Unlocking dvp requires some rather
+ * tricky unlock/relock dance to prevent mp from being freed;
+ * use vn_vget_ino_gen() which takes care of all that.
+ *
+ * XXX Note that there is a time window when both vnodes are
+ * unlocked. It is possible, although highly unlikely, that
+ * during that window the parent-child relationship between
+ * the vnodes may change, for example, get reversed.
+ * In that case we would have a wrong lock order for the vnodes.
+ * All other filesystems seem to ignore this problem, so we
+ * do the same here.
+ * A potential solution could be implemented as follows:
+ * - using LK_NOWAIT when locking the second vnode and retrying
+ * if necessary
+ * - checking that the parent-child relationship still holds
+ * after locking both vnodes and retrying if it doesn't
+ */
+ error = vn_vget_ino_gen(dvp, zfs_dd_callback, vp, lkflags, &vp);
+ return (error);
+ } else {
+ error = vn_lock(vp, lkflags);
+ if (error != 0)
+ vrele(vp);
+ return (error);
}
- return (error);
}
-
/*
* Lookup an entry in a directory, or an extended attribute directory.
* If it exists, return a held vnode reference for it.
@@ -1366,8 +1506,6 @@
* rdir - root directory vnode [UNUSED].
* cr - credentials of caller.
* ct - caller context
- * direntflags - directory lookup flags
- * realpnp - returned pathname.
*
* OUT: vpp - vnode of located entry, NULL if not found.
*
@@ -1382,46 +1520,17 @@
int nameiop, cred_t *cr, kthread_t *td, int flags)
{
znode_t *zdp = VTOZ(dvp);
+ znode_t *zp;
zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
int error = 0;
- int *direntflags = NULL;
- void *realpnp = NULL;
- /* fast path */
- if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) {
-
+ /* fast path (should be redundant with vfs namecache) */
+ if (!(flags & LOOKUP_XATTR)) {
if (dvp->v_type != VDIR) {
return (SET_ERROR(ENOTDIR));
} else if (zdp->z_sa_hdl == NULL) {
return (SET_ERROR(EIO));
}
-
- if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) {
- error = zfs_fastaccesschk_execute(zdp, cr);
- if (!error) {
- *vpp = dvp;
- VN_HOLD(*vpp);
- return (0);
- }
- return (error);
- } else {
- vnode_t *tvp = dnlc_lookup(dvp, nm);
-
- if (tvp) {
- error = zfs_fastaccesschk_execute(zdp, cr);
- if (error) {
- VN_RELE(tvp);
- return (error);
- }
- if (tvp == DNLC_NO_VNODE) {
- VN_RELE(tvp);
- return (SET_ERROR(ENOENT));
- } else {
- *vpp = tvp;
- return (specvp_check(vpp, cr));
- }
- }
- }
}
DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm);
@@ -1459,10 +1568,9 @@
/*
* Do we have permission to get into attribute directory?
*/
-
if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0,
B_FALSE, cr)) {
- VN_RELE(*vpp);
+ vrele(*vpp);
*vpp = NULL;
}
@@ -1470,15 +1578,9 @@
return (error);
}
- if (dvp->v_type != VDIR) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(ENOTDIR));
- }
-
/*
* Check accessibility of directory.
*/
-
if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) {
ZFS_EXIT(zfsvfs);
return (error);
@@ -1490,10 +1592,98 @@
return (SET_ERROR(EILSEQ));
}
- error = zfs_dirlook(zdp, nm, vpp, flags, direntflags, realpnp);
- if (error == 0)
- error = specvp_check(vpp, cr);
+ /*
+ * First handle the special cases.
+ */
+ if ((cnp->cn_flags & ISDOTDOT) != 0) {
+ /*
+ * If we are a snapshot mounted under .zfs, return
+ * the vp for the snapshot directory.
+ */
+ if (zdp->z_id == zfsvfs->z_root && zfsvfs->z_parent != zfsvfs) {
+ struct componentname cn;
+ vnode_t *zfsctl_vp;
+ int ltype;
+
+ ZFS_EXIT(zfsvfs);
+ ltype = VOP_ISLOCKED(dvp);
+ VOP_UNLOCK(dvp, 0);
+ error = zfsctl_root(zfsvfs->z_parent, LK_SHARED,
+ &zfsctl_vp);
+ if (error == 0) {
+ cn.cn_nameptr = "snapshot";
+ cn.cn_namelen = strlen(cn.cn_nameptr);
+ cn.cn_nameiop = cnp->cn_nameiop;
+ cn.cn_flags = cnp->cn_flags & ~ISDOTDOT;
+ cn.cn_lkflags = cnp->cn_lkflags;
+ error = VOP_LOOKUP(zfsctl_vp, vpp, &cn);
+ vput(zfsctl_vp);
+ }
+ vn_lock(dvp, ltype | LK_RETRY);
+ return (error);
+ }
+ }
+ if (zfs_has_ctldir(zdp) && strcmp(nm, ZFS_CTLDIR_NAME) == 0) {
+ ZFS_EXIT(zfsvfs);
+ if ((cnp->cn_flags & ISLASTCN) != 0 && nameiop != LOOKUP)
+ return (SET_ERROR(ENOTSUP));
+ error = zfsctl_root(zfsvfs, cnp->cn_lkflags, vpp);
+ return (error);
+ }
+
+ /*
+ * The loop is retry the lookup if the parent-child relationship
+ * changes during the dot-dot locking complexities.
+ */
+ for (;;) {
+ uint64_t parent;
+
+ error = zfs_dirlook(zdp, nm, &zp);
+ if (error == 0)
+ *vpp = ZTOV(zp);
+
+ ZFS_EXIT(zfsvfs);
+ if (error != 0)
+ break;
+
+ error = zfs_lookup_lock(dvp, *vpp, nm, cnp->cn_lkflags);
+ if (error != 0) {
+ /*
+ * If we've got a locking error, then the vnode
+ * got reclaimed because of a force unmount.
+ * We never enter doomed vnodes into the name cache.
+ */
+ *vpp = NULL;
+ return (error);
+ }
+
+ if ((cnp->cn_flags & ISDOTDOT) == 0)
+ break;
+
+ ZFS_ENTER(zfsvfs);
+ if (zdp->z_sa_hdl == NULL) {
+ error = SET_ERROR(EIO);
+ } else {
+ error = sa_lookup(zdp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
+ &parent, sizeof (parent));
+ }
+ if (error != 0) {
+ ZFS_EXIT(zfsvfs);
+ vput(ZTOV(zp));
+ break;
+ }
+ if (zp->z_id == parent) {
+ ZFS_EXIT(zfsvfs);
+ break;
+ }
+ vput(ZTOV(zp));
+ }
+
+out:
+ if (error != 0)
+ *vpp = NULL;
+
/* Translate errors and add SAVENAME when needed. */
if (cnp->cn_flags & ISLASTCN) {
switch (nameiop) {
@@ -1511,42 +1701,20 @@
break;
}
}
- if (error == 0 && (nm[0] != '.' || nm[1] != '\0')) {
- int ltype = 0;
- if (cnp->cn_flags & ISDOTDOT) {
- ltype = VOP_ISLOCKED(dvp);
- VOP_UNLOCK(dvp, 0);
- }
- ZFS_EXIT(zfsvfs);
- error = zfs_vnode_lock(*vpp, cnp->cn_lkflags);
- if (cnp->cn_flags & ISDOTDOT)
- vn_lock(dvp, ltype | LK_RETRY);
- if (error != 0) {
- VN_RELE(*vpp);
- *vpp = NULL;
- return (error);
- }
- } else {
- ZFS_EXIT(zfsvfs);
- }
+ /* Insert name into cache (as non-existent) if appropriate. */
+ if (zfsvfs->z_use_namecache &&
+ error == ENOENT && (cnp->cn_flags & MAKEENTRY) != 0)
+ cache_enter(dvp, NULL, cnp);
-#ifdef FREEBSD_NAMECACHE
- /*
- * Insert name into cache (as non-existent) if appropriate.
- */
- if (error == ENOENT && (cnp->cn_flags & MAKEENTRY) && nameiop != CREATE)
- cache_enter(dvp, *vpp, cnp);
- /*
- * Insert name into cache if appropriate.
- */
- if (error == 0 && (cnp->cn_flags & MAKEENTRY)) {
+ /* Insert name into cache if appropriate. */
+ if (zfsvfs->z_use_namecache &&
+ error == 0 && (cnp->cn_flags & MAKEENTRY)) {
if (!(cnp->cn_flags & ISLASTCN) ||
(nameiop != DELETE && nameiop != RENAME)) {
cache_enter(dvp, *vpp, cnp);
}
}
-#endif
return (error);
}
@@ -1564,7 +1732,7 @@
* cr - credentials of caller.
* flag - large file flag [UNUSED].
* ct - caller context
- * vsecp - ACL to be set
+ * vsecp - ACL to be set
*
* OUT: vpp - vnode of created or trunc'd entry.
*
@@ -1584,7 +1752,6 @@
zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
zilog_t *zilog;
objset_t *os;
- zfs_dirlock_t *dl;
dmu_tx_t *tx;
int error;
ksid_t *ksid;
@@ -1592,9 +1759,9 @@
gid_t gid = crgetgid(cr);
zfs_acl_ids_t acl_ids;
boolean_t fuid_dirtied;
- boolean_t have_acl = B_FALSE;
void *vsecp = NULL;
int flag = 0;
+ uint64_t txtype;
/*
* If we have an ephemeral id, ACL, or XVATTR then
@@ -1630,176 +1797,90 @@
return (error);
}
}
-top:
+
*vpp = NULL;
if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr))
vap->va_mode &= ~S_ISVTX;
- if (*name == '\0') {
- /*
- * Null component name refers to the directory itself.
- */
- VN_HOLD(dvp);
- zp = dzp;
- dl = NULL;
- error = 0;
- } else {
- /* possible VN_HOLD(zp) */
- int zflg = 0;
+ error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
+ if (error) {
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+ ASSERT3P(zp, ==, NULL);
- if (flag & FIGNORECASE)
- zflg |= ZCILOOK;
-
- error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
- NULL, NULL);
- if (error) {
- if (have_acl)
- zfs_acl_ids_free(&acl_ids);
- if (strcmp(name, "..") == 0)
- error = SET_ERROR(EISDIR);
- ZFS_EXIT(zfsvfs);
- return (error);
- }
+ /*
+ * Create a new file object and update the directory
+ * to reference it.
+ */
+ if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
+ goto out;
}
- if (zp == NULL) {
- uint64_t txtype;
+ /*
+ * We only support the creation of regular files in
+ * extended attribute directories.
+ */
- /*
- * Create a new file object and update the directory
- * to reference it.
- */
- if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
- if (have_acl)
- zfs_acl_ids_free(&acl_ids);
- goto out;
- }
+ if ((dzp->z_pflags & ZFS_XATTR) &&
+ (vap->va_type != VREG)) {
+ error = SET_ERROR(EINVAL);
+ goto out;
+ }
- /*
- * We only support the creation of regular files in
- * extended attribute directories.
- */
+ if ((error = zfs_acl_ids_create(dzp, 0, vap,
+ cr, vsecp, &acl_ids)) != 0)
+ goto out;
- if ((dzp->z_pflags & ZFS_XATTR) &&
- (vap->va_type != VREG)) {
- if (have_acl)
- zfs_acl_ids_free(&acl_ids);
- error = SET_ERROR(EINVAL);
- goto out;
- }
+ if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
+ zfs_acl_ids_free(&acl_ids);
+ error = SET_ERROR(EDQUOT);
+ goto out;
+ }
- if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
- cr, vsecp, &acl_ids)) != 0)
- goto out;
- have_acl = B_TRUE;
+ getnewvnode_reserve(1);
- if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
- zfs_acl_ids_free(&acl_ids);
- error = SET_ERROR(EDQUOT);
- goto out;
- }
+ tx = dmu_tx_create(os);
- tx = dmu_tx_create(os);
+ dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
+ ZFS_SA_BASE_ATTR_SIZE);
- dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
- ZFS_SA_BASE_ATTR_SIZE);
-
- fuid_dirtied = zfsvfs->z_fuid_dirty;
- if (fuid_dirtied)
- zfs_fuid_txhold(zfsvfs, tx);
- dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
- dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
- if (!zfsvfs->z_use_sa &&
- acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
- dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
- 0, acl_ids.z_aclp->z_acl_bytes);
- }
- error = dmu_tx_assign(tx, TXG_NOWAIT);
- if (error) {
- zfs_dirent_unlock(dl);
- if (error == ERESTART) {
- dmu_tx_wait(tx);
- dmu_tx_abort(tx);
- goto top;
- }
- zfs_acl_ids_free(&acl_ids);
- dmu_tx_abort(tx);
- ZFS_EXIT(zfsvfs);
- return (error);
- }
- zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
-
- if (fuid_dirtied)
- zfs_fuid_sync(zfsvfs, tx);
-
- (void) zfs_link_create(dl, zp, tx, ZNEW);
- txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
- if (flag & FIGNORECASE)
- txtype |= TX_CI;
- zfs_log_create(zilog, tx, txtype, dzp, zp, name,
- vsecp, acl_ids.z_fuidp, vap);
+ fuid_dirtied = zfsvfs->z_fuid_dirty;
+ if (fuid_dirtied)
+ zfs_fuid_txhold(zfsvfs, tx);
+ dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
+ dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
+ if (!zfsvfs->z_use_sa &&
+ acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
+ dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
+ 0, acl_ids.z_aclp->z_acl_bytes);
+ }
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error) {
zfs_acl_ids_free(&acl_ids);
- dmu_tx_commit(tx);
- } else {
- int aflags = (flag & FAPPEND) ? V_APPEND : 0;
+ dmu_tx_abort(tx);
+ getnewvnode_drop_reserve();
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+ zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
- if (have_acl)
- zfs_acl_ids_free(&acl_ids);
- have_acl = B_FALSE;
+ if (fuid_dirtied)
+ zfs_fuid_sync(zfsvfs, tx);
- /*
- * A directory entry already exists for this name.
- */
- /*
- * Can't truncate an existing file if in exclusive mode.
- */
- if (excl == EXCL) {
- error = SET_ERROR(EEXIST);
- goto out;
- }
- /*
- * Can't open a directory for writing.
- */
- if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) {
- error = SET_ERROR(EISDIR);
- goto out;
- }
- /*
- * Verify requested access to file.
- */
- if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) {
- goto out;
- }
+ (void) zfs_link_create(dzp, name, zp, tx, ZNEW);
+ txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
+ zfs_log_create(zilog, tx, txtype, dzp, zp, name,
+ vsecp, acl_ids.z_fuidp, vap);
+ zfs_acl_ids_free(&acl_ids);
+ dmu_tx_commit(tx);
- mutex_enter(&dzp->z_lock);
- dzp->z_seq++;
- mutex_exit(&dzp->z_lock);
+ getnewvnode_drop_reserve();
- /*
- * Truncate regular files if requested.
- */
- if ((ZTOV(zp)->v_type == VREG) &&
- (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) {
- /* we can't hold any locks when calling zfs_freesp() */
- zfs_dirent_unlock(dl);
- dl = NULL;
- error = zfs_freesp(zp, 0, 0, mode, TRUE);
- if (error == 0) {
- vnevent_create(ZTOV(zp), ct);
- }
- }
- }
out:
- if (dl)
- zfs_dirent_unlock(dl);
-
- if (error) {
- if (zp)
- VN_RELE(ZTOV(zp));
- } else {
+ if (error == 0) {
*vpp = ZTOV(zp);
- error = specvp_check(vpp, cr);
}
if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
@@ -1825,57 +1906,31 @@
* vp - ctime (if nlink > 0)
*/
-uint64_t null_xattr = 0;
-
/*ARGSUSED*/
static int
-zfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct,
- int flags)
+zfs_remove(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr)
{
- znode_t *zp, *dzp = VTOZ(dvp);
+ znode_t *dzp = VTOZ(dvp);
+ znode_t *zp = VTOZ(vp);
znode_t *xzp;
- vnode_t *vp;
zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
zilog_t *zilog;
uint64_t acl_obj, xattr_obj;
- uint64_t xattr_obj_unlinked = 0;
uint64_t obj = 0;
- zfs_dirlock_t *dl;
dmu_tx_t *tx;
- boolean_t may_delete_now, delete_now = FALSE;
boolean_t unlinked, toobig = FALSE;
uint64_t txtype;
- pathname_t *realnmp = NULL;
- pathname_t realnm;
int error;
- int zflg = ZEXISTS;
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(dzp);
+ ZFS_VERIFY_ZP(zp);
zilog = zfsvfs->z_log;
+ zp = VTOZ(vp);
- if (flags & FIGNORECASE) {
- zflg |= ZCILOOK;
- pn_alloc(&realnm);
- realnmp = &realnm;
- }
-
-top:
xattr_obj = 0;
xzp = NULL;
- /*
- * Attempt to lock directory; fail if entry doesn't exist.
- */
- if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
- NULL, realnmp)) {
- if (realnmp)
- pn_free(realnmp);
- ZFS_EXIT(zfsvfs);
- return (error);
- }
- vp = ZTOV(zp);
-
if (error = zfs_zaccess_delete(dzp, zp, cr)) {
goto out;
}
@@ -1890,14 +1945,15 @@
vnevent_remove(vp, dvp, name, ct);
- if (realnmp)
- dnlc_remove(dvp, realnmp->pn_buf);
- else
- dnlc_remove(dvp, name);
+ obj = zp->z_id;
- VI_LOCK(vp);
- may_delete_now = vp->v_count == 1 && !vn_has_cached_data(vp);
- VI_UNLOCK(vp);
+ /* are there any extended attributes? */
+ error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
+ &xattr_obj, sizeof (xattr_obj));
+ if (error == 0 && xattr_obj) {
+ error = zfs_zget(zfsvfs, xattr_obj, &xzp);
+ ASSERT0(error);
+ }
/*
* We may delete the znode now, or we may put it in the unlinked set;
@@ -1905,51 +1961,27 @@
* other holds on the vnode. So we dmu_tx_hold() the right things to
* allow for either case.
*/
- obj = zp->z_id;
tx = dmu_tx_create(zfsvfs->z_os);
dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
zfs_sa_upgrade_txholds(tx, zp);
zfs_sa_upgrade_txholds(tx, dzp);
- if (may_delete_now) {
- toobig =
- zp->z_size > zp->z_blksz * DMU_MAX_DELETEBLKCNT;
- /* if the file is too big, only hold_free a token amount */
- dmu_tx_hold_free(tx, zp->z_id, 0,
- (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END));
- }
- /* are there any extended attributes? */
- error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
- &xattr_obj, sizeof (xattr_obj));
- if (error == 0 && xattr_obj) {
- error = zfs_zget(zfsvfs, xattr_obj, &xzp);
- ASSERT0(error);
+ if (xzp) {
dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
}
- mutex_enter(&zp->z_lock);
- if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now)
- dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
- mutex_exit(&zp->z_lock);
-
/* charge as an update -- would be nice not to charge at all */
dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
- error = dmu_tx_assign(tx, TXG_NOWAIT);
+ /*
+ * Mark this transaction as typically resulting in a net free of space
+ */
+ dmu_tx_mark_netfree(tx);
+
+ error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
- zfs_dirent_unlock(dl);
- VN_RELE(vp);
- if (xzp)
- VN_RELE(ZTOV(xzp));
- if (error == ERESTART) {
- dmu_tx_wait(tx);
- dmu_tx_abort(tx);
- goto top;
- }
- if (realnmp)
- pn_free(realnmp);
dmu_tx_abort(tx);
ZFS_EXIT(zfsvfs);
return (error);
@@ -1958,7 +1990,7 @@
/*
* Remove the directory entry.
*/
- error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked);
+ error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, &unlinked);
if (error) {
dmu_tx_commit(tx);
@@ -1966,77 +1998,18 @@
}
if (unlinked) {
-
- /*
- * Hold z_lock so that we can make sure that the ACL obj
- * hasn't changed. Could have been deleted due to
- * zfs_sa_upgrade().
- */
- mutex_enter(&zp->z_lock);
- VI_LOCK(vp);
- (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
- &xattr_obj_unlinked, sizeof (xattr_obj_unlinked));
- delete_now = may_delete_now && !toobig &&
- vp->v_count == 1 && !vn_has_cached_data(vp) &&
- xattr_obj == xattr_obj_unlinked && zfs_external_acl(zp) ==
- acl_obj;
- VI_UNLOCK(vp);
- }
-
- if (delete_now) {
-#ifdef __FreeBSD__
- panic("zfs_remove: delete_now branch taken");
-#endif
- if (xattr_obj_unlinked) {
- ASSERT3U(xzp->z_links, ==, 2);
- mutex_enter(&xzp->z_lock);
- xzp->z_unlinked = 1;
- xzp->z_links = 0;
- error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
- &xzp->z_links, sizeof (xzp->z_links), tx);
- ASSERT3U(error, ==, 0);
- mutex_exit(&xzp->z_lock);
- zfs_unlinked_add(xzp, tx);
-
- if (zp->z_is_sa)
- error = sa_remove(zp->z_sa_hdl,
- SA_ZPL_XATTR(zfsvfs), tx);
- else
- error = sa_update(zp->z_sa_hdl,
- SA_ZPL_XATTR(zfsvfs), &null_xattr,
- sizeof (uint64_t), tx);
- ASSERT0(error);
- }
- VI_LOCK(vp);
- vp->v_count--;
- ASSERT0(vp->v_count);
- VI_UNLOCK(vp);
- mutex_exit(&zp->z_lock);
- zfs_znode_delete(zp, tx);
- } else if (unlinked) {
- mutex_exit(&zp->z_lock);
zfs_unlinked_add(zp, tx);
-#ifdef __FreeBSD__
vp->v_vflag |= VV_NOSYNC;
-#endif
}
txtype = TX_REMOVE;
- if (flags & FIGNORECASE)
- txtype |= TX_CI;
zfs_log_remove(zilog, tx, txtype, dzp, name, obj);
dmu_tx_commit(tx);
out:
- if (realnmp)
- pn_free(realnmp);
- zfs_dirent_unlock(dl);
-
- if (!delete_now)
- VN_RELE(vp);
if (xzp)
- VN_RELE(ZTOV(xzp));
+ vrele(ZTOV(xzp));
if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
zil_commit(zilog, 0);
@@ -2067,17 +2040,14 @@
*/
/*ARGSUSED*/
static int
-zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr,
- caller_context_t *ct, int flags, vsecattr_t *vsecp)
+zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr)
{
znode_t *zp, *dzp = VTOZ(dvp);
zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
zilog_t *zilog;
- zfs_dirlock_t *dl;
uint64_t txtype;
dmu_tx_t *tx;
int error;
- int zf = ZNEW;
ksid_t *ksid;
uid_t uid;
gid_t gid = crgetgid(cr);
@@ -2097,7 +2067,7 @@
else
uid = crgetuid(cr);
if (zfsvfs->z_use_fuids == B_FALSE &&
- (vsecp || (vap->va_mask & AT_XVATTR) ||
+ ((vap->va_mask & AT_XVATTR) ||
IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
return (SET_ERROR(EINVAL));
@@ -2115,8 +2085,6 @@
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EILSEQ));
}
- if (flags & FIGNORECASE)
- zf |= ZCILOOK;
if (vap->va_mask & AT_XVATTR) {
if ((error = secpolicy_xvattr(dvp, (xvattr_t *)vap,
@@ -2127,10 +2095,11 @@
}
if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
- vsecp, &acl_ids)) != 0) {
+ NULL, &acl_ids)) != 0) {
ZFS_EXIT(zfsvfs);
return (error);
}
+
/*
* First make sure the new directory doesn't exist.
*
@@ -2138,19 +2107,17 @@
* EACCES instead of EEXIST which can cause some applications
* to fail.
*/
-top:
*vpp = NULL;
- if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf,
- NULL, NULL)) {
+ if (error = zfs_dirent_lookup(dzp, dirname, &zp, ZNEW)) {
zfs_acl_ids_free(&acl_ids);
ZFS_EXIT(zfsvfs);
return (error);
}
+ ASSERT3P(zp, ==, NULL);
if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) {
zfs_acl_ids_free(&acl_ids);
- zfs_dirent_unlock(dl);
ZFS_EXIT(zfsvfs);
return (error);
}
@@ -2157,7 +2124,6 @@
if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
zfs_acl_ids_free(&acl_ids);
- zfs_dirent_unlock(dl);
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EDQUOT));
}
@@ -2165,6 +2131,7 @@
/*
* Add a new entry to the directory.
*/
+ getnewvnode_reserve(1);
tx = dmu_tx_create(zfsvfs->z_os);
dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
@@ -2179,16 +2146,11 @@
dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
ZFS_SA_BASE_ATTR_SIZE);
- error = dmu_tx_assign(tx, TXG_NOWAIT);
+ error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
- zfs_dirent_unlock(dl);
- if (error == ERESTART) {
- dmu_tx_wait(tx);
- dmu_tx_abort(tx);
- goto top;
- }
zfs_acl_ids_free(&acl_ids);
dmu_tx_abort(tx);
+ getnewvnode_drop_reserve();
ZFS_EXIT(zfsvfs);
return (error);
}
@@ -2204,14 +2166,12 @@
/*
* Now put new name in parent dir.
*/
- (void) zfs_link_create(dl, zp, tx, ZNEW);
+ (void) zfs_link_create(dzp, dirname, zp, tx, ZNEW);
*vpp = ZTOV(zp);
- txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap);
- if (flags & FIGNORECASE)
- txtype |= TX_CI;
- zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp,
+ txtype = zfs_log_create_txtype(Z_DIR, NULL, vap);
+ zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, NULL,
acl_ids.z_fuidp, vap);
zfs_acl_ids_free(&acl_ids);
@@ -2218,7 +2178,7 @@
dmu_tx_commit(tx);
- zfs_dirent_unlock(dl);
+ getnewvnode_drop_reserve();
if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
zil_commit(zilog, 0);
@@ -2246,39 +2206,21 @@
*/
/*ARGSUSED*/
static int
-zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr,
- caller_context_t *ct, int flags)
+zfs_rmdir(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr)
{
znode_t *dzp = VTOZ(dvp);
- znode_t *zp;
- vnode_t *vp;
+ znode_t *zp = VTOZ(vp);
zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
zilog_t *zilog;
- zfs_dirlock_t *dl;
dmu_tx_t *tx;
int error;
- int zflg = ZEXISTS;
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(dzp);
+ ZFS_VERIFY_ZP(zp);
zilog = zfsvfs->z_log;
- if (flags & FIGNORECASE)
- zflg |= ZCILOOK;
-top:
- zp = NULL;
- /*
- * Attempt to lock directory; fail if entry doesn't exist.
- */
- if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
- NULL, NULL)) {
- ZFS_EXIT(zfsvfs);
- return (error);
- }
-
- vp = ZTOV(zp);
-
if (error = zfs_zaccess_delete(dzp, zp, cr)) {
goto out;
}
@@ -2288,25 +2230,8 @@
goto out;
}
- if (vp == cwd) {
- error = SET_ERROR(EINVAL);
- goto out;
- }
-
vnevent_rmdir(vp, dvp, name, ct);
- /*
- * Grab a lock on the directory to make sure that noone is
- * trying to add (or lookup) entries while we are removing it.
- */
- rw_enter(&zp->z_name_lock, RW_WRITER);
-
- /*
- * Grab a lock on the parent pointer to make sure we play well
- * with the treewalk and directory rename code.
- */
- rw_enter(&zp->z_parent_lock, RW_WRITER);
-
tx = dmu_tx_create(zfsvfs->z_os);
dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
@@ -2313,47 +2238,27 @@
dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
zfs_sa_upgrade_txholds(tx, zp);
zfs_sa_upgrade_txholds(tx, dzp);
- error = dmu_tx_assign(tx, TXG_NOWAIT);
+ dmu_tx_mark_netfree(tx);
+ error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
- rw_exit(&zp->z_parent_lock);
- rw_exit(&zp->z_name_lock);
- zfs_dirent_unlock(dl);
- VN_RELE(vp);
- if (error == ERESTART) {
- dmu_tx_wait(tx);
- dmu_tx_abort(tx);
- goto top;
- }
dmu_tx_abort(tx);
ZFS_EXIT(zfsvfs);
return (error);
}
-#ifdef FREEBSD_NAMECACHE
cache_purge(dvp);
-#endif
- error = zfs_link_destroy(dl, zp, tx, zflg, NULL);
+ error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, NULL);
if (error == 0) {
uint64_t txtype = TX_RMDIR;
- if (flags & FIGNORECASE)
- txtype |= TX_CI;
zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT);
}
dmu_tx_commit(tx);
- rw_exit(&zp->z_parent_lock);
- rw_exit(&zp->z_name_lock);
-#ifdef FREEBSD_NAMECACHE
cache_purge(vp);
-#endif
out:
- zfs_dirent_unlock(dl);
-
- VN_RELE(vp);
-
if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
zil_commit(zilog, 0);
@@ -2578,10 +2483,10 @@
if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0)
goto skip_entry;
if (!zfs_has_access(ezp, cr)) {
- VN_RELE(ZTOV(ezp));
+ vrele(ZTOV(ezp));
goto skip_entry;
}
- VN_RELE(ZTOV(ezp));
+ vrele(ZTOV(ezp));
}
if (flags & V_RDDIR_ENTFLAGS)
@@ -2632,7 +2537,8 @@
/* Prefetch znode */
if (prefetch)
- dmu_prefetch(os, objnum, 0, 0);
+ dmu_prefetch(os, objnum, 0, 0, 0,
+ ZIO_PRIORITY_SYNC_READ);
skip_entry:
/*
@@ -2777,10 +2683,9 @@
* than to determine whether we were asked the question.
*/
- mutex_enter(&zp->z_lock);
vap->va_type = IFTOVT(zp->z_mode);
vap->va_mode = zp->z_mode & ~S_IFMT;
-#ifdef sun
+#ifdef illumos
vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev;
#else
vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
@@ -2792,7 +2697,7 @@
links = zp->z_links;
vap->va_nlink = MIN(links, LINK_MAX); /* nlink_t limit! */
vap->va_size = zp->z_size;
-#ifdef sun
+#ifdef illumos
vap->va_rdev = vp->v_rdev;
#else
if (vp->v_type == VBLK || vp->v_type == VCHR)
@@ -2800,6 +2705,7 @@
#endif
vap->va_seq = zp->z_seq;
vap->va_flags = 0; /* FreeBSD: Reset chflags(2) flags. */
+ vap->va_filerev = zp->z_seq;
/*
* Add in any requested optional attributes and the create time.
@@ -2913,7 +2819,6 @@
ZFS_TIME_DECODE(&vap->va_ctime, ctime);
ZFS_TIME_DECODE(&vap->va_birthtime, crtime);
- mutex_exit(&zp->z_lock);
sa_object_size(zp->z_sa_hdl, &blksize, &nblocks);
vap->va_blksize = blksize;
@@ -3049,7 +2954,6 @@
}
}
-top:
attrzp = NULL;
aclp = NULL;
@@ -3138,7 +3042,6 @@
}
}
- mutex_enter(&zp->z_lock);
oldva.va_mode = zp->z_mode;
zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
if (mask & AT_XVATTR) {
@@ -3212,7 +3115,6 @@
}
if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
- mutex_exit(&zp->z_lock);
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EPERM));
}
@@ -3224,8 +3126,6 @@
}
}
- mutex_exit(&zp->z_lock);
-
if (mask & AT_MODE) {
if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
err = secpolicy_setid_setsticky_clear(vp, vap,
@@ -3291,6 +3191,11 @@
if (err == 0 && xattr_obj) {
err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp);
+ if (err == 0) {
+ err = vn_lock(ZTOV(attrzp), LK_EXCLUSIVE);
+ if (err != 0)
+ vrele(ZTOV(attrzp));
+ }
if (err)
goto out2;
}
@@ -3300,7 +3205,7 @@
if (new_uid != zp->z_uid &&
zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) {
if (attrzp)
- VN_RELE(ZTOV(attrzp));
+ vput(ZTOV(attrzp));
err = SET_ERROR(EDQUOT);
goto out2;
}
@@ -3312,7 +3217,7 @@
if (new_gid != zp->z_gid &&
zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) {
if (attrzp)
- VN_RELE(ZTOV(attrzp));
+ vput(ZTOV(attrzp));
err = SET_ERROR(EDQUOT);
goto out2;
}
@@ -3334,7 +3239,6 @@
if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))
goto out;
- mutex_enter(&zp->z_lock);
if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
/*
* Are we upgrading ACL from old V0 format
@@ -3355,7 +3259,6 @@
dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
0, aclp->z_acl_bytes);
}
- mutex_exit(&zp->z_lock);
dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
} else {
if ((mask & AT_XVATTR) &&
@@ -3375,12 +3278,9 @@
zfs_sa_upgrade_txholds(tx, zp);
- err = dmu_tx_assign(tx, TXG_NOWAIT);
- if (err) {
- if (err == ERESTART)
- dmu_tx_wait(tx);
+ err = dmu_tx_assign(tx, TXG_WAIT);
+ if (err)
goto out;
- }
count = 0;
/*
@@ -3391,10 +3291,8 @@
* updated as a side-effect of calling this function.
*/
-
if (mask & (AT_UID|AT_GID|AT_MODE))
mutex_enter(&zp->z_acl_lock);
- mutex_enter(&zp->z_lock);
SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
&zp->z_pflags, sizeof (zp->z_pflags));
@@ -3402,7 +3300,6 @@
if (attrzp) {
if (mask & (AT_UID|AT_GID|AT_MODE))
mutex_enter(&attrzp->z_acl_lock);
- mutex_enter(&attrzp->z_lock);
SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
sizeof (attrzp->z_pflags));
@@ -3536,7 +3433,6 @@
if (mask != 0)
zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
- mutex_exit(&zp->z_lock);
if (mask & (AT_UID|AT_GID|AT_MODE))
mutex_exit(&zp->z_acl_lock);
@@ -3543,7 +3439,6 @@
if (attrzp) {
if (mask & (AT_UID|AT_GID|AT_MODE))
mutex_exit(&attrzp->z_acl_lock);
- mutex_exit(&attrzp->z_lock);
}
out:
if (err == 0 && attrzp) {
@@ -3553,7 +3448,7 @@
}
if (attrzp)
- VN_RELE(ZTOV(attrzp));
+ vput(ZTOV(attrzp));
if (aclp)
zfs_acl_free(aclp);
@@ -3565,8 +3460,6 @@
if (err) {
dmu_tx_abort(tx);
- if (err == ERESTART)
- goto top;
} else {
err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
dmu_tx_commit(tx);
@@ -3580,101 +3473,236 @@
return (err);
}
-typedef struct zfs_zlock {
- krwlock_t *zl_rwlock; /* lock we acquired */
- znode_t *zl_znode; /* znode we held */
- struct zfs_zlock *zl_next; /* next in list */
-} zfs_zlock_t;
-
/*
- * Drop locks and release vnodes that were held by zfs_rename_lock().
+ * We acquire all but fdvp locks using non-blocking acquisitions. If we
+ * fail to acquire any lock in the path we will drop all held locks,
+ * acquire the new lock in a blocking fashion, and then release it and
+ * restart the rename. This acquire/release step ensures that we do not
+ * spin on a lock waiting for release. On error release all vnode locks
+ * and decrement references the way tmpfs_rename() would do.
*/
-static void
-zfs_rename_unlock(zfs_zlock_t **zlpp)
+static int
+zfs_rename_relock(struct vnode *sdvp, struct vnode **svpp,
+ struct vnode *tdvp, struct vnode **tvpp,
+ const struct componentname *scnp, const struct componentname *tcnp)
{
- zfs_zlock_t *zl;
+ zfsvfs_t *zfsvfs;
+ struct vnode *nvp, *svp, *tvp;
+ znode_t *sdzp, *tdzp, *szp, *tzp;
+ const char *snm = scnp->cn_nameptr;
+ const char *tnm = tcnp->cn_nameptr;
+ int error;
- while ((zl = *zlpp) != NULL) {
- if (zl->zl_znode != NULL)
- VN_RELE(ZTOV(zl->zl_znode));
- rw_exit(zl->zl_rwlock);
- *zlpp = zl->zl_next;
- kmem_free(zl, sizeof (*zl));
+ VOP_UNLOCK(tdvp, 0);
+ if (*tvpp != NULL && *tvpp != tdvp)
+ VOP_UNLOCK(*tvpp, 0);
+
+relock:
+ error = vn_lock(sdvp, LK_EXCLUSIVE);
+ if (error)
+ goto out;
+ sdzp = VTOZ(sdvp);
+
+ error = vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT);
+ if (error != 0) {
+ VOP_UNLOCK(sdvp, 0);
+ if (error != EBUSY)
+ goto out;
+ error = vn_lock(tdvp, LK_EXCLUSIVE);
+ if (error)
+ goto out;
+ VOP_UNLOCK(tdvp, 0);
+ goto relock;
}
-}
+ tdzp = VTOZ(tdvp);
-/*
- * Search back through the directory tree, using the ".." entries.
- * Lock each directory in the chain to prevent concurrent renames.
- * Fail any attempt to move a directory into one of its own descendants.
- * XXX - z_parent_lock can overlap with map or grow locks
- */
-static int
-zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
-{
- zfs_zlock_t *zl;
- znode_t *zp = tdzp;
- uint64_t rootid = zp->z_zfsvfs->z_root;
- uint64_t oidp = zp->z_id;
- krwlock_t *rwlp = &szp->z_parent_lock;
- krw_t rw = RW_WRITER;
+ /*
+ * Before using sdzp and tdzp we must ensure that they are live.
+ * As a porting legacy from illumos we have two things to worry
+ * about. One is typical for FreeBSD and it is that the vnode is
+ * not reclaimed (doomed). The other is that the znode is live.
+ * The current code can invalidate the znode without acquiring the
+ * corresponding vnode lock if the object represented by the znode
+ * and vnode is no longer valid after a rollback or receive operation.
+ * z_teardown_lock hidden behind ZFS_ENTER and ZFS_EXIT is the lock
+ * that protects the znodes from the invalidation.
+ */
+ zfsvfs = sdzp->z_zfsvfs;
+ ASSERT3P(zfsvfs, ==, tdzp->z_zfsvfs);
+ ZFS_ENTER(zfsvfs);
/*
- * First pass write-locks szp and compares to zp->z_id.
- * Later passes read-lock zp and compare to zp->z_parent.
+ * We can not use ZFS_VERIFY_ZP() here because it could directly return
+ * bypassing the cleanup code in the case of an error.
*/
- do {
- if (!rw_tryenter(rwlp, rw)) {
- /*
- * Another thread is renaming in this path.
- * Note that if we are a WRITER, we don't have any
- * parent_locks held yet.
- */
- if (rw == RW_READER && zp->z_id > szp->z_id) {
- /*
- * Drop our locks and restart
- */
- zfs_rename_unlock(&zl);
- *zlpp = NULL;
- zp = tdzp;
- oidp = zp->z_id;
- rwlp = &szp->z_parent_lock;
- rw = RW_WRITER;
- continue;
- } else {
- /*
- * Wait for other thread to drop its locks
- */
- rw_enter(rwlp, rw);
+ if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) {
+ ZFS_EXIT(zfsvfs);
+ VOP_UNLOCK(sdvp, 0);
+ VOP_UNLOCK(tdvp, 0);
+ error = SET_ERROR(EIO);
+ goto out;
+ }
+
+ /*
+ * Re-resolve svp to be certain it still exists and fetch the
+ * correct vnode.
+ */
+ error = zfs_dirent_lookup(sdzp, snm, &szp, ZEXISTS);
+ if (error != 0) {
+ /* Source entry invalid or not there. */
+ ZFS_EXIT(zfsvfs);
+ VOP_UNLOCK(sdvp, 0);
+ VOP_UNLOCK(tdvp, 0);
+ if ((scnp->cn_flags & ISDOTDOT) != 0 ||
+ (scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.'))
+ error = SET_ERROR(EINVAL);
+ goto out;
+ }
+ svp = ZTOV(szp);
+
+ /*
+ * Re-resolve tvp, if it disappeared we just carry on.
+ */
+ error = zfs_dirent_lookup(tdzp, tnm, &tzp, 0);
+ if (error != 0) {
+ ZFS_EXIT(zfsvfs);
+ VOP_UNLOCK(sdvp, 0);
+ VOP_UNLOCK(tdvp, 0);
+ vrele(svp);
+ if ((tcnp->cn_flags & ISDOTDOT) != 0)
+ error = SET_ERROR(EINVAL);
+ goto out;
+ }
+ if (tzp != NULL)
+ tvp = ZTOV(tzp);
+ else
+ tvp = NULL;
+
+ /*
+ * At present the vnode locks must be acquired before z_teardown_lock,
+ * although it would be more logical to use the opposite order.
+ */
+ ZFS_EXIT(zfsvfs);
+
+ /*
+ * Now try acquire locks on svp and tvp.
+ */
+ nvp = svp;
+ error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
+ if (error != 0) {
+ VOP_UNLOCK(sdvp, 0);
+ VOP_UNLOCK(tdvp, 0);
+ if (tvp != NULL)
+ vrele(tvp);
+ if (error != EBUSY) {
+ vrele(nvp);
+ goto out;
+ }
+ error = vn_lock(nvp, LK_EXCLUSIVE);
+ if (error != 0) {
+ vrele(nvp);
+ goto out;
+ }
+ VOP_UNLOCK(nvp, 0);
+ /*
+ * Concurrent rename race.
+ * XXX ?
+ */
+ if (nvp == tdvp) {
+ vrele(nvp);
+ error = SET_ERROR(EINVAL);
+ goto out;
+ }
+ vrele(*svpp);
+ *svpp = nvp;
+ goto relock;
+ }
+ vrele(*svpp);
+ *svpp = nvp;
+
+ if (*tvpp != NULL)
+ vrele(*tvpp);
+ *tvpp = NULL;
+ if (tvp != NULL) {
+ nvp = tvp;
+ error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT);
+ if (error != 0) {
+ VOP_UNLOCK(sdvp, 0);
+ VOP_UNLOCK(tdvp, 0);
+ VOP_UNLOCK(*svpp, 0);
+ if (error != EBUSY) {
+ vrele(nvp);
+ goto out;
}
+ error = vn_lock(nvp, LK_EXCLUSIVE);
+ if (error != 0) {
+ vrele(nvp);
+ goto out;
+ }
+ vput(nvp);
+ goto relock;
}
+ *tvpp = nvp;
+ }
- zl = kmem_alloc(sizeof (*zl), KM_SLEEP);
- zl->zl_rwlock = rwlp;
- zl->zl_znode = NULL;
- zl->zl_next = *zlpp;
- *zlpp = zl;
+ return (0);
- if (oidp == szp->z_id) /* We're a descendant of szp */
- return (SET_ERROR(EINVAL));
+out:
+ return (error);
+}
- if (oidp == rootid) /* We've hit the top */
- return (0);
+/*
+ * Note that we must use VRELE_ASYNC in this function as it walks
+ * up the directory tree and vrele may need to acquire an exclusive
+ * lock if a last reference to a vnode is dropped.
+ */
+static int
+zfs_rename_check(znode_t *szp, znode_t *sdzp, znode_t *tdzp)
+{
+ zfsvfs_t *zfsvfs;
+ znode_t *zp, *zp1;
+ uint64_t parent;
+ int error;
- if (rw == RW_READER) { /* i.e. not the first pass */
- int error = zfs_zget(zp->z_zfsvfs, oidp, &zp);
- if (error)
- return (error);
- zl->zl_znode = zp;
+ zfsvfs = tdzp->z_zfsvfs;
+ if (tdzp == szp)
+ return (SET_ERROR(EINVAL));
+ if (tdzp == sdzp)
+ return (0);
+ if (tdzp->z_id == zfsvfs->z_root)
+ return (0);
+ zp = tdzp;
+ for (;;) {
+ ASSERT(!zp->z_unlinked);
+ if ((error = sa_lookup(zp->z_sa_hdl,
+ SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)
+ break;
+
+ if (parent == szp->z_id) {
+ error = SET_ERROR(EINVAL);
+ break;
}
- (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zp->z_zfsvfs),
- &oidp, sizeof (oidp));
- rwlp = &zp->z_parent_lock;
- rw = RW_READER;
+ if (parent == zfsvfs->z_root)
+ break;
+ if (parent == sdzp->z_id)
+ break;
- } while (zp->z_id != sdzp->z_id);
+ error = zfs_zget(zfsvfs, parent, &zp1);
+ if (error != 0)
+ break;
- return (0);
+ if (zp != tdzp)
+ VN_RELE_ASYNC(ZTOV(zp),
+ dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os)));
+ zp = zp1;
+ }
+
+ if (error == ENOTDIR)
+ panic("checkpath: .. not a directory\n");
+ if (zp != tdzp)
+ VN_RELE_ASYNC(ZTOV(zp),
+ dsl_pool_vnrele_taskq(dmu_objset_pool(zfsvfs->z_os)));
+ return (error);
}
/*
@@ -3696,181 +3724,95 @@
*/
/*ARGSUSED*/
static int
-zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr,
- caller_context_t *ct, int flags)
+zfs_rename(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp,
+ vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp,
+ cred_t *cr)
{
- znode_t *tdzp, *szp, *tzp;
- znode_t *sdzp = VTOZ(sdvp);
- zfsvfs_t *zfsvfs = sdzp->z_zfsvfs;
- zilog_t *zilog;
- vnode_t *realvp;
- zfs_dirlock_t *sdl, *tdl;
+ zfsvfs_t *zfsvfs;
+ znode_t *sdzp, *tdzp, *szp, *tzp;
+ zilog_t *zilog = NULL;
dmu_tx_t *tx;
- zfs_zlock_t *zl;
- int cmp, serr, terr;
+ char *snm = scnp->cn_nameptr;
+ char *tnm = tcnp->cn_nameptr;
int error = 0;
- int zflg = 0;
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(sdzp);
- zilog = zfsvfs->z_log;
+ /* Reject renames across filesystems. */
+ if ((*svpp)->v_mount != tdvp->v_mount ||
+ ((*tvpp) != NULL && (*svpp)->v_mount != (*tvpp)->v_mount)) {
+ error = SET_ERROR(EXDEV);
+ goto out;
+ }
+ if (zfsctl_is_node(tdvp)) {
+ error = SET_ERROR(EXDEV);
+ goto out;
+ }
+
/*
- * Make sure we have the real vp for the target directory.
+ * Lock all four vnodes to ensure safety and semantics of renaming.
*/
- if (VOP_REALVP(tdvp, &realvp, ct) == 0)
- tdvp = realvp;
-
- if (tdvp->v_vfsp != sdvp->v_vfsp || zfsctl_is_node(tdvp)) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EXDEV));
+ error = zfs_rename_relock(sdvp, svpp, tdvp, tvpp, scnp, tcnp);
+ if (error != 0) {
+ /* no vnodes are locked in the case of error here */
+ return (error);
}
tdzp = VTOZ(tdvp);
- ZFS_VERIFY_ZP(tdzp);
+ sdzp = VTOZ(sdvp);
+ zfsvfs = tdzp->z_zfsvfs;
+ zilog = zfsvfs->z_log;
+
+ /*
+ * After we re-enter ZFS_ENTER() we will have to revalidate all
+ * znodes involved.
+ */
+ ZFS_ENTER(zfsvfs);
+
if (zfsvfs->z_utf8 && u8_validate(tnm,
strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EILSEQ));
+ error = SET_ERROR(EILSEQ);
+ goto unlockout;
}
- if (flags & FIGNORECASE)
- zflg |= ZCILOOK;
+ /* If source and target are the same file, there is nothing to do. */
+ if ((*svpp) == (*tvpp)) {
+ error = 0;
+ goto unlockout;
+ }
-top:
- szp = NULL;
- tzp = NULL;
- zl = NULL;
+ if (((*svpp)->v_type == VDIR && (*svpp)->v_mountedhere != NULL) ||
+ ((*tvpp) != NULL && (*tvpp)->v_type == VDIR &&
+ (*tvpp)->v_mountedhere != NULL)) {
+ error = SET_ERROR(EXDEV);
+ goto unlockout;
+ }
/*
- * This is to prevent the creation of links into attribute space
- * by renaming a linked file into/outof an attribute directory.
- * See the comment in zfs_link() for why this is considered bad.
+ * We can not use ZFS_VERIFY_ZP() here because it could directly return
+ * bypassing the cleanup code in the case of an error.
*/
- if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EINVAL));
+ if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) {
+ error = SET_ERROR(EIO);
+ goto unlockout;
}
- /*
- * Lock source and target directory entries. To prevent deadlock,
- * a lock ordering must be defined. We lock the directory with
- * the smallest object id first, or if it's a tie, the one with
- * the lexically first name.
- */
- if (sdzp->z_id < tdzp->z_id) {
- cmp = -1;
- } else if (sdzp->z_id > tdzp->z_id) {
- cmp = 1;
- } else {
- /*
- * First compare the two name arguments without
- * considering any case folding.
- */
- int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER);
-
- cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error);
- ASSERT(error == 0 || !zfsvfs->z_utf8);
- if (cmp == 0) {
- /*
- * POSIX: "If the old argument and the new argument
- * both refer to links to the same existing file,
- * the rename() function shall return successfully
- * and perform no other action."
- */
- ZFS_EXIT(zfsvfs);
- return (0);
- }
- /*
- * If the file system is case-folding, then we may
- * have some more checking to do. A case-folding file
- * system is either supporting mixed case sensitivity
- * access or is completely case-insensitive. Note
- * that the file system is always case preserving.
- *
- * In mixed sensitivity mode case sensitive behavior
- * is the default. FIGNORECASE must be used to
- * explicitly request case insensitive behavior.
- *
- * If the source and target names provided differ only
- * by case (e.g., a request to rename 'tim' to 'Tim'),
- * we will treat this as a special case in the
- * case-insensitive mode: as long as the source name
- * is an exact match, we will allow this to proceed as
- * a name-change request.
- */
- if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
- (zfsvfs->z_case == ZFS_CASE_MIXED &&
- flags & FIGNORECASE)) &&
- u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST,
- &error) == 0) {
- /*
- * case preserving rename request, require exact
- * name matches
- */
- zflg |= ZCIEXACT;
- zflg &= ~ZCILOOK;
- }
+ szp = VTOZ(*svpp);
+ tzp = *tvpp == NULL ? NULL : VTOZ(*tvpp);
+ if (szp->z_sa_hdl == NULL || (tzp != NULL && tzp->z_sa_hdl == NULL)) {
+ error = SET_ERROR(EIO);
+ goto unlockout;
}
/*
- * If the source and destination directories are the same, we should
- * grab the z_name_lock of that directory only once.
+ * This is to prevent the creation of links into attribute space
+ * by renaming a linked file into/outof an attribute directory.
+ * See the comment in zfs_link() for why this is considered bad.
*/
- if (sdzp == tdzp) {
- zflg |= ZHAVELOCK;
- rw_enter(&sdzp->z_name_lock, RW_READER);
+ if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
+ error = SET_ERROR(EINVAL);
+ goto unlockout;
}
- if (cmp < 0) {
- serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp,
- ZEXISTS | zflg, NULL, NULL);
- terr = zfs_dirent_lock(&tdl,
- tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL);
- } else {
- terr = zfs_dirent_lock(&tdl,
- tdzp, tnm, &tzp, zflg, NULL, NULL);
- serr = zfs_dirent_lock(&sdl,
- sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg,
- NULL, NULL);
- }
-
- if (serr) {
- /*
- * Source entry invalid or not there.
- */
- if (!terr) {
- zfs_dirent_unlock(tdl);
- if (tzp)
- VN_RELE(ZTOV(tzp));
- }
-
- if (sdzp == tdzp)
- rw_exit(&sdzp->z_name_lock);
-
- /*
- * FreeBSD: In OpenSolaris they only check if rename source is
- * ".." here, because "." is handled in their lookup. This is
- * not the case for FreeBSD, so we check for "." explicitly.
- */
- if (strcmp(snm, ".") == 0 || strcmp(snm, "..") == 0)
- serr = SET_ERROR(EINVAL);
- ZFS_EXIT(zfsvfs);
- return (serr);
- }
- if (terr) {
- zfs_dirent_unlock(sdl);
- VN_RELE(ZTOV(szp));
-
- if (sdzp == tdzp)
- rw_exit(&sdzp->z_name_lock);
-
- if (strcmp(tnm, "..") == 0)
- terr = SET_ERROR(EINVAL);
- ZFS_EXIT(zfsvfs);
- return (terr);
- }
-
/*
* Must have write access at the source to remove the old entry
* and write access at the target to create the new entry.
@@ -3877,17 +3819,26 @@
* Note that if target and source are the same, this can be
* done in a single check.
*/
-
if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))
- goto out;
+ goto unlockout;
- if (ZTOV(szp)->v_type == VDIR) {
+ if ((*svpp)->v_type == VDIR) {
/*
+ * Avoid ".", "..", and aliases of "." for obvious reasons.
+ */
+ if ((scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.') ||
+ sdzp == szp ||
+ (scnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) {
+ error = EINVAL;
+ goto unlockout;
+ }
+
+ /*
* Check to make sure rename is valid.
* Can't do a move like this: /usr/a/b to /usr/a/b/c/d
*/
- if (error = zfs_rename_lock(szp, tdzp, sdzp, &zl))
- goto out;
+ if (error = zfs_rename_check(szp, sdzp, tdzp))
+ goto unlockout;
}
/*
@@ -3897,31 +3848,26 @@
/*
* Source and target must be the same type.
*/
- if (ZTOV(szp)->v_type == VDIR) {
- if (ZTOV(tzp)->v_type != VDIR) {
+ if ((*svpp)->v_type == VDIR) {
+ if ((*tvpp)->v_type != VDIR) {
error = SET_ERROR(ENOTDIR);
- goto out;
+ goto unlockout;
+ } else {
+ cache_purge(tdvp);
+ if (sdvp != tdvp)
+ cache_purge(sdvp);
}
} else {
- if (ZTOV(tzp)->v_type == VDIR) {
+ if ((*tvpp)->v_type == VDIR) {
error = SET_ERROR(EISDIR);
- goto out;
+ goto unlockout;
}
}
- /*
- * POSIX dictates that when the source and target
- * entries refer to the same file object, rename
- * must do nothing and exit without error.
- */
- if (szp->z_id == tzp->z_id) {
- error = 0;
- goto out;
- }
}
- vnevent_rename_src(ZTOV(szp), sdvp, snm, ct);
+ vnevent_rename_src(*svpp, sdvp, scnp->cn_nameptr, ct);
if (tzp)
- vnevent_rename_dest(ZTOV(tzp), tdvp, tnm, ct);
+ vnevent_rename_dest(*tvpp, tdvp, tnm, ct);
/*
* notify the target directory if it is not the same
@@ -3947,34 +3893,18 @@
zfs_sa_upgrade_txholds(tx, szp);
dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
- error = dmu_tx_assign(tx, TXG_NOWAIT);
+ error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
- if (zl != NULL)
- zfs_rename_unlock(&zl);
- zfs_dirent_unlock(sdl);
- zfs_dirent_unlock(tdl);
-
- if (sdzp == tdzp)
- rw_exit(&sdzp->z_name_lock);
-
- VN_RELE(ZTOV(szp));
- if (tzp)
- VN_RELE(ZTOV(tzp));
- if (error == ERESTART) {
- dmu_tx_wait(tx);
- dmu_tx_abort(tx);
- goto top;
- }
dmu_tx_abort(tx);
- ZFS_EXIT(zfsvfs);
- return (error);
+ goto unlockout;
}
+
if (tzp) /* Attempt to remove the existing target */
- error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL);
+ error = zfs_link_destroy(tdzp, tnm, tzp, tx, 0, NULL);
if (error == 0) {
- error = zfs_link_create(tdl, szp, tx, ZRENAMING);
+ error = zfs_link_create(tdzp, tnm, szp, tx, ZRENAMING);
if (error == 0) {
szp->z_pflags |= ZFS_AV_MODIFIED;
@@ -3982,17 +3912,16 @@
(void *)&szp->z_pflags, sizeof (uint64_t), tx);
ASSERT0(error);
- error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
+ error = zfs_link_destroy(sdzp, snm, szp, tx, ZRENAMING,
+ NULL);
if (error == 0) {
- zfs_log_rename(zilog, tx, TX_RENAME |
- (flags & FIGNORECASE ? TX_CI : 0), sdzp,
- sdl->dl_name, tdzp, tdl->dl_name, szp);
+ zfs_log_rename(zilog, tx, TX_RENAME, sdzp,
+ snm, tdzp, tnm, szp);
/*
* Update path information for the target vnode
*/
- vn_renamepath(tdvp, ZTOV(szp), tnm,
- strlen(tnm));
+ vn_renamepath(tdvp, *svpp, tnm, strlen(tnm));
} else {
/*
* At this point, we have successfully created
@@ -4006,39 +3935,33 @@
* succeed; fortunately, it is very unlikely to
* fail, since we just created it.
*/
- VERIFY3U(zfs_link_destroy(tdl, szp, tx,
+ VERIFY3U(zfs_link_destroy(tdzp, tnm, szp, tx,
ZRENAMING, NULL), ==, 0);
}
}
-#ifdef FREEBSD_NAMECACHE
if (error == 0) {
- cache_purge(sdvp);
- cache_purge(tdvp);
+ cache_purge(*svpp);
+ if (*tvpp != NULL)
+ cache_purge(*tvpp);
+ cache_purge_negative(tdvp);
}
-#endif
}
dmu_tx_commit(tx);
-out:
- if (zl != NULL)
- zfs_rename_unlock(&zl);
- zfs_dirent_unlock(sdl);
- zfs_dirent_unlock(tdl);
+unlockout: /* all 4 vnodes are locked, ZFS_ENTER called */
+ ZFS_EXIT(zfsvfs);
+ VOP_UNLOCK(*svpp, 0);
+ VOP_UNLOCK(sdvp, 0);
- if (sdzp == tdzp)
- rw_exit(&sdzp->z_name_lock);
-
-
- VN_RELE(ZTOV(szp));
- if (tzp)
- VN_RELE(ZTOV(tzp));
-
- if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+out: /* original two vnodes are locked */
+ if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
zil_commit(zilog, 0);
- ZFS_EXIT(zfsvfs);
-
+ if (*tvpp != NULL)
+ VOP_UNLOCK(*tvpp, 0);
+ if (tdvp != *tvpp)
+ VOP_UNLOCK(tdvp, 0);
return (error);
}
@@ -4063,13 +3986,11 @@
cred_t *cr, kthread_t *td)
{
znode_t *zp, *dzp = VTOZ(dvp);
- zfs_dirlock_t *dl;
dmu_tx_t *tx;
zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
zilog_t *zilog;
uint64_t len = strlen(link);
int error;
- int zflg = ZNEW;
zfs_acl_ids_t acl_ids;
boolean_t fuid_dirtied;
uint64_t txtype = TX_SYMLINK;
@@ -4086,8 +4007,6 @@
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EILSEQ));
}
- if (flags & FIGNORECASE)
- zflg |= ZCILOOK;
if (len > MAXPATHLEN) {
ZFS_EXIT(zfsvfs);
@@ -4099,11 +4018,11 @@
ZFS_EXIT(zfsvfs);
return (error);
}
-top:
+
/*
* Attempt to lock directory; fail if entry already exists.
*/
- error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL);
+ error = zfs_dirent_lookup(dzp, name, &zp, ZNEW);
if (error) {
zfs_acl_ids_free(&acl_ids);
ZFS_EXIT(zfsvfs);
@@ -4112,7 +4031,6 @@
if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
zfs_acl_ids_free(&acl_ids);
- zfs_dirent_unlock(dl);
ZFS_EXIT(zfsvfs);
return (error);
}
@@ -4119,10 +4037,11 @@
if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
zfs_acl_ids_free(&acl_ids);
- zfs_dirent_unlock(dl);
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EDQUOT));
}
+
+ getnewvnode_reserve(1);
tx = dmu_tx_create(zfsvfs->z_os);
fuid_dirtied = zfsvfs->z_fuid_dirty;
dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
@@ -4136,16 +4055,11 @@
}
if (fuid_dirtied)
zfs_fuid_txhold(zfsvfs, tx);
- error = dmu_tx_assign(tx, TXG_NOWAIT);
+ error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
- zfs_dirent_unlock(dl);
- if (error == ERESTART) {
- dmu_tx_wait(tx);
- dmu_tx_abort(tx);
- goto top;
- }
zfs_acl_ids_free(&acl_ids);
dmu_tx_abort(tx);
+ getnewvnode_drop_reserve();
ZFS_EXIT(zfsvfs);
return (error);
}
@@ -4159,13 +4073,11 @@
if (fuid_dirtied)
zfs_fuid_sync(zfsvfs, tx);
- mutex_enter(&zp->z_lock);
if (zp->z_is_sa)
error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
link, len, tx);
else
zfs_sa_symlink(zp, link, len, tx);
- mutex_exit(&zp->z_lock);
zp->z_size = len;
(void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
@@ -4173,10 +4085,8 @@
/*
* Insert the new object into the directory.
*/
- (void) zfs_link_create(dl, zp, tx, ZNEW);
+ (void) zfs_link_create(dzp, name, zp, tx, ZNEW);
- if (flags & FIGNORECASE)
- txtype |= TX_CI;
zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
*vpp = ZTOV(zp);
@@ -4184,7 +4094,7 @@
dmu_tx_commit(tx);
- zfs_dirent_unlock(dl);
+ getnewvnode_drop_reserve();
if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
zil_commit(zilog, 0);
@@ -4220,13 +4130,11 @@
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
- mutex_enter(&zp->z_lock);
if (zp->z_is_sa)
error = sa_lookup_uio(zp->z_sa_hdl,
SA_ZPL_SYMLINK(zfsvfs), uio);
else
error = zfs_sa_readlink(zp, uio);
- mutex_exit(&zp->z_lock);
ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
@@ -4258,11 +4166,8 @@
znode_t *tzp, *szp;
zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
zilog_t *zilog;
- zfs_dirlock_t *dl;
dmu_tx_t *tx;
- vnode_t *realvp;
int error;
- int zf = ZNEW;
uint64_t parent;
uid_t owner;
@@ -4272,9 +4177,6 @@
ZFS_VERIFY_ZP(dzp);
zilog = zfsvfs->z_log;
- if (VOP_REALVP(svp, &realvp, ct) == 0)
- svp = realvp;
-
/*
* POSIX dictates that we return EPERM here.
* Better choices include ENOTSUP or EISDIR.
@@ -4284,14 +4186,14 @@
return (SET_ERROR(EPERM));
}
- if (svp->v_vfsp != tdvp->v_vfsp || zfsctl_is_node(svp)) {
+ szp = VTOZ(svp);
+ ZFS_VERIFY_ZP(szp);
+
+ if (szp->z_pflags & (ZFS_APPENDONLY | ZFS_IMMUTABLE | ZFS_READONLY)) {
ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EXDEV));
+ return (SET_ERROR(EPERM));
}
- szp = VTOZ(svp);
- ZFS_VERIFY_ZP(szp);
-
/* Prevent links to .zfs/shares files */
if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
@@ -4309,8 +4211,6 @@
ZFS_EXIT(zfsvfs);
return (SET_ERROR(EILSEQ));
}
- if (flags & FIGNORECASE)
- zf |= ZCILOOK;
/*
* We do not support links between attributes and non-attributes
@@ -4335,11 +4235,10 @@
return (error);
}
-top:
/*
* Attempt to lock directory; fail if entry already exists.
*/
- error = zfs_dirent_lock(&dl, dzp, name, &tzp, zf, NULL, NULL);
+ error = zfs_dirent_lookup(dzp, name, &tzp, ZNEW);
if (error) {
ZFS_EXIT(zfsvfs);
return (error);
@@ -4350,32 +4249,22 @@
dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
zfs_sa_upgrade_txholds(tx, szp);
zfs_sa_upgrade_txholds(tx, dzp);
- error = dmu_tx_assign(tx, TXG_NOWAIT);
+ error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
- zfs_dirent_unlock(dl);
- if (error == ERESTART) {
- dmu_tx_wait(tx);
- dmu_tx_abort(tx);
- goto top;
- }
dmu_tx_abort(tx);
ZFS_EXIT(zfsvfs);
return (error);
}
- error = zfs_link_create(dl, szp, tx, 0);
+ error = zfs_link_create(dzp, name, szp, tx, 0);
if (error == 0) {
uint64_t txtype = TX_LINK;
- if (flags & FIGNORECASE)
- txtype |= TX_CI;
zfs_log_link(zilog, tx, txtype, dzp, szp, name);
}
dmu_tx_commit(tx);
- zfs_dirent_unlock(dl);
-
if (error == 0) {
vnevent_link(svp, ct);
}
@@ -4387,243 +4276,8 @@
return (error);
}
-#ifdef sun
-/*
- * zfs_null_putapage() is used when the file system has been force
- * unmounted. It just drops the pages.
- */
-/* ARGSUSED */
-static int
-zfs_null_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
- size_t *lenp, int flags, cred_t *cr)
-{
- pvn_write_done(pp, B_INVAL|B_FORCE|B_ERROR);
- return (0);
-}
-/*
- * Push a page out to disk, klustering if possible.
- *
- * IN: vp - file to push page to.
- * pp - page to push.
- * flags - additional flags.
- * cr - credentials of caller.
- *
- * OUT: offp - start of range pushed.
- * lenp - len of range pushed.
- *
- * RETURN: 0 on success, error code on failure.
- *
- * NOTE: callers must have locked the page to be pushed. On
- * exit, the page (and all other pages in the kluster) must be
- * unlocked.
- */
-/* ARGSUSED */
-static int
-zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
- size_t *lenp, int flags, cred_t *cr)
-{
- znode_t *zp = VTOZ(vp);
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- dmu_tx_t *tx;
- u_offset_t off, koff;
- size_t len, klen;
- int err;
-
- off = pp->p_offset;
- len = PAGESIZE;
- /*
- * If our blocksize is bigger than the page size, try to kluster
- * multiple pages so that we write a full block (thus avoiding
- * a read-modify-write).
- */
- if (off < zp->z_size && zp->z_blksz > PAGESIZE) {
- klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE);
- koff = ISP2(klen) ? P2ALIGN(off, (u_offset_t)klen) : 0;
- ASSERT(koff <= zp->z_size);
- if (koff + klen > zp->z_size)
- klen = P2ROUNDUP(zp->z_size - koff, (uint64_t)PAGESIZE);
- pp = pvn_write_kluster(vp, pp, &off, &len, koff, klen, flags);
- }
- ASSERT3U(btop(len), ==, btopr(len));
-
- /*
- * Can't push pages past end-of-file.
- */
- if (off >= zp->z_size) {
- /* ignore all pages */
- err = 0;
- goto out;
- } else if (off + len > zp->z_size) {
- int npages = btopr(zp->z_size - off);
- page_t *trunc;
-
- page_list_break(&pp, &trunc, npages);
- /* ignore pages past end of file */
- if (trunc)
- pvn_write_done(trunc, flags);
- len = zp->z_size - off;
- }
-
- if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
- zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
- err = SET_ERROR(EDQUOT);
- goto out;
- }
-top:
- tx = dmu_tx_create(zfsvfs->z_os);
- dmu_tx_hold_write(tx, zp->z_id, off, len);
-
- dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
- zfs_sa_upgrade_txholds(tx, zp);
- err = dmu_tx_assign(tx, TXG_NOWAIT);
- if (err != 0) {
- if (err == ERESTART) {
- dmu_tx_wait(tx);
- dmu_tx_abort(tx);
- goto top;
- }
- dmu_tx_abort(tx);
- goto out;
- }
-
- if (zp->z_blksz <= PAGESIZE) {
- caddr_t va = zfs_map_page(pp, S_READ);
- ASSERT3U(len, <=, PAGESIZE);
- dmu_write(zfsvfs->z_os, zp->z_id, off, len, va, tx);
- zfs_unmap_page(pp, va);
- } else {
- err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, pp, tx);
- }
-
- if (err == 0) {
- uint64_t mtime[2], ctime[2];
- sa_bulk_attr_t bulk[3];
- int count = 0;
-
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
- &mtime, 16);
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
- &ctime, 16);
- SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
- &zp->z_pflags, 8);
- zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
- B_TRUE);
- zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0);
- }
- dmu_tx_commit(tx);
-
-out:
- pvn_write_done(pp, (err ? B_ERROR : 0) | flags);
- if (offp)
- *offp = off;
- if (lenp)
- *lenp = len;
-
- return (err);
-}
-
-/*
- * Copy the portion of the file indicated from pages into the file.
- * The pages are stored in a page list attached to the files vnode.
- *
- * IN: vp - vnode of file to push page data to.
- * off - position in file to put data.
- * len - amount of data to write.
- * flags - flags to control the operation.
- * cr - credentials of caller.
- * ct - caller context.
- *
- * RETURN: 0 on success, error code on failure.
- *
- * Timestamps:
- * vp - ctime|mtime updated
- */
/*ARGSUSED*/
-static int
-zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr,
- caller_context_t *ct)
-{
- znode_t *zp = VTOZ(vp);
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- page_t *pp;
- size_t io_len;
- u_offset_t io_off;
- uint_t blksz;
- rl_t *rl;
- int error = 0;
-
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
-
- /*
- * Align this request to the file block size in case we kluster.
- * XXX - this can result in pretty aggresive locking, which can
- * impact simultanious read/write access. One option might be
- * to break up long requests (len == 0) into block-by-block
- * operations to get narrower locking.
- */
- blksz = zp->z_blksz;
- if (ISP2(blksz))
- io_off = P2ALIGN_TYPED(off, blksz, u_offset_t);
- else
- io_off = 0;
- if (len > 0 && ISP2(blksz))
- io_len = P2ROUNDUP_TYPED(len + (off - io_off), blksz, size_t);
- else
- io_len = 0;
-
- if (io_len == 0) {
- /*
- * Search the entire vp list for pages >= io_off.
- */
- rl = zfs_range_lock(zp, io_off, UINT64_MAX, RL_WRITER);
- error = pvn_vplist_dirty(vp, io_off, zfs_putapage, flags, cr);
- goto out;
- }
- rl = zfs_range_lock(zp, io_off, io_len, RL_WRITER);
-
- if (off > zp->z_size) {
- /* past end of file */
- zfs_range_unlock(rl);
- ZFS_EXIT(zfsvfs);
- return (0);
- }
-
- len = MIN(io_len, P2ROUNDUP(zp->z_size, PAGESIZE) - io_off);
-
- for (off = io_off; io_off < off + len; io_off += io_len) {
- if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) {
- pp = page_lookup(vp, io_off,
- (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED);
- } else {
- pp = page_lookup_nowait(vp, io_off,
- (flags & B_FREE) ? SE_EXCL : SE_SHARED);
- }
-
- if (pp != NULL && pvn_getdirty(pp, flags)) {
- int err;
-
- /*
- * Found a dirty page to push
- */
- err = zfs_putapage(vp, pp, &io_off, &io_len, flags, cr);
- if (err)
- error = err;
- } else {
- io_len = PAGESIZE;
- }
- }
-out:
- zfs_range_unlock(rl);
- if ((flags & B_ASYNC) == 0 || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
- zil_commit(zfsvfs->z_log, zp->z_id);
- ZFS_EXIT(zfsvfs);
- return (error);
-}
-#endif /* sun */
-
-/*ARGSUSED*/
void
zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
{
@@ -4638,21 +4292,18 @@
* suspend/resume and this file no longer exists.
*/
rw_exit(&zfsvfs->z_teardown_inactive_lock);
- vrecycle(vp, curthread);
+ vrecycle(vp);
return;
}
- mutex_enter(&zp->z_lock);
if (zp->z_unlinked) {
/*
* Fast path to recycle a vnode of a removed file.
*/
- mutex_exit(&zp->z_lock);
rw_exit(&zfsvfs->z_teardown_inactive_lock);
- vrecycle(vp, curthread);
+ vrecycle(vp);
return;
}
- mutex_exit(&zp->z_lock);
if (zp->z_atime_dirty && zp->z_unlinked == 0) {
dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
@@ -4663,11 +4314,9 @@
if (error) {
dmu_tx_abort(tx);
} else {
- mutex_enter(&zp->z_lock);
(void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
(void *)&zp->z_atime, sizeof (zp->z_atime), tx);
zp->z_atime_dirty = 0;
- mutex_exit(&zp->z_lock);
dmu_tx_commit(tx);
}
}
@@ -4674,424 +4323,7 @@
rw_exit(&zfsvfs->z_teardown_inactive_lock);
}
-#ifdef sun
-/*
- * Bounds-check the seek operation.
- *
- * IN: vp - vnode seeking within
- * ooff - old file offset
- * noffp - pointer to new file offset
- * ct - caller context
- *
- * RETURN: 0 on success, EINVAL if new offset invalid.
- */
-/* ARGSUSED */
-static int
-zfs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp,
- caller_context_t *ct)
-{
- if (vp->v_type == VDIR)
- return (0);
- return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
-}
-/*
- * Pre-filter the generic locking function to trap attempts to place
- * a mandatory lock on a memory mapped file.
- */
-static int
-zfs_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset,
- flk_callback_t *flk_cbp, cred_t *cr, caller_context_t *ct)
-{
- znode_t *zp = VTOZ(vp);
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
-
- /*
- * We are following the UFS semantics with respect to mapcnt
- * here: If we see that the file is mapped already, then we will
- * return an error, but we don't worry about races between this
- * function and zfs_map().
- */
- if (zp->z_mapcnt > 0 && MANDMODE(zp->z_mode)) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EAGAIN));
- }
- ZFS_EXIT(zfsvfs);
- return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
-}
-
-/*
- * If we can't find a page in the cache, we will create a new page
- * and fill it with file data. For efficiency, we may try to fill
- * multiple pages at once (klustering) to fill up the supplied page
- * list. Note that the pages to be filled are held with an exclusive
- * lock to prevent access by other threads while they are being filled.
- */
-static int
-zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg,
- caddr_t addr, page_t *pl[], size_t plsz, enum seg_rw rw)
-{
- znode_t *zp = VTOZ(vp);
- page_t *pp, *cur_pp;
- objset_t *os = zp->z_zfsvfs->z_os;
- u_offset_t io_off, total;
- size_t io_len;
- int err;
-
- if (plsz == PAGESIZE || zp->z_blksz <= PAGESIZE) {
- /*
- * We only have a single page, don't bother klustering
- */
- io_off = off;
- io_len = PAGESIZE;
- pp = page_create_va(vp, io_off, io_len,
- PG_EXCL | PG_WAIT, seg, addr);
- } else {
- /*
- * Try to find enough pages to fill the page list
- */
- pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
- &io_len, off, plsz, 0);
- }
- if (pp == NULL) {
- /*
- * The page already exists, nothing to do here.
- */
- *pl = NULL;
- return (0);
- }
-
- /*
- * Fill the pages in the kluster.
- */
- cur_pp = pp;
- for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) {
- caddr_t va;
-
- ASSERT3U(io_off, ==, cur_pp->p_offset);
- va = zfs_map_page(cur_pp, S_WRITE);
- err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va,
- DMU_READ_PREFETCH);
- zfs_unmap_page(cur_pp, va);
- if (err) {
- /* On error, toss the entire kluster */
- pvn_read_done(pp, B_ERROR);
- /* convert checksum errors into IO errors */
- if (err == ECKSUM)
- err = SET_ERROR(EIO);
- return (err);
- }
- cur_pp = cur_pp->p_next;
- }
-
- /*
- * Fill in the page list array from the kluster starting
- * from the desired offset `off'.
- * NOTE: the page list will always be null terminated.
- */
- pvn_plist_init(pp, pl, plsz, off, io_len, rw);
- ASSERT(pl == NULL || (*pl)->p_offset == off);
-
- return (0);
-}
-
-/*
- * Return pointers to the pages for the file region [off, off + len]
- * in the pl array. If plsz is greater than len, this function may
- * also return page pointers from after the specified region
- * (i.e. the region [off, off + plsz]). These additional pages are
- * only returned if they are already in the cache, or were created as
- * part of a klustered read.
- *
- * IN: vp - vnode of file to get data from.
- * off - position in file to get data from.
- * len - amount of data to retrieve.
- * plsz - length of provided page list.
- * seg - segment to obtain pages for.
- * addr - virtual address of fault.
- * rw - mode of created pages.
- * cr - credentials of caller.
- * ct - caller context.
- *
- * OUT: protp - protection mode of created pages.
- * pl - list of pages created.
- *
- * RETURN: 0 on success, error code on failure.
- *
- * Timestamps:
- * vp - atime updated
- */
-/* ARGSUSED */
-static int
-zfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
- page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
- enum seg_rw rw, cred_t *cr, caller_context_t *ct)
-{
- znode_t *zp = VTOZ(vp);
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- page_t **pl0 = pl;
- int err = 0;
-
- /* we do our own caching, faultahead is unnecessary */
- if (pl == NULL)
- return (0);
- else if (len > plsz)
- len = plsz;
- else
- len = P2ROUNDUP(len, PAGESIZE);
- ASSERT(plsz >= len);
-
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
-
- if (protp)
- *protp = PROT_ALL;
-
- /*
- * Loop through the requested range [off, off + len) looking
- * for pages. If we don't find a page, we will need to create
- * a new page and fill it with data from the file.
- */
- while (len > 0) {
- if (*pl = page_lookup(vp, off, SE_SHARED))
- *(pl+1) = NULL;
- else if (err = zfs_fillpage(vp, off, seg, addr, pl, plsz, rw))
- goto out;
- while (*pl) {
- ASSERT3U((*pl)->p_offset, ==, off);
- off += PAGESIZE;
- addr += PAGESIZE;
- if (len > 0) {
- ASSERT3U(len, >=, PAGESIZE);
- len -= PAGESIZE;
- }
- ASSERT3U(plsz, >=, PAGESIZE);
- plsz -= PAGESIZE;
- pl++;
- }
- }
-
- /*
- * Fill out the page array with any pages already in the cache.
- */
- while (plsz > 0 &&
- (*pl++ = page_lookup_nowait(vp, off, SE_SHARED))) {
- off += PAGESIZE;
- plsz -= PAGESIZE;
- }
-out:
- if (err) {
- /*
- * Release any pages we have previously locked.
- */
- while (pl > pl0)
- page_unlock(*--pl);
- } else {
- ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
- }
-
- *pl = NULL;
-
- ZFS_EXIT(zfsvfs);
- return (err);
-}
-
-/*
- * Request a memory map for a section of a file. This code interacts
- * with common code and the VM system as follows:
- *
- * - common code calls mmap(), which ends up in smmap_common()
- * - this calls VOP_MAP(), which takes you into (say) zfs
- * - zfs_map() calls as_map(), passing segvn_create() as the callback
- * - segvn_create() creates the new segment and calls VOP_ADDMAP()
- * - zfs_addmap() updates z_mapcnt
- */
-/*ARGSUSED*/
-static int
-zfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
- size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
- caller_context_t *ct)
-{
- znode_t *zp = VTOZ(vp);
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- segvn_crargs_t vn_a;
- int error;
-
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
-
- if ((prot & PROT_WRITE) && (zp->z_pflags &
- (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EPERM));
- }
-
- if ((prot & (PROT_READ | PROT_EXEC)) &&
- (zp->z_pflags & ZFS_AV_QUARANTINED)) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EACCES));
- }
-
- if (vp->v_flag & VNOMAP) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(ENOSYS));
- }
-
- if (off < 0 || len > MAXOFFSET_T - off) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(ENXIO));
- }
-
- if (vp->v_type != VREG) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(ENODEV));
- }
-
- /*
- * If file is locked, disallow mapping.
- */
- if (MANDMODE(zp->z_mode) && vn_has_flocks(vp)) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EAGAIN));
- }
-
- as_rangelock(as);
- error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
- if (error != 0) {
- as_rangeunlock(as);
- ZFS_EXIT(zfsvfs);
- return (error);
- }
-
- vn_a.vp = vp;
- vn_a.offset = (u_offset_t)off;
- vn_a.type = flags & MAP_TYPE;
- vn_a.prot = prot;
- vn_a.maxprot = maxprot;
- vn_a.cred = cr;
- vn_a.amp = NULL;
- vn_a.flags = flags & ~MAP_TYPE;
- vn_a.szc = 0;
- vn_a.lgrp_mem_policy_flags = 0;
-
- error = as_map(as, *addrp, len, segvn_create, &vn_a);
-
- as_rangeunlock(as);
- ZFS_EXIT(zfsvfs);
- return (error);
-}
-
-/* ARGSUSED */
-static int
-zfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
- size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
- caller_context_t *ct)
-{
- uint64_t pages = btopr(len);
-
- atomic_add_64(&VTOZ(vp)->z_mapcnt, pages);
- return (0);
-}
-
-/*
- * The reason we push dirty pages as part of zfs_delmap() is so that we get a
- * more accurate mtime for the associated file. Since we don't have a way of
- * detecting when the data was actually modified, we have to resort to
- * heuristics. If an explicit msync() is done, then we mark the mtime when the
- * last page is pushed. The problem occurs when the msync() call is omitted,
- * which by far the most common case:
- *
- * open()
- * mmap()
- * <modify memory>
- * munmap()
- * close()
- * <time lapse>
- * putpage() via fsflush
- *
- * If we wait until fsflush to come along, we can have a modification time that
- * is some arbitrary point in the future. In order to prevent this in the
- * common case, we flush pages whenever a (MAP_SHARED, PROT_WRITE) mapping is
- * torn down.
- */
-/* ARGSUSED */
-static int
-zfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
- size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr,
- caller_context_t *ct)
-{
- uint64_t pages = btopr(len);
-
- ASSERT3U(VTOZ(vp)->z_mapcnt, >=, pages);
- atomic_add_64(&VTOZ(vp)->z_mapcnt, -pages);
-
- if ((flags & MAP_SHARED) && (prot & PROT_WRITE) &&
- vn_has_cached_data(vp))
- (void) VOP_PUTPAGE(vp, off, len, B_ASYNC, cr, ct);
-
- return (0);
-}
-
-/*
- * Free or allocate space in a file. Currently, this function only
- * supports the `F_FREESP' command. However, this command is somewhat
- * misnamed, as its functionality includes the ability to allocate as
- * well as free space.
- *
- * IN: vp - vnode of file to free data in.
- * cmd - action to take (only F_FREESP supported).
- * bfp - section of file to free/alloc.
- * flag - current file open mode flags.
- * offset - current file offset.
- * cr - credentials of caller [UNUSED].
- * ct - caller context.
- *
- * RETURN: 0 on success, error code on failure.
- *
- * Timestamps:
- * vp - ctime|mtime updated
- */
-/* ARGSUSED */
-static int
-zfs_space(vnode_t *vp, int cmd, flock64_t *bfp, int flag,
- offset_t offset, cred_t *cr, caller_context_t *ct)
-{
- znode_t *zp = VTOZ(vp);
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- uint64_t off, len;
- int error;
-
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
-
- if (cmd != F_FREESP) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EINVAL));
- }
-
- if (error = convoff(vp, bfp, 0, offset)) {
- ZFS_EXIT(zfsvfs);
- return (error);
- }
-
- if (bfp->l_len < 0) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EINVAL));
- }
-
- off = bfp->l_start;
- len = bfp->l_len; /* 0 means from off to end of file */
-
- error = zfs_freesp(zp, off, len, flag, TRUE);
-
- ZFS_EXIT(zfsvfs);
- return (error);
-}
-#endif /* sun */
-
CTASSERT(sizeof(struct zfid_short) <= sizeof(struct fid));
CTASSERT(sizeof(struct zfid_long) <= sizeof(struct fid));
@@ -5167,7 +4399,6 @@
{
znode_t *zp, *xzp;
zfsvfs_t *zfsvfs;
- zfs_dirlock_t *dl;
int error;
switch (cmd) {
@@ -5178,7 +4409,7 @@
case _PC_FILESIZEBITS:
*valp = 64;
return (0);
-#ifdef sun
+#ifdef illumos
case _PC_XATTR_EXISTS:
zp = VTOZ(vp);
zfsvfs = zp->z_zfsvfs;
@@ -5185,13 +4416,12 @@
ZFS_ENTER(zfsvfs);
ZFS_VERIFY_ZP(zp);
*valp = 0;
- error = zfs_dirent_lock(&dl, zp, "", &xzp,
- ZXATTR | ZEXISTS | ZSHARED, NULL, NULL);
+ error = zfs_dirent_lookup(zp, "", &xzp,
+ ZXATTR | ZEXISTS | ZSHARED);
if (error == 0) {
- zfs_dirent_unlock(dl);
if (!zfs_dirempty(xzp))
*valp = 1;
- VN_RELE(ZTOV(xzp));
+ vrele(ZTOV(xzp));
} else if (error == ENOENT) {
/*
* If there aren't extended attributes, it's the
@@ -5216,16 +4446,16 @@
case _PC_ACL_ENABLED:
*valp = _ACL_ACE_ENABLED;
return (0);
-#endif /* sun */
+#endif /* illumos */
case _PC_MIN_HOLE_SIZE:
*valp = (int)SPA_MINBLOCKSIZE;
return (0);
-#ifdef sun
+#ifdef illumos
case _PC_TIMESTAMP_RESOLUTION:
/* nanosecond timestamp resolution */
*valp = 1L;
return (0);
-#endif /* sun */
+#endif
case _PC_ACL_EXTENDED:
*valp = 0;
return (0);
@@ -5262,7 +4492,7 @@
}
/*ARGSUSED*/
-static int
+int
zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
caller_context_t *ct)
{
@@ -5284,355 +4514,7 @@
return (error);
}
-#ifdef sun
-/*
- * The smallest read we may consider to loan out an arcbuf.
- * This must be a power of 2.
- */
-int zcr_blksz_min = (1 << 10); /* 1K */
-/*
- * If set to less than the file block size, allow loaning out of an
- * arcbuf for a partial block read. This must be a power of 2.
- */
-int zcr_blksz_max = (1 << 17); /* 128K */
-
-/*ARGSUSED*/
static int
-zfs_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *xuio, cred_t *cr,
- caller_context_t *ct)
-{
- znode_t *zp = VTOZ(vp);
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- int max_blksz = zfsvfs->z_max_blksz;
- uio_t *uio = &xuio->xu_uio;
- ssize_t size = uio->uio_resid;
- offset_t offset = uio->uio_loffset;
- int blksz;
- int fullblk, i;
- arc_buf_t *abuf;
- ssize_t maxsize;
- int preamble, postamble;
-
- if (xuio->xu_type != UIOTYPE_ZEROCOPY)
- return (SET_ERROR(EINVAL));
-
- ZFS_ENTER(zfsvfs);
- ZFS_VERIFY_ZP(zp);
- switch (ioflag) {
- case UIO_WRITE:
- /*
- * Loan out an arc_buf for write if write size is bigger than
- * max_blksz, and the file's block size is also max_blksz.
- */
- blksz = max_blksz;
- if (size < blksz || zp->z_blksz != blksz) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EINVAL));
- }
- /*
- * Caller requests buffers for write before knowing where the
- * write offset might be (e.g. NFS TCP write).
- */
- if (offset == -1) {
- preamble = 0;
- } else {
- preamble = P2PHASE(offset, blksz);
- if (preamble) {
- preamble = blksz - preamble;
- size -= preamble;
- }
- }
-
- postamble = P2PHASE(size, blksz);
- size -= postamble;
-
- fullblk = size / blksz;
- (void) dmu_xuio_init(xuio,
- (preamble != 0) + fullblk + (postamble != 0));
- DTRACE_PROBE3(zfs_reqzcbuf_align, int, preamble,
- int, postamble, int,
- (preamble != 0) + fullblk + (postamble != 0));
-
- /*
- * Have to fix iov base/len for partial buffers. They
- * currently represent full arc_buf's.
- */
- if (preamble) {
- /* data begins in the middle of the arc_buf */
- abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
- blksz);
- ASSERT(abuf);
- (void) dmu_xuio_add(xuio, abuf,
- blksz - preamble, preamble);
- }
-
- for (i = 0; i < fullblk; i++) {
- abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
- blksz);
- ASSERT(abuf);
- (void) dmu_xuio_add(xuio, abuf, 0, blksz);
- }
-
- if (postamble) {
- /* data ends in the middle of the arc_buf */
- abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
- blksz);
- ASSERT(abuf);
- (void) dmu_xuio_add(xuio, abuf, 0, postamble);
- }
- break;
- case UIO_READ:
- /*
- * Loan out an arc_buf for read if the read size is larger than
- * the current file block size. Block alignment is not
- * considered. Partial arc_buf will be loaned out for read.
- */
- blksz = zp->z_blksz;
- if (blksz < zcr_blksz_min)
- blksz = zcr_blksz_min;
- if (blksz > zcr_blksz_max)
- blksz = zcr_blksz_max;
- /* avoid potential complexity of dealing with it */
- if (blksz > max_blksz) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EINVAL));
- }
-
- maxsize = zp->z_size - uio->uio_loffset;
- if (size > maxsize)
- size = maxsize;
-
- if (size < blksz || vn_has_cached_data(vp)) {
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EINVAL));
- }
- break;
- default:
- ZFS_EXIT(zfsvfs);
- return (SET_ERROR(EINVAL));
- }
-
- uio->uio_extflg = UIO_XUIO;
- XUIO_XUZC_RW(xuio) = ioflag;
- ZFS_EXIT(zfsvfs);
- return (0);
-}
-
-/*ARGSUSED*/
-static int
-zfs_retzcbuf(vnode_t *vp, xuio_t *xuio, cred_t *cr, caller_context_t *ct)
-{
- int i;
- arc_buf_t *abuf;
- int ioflag = XUIO_XUZC_RW(xuio);
-
- ASSERT(xuio->xu_type == UIOTYPE_ZEROCOPY);
-
- i = dmu_xuio_cnt(xuio);
- while (i-- > 0) {
- abuf = dmu_xuio_arcbuf(xuio, i);
- /*
- * if abuf == NULL, it must be a write buffer
- * that has been returned in zfs_write().
- */
- if (abuf)
- dmu_return_arcbuf(abuf);
- ASSERT(abuf || ioflag == UIO_WRITE);
- }
-
- dmu_xuio_fini(xuio);
- return (0);
-}
-
-/*
- * Predeclare these here so that the compiler assumes that
- * this is an "old style" function declaration that does
- * not include arguments => we won't get type mismatch errors
- * in the initializations that follow.
- */
-static int zfs_inval();
-static int zfs_isdir();
-
-static int
-zfs_inval()
-{
- return (SET_ERROR(EINVAL));
-}
-
-static int
-zfs_isdir()
-{
- return (SET_ERROR(EISDIR));
-}
-/*
- * Directory vnode operations template
- */
-vnodeops_t *zfs_dvnodeops;
-const fs_operation_def_t zfs_dvnodeops_template[] = {
- VOPNAME_OPEN, { .vop_open = zfs_open },
- VOPNAME_CLOSE, { .vop_close = zfs_close },
- VOPNAME_READ, { .error = zfs_isdir },
- VOPNAME_WRITE, { .error = zfs_isdir },
- VOPNAME_IOCTL, { .vop_ioctl = zfs_ioctl },
- VOPNAME_GETATTR, { .vop_getattr = zfs_getattr },
- VOPNAME_SETATTR, { .vop_setattr = zfs_setattr },
- VOPNAME_ACCESS, { .vop_access = zfs_access },
- VOPNAME_LOOKUP, { .vop_lookup = zfs_lookup },
- VOPNAME_CREATE, { .vop_create = zfs_create },
- VOPNAME_REMOVE, { .vop_remove = zfs_remove },
- VOPNAME_LINK, { .vop_link = zfs_link },
- VOPNAME_RENAME, { .vop_rename = zfs_rename },
- VOPNAME_MKDIR, { .vop_mkdir = zfs_mkdir },
- VOPNAME_RMDIR, { .vop_rmdir = zfs_rmdir },
- VOPNAME_READDIR, { .vop_readdir = zfs_readdir },
- VOPNAME_SYMLINK, { .vop_symlink = zfs_symlink },
- VOPNAME_FSYNC, { .vop_fsync = zfs_fsync },
- VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive },
- VOPNAME_FID, { .vop_fid = zfs_fid },
- VOPNAME_SEEK, { .vop_seek = zfs_seek },
- VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf },
- VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr },
- VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr },
- VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support },
- NULL, NULL
-};
-
-/*
- * Regular file vnode operations template
- */
-vnodeops_t *zfs_fvnodeops;
-const fs_operation_def_t zfs_fvnodeops_template[] = {
- VOPNAME_OPEN, { .vop_open = zfs_open },
- VOPNAME_CLOSE, { .vop_close = zfs_close },
- VOPNAME_READ, { .vop_read = zfs_read },
- VOPNAME_WRITE, { .vop_write = zfs_write },
- VOPNAME_IOCTL, { .vop_ioctl = zfs_ioctl },
- VOPNAME_GETATTR, { .vop_getattr = zfs_getattr },
- VOPNAME_SETATTR, { .vop_setattr = zfs_setattr },
- VOPNAME_ACCESS, { .vop_access = zfs_access },
- VOPNAME_LOOKUP, { .vop_lookup = zfs_lookup },
- VOPNAME_RENAME, { .vop_rename = zfs_rename },
- VOPNAME_FSYNC, { .vop_fsync = zfs_fsync },
- VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive },
- VOPNAME_FID, { .vop_fid = zfs_fid },
- VOPNAME_SEEK, { .vop_seek = zfs_seek },
- VOPNAME_FRLOCK, { .vop_frlock = zfs_frlock },
- VOPNAME_SPACE, { .vop_space = zfs_space },
- VOPNAME_GETPAGE, { .vop_getpage = zfs_getpage },
- VOPNAME_PUTPAGE, { .vop_putpage = zfs_putpage },
- VOPNAME_MAP, { .vop_map = zfs_map },
- VOPNAME_ADDMAP, { .vop_addmap = zfs_addmap },
- VOPNAME_DELMAP, { .vop_delmap = zfs_delmap },
- VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf },
- VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr },
- VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr },
- VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support },
- VOPNAME_REQZCBUF, { .vop_reqzcbuf = zfs_reqzcbuf },
- VOPNAME_RETZCBUF, { .vop_retzcbuf = zfs_retzcbuf },
- NULL, NULL
-};
-
-/*
- * Symbolic link vnode operations template
- */
-vnodeops_t *zfs_symvnodeops;
-const fs_operation_def_t zfs_symvnodeops_template[] = {
- VOPNAME_GETATTR, { .vop_getattr = zfs_getattr },
- VOPNAME_SETATTR, { .vop_setattr = zfs_setattr },
- VOPNAME_ACCESS, { .vop_access = zfs_access },
- VOPNAME_RENAME, { .vop_rename = zfs_rename },
- VOPNAME_READLINK, { .vop_readlink = zfs_readlink },
- VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive },
- VOPNAME_FID, { .vop_fid = zfs_fid },
- VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf },
- VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support },
- NULL, NULL
-};
-
-/*
- * special share hidden files vnode operations template
- */
-vnodeops_t *zfs_sharevnodeops;
-const fs_operation_def_t zfs_sharevnodeops_template[] = {
- VOPNAME_GETATTR, { .vop_getattr = zfs_getattr },
- VOPNAME_ACCESS, { .vop_access = zfs_access },
- VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive },
- VOPNAME_FID, { .vop_fid = zfs_fid },
- VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf },
- VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr },
- VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr },
- VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support },
- NULL, NULL
-};
-
-/*
- * Extended attribute directory vnode operations template
- *
- * This template is identical to the directory vnodes
- * operation template except for restricted operations:
- * VOP_MKDIR()
- * VOP_SYMLINK()
- *
- * Note that there are other restrictions embedded in:
- * zfs_create() - restrict type to VREG
- * zfs_link() - no links into/out of attribute space
- * zfs_rename() - no moves into/out of attribute space
- */
-vnodeops_t *zfs_xdvnodeops;
-const fs_operation_def_t zfs_xdvnodeops_template[] = {
- VOPNAME_OPEN, { .vop_open = zfs_open },
- VOPNAME_CLOSE, { .vop_close = zfs_close },
- VOPNAME_IOCTL, { .vop_ioctl = zfs_ioctl },
- VOPNAME_GETATTR, { .vop_getattr = zfs_getattr },
- VOPNAME_SETATTR, { .vop_setattr = zfs_setattr },
- VOPNAME_ACCESS, { .vop_access = zfs_access },
- VOPNAME_LOOKUP, { .vop_lookup = zfs_lookup },
- VOPNAME_CREATE, { .vop_create = zfs_create },
- VOPNAME_REMOVE, { .vop_remove = zfs_remove },
- VOPNAME_LINK, { .vop_link = zfs_link },
- VOPNAME_RENAME, { .vop_rename = zfs_rename },
- VOPNAME_MKDIR, { .error = zfs_inval },
- VOPNAME_RMDIR, { .vop_rmdir = zfs_rmdir },
- VOPNAME_READDIR, { .vop_readdir = zfs_readdir },
- VOPNAME_SYMLINK, { .error = zfs_inval },
- VOPNAME_FSYNC, { .vop_fsync = zfs_fsync },
- VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive },
- VOPNAME_FID, { .vop_fid = zfs_fid },
- VOPNAME_SEEK, { .vop_seek = zfs_seek },
- VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf },
- VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr },
- VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr },
- VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support },
- NULL, NULL
-};
-
-/*
- * Error vnode operations template
- */
-vnodeops_t *zfs_evnodeops;
-const fs_operation_def_t zfs_evnodeops_template[] = {
- VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive },
- VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf },
- NULL, NULL
-};
-#endif /* sun */
-
-static int
-ioflags(int ioflags)
-{
- int flags = 0;
-
- if (ioflags & IO_APPEND)
- flags |= FAPPEND;
- if (ioflags & IO_NDELAY)
- flags |= FNONBLOCK;
- if (ioflags & IO_SYNC)
- flags |= (FSYNC | FDSYNC | FRSYNC);
-
- return (flags);
-}
-
-static int
zfs_getpages(struct vnode *vp, vm_page_t *m, int count, int reqpage)
{
znode_t *zp = VTOZ(vp);
@@ -5679,7 +4561,7 @@
mfirst = m[reqstart];
mlast = m[reqstart + reqsize - 1];
- VM_OBJECT_LOCK(object);
+ zfs_vmobject_wlock(object);
for (i = 0; i < reqstart; i++) {
vm_page_lock(m[i]);
@@ -5695,9 +4577,9 @@
if (mreq->valid && reqsize == 1) {
if (mreq->valid != VM_PAGE_BITS_ALL)
vm_page_zero_invalid(mreq, TRUE);
- VM_OBJECT_UNLOCK(object);
+ zfs_vmobject_wunlock(object);
ZFS_EXIT(zfsvfs);
- return (VM_PAGER_OK);
+ return (zfs_vm_pagerret_ok);
}
PCPU_INC(cnt.v_vnodein);
@@ -5711,9 +4593,9 @@
vm_page_unlock(m[i]);
}
}
- VM_OBJECT_UNLOCK(object);
+ zfs_vmobject_wunlock(object);
ZFS_EXIT(zfsvfs);
- return (VM_PAGER_BAD);
+ return (zfs_vm_pagerret_bad);
}
lsize = PAGE_SIZE;
@@ -5720,7 +4602,7 @@
if (IDX_TO_OFF(mlast->pindex) + lsize > object->un_pager.vnp.vnp_size)
lsize = object->un_pager.vnp.vnp_size - IDX_TO_OFF(mlast->pindex);
- VM_OBJECT_UNLOCK(object);
+ zfs_vmobject_wunlock(object);
for (i = reqstart; i < reqstart + reqsize; i++) {
size = PAGE_SIZE;
@@ -5736,7 +4618,7 @@
break;
}
- VM_OBJECT_LOCK(object);
+ zfs_vmobject_wlock(object);
for (i = reqstart; i < reqstart + reqsize; i++) {
if (!error)
@@ -5746,11 +4628,11 @@
vm_page_readahead_finish(m[i]);
}
- VM_OBJECT_UNLOCK(object);
+ zfs_vmobject_wunlock(object);
ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
ZFS_EXIT(zfsvfs);
- return (error ? VM_PAGER_ERROR : VM_PAGER_OK);
+ return (error ? zfs_vm_pagerret_error : zfs_vm_pagerret_ok);
}
static int
@@ -5768,6 +4650,162 @@
}
static int
+zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags,
+ int *rtvals)
+{
+ znode_t *zp = VTOZ(vp);
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ rl_t *rl;
+ dmu_tx_t *tx;
+ struct sf_buf *sf;
+ vm_object_t object;
+ vm_page_t m;
+ caddr_t va;
+ size_t tocopy;
+ size_t lo_len;
+ vm_ooffset_t lo_off;
+ vm_ooffset_t off;
+ uint_t blksz;
+ int ncount;
+ int pcount;
+ int err;
+ int i;
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ object = vp->v_object;
+ pcount = btoc(len);
+ ncount = pcount;
+
+ KASSERT(ma[0]->object == object, ("mismatching object"));
+ KASSERT(len > 0 && (len & PAGE_MASK) == 0, ("unexpected length"));
+
+ for (i = 0; i < pcount; i++)
+ rtvals[i] = zfs_vm_pagerret_error;
+
+ off = IDX_TO_OFF(ma[0]->pindex);
+ blksz = zp->z_blksz;
+ lo_off = rounddown(off, blksz);
+ lo_len = roundup(len + (off - lo_off), blksz);
+ rl = zfs_range_lock(zp, lo_off, lo_len, RL_WRITER);
+
+ zfs_vmobject_wlock(object);
+ if (len + off > object->un_pager.vnp.vnp_size) {
+ if (object->un_pager.vnp.vnp_size > off) {
+ int pgoff;
+
+ len = object->un_pager.vnp.vnp_size - off;
+ ncount = btoc(len);
+ if ((pgoff = (int)len & PAGE_MASK) != 0) {
+ /*
+ * If the object is locked and the following
+ * conditions hold, then the page's dirty
+ * field cannot be concurrently changed by a
+ * pmap operation.
+ */
+ m = ma[ncount - 1];
+ vm_page_assert_sbusied(m);
+ KASSERT(!pmap_page_is_write_mapped(m),
+ ("zfs_putpages: page %p is not read-only", m));
+ vm_page_clear_dirty(m, pgoff, PAGE_SIZE -
+ pgoff);
+ }
+ } else {
+ len = 0;
+ ncount = 0;
+ }
+ if (ncount < pcount) {
+ for (i = ncount; i < pcount; i++) {
+ rtvals[i] = zfs_vm_pagerret_bad;
+ }
+ }
+ }
+ zfs_vmobject_wunlock(object);
+
+ if (ncount == 0)
+ goto out;
+
+ if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
+ zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
+ goto out;
+ }
+
+ tx = dmu_tx_create(zfsvfs->z_os);
+ dmu_tx_hold_write(tx, zp->z_id, off, len);
+
+ dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+ zfs_sa_upgrade_txholds(tx, zp);
+ err = dmu_tx_assign(tx, TXG_WAIT);
+ if (err != 0) {
+ dmu_tx_abort(tx);
+ goto out;
+ }
+
+ if (zp->z_blksz < PAGE_SIZE) {
+ for (i = 0; len > 0; off += tocopy, len -= tocopy, i++) {
+ tocopy = len > PAGE_SIZE ? PAGE_SIZE : len;
+ va = zfs_map_page(ma[i], &sf);
+ dmu_write(zfsvfs->z_os, zp->z_id, off, tocopy, va, tx);
+ zfs_unmap_page(sf);
+ }
+ } else {
+ err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, ma, tx);
+ }
+
+ if (err == 0) {
+ uint64_t mtime[2], ctime[2];
+ sa_bulk_attr_t bulk[3];
+ int count = 0;
+
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
+ &mtime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
+ &ctime, 16);
+ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
+ &zp->z_pflags, 8);
+ zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
+ B_TRUE);
+ (void)sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+ zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0);
+
+ zfs_vmobject_wlock(object);
+ for (i = 0; i < ncount; i++) {
+ rtvals[i] = zfs_vm_pagerret_ok;
+ vm_page_undirty(ma[i]);
+ }
+ zfs_vmobject_wunlock(object);
+ PCPU_INC(cnt.v_vnodeout);
+ PCPU_ADD(cnt.v_vnodepgsout, ncount);
+ }
+ dmu_tx_commit(tx);
+
+out:
+ zfs_range_unlock(rl);
+ if ((flags & (zfs_vm_pagerput_sync | zfs_vm_pagerput_inval)) != 0 ||
+ zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
+ zil_commit(zfsvfs->z_log, zp->z_id);
+ ZFS_EXIT(zfsvfs);
+ return (rtvals[0]);
+}
+
+int
+zfs_freebsd_putpages(ap)
+ struct vop_putpages_args /* {
+ struct vnode *a_vp;
+ vm_page_t *a_m;
+ int a_count;
+ int a_sync;
+ int *a_rtvals;
+ vm_ooffset_t a_offset;
+ } */ *ap;
+{
+
+ return (zfs_putpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_sync,
+ ap->a_rtvals));
+}
+
+static int
zfs_freebsd_bmap(ap)
struct vop_bmap_args /* {
struct vnode *a_vp;
@@ -5840,6 +4878,21 @@
}
static int
+ioflags(int ioflags)
+{
+ int flags = 0;
+
+ if (ioflags & IO_APPEND)
+ flags |= FAPPEND;
+ if (ioflags & IO_NDELAY)
+ flags |= FNONBLOCK;
+ if (ioflags & IO_SYNC)
+ flags |= (FSYNC | FDSYNC | FRSYNC);
+
+ return (flags);
+}
+
+static int
zfs_freebsd_read(ap)
struct vop_read_args /* {
struct vnode *a_vp;
@@ -5930,6 +4983,23 @@
}
static int
+zfs_cache_lookup(ap)
+ struct vop_lookup_args /* {
+ struct vnode *a_dvp;
+ struct vnode **a_vpp;
+ struct componentname *a_cnp;
+ } */ *ap;
+{
+ zfsvfs_t *zfsvfs;
+
+ zfsvfs = ap->a_dvp->v_mount->mnt_data;
+ if (zfsvfs->z_use_namecache)
+ return (vfs_cache_lookup(ap));
+ else
+ return (zfs_freebsd_lookup(ap));
+}
+
+static int
zfs_freebsd_create(ap)
struct vop_create_args /* {
struct vnode *a_dvp;
@@ -5938,17 +5008,23 @@
struct vattr *a_vap;
} */ *ap;
{
+ zfsvfs_t *zfsvfs;
struct componentname *cnp = ap->a_cnp;
vattr_t *vap = ap->a_vap;
- int mode;
+ int error, mode;
ASSERT(cnp->cn_flags & SAVENAME);
vattr_init_mask(vap);
mode = vap->va_mode & ALLPERMS;
+ zfsvfs = ap->a_dvp->v_mount->mnt_data;
- return (zfs_create(ap->a_dvp, cnp->cn_nameptr, vap, !EXCL, mode,
- ap->a_vpp, cnp->cn_cred, cnp->cn_thread));
+ error = zfs_create(ap->a_dvp, cnp->cn_nameptr, vap, !EXCL, mode,
+ ap->a_vpp, cnp->cn_cred, cnp->cn_thread);
+ if (zfsvfs->z_use_namecache &&
+ error == 0 && (cnp->cn_flags & MAKEENTRY) != 0)
+ cache_enter(ap->a_dvp, *ap->a_vpp, cnp);
+ return (error);
}
static int
@@ -5962,8 +5038,8 @@
ASSERT(ap->a_cnp->cn_flags & SAVENAME);
- return (zfs_remove(ap->a_dvp, ap->a_cnp->cn_nameptr,
- ap->a_cnp->cn_cred, NULL, 0));
+ return (zfs_remove(ap->a_dvp, ap->a_vp, ap->a_cnp->cn_nameptr,
+ ap->a_cnp->cn_cred));
}
static int
@@ -5982,7 +5058,7 @@
vattr_init_mask(vap);
return (zfs_mkdir(ap->a_dvp, ap->a_cnp->cn_nameptr, vap, ap->a_vpp,
- ap->a_cnp->cn_cred, NULL, 0, NULL));
+ ap->a_cnp->cn_cred));
}
static int
@@ -5997,7 +5073,7 @@
ASSERT(cnp->cn_flags & SAVENAME);
- return (zfs_rmdir(ap->a_dvp, cnp->cn_nameptr, NULL, cnp->cn_cred, NULL, 0));
+ return (zfs_rmdir(ap->a_dvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred));
}
static int
@@ -6052,6 +5128,14 @@
XVA_SET_REQ(&xvap, XAT_APPENDONLY);
XVA_SET_REQ(&xvap, XAT_NOUNLINK);
XVA_SET_REQ(&xvap, XAT_NODUMP);
+ XVA_SET_REQ(&xvap, XAT_READONLY);
+ XVA_SET_REQ(&xvap, XAT_ARCHIVE);
+ XVA_SET_REQ(&xvap, XAT_SYSTEM);
+ XVA_SET_REQ(&xvap, XAT_HIDDEN);
+ XVA_SET_REQ(&xvap, XAT_REPARSE);
+ XVA_SET_REQ(&xvap, XAT_OFFLINE);
+ XVA_SET_REQ(&xvap, XAT_SPARSE);
+
error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred, NULL);
if (error != 0)
return (error);
@@ -6067,8 +5151,23 @@
xvap.xva_xoptattrs.xoa_appendonly);
FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK,
xvap.xva_xoptattrs.xoa_nounlink);
+ FLAG_CHECK(UF_ARCHIVE, XAT_ARCHIVE,
+ xvap.xva_xoptattrs.xoa_archive);
FLAG_CHECK(UF_NODUMP, XAT_NODUMP,
xvap.xva_xoptattrs.xoa_nodump);
+ FLAG_CHECK(UF_READONLY, XAT_READONLY,
+ xvap.xva_xoptattrs.xoa_readonly);
+ FLAG_CHECK(UF_SYSTEM, XAT_SYSTEM,
+ xvap.xva_xoptattrs.xoa_system);
+ FLAG_CHECK(UF_HIDDEN, XAT_HIDDEN,
+ xvap.xva_xoptattrs.xoa_hidden);
+ FLAG_CHECK(UF_REPARSE, XAT_REPARSE,
+ xvap.xva_xoptattrs.xoa_reparse);
+ FLAG_CHECK(UF_OFFLINE, XAT_OFFLINE,
+ xvap.xva_xoptattrs.xoa_offline);
+ FLAG_CHECK(UF_SPARSE, XAT_SPARSE,
+ xvap.xva_xoptattrs.xoa_sparse);
+
#undef FLAG_CHECK
*vap = xvap.xva_vattr;
vap->va_flags = fflags;
@@ -6106,7 +5205,16 @@
return (EOPNOTSUPP);
fflags = vap->va_flags;
- if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_NODUMP)) != 0)
+ /*
+ * XXX KDM
+ * We need to figure out whether it makes sense to allow
+ * UF_REPARSE through, since we don't really have other
+ * facilities to handle reparse points and zfs_setattr()
+ * doesn't currently allow setting that attribute anyway.
+ */
+ if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_ARCHIVE|
+ UF_NODUMP|UF_SYSTEM|UF_HIDDEN|UF_READONLY|UF_REPARSE|
+ UF_OFFLINE|UF_SPARSE)) != 0)
return (EOPNOTSUPP);
/*
* Unprivileged processes are not permitted to unset system
@@ -6158,8 +5266,22 @@
xvap.xva_xoptattrs.xoa_appendonly);
FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK,
xvap.xva_xoptattrs.xoa_nounlink);
+ FLAG_CHANGE(UF_ARCHIVE, ZFS_ARCHIVE, XAT_ARCHIVE,
+ xvap.xva_xoptattrs.xoa_archive);
FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP,
xvap.xva_xoptattrs.xoa_nodump);
+ FLAG_CHANGE(UF_READONLY, ZFS_READONLY, XAT_READONLY,
+ xvap.xva_xoptattrs.xoa_readonly);
+ FLAG_CHANGE(UF_SYSTEM, ZFS_SYSTEM, XAT_SYSTEM,
+ xvap.xva_xoptattrs.xoa_system);
+ FLAG_CHANGE(UF_HIDDEN, ZFS_HIDDEN, XAT_HIDDEN,
+ xvap.xva_xoptattrs.xoa_hidden);
+ FLAG_CHANGE(UF_REPARSE, ZFS_REPARSE, XAT_REPARSE,
+ xvap.xva_xoptattrs.xoa_hidden);
+ FLAG_CHANGE(UF_OFFLINE, ZFS_OFFLINE, XAT_OFFLINE,
+ xvap.xva_xoptattrs.xoa_offline);
+ FLAG_CHANGE(UF_SPARSE, ZFS_SPARSE, XAT_SPARSE,
+ xvap.xva_xoptattrs.xoa_sparse);
#undef FLAG_CHANGE
}
return (zfs_setattr(vp, (vattr_t *)&xvap, 0, cred, NULL));
@@ -6185,17 +5307,14 @@
ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART));
ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART));
- error = zfs_rename(fdvp, ap->a_fcnp->cn_nameptr, tdvp,
- ap->a_tcnp->cn_nameptr, ap->a_fcnp->cn_cred, NULL, 0);
+ error = zfs_rename(fdvp, &fvp, ap->a_fcnp, tdvp, &tvp,
+ ap->a_tcnp, ap->a_fcnp->cn_cred);
- if (tdvp == tvp)
- VN_RELE(tdvp);
- else
- VN_URELE(tdvp);
- if (tvp)
- VN_URELE(tvp);
- VN_RELE(fdvp);
- VN_RELE(fvp);
+ vrele(fdvp);
+ vrele(fvp);
+ vrele(tdvp);
+ if (tvp != NULL)
+ vrele(tvp);
return (error);
}
@@ -6243,10 +5362,15 @@
} */ *ap;
{
struct componentname *cnp = ap->a_cnp;
+ vnode_t *vp = ap->a_vp;
+ vnode_t *tdvp = ap->a_tdvp;
+ if (tdvp->v_mount != vp->v_mount)
+ return (EXDEV);
+
ASSERT(cnp->cn_flags & SAVENAME);
- return (zfs_link(ap->a_tdvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred, NULL, 0));
+ return (zfs_link(tdvp, vp, cnp->cn_nameptr, cnp->cn_cred, NULL, 0));
}
static int
@@ -6443,7 +5567,7 @@
}
flags = FREAD;
- NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | MPSAFE, UIO_SYSSPACE, attrname,
+ NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname,
xvp, td);
error = vn_open_cred(&nd, &flags, 0, 0, ap->a_cred, NULL);
vp = nd.ni_vp;
@@ -6511,18 +5635,20 @@
return (error);
}
- NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF | MPSAFE,
+ NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF,
UIO_SYSSPACE, attrname, xvp, td);
error = namei(&nd);
vp = nd.ni_vp;
- NDFREE(&nd, NDF_ONLY_PNBUF);
if (error != 0) {
ZFS_EXIT(zfsvfs);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
if (error == ENOENT)
error = ENOATTR;
return (error);
}
+
error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
vput(nd.ni_dvp);
if (vp == nd.ni_dvp)
@@ -6578,7 +5704,7 @@
}
flags = FFLAGS(O_WRONLY | O_CREAT);
- NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | MPSAFE, UIO_SYSSPACE, attrname,
+ NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname,
xvp, td);
error = vn_open_cred(&nd, &flags, 0600, 0, ap->a_cred, NULL);
vp = nd.ni_vp;
@@ -6592,7 +5718,7 @@
va.va_size = 0;
error = VOP_SETATTR(vp, &va, ap->a_cred);
if (error == 0)
- VOP_WRITE(vp, ap->a_uio, IO_UNIT | IO_SYNC, ap->a_cred);
+ VOP_WRITE(vp, ap->a_uio, IO_UNIT, ap->a_cred);
VOP_UNLOCK(vp, 0);
vn_close(vp, flags, ap->a_cred, td);
@@ -6659,7 +5785,7 @@
return (error);
}
- NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED | MPSAFE,
+ NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED,
UIO_SYSSPACE, ".", xvp, td);
error = namei(&nd);
vp = nd.ni_vp;
@@ -6769,6 +5895,9 @@
if (ap->a_type != ACL_TYPE_NFS4)
return (EINVAL);
+ if (ap->a_aclp == NULL)
+ return (EINVAL);
+
if (ap->a_aclp->acl_cnt < 1 || ap->a_aclp->acl_cnt > MAX_ACL_ENTRIES)
return (EINVAL);
@@ -6812,6 +5941,87 @@
return (EOPNOTSUPP);
}
+static int
+zfs_vptocnp(struct vop_vptocnp_args *ap)
+{
+ vnode_t *covered_vp;
+ vnode_t *vp = ap->a_vp;;
+ zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
+ znode_t *zp = VTOZ(vp);
+ int ltype;
+ int error;
+
+ ZFS_ENTER(zfsvfs);
+ ZFS_VERIFY_ZP(zp);
+
+ /*
+ * If we are a snapshot mounted under .zfs, run the operation
+ * on the covered vnode.
+ */
+ if (zp->z_id != zfsvfs->z_root || zfsvfs->z_parent == zfsvfs) {
+ char name[MAXNAMLEN + 1];
+ znode_t *dzp;
+ size_t len;
+
+ error = zfs_znode_parent_and_name(zp, &dzp, name);
+ if (error == 0) {
+ len = strlen(name);
+ if (*ap->a_buflen < len)
+ error = SET_ERROR(ENOMEM);
+ }
+ if (error == 0) {
+ *ap->a_buflen -= len;
+ bcopy(name, ap->a_buf + *ap->a_buflen, len);
+ *ap->a_vpp = ZTOV(dzp);
+ }
+ ZFS_EXIT(zfsvfs);
+ return (error);
+ }
+ ZFS_EXIT(zfsvfs);
+
+ covered_vp = vp->v_mount->mnt_vnodecovered;
+ vhold(covered_vp);
+ ltype = VOP_ISLOCKED(vp);
+ VOP_UNLOCK(vp, 0);
+ error = vget(covered_vp, LK_SHARED, curthread);
+ vdrop(covered_vp);
+ if (error == 0) {
+ error = VOP_VPTOCNP(covered_vp, ap->a_vpp, ap->a_cred,
+ ap->a_buf, ap->a_buflen);
+ vput(covered_vp);
+ }
+ vn_lock(vp, ltype | LK_RETRY);
+ if ((vp->v_iflag & VI_DOOMED) != 0)
+ error = SET_ERROR(ENOENT);
+ return (error);
+}
+
+#ifdef DIAGNOSTIC
+static int
+zfs_lock(ap)
+ struct vop_lock1_args /* {
+ struct vnode *a_vp;
+ int a_flags;
+ char *file;
+ int line;
+ } */ *ap;
+{
+ vnode_t *vp;
+ znode_t *zp;
+ int err;
+
+ err = vop_stdlock(ap);
+ if (err == 0 && (ap->a_flags & LK_NOWAIT) == 0) {
+ vp = ap->a_vp;
+ zp = vp->v_data;
+ if (vp->v_mount != NULL && (vp->v_iflag & VI_DOOMED) == 0 &&
+ zp != NULL && (zp->z_pflags & ZFS_XATTR) == 0)
+ VERIFY(!RRM_LOCK_HELD(&zp->z_zfsvfs->z_teardown_lock));
+ }
+ return (err);
+}
+#endif
+
struct vop_vector zfs_vnodeops;
struct vop_vector zfs_fifoops;
struct vop_vector zfs_shareops;
@@ -6821,12 +6031,8 @@
.vop_inactive = zfs_freebsd_inactive,
.vop_reclaim = zfs_freebsd_reclaim,
.vop_access = zfs_freebsd_access,
-#ifdef FREEBSD_NAMECACHE
- .vop_lookup = vfs_cache_lookup,
+ .vop_lookup = zfs_cache_lookup,
.vop_cachedlookup = zfs_freebsd_lookup,
-#else
- .vop_lookup = zfs_freebsd_lookup,
-#endif
.vop_getattr = zfs_freebsd_getattr,
.vop_setattr = zfs_freebsd_setattr,
.vop_create = zfs_freebsd_create,
@@ -6856,6 +6062,11 @@
.vop_setacl = zfs_freebsd_setacl,
.vop_aclcheck = zfs_freebsd_aclcheck,
.vop_getpages = zfs_freebsd_getpages,
+ .vop_putpages = zfs_freebsd_putpages,
+ .vop_vptocnp = zfs_vptocnp,
+#ifdef DIAGNOSTIC
+ .vop_lock1 = zfs_lock,
+#endif
};
struct vop_vector zfs_fifoops = {
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_znode.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,7 +21,8 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
*/
/* Portions Copyright 2007 Jeremy Teo */
@@ -55,6 +56,7 @@
#endif /* _KERNEL */
#include <sys/dmu.h>
+#include <sys/dmu_objset.h>
#include <sys/refcount.h>
#include <sys/stat.h>
#include <sys/zap.h>
@@ -68,8 +70,8 @@
#include "zfs_comutil.h"
/* Used by fstat(1). */
-SYSCTL_INT(_debug_sizeof, OID_AUTO, znode, CTLFLAG_RD, 0, sizeof(znode_t),
- "sizeof(znode_t)");
+SYSCTL_INT(_debug_sizeof, OID_AUTO, znode, CTLFLAG_RD,
+ SYSCTL_NULL_INT_PTR, sizeof(znode_t), "sizeof(znode_t)");
/*
* Define ZNODE_STATS to turn on statistic gathering. By default, it is only
@@ -114,43 +116,15 @@
extern struct vop_vector zfs_fifoops;
extern struct vop_vector zfs_shareops;
-/*
- * XXX: We cannot use this function as a cache constructor, because
- * there is one global cache for all file systems and we need
- * to pass vfsp here, which is not possible, because argument
- * 'cdrarg' is defined at kmem_cache_create() time.
- */
-/*ARGSUSED*/
static int
zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
{
znode_t *zp = buf;
- vnode_t *vp;
- vfs_t *vfsp = arg;
- int error;
POINTER_INVALIDATE(&zp->z_zfsvfs);
- ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
- if (vfsp != NULL) {
- error = getnewvnode("zfs", vfsp, &zfs_vnodeops, &vp);
- if (error != 0 && (kmflags & KM_NOSLEEP))
- return (-1);
- ASSERT(error == 0);
- vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
- zp->z_vnode = vp;
- vp->v_data = (caddr_t)zp;
- VN_LOCK_AREC(vp);
- VN_LOCK_ASHARE(vp);
- } else {
- zp->z_vnode = NULL;
- }
-
list_link_init(&zp->z_link_node);
- mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL);
- rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL);
- rw_init(&zp->z_name_lock, NULL, RW_DEFAULT, NULL);
mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&zp->z_range_lock, NULL, MUTEX_DEFAULT, NULL);
@@ -157,8 +131,8 @@
avl_create(&zp->z_range_avl, zfs_range_compare,
sizeof (rl_t), offsetof(rl_t, r_node));
- zp->z_dirlocks = NULL;
zp->z_acl_cached = NULL;
+ zp->z_vnode = NULL;
zp->z_moved = 0;
return (0);
}
@@ -170,17 +144,12 @@
znode_t *zp = buf;
ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
- ASSERT(ZTOV(zp) == NULL);
- vn_free(ZTOV(zp));
+ ASSERT3P(zp->z_vnode, ==, NULL);
ASSERT(!list_link_active(&zp->z_link_node));
- mutex_destroy(&zp->z_lock);
- rw_destroy(&zp->z_parent_lock);
- rw_destroy(&zp->z_name_lock);
mutex_destroy(&zp->z_acl_lock);
avl_destroy(&zp->z_range_avl);
mutex_destroy(&zp->z_range_lock);
- ASSERT(zp->z_dirlocks == NULL);
ASSERT(zp->z_acl_cached == NULL);
}
@@ -196,7 +165,7 @@
} znode_move_stats;
#endif /* ZNODE_STATS */
-#ifdef sun
+#ifdef illumos
static void
zfs_znode_move_impl(znode_t *ozp, znode_t *nzp)
{
@@ -301,7 +270,7 @@
* can safely ensure that the filesystem is not and will not be
* unmounted. The next statement is equivalent to ZFS_ENTER().
*/
- rrw_enter(&zfsvfs->z_teardown_lock, RW_READER, FTAG);
+ rrm_enter(&zfsvfs->z_teardown_lock, RW_READER, FTAG);
if (zfsvfs->z_unmounted) {
ZFS_EXIT(zfsvfs);
rw_exit(&zfsvfs_lock);
@@ -367,7 +336,7 @@
return (KMEM_CBRC_YES);
}
-#endif /* sun */
+#endif /* illumos */
void
zfs_znode_init(void)
@@ -378,7 +347,7 @@
rw_init(&zfsvfs_lock, NULL, RW_DEFAULT, NULL);
ASSERT(znode_cache == NULL);
znode_cache = kmem_cache_create("zfs_znode_cache",
- sizeof (znode_t), 0, /* zfs_znode_cache_constructor */ NULL,
+ sizeof (znode_t), 0, zfs_znode_cache_constructor,
zfs_znode_cache_destructor, NULL, NULL, NULL, 0);
kmem_cache_set_move(znode_cache, zfs_znode_move);
}
@@ -386,12 +355,12 @@
void
zfs_znode_fini(void)
{
-#ifdef sun
+#ifdef illumos
/*
* Cleanup vfs & vnode ops
*/
zfs_remove_op_tables();
-#endif /* sun */
+#endif
/*
* Cleanup zcache
@@ -402,7 +371,7 @@
rw_destroy(&zfsvfs_lock);
}
-#ifdef sun
+#ifdef illumos
struct vnodeops *zfs_dvnodeops;
struct vnodeops *zfs_fvnodeops;
struct vnodeops *zfs_symvnodeops;
@@ -494,7 +463,7 @@
return (error);
}
-#endif /* sun */
+#endif /* illumos */
int
zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
@@ -502,7 +471,6 @@
zfs_acl_ids_t acl_ids;
vattr_t vattr;
znode_t *sharezp;
- vnode_t *vp, vnode;
znode_t *zp;
int error;
@@ -513,7 +481,6 @@
vattr.va_gid = crgetgid(kcred);
sharezp = kmem_cache_alloc(znode_cache, KM_SLEEP);
- zfs_znode_cache_constructor(sharezp, zfsvfs->z_parent->z_vfs, 0);
ASSERT(!POINTER_IS_VALID(sharezp->z_zfsvfs));
sharezp->z_moved = 0;
sharezp->z_unlinked = 0;
@@ -521,12 +488,6 @@
sharezp->z_zfsvfs = zfsvfs;
sharezp->z_is_sa = zfsvfs->z_use_sa;
- sharezp->z_vnode = &vnode;
- vnode.v_data = sharezp;
-
- vp = ZTOV(sharezp);
- vp->v_type = VDIR;
-
VERIFY(0 == zfs_acl_ids_create(sharezp, IS_ROOT_NODE, &vattr,
kcred, NULL, &acl_ids));
zfs_mknode(sharezp, &vattr, tx, kcred, IS_ROOT_NODE, &zp, &acl_ids);
@@ -537,12 +498,7 @@
zfsvfs->z_shares_dir = sharezp->z_id;
zfs_acl_ids_free(&acl_ids);
- ZTOV(sharezp)->v_data = NULL;
- ZTOV(sharezp)->v_count = 0;
- ZTOV(sharezp)->v_holdcnt = 0;
- zp->z_vnode = NULL;
sa_handle_destroy(sharezp->z_sa_hdl);
- sharezp->z_vnode = NULL;
kmem_cache_free(znode_cache, sharezp);
return (error);
@@ -595,8 +551,6 @@
ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs) || (zfsvfs == zp->z_zfsvfs));
ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id)));
- mutex_enter(&zp->z_lock);
-
ASSERT(zp->z_sa_hdl == NULL);
ASSERT(zp->z_acl_cached == NULL);
if (sa_hdl == NULL) {
@@ -610,12 +564,12 @@
zp->z_is_sa = (obj_type == DMU_OT_SA) ? B_TRUE : B_FALSE;
/*
- * Slap on VROOT if we are the root znode
+ * Slap on VROOT if we are the root znode unless we are the root
+ * node of a snapshot mounted under .zfs.
*/
- if (zp->z_id == zfsvfs->z_root)
+ if (zp->z_id == zfsvfs->z_root && zfsvfs->z_parent == zfsvfs)
ZTOV(zp)->v_flag |= VROOT;
- mutex_exit(&zp->z_lock);
vn_exists(ZTOV(zp));
}
@@ -658,11 +612,20 @@
uint64_t parent;
sa_bulk_attr_t bulk[9];
int count = 0;
+ int error;
zp = kmem_cache_alloc(znode_cache, KM_SLEEP);
- zfs_znode_cache_constructor(zp, zfsvfs->z_parent->z_vfs, 0);
- ASSERT(zp->z_dirlocks == NULL);
+ KASSERT(curthread->td_vp_reserv > 0,
+ ("zfs_znode_alloc: getnewvnode without any vnodes reserved"));
+ error = getnewvnode("zfs", zfsvfs->z_parent->z_vfs, &zfs_vnodeops, &vp);
+ if (error != 0) {
+ kmem_cache_free(znode_cache, zp);
+ return (NULL);
+ }
+ zp->z_vnode = vp;
+ vp->v_data = zp;
+
ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs));
zp->z_moved = 0;
@@ -716,7 +679,7 @@
case VDIR:
zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */
break;
-#ifdef sun
+#ifdef illumos
case VBLK:
case VCHR:
{
@@ -727,12 +690,12 @@
vp->v_rdev = zfs_cmpldev(rdev);
}
break;
-#endif /* sun */
+#endif
case VFIFO:
-#ifdef sun
+#ifdef illumos
case VSOCK:
case VDOOR:
-#endif /* sun */
+#endif
vp->v_op = &zfs_fifoops;
break;
case VREG:
@@ -741,7 +704,7 @@
vp->v_op = &zfs_shareops;
}
break;
-#ifdef sun
+#ifdef illumos
case VLNK:
vn_setops(vp, zfs_symvnodeops);
break;
@@ -748,10 +711,8 @@
default:
vn_setops(vp, zfs_evnodeops);
break;
-#endif /* sun */
+#endif
}
- if (vp->v_type != VFIFO)
- VN_LOCK_ASHARE(vp);
mutex_enter(&zfsvfs->z_znodes_lock);
list_insert_tail(&zfsvfs->z_all_znodes, zp);
@@ -763,7 +724,17 @@
zp->z_zfsvfs = zfsvfs;
mutex_exit(&zfsvfs->z_znodes_lock);
+ /*
+ * Acquire vnode lock before making it available to the world.
+ */
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+ VN_LOCK_AREC(vp);
+ if (vp->v_type != VFIFO)
+ VN_LOCK_ASHARE(vp);
+
+#ifdef illumos
VFS_HOLD(zfsvfs->z_vfs);
+#endif
return (zp);
}
@@ -815,7 +786,7 @@
gen = vap->va_nblocks; /* ditto */
} else {
obj = 0;
- gethrestime(&now);
+ vfs_timestamp(&now);
gen = dmu_tx_get_txg(tx);
}
@@ -834,10 +805,9 @@
*/
if (vap->va_type == VDIR) {
if (zfsvfs->z_replay) {
- err = zap_create_claim_norm(zfsvfs->z_os, obj,
+ VERIFY0(zap_create_claim_norm(zfsvfs->z_os, obj,
zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
- obj_type, bonuslen, tx);
- ASSERT0(err);
+ obj_type, bonuslen, tx));
} else {
obj = zap_create_norm(zfsvfs->z_os,
zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
@@ -845,10 +815,9 @@
}
} else {
if (zfsvfs->z_replay) {
- err = dmu_object_claim(zfsvfs->z_os, obj,
+ VERIFY0(dmu_object_claim(zfsvfs->z_os, obj,
DMU_OT_PLAIN_FILE_CONTENTS, 0,
- obj_type, bonuslen, tx);
- ASSERT0(err);
+ obj_type, bonuslen, tx));
} else {
obj = dmu_object_alloc(zfsvfs->z_os,
DMU_OT_PLAIN_FILE_CONTENTS, 0,
@@ -856,7 +825,6 @@
}
}
- getnewvnode_reserve(1);
ZFS_OBJ_HOLD_ENTER(zfsvfs, obj);
VERIFY(0 == sa_buf_hold(zfsvfs->z_os, obj, NULL, &db));
@@ -1030,8 +998,7 @@
if (obj_type == DMU_OT_ZNODE ||
acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) {
- err = zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx);
- ASSERT0(err);
+ VERIFY0(zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx));
}
if (!(flag & IS_ROOT_NODE)) {
vnode_t *vp;
@@ -1043,7 +1010,6 @@
KASSERT(err == 0, ("insmntque() failed: error %d", err));
}
ZFS_OBJ_HOLD_EXIT(zfsvfs, obj);
- getnewvnode_drop_reserve();
}
/*
@@ -1183,54 +1149,55 @@
if (hdl != NULL) {
zp = sa_get_userdata(hdl);
-
/*
* Since "SA" does immediate eviction we
* should never find a sa handle that doesn't
* know about the znode.
*/
-
ASSERT3P(zp, !=, NULL);
-
- mutex_enter(&zp->z_lock);
ASSERT3U(zp->z_id, ==, obj_num);
- if (zp->z_unlinked) {
- err = SET_ERROR(ENOENT);
- } else {
- vp = ZTOV(zp);
- *zpp = zp;
- err = 0;
- }
- sa_buf_rele(db, NULL);
+ *zpp = zp;
+ vp = ZTOV(zp);
/* Don't let the vnode disappear after ZFS_OBJ_HOLD_EXIT. */
- if (err == 0)
- VN_HOLD(vp);
+ VN_HOLD(vp);
- mutex_exit(&zp->z_lock);
+ sa_buf_rele(db, NULL);
ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
- if (err == 0) {
- locked = VOP_ISLOCKED(vp);
- VI_LOCK(vp);
- if ((vp->v_iflag & VI_DOOMED) != 0 &&
- locked != LK_EXCLUSIVE) {
- /*
- * The vnode is doomed and this thread doesn't
- * hold the exclusive lock on it, so the vnode
- * must be being reclaimed by another thread.
- * Otherwise the doomed vnode is being reclaimed
- * by this thread and zfs_zget is called from
- * ZIL internals.
- */
- VI_UNLOCK(vp);
- VN_RELE(vp);
- goto again;
- }
+ locked = VOP_ISLOCKED(vp);
+ VI_LOCK(vp);
+ if ((vp->v_iflag & VI_DOOMED) != 0 &&
+ locked != LK_EXCLUSIVE) {
+ /*
+ * The vnode is doomed and this thread doesn't
+ * hold the exclusive lock on it, so the vnode
+ * must be being reclaimed by another thread.
+ * Otherwise the doomed vnode is being reclaimed
+ * by this thread and zfs_zget is called from
+ * ZIL internals.
+ */
VI_UNLOCK(vp);
+
+ /*
+ * XXX vrele() locks the vnode when the last reference
+ * is dropped. Although in this case the vnode is
+ * doomed / dead and so no inactivation is required,
+ * the vnode lock is still acquired. That could result
+ * in a LOR with z_teardown_lock if another thread holds
+ * the vnode's lock and tries to take z_teardown_lock.
+ * But that is only possible if the other thread peforms
+ * a ZFS vnode operation on the vnode. That either
+ * should not happen if the vnode is dead or the thread
+ * should also have a refrence to the vnode and thus
+ * our reference is not last.
+ */
+ VN_RELE(vp);
+ goto again;
}
+ VI_UNLOCK(vp);
getnewvnode_drop_reserve();
- return (err);
+ return (0);
}
/*
@@ -1255,9 +1222,10 @@
vnode_t *vp = ZTOV(zp);
err = insmntque(vp, zfsvfs->z_vfs);
- if (err == 0)
+ if (err == 0) {
+ vp->v_hash = obj_num;
VOP_UNLOCK(vp, 0);
- else {
+ } else {
zp->z_vnode = NULL;
zfs_znode_dmu_fini(zp);
zfs_znode_free(zp);
@@ -1345,15 +1313,25 @@
}
/*
- * XXXPJD: Not sure how is that possible, but under heavy
- * zfs recv -F load it happens that z_gen is the same, but
- * vnode type is different than znode type. This would mean
- * that for example regular file was replaced with directory
- * which has the same object number.
+ * It is highly improbable but still quite possible that two
+ * objects in different datasets are created with the same
+ * object numbers and in transaction groups with the same
+ * numbers. znodes corresponding to those objects would
+ * have the same z_id and z_gen, but their other attributes
+ * may be different.
+ * zfs recv -F may replace one of such objects with the other.
+ * As a result file properties recorded in the replaced
+ * object's vnode may no longer match the received object's
+ * properties. At present the only cached property is the
+ * files type recorded in v_type.
+ * So, handle this case by leaving the old vnode and znode
+ * disassociated from the actual object. A new vnode and a
+ * znode will be created if the object is accessed
+ * (e.g. via a look-up). The old vnode and znode will be
+ * recycled when the last vnode reference is dropped.
*/
vp = ZTOV(zp);
- if (vp != NULL &&
- vp->v_type != IFTOVT((mode_t)zp->z_mode)) {
+ if (vp->v_type != IFTOVT((mode_t)zp->z_mode)) {
zfs_znode_dmu_fini(zp);
ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
return (EIO);
@@ -1361,11 +1339,9 @@
zp->z_unlinked = (zp->z_links == 0);
zp->z_blksz = doi.doi_data_block_size;
- if (vp != NULL) {
- vn_pages_remove(vp, 0, 0);
- if (zp->z_size != size)
- vnode_pager_setsize(vp, zp->z_size);
- }
+ vn_pages_remove(vp, 0, 0);
+ if (zp->z_size != size)
+ vnode_pager_setsize(vp, zp->z_size);
ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
@@ -1404,20 +1380,16 @@
*/
ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id);
- mutex_enter(&zp->z_lock);
-
/*
* If this was the last reference to a file with no links,
* remove the file from the file system.
*/
if (zp->z_unlinked) {
- mutex_exit(&zp->z_lock);
ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
zfs_rmnode(zp);
return;
}
- mutex_exit(&zp->z_lock);
zfs_znode_dmu_fini(zp);
ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
zfs_znode_free(zp);
@@ -1442,7 +1414,9 @@
kmem_cache_free(znode_cache, zp);
+#ifdef illumos
VFS_RELE(zfsvfs->z_vfs);
+#endif
}
void
@@ -1451,7 +1425,7 @@
{
timestruc_t now;
- gethrestime(&now);
+ vfs_timestamp(&now);
if (have_tx) { /* will sa_bulk_update happen really soon? */
zp->z_atime_dirty = 0;
@@ -1515,7 +1489,7 @@
dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &zp->z_blksz, &dummy);
}
-#ifdef sun
+#ifdef illumos
/*
* This is a dummy interface used when pvn_vplist_dirty() should *not*
* be calling back into the fs for a putpage(). E.g.: when truncating
@@ -1529,7 +1503,7 @@
ASSERT(0);
return (0);
}
-#endif /* sun */
+#endif
/*
* Increase the file length
@@ -1537,7 +1511,7 @@
* IN: zp - znode of file to free data in.
* end - new end-of-file
*
- * RETURN: 0 on success, error code on failure
+ * RETURN: 0 on success, error code on failure
*/
static int
zfs_extend(znode_t *zp, uint64_t end)
@@ -1560,7 +1534,6 @@
zfs_range_unlock(rl);
return (0);
}
-top:
tx = dmu_tx_create(zfsvfs->z_os);
dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
zfs_sa_upgrade_txholds(tx, zp);
@@ -1570,8 +1543,13 @@
* We are growing the file past the current block size.
*/
if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) {
+ /*
+ * File's blocksize is already larger than the
+ * "recordsize" property. Only let it grow to
+ * the next power of 2.
+ */
ASSERT(!ISP2(zp->z_blksz));
- newblksz = MIN(end, SPA_MAXBLOCKSIZE);
+ newblksz = MIN(end, 1 << highbit64(zp->z_blksz));
} else {
newblksz = MIN(end, zp->z_zfsvfs->z_max_blksz);
}
@@ -1580,13 +1558,8 @@
newblksz = 0;
}
- error = dmu_tx_assign(tx, TXG_NOWAIT);
+ error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
- if (error == ERESTART) {
- dmu_tx_wait(tx);
- dmu_tx_abort(tx);
- goto top;
- }
dmu_tx_abort(tx);
zfs_range_unlock(rl);
return (error);
@@ -1616,7 +1589,7 @@
* off - start of section to free.
* len - length of section to free.
*
- * RETURN: 0 on success, error code on failure
+ * RETURN: 0 on success, error code on failure
*/
static int
zfs_free_range(znode_t *zp, uint64_t off, uint64_t len)
@@ -1663,7 +1636,7 @@
* IN: zp - znode of file to free data in.
* end - new end-of-file.
*
- * RETURN: 0 on success, error code on failure
+ * RETURN: 0 on success, error code on failure
*/
static int
zfs_trunc(znode_t *zp, uint64_t end)
@@ -1694,17 +1667,12 @@
zfs_range_unlock(rl);
return (error);
}
-top:
tx = dmu_tx_create(zfsvfs->z_os);
dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
zfs_sa_upgrade_txholds(tx, zp);
- error = dmu_tx_assign(tx, TXG_NOWAIT);
+ dmu_tx_mark_netfree(tx);
+ error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
- if (error == ERESTART) {
- dmu_tx_wait(tx);
- dmu_tx_abort(tx);
- goto top;
- }
dmu_tx_abort(tx);
zfs_range_unlock(rl);
return (error);
@@ -1745,7 +1713,7 @@
* flag - current file open mode flags.
* log - TRUE if this action should be logged
*
- * RETURN: 0 on success, error code on failure
+ * RETURN: 0 on success, error code on failure
*/
int
zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log)
@@ -1795,13 +1763,8 @@
tx = dmu_tx_create(zfsvfs->z_os);
dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
zfs_sa_upgrade_txholds(tx, zp);
- error = dmu_tx_assign(tx, TXG_NOWAIT);
+ error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
- if (error == ERESTART) {
- dmu_tx_wait(tx);
- dmu_tx_abort(tx);
- goto log;
- }
dmu_tx_abort(tx);
return (error);
}
@@ -1823,7 +1786,6 @@
void
zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
{
- zfsvfs_t zfsvfs;
uint64_t moid, obj, sa_obj, version;
uint64_t sense = ZFS_CASE_SENSITIVE;
uint64_t norm = 0;
@@ -1831,7 +1793,7 @@
int error;
int i;
znode_t *rootzp = NULL;
- vnode_t vnode;
+ zfsvfs_t *zfsvfs;
vattr_t vattr;
znode_t *zp;
zfs_acl_ids_t acl_ids;
@@ -1907,10 +1869,9 @@
vattr.va_uid = crgetuid(cr);
vattr.va_gid = crgetgid(cr);
- bzero(&zfsvfs, sizeof (zfsvfs_t));
+ zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP);
- zfs_znode_cache_constructor(rootzp, NULL, 0);
ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs));
rootzp->z_moved = 0;
rootzp->z_unlinked = 0;
@@ -1917,19 +1878,15 @@
rootzp->z_atime_dirty = 0;
rootzp->z_is_sa = USE_SA(version, os);
- vnode.v_type = VDIR;
- vnode.v_data = rootzp;
- rootzp->z_vnode = &vnode;
+ zfsvfs->z_os = os;
+ zfsvfs->z_parent = zfsvfs;
+ zfsvfs->z_version = version;
+ zfsvfs->z_use_fuids = USE_FUIDS(version, os);
+ zfsvfs->z_use_sa = USE_SA(version, os);
+ zfsvfs->z_norm = norm;
- zfsvfs.z_os = os;
- zfsvfs.z_parent = &zfsvfs;
- zfsvfs.z_version = version;
- zfsvfs.z_use_fuids = USE_FUIDS(version, os);
- zfsvfs.z_use_sa = USE_SA(version, os);
- zfsvfs.z_norm = norm;
-
error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
- &zfsvfs.z_attr_table);
+ &zfsvfs->z_attr_table);
ASSERT(error == 0);
@@ -1938,16 +1895,16 @@
* insensitive.
*/
if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED)
- zfsvfs.z_norm |= U8_TEXTPREP_TOUPPER;
+ zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
- mutex_init(&zfsvfs.z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
- list_create(&zfsvfs.z_all_znodes, sizeof (znode_t),
+ mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
+ list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
offsetof(znode_t, z_link_node));
for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
- mutex_init(&zfsvfs.z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
- rootzp->z_zfsvfs = &zfsvfs;
+ rootzp->z_zfsvfs = zfsvfs;
VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr,
cr, NULL, &acl_ids));
zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids);
@@ -1958,7 +1915,6 @@
POINTER_INVALIDATE(&rootzp->z_zfsvfs);
sa_handle_destroy(rootzp->z_sa_hdl);
- rootzp->z_vnode = NULL;
kmem_cache_free(znode_cache, rootzp);
/*
@@ -1965,14 +1921,14 @@
* Create shares directory
*/
- error = zfs_create_share_dir(&zfsvfs, tx);
+ error = zfs_create_share_dir(zfsvfs, tx);
ASSERT(error == 0);
for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
- mutex_destroy(&zfsvfs.z_hold_mtx[i]);
+ mutex_destroy(&zfsvfs->z_hold_mtx[i]);
+ kmem_free(zfsvfs, sizeof (zfsvfs_t));
}
-
#endif /* _KERNEL */
static int
@@ -2228,3 +2184,35 @@
zfs_release_sa_handle(hdl, db, FTAG);
return (error);
}
+
+#ifdef _KERNEL
+int
+zfs_znode_parent_and_name(znode_t *zp, znode_t **dzpp, char *buf)
+{
+ zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+ uint64_t parent;
+ int is_xattrdir;
+ int err;
+
+ /* Extended attributes should not be visible as regular files. */
+ if ((zp->z_pflags & ZFS_XATTR) != 0)
+ return (SET_ERROR(EINVAL));
+
+ err = zfs_obj_to_pobj(zfsvfs->z_os, zp->z_sa_hdl, zfsvfs->z_attr_table,
+ &parent, &is_xattrdir);
+ if (err != 0)
+ return (err);
+ ASSERT0(is_xattrdir);
+
+ /* No name as this is a root object. */
+ if (parent == zp->z_id)
+ return (SET_ERROR(EINVAL));
+
+ err = zap_value_search(zfsvfs->z_os, parent, zp->z_id,
+ ZFS_DIRENT_OBJ(-1ULL), buf);
+ if (err != 0)
+ return (err);
+ err = zfs_zget(zfsvfs, parent, dzpp);
+ return (err);
+}
+#endif /* _KERNEL */
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,7 +21,8 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
*/
/* Portions Copyright 2010 Robert Milkowski */
@@ -90,10 +91,17 @@
SYSCTL_INT(_vfs_zfs_trim, OID_AUTO, enabled, CTLFLAG_RDTUN, &zfs_trim_enabled, 0,
"Enable ZFS TRIM");
+/*
+ * Limit SLOG write size per commit executed with synchronous priority.
+ * Any writes above that executed with lower (asynchronous) priority to
+ * limit potential SLOG device abuse by single active ZIL writer.
+ */
+uint64_t zil_slog_limit = 768 * 1024;
+SYSCTL_QUAD(_vfs_zfs, OID_AUTO, zil_slog_limit, CTLFLAG_RWTUN,
+ &zil_slog_limit, 0, "Maximal SLOG commit size with sync priority");
+
static kmem_cache_t *zil_lwb_cache;
-static void zil_async_to_sync(zilog_t *zilog, uint64_t foid);
-
#define LWB_EMPTY(lwb) ((BP_GET_LSIZE(&lwb->lwb_blk) - \
sizeof (zil_chain_t)) == (lwb->lwb_sz - lwb->lwb_nused))
@@ -150,10 +158,15 @@
zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp)
{
avl_tree_t *t = &zilog->zl_bp_tree;
- const dva_t *dva = BP_IDENTITY(bp);
+ const dva_t *dva;
zil_bp_node_t *zn;
avl_index_t where;
+ if (BP_IS_EMBEDDED(bp))
+ return (0);
+
+ dva = BP_IDENTITY(bp);
+
if (avl_find(t, dva, &where) != NULL)
return (SET_ERROR(EEXIST));
@@ -189,9 +202,9 @@
char **end)
{
enum zio_flag zio_flags = ZIO_FLAG_CANFAIL;
- uint32_t aflags = ARC_WAIT;
+ arc_flags_t aflags = ARC_FLAG_WAIT;
arc_buf_t *abuf = NULL;
- zbookmark_t zb;
+ zbookmark_phys_t zb;
int error;
if (zilog->zl_header->zh_claim_txg == 0)
@@ -228,6 +241,7 @@
sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk)) {
error = SET_ERROR(ECKSUM);
} else {
+ ASSERT3U(len, <=, SPA_OLD_MAXBLOCKSIZE);
bcopy(lr, dst, len);
*end = (char *)dst + len;
*nbp = zilc->zc_next_blk;
@@ -242,6 +256,8 @@
(zilc->zc_nused > (size - sizeof (*zilc)))) {
error = SET_ERROR(ECKSUM);
} else {
+ ASSERT3U(zilc->zc_nused, <=,
+ SPA_OLD_MAXBLOCKSIZE);
bcopy(lr, dst, zilc->zc_nused);
*end = (char *)dst + zilc->zc_nused;
*nbp = zilc->zc_next_blk;
@@ -248,7 +264,7 @@
}
}
- VERIFY(arc_buf_remove_ref(abuf, &abuf));
+ arc_buf_destroy(abuf, &abuf);
}
return (error);
@@ -262,9 +278,9 @@
{
enum zio_flag zio_flags = ZIO_FLAG_CANFAIL;
const blkptr_t *bp = &lr->lr_blkptr;
- uint32_t aflags = ARC_WAIT;
+ arc_flags_t aflags = ARC_FLAG_WAIT;
arc_buf_t *abuf = NULL;
- zbookmark_t zb;
+ zbookmark_phys_t zb;
int error;
if (BP_IS_HOLE(bp)) {
@@ -285,7 +301,7 @@
if (error == 0) {
if (wbuf != NULL)
bcopy(abuf->b_data, wbuf, arc_buf_size(abuf));
- (void) arc_buf_remove_ref(abuf, &abuf);
+ arc_buf_destroy(abuf, &abuf);
}
return (error);
@@ -325,7 +341,7 @@
* If the log has been claimed, stop if we encounter a sequence
* number greater than the highest claimed sequence number.
*/
- lrbuf = zio_buf_alloc(SPA_MAXBLOCKSIZE);
+ lrbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE);
zil_bp_tree_init(zilog);
for (blk = zh->zh_log; !BP_IS_HOLE(&blk); blk = next_blk) {
@@ -372,7 +388,7 @@
(max_blk_seq == claim_blk_seq && max_lr_seq == claim_lr_seq));
zil_bp_tree_fini(zilog);
- zio_buf_free(lrbuf, SPA_MAXBLOCKSIZE);
+ zio_buf_free(lrbuf, SPA_OLD_MAXBLOCKSIZE);
return (error);
}
@@ -384,7 +400,8 @@
* Claim log block if not already committed and not already claimed.
* If tx == NULL, just verify that the block is claimable.
*/
- if (bp->blk_birth < first_txg || zil_bp_tree_add(zilog, bp) != 0)
+ if (BP_IS_HOLE(bp) || bp->blk_birth < first_txg ||
+ zil_bp_tree_add(zilog, bp) != 0)
return (0);
return (zio_wait(zio_claim(NULL, zilog->zl_spa,
@@ -434,7 +451,8 @@
* If we previously claimed it, we need to free it.
*/
if (claim_txg != 0 && lrc->lrc_txtype == TX_WRITE &&
- bp->blk_birth >= claim_txg && zil_bp_tree_add(zilog, bp) == 0)
+ bp->blk_birth >= claim_txg && zil_bp_tree_add(zilog, bp) == 0 &&
+ !BP_IS_HOLE(bp))
zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp);
return (0);
@@ -441,7 +459,7 @@
}
static lwb_t *
-zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, uint64_t txg)
+zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, boolean_t slog, uint64_t txg)
{
lwb_t *lwb;
@@ -448,6 +466,7 @@
lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
lwb->lwb_zilog = zilog;
lwb->lwb_blk = *bp;
+ lwb->lwb_slog = slog;
lwb->lwb_buf = zio_buf_alloc(BP_GET_LSIZE(bp));
lwb->lwb_max_txg = txg;
lwb->lwb_zio = NULL;
@@ -477,7 +496,7 @@
dsl_pool_t *dp = zilog->zl_dmu_pool;
dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os);
- if (dsl_dataset_is_snapshot(ds))
+ if (ds->ds_is_snapshot)
panic("dirtying snapshot!");
if (txg_list_add(&dp->dp_dirty_zilogs, zilog, txg)) {
@@ -486,7 +505,28 @@
}
}
+/*
+ * Determine if the zil is dirty in the specified txg. Callers wanting to
+ * ensure that the dirty state does not change must hold the itxg_lock for
+ * the specified txg. Holding the lock will ensure that the zil cannot be
+ * dirtied (zil_itx_assign) or cleaned (zil_clean) while we check its current
+ * state.
+ */
boolean_t
+zilog_is_dirty_in_txg(zilog_t *zilog, uint64_t txg)
+{
+ dsl_pool_t *dp = zilog->zl_dmu_pool;
+
+ if (txg_list_member(&dp->dp_dirty_zilogs, zilog, txg & TXG_MASK))
+ return (B_TRUE);
+ return (B_FALSE);
+}
+
+/*
+ * Determine if the zil is dirty. The zil is considered dirty if it has
+ * any pending itx records that have not been cleaned by zil_clean().
+ */
+boolean_t
zilog_is_dirty(zilog_t *zilog)
{
dsl_pool_t *dp = zilog->zl_dmu_pool;
@@ -510,6 +550,7 @@
dmu_tx_t *tx = NULL;
blkptr_t blk;
int error = 0;
+ boolean_t slog = FALSE;
/*
* Wait for any previous destroy to complete.
@@ -538,7 +579,7 @@
}
error = zio_alloc_zil(zilog->zl_spa, txg, &blk, NULL,
- ZIL_MIN_BLKSZ, zilog->zl_logbias == ZFS_LOGBIAS_LATENCY);
+ ZIL_MIN_BLKSZ, &slog);
if (error == 0)
zil_init_log_chain(zilog, &blk);
@@ -548,7 +589,7 @@
* Allocate a log write buffer (lwb) for the first log block.
*/
if (error == 0)
- lwb = zil_alloc_lwb(zilog, &blk, txg);
+ lwb = zil_alloc_lwb(zilog, &blk, slog, txg);
/*
* If we just allocated the first log block, commit our transaction
@@ -630,7 +671,7 @@
}
int
-zil_claim(const char *osname, void *txarg)
+zil_claim(dsl_pool_t *dp, dsl_dataset_t *ds, void *txarg)
{
dmu_tx_t *tx = txarg;
uint64_t first_txg = dmu_tx_get_txg(tx);
@@ -639,9 +680,17 @@
objset_t *os;
int error;
- error = dmu_objset_own(osname, DMU_OST_ANY, B_FALSE, FTAG, &os);
+ error = dmu_objset_own_obj(dp, ds->ds_object,
+ DMU_OST_ANY, B_FALSE, FTAG, &os);
if (error != 0) {
- cmn_err(CE_WARN, "can't open objset for %s", osname);
+ /*
+ * EBUSY indicates that the objset is inconsistent, in which
+ * case it can not have a ZIL.
+ */
+ if (error != EBUSY) {
+ cmn_err(CE_WARN, "can't open objset for %llu, error %u",
+ (unsigned long long)ds->ds_object, error);
+ }
return (0);
}
@@ -687,8 +736,9 @@
* Checksum errors are ok as they indicate the end of the chain.
* Any other error (no device or read failure) returns an error.
*/
+/* ARGSUSED */
int
-zil_check_log_chain(const char *osname, void *tx)
+zil_check_log_chain(dsl_pool_t *dp, dsl_dataset_t *ds, void *tx)
{
zilog_t *zilog;
objset_t *os;
@@ -697,9 +747,10 @@
ASSERT(tx == NULL);
- error = dmu_objset_hold(osname, FTAG, &os);
+ error = dmu_objset_from_ds(ds, &os);
if (error != 0) {
- cmn_err(CE_WARN, "can't open objset for %s", osname);
+ cmn_err(CE_WARN, "can't open objset %llu, error %d",
+ (unsigned long long)ds->ds_object, error);
return (0);
}
@@ -722,10 +773,8 @@
valid = vdev_log_state_valid(vd);
spa_config_exit(os->os_spa, SCL_STATE, FTAG);
- if (!valid) {
- dmu_objset_rele(os, FTAG);
+ if (!valid)
return (0);
- }
}
/*
@@ -738,8 +787,6 @@
error = zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, tx,
zilog->zl_header->zh_claim_txg ? -1ULL : spa_first_txg(os->os_spa));
- dmu_objset_rele(os, FTAG);
-
return ((error == ECKSUM || error == ENOENT) ? 0 : error);
}
@@ -795,7 +842,7 @@
avl_tree_t *t = &zilog->zl_vdev_tree;
void *cookie = NULL;
zil_vdev_node_t *zv;
- zio_t *zio;
+ zio_t *zio = NULL;
ASSERT(zilog->zl_writer);
@@ -808,12 +855,13 @@
spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
- zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
-
while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) {
vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev);
- if (vd != NULL)
+ if (vd != NULL && !vd->vdev_nowritecache) {
+ if (zio == NULL)
+ zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
zio_flush(zio, vd);
+ }
kmem_free(zv, sizeof (*zv));
}
@@ -821,7 +869,8 @@
* Wait for all the flushes to complete. Not all devices actually
* support the DKIOCFLUSHWRITECACHE ioctl, so it's OK if it fails.
*/
- (void) zio_wait(zio);
+ if (zio)
+ (void) zio_wait(zio);
spa_config_exit(spa, SCL_STATE, FTAG);
}
@@ -842,7 +891,7 @@
ASSERT(BP_GET_BYTEORDER(zio->io_bp) == ZFS_HOST_BYTEORDER);
ASSERT(!BP_IS_GANG(zio->io_bp));
ASSERT(!BP_IS_HOLE(zio->io_bp));
- ASSERT(zio->io_bp->blk_fill == 0);
+ ASSERT(BP_GET_FILL(zio->io_bp) == 0);
/*
* Ensure the lwb buffer pointer is cleared before releasing
@@ -872,7 +921,8 @@
static void
zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb)
{
- zbookmark_t zb;
+ zbookmark_phys_t zb;
+ zio_priority_t prio;
SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET],
ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
@@ -883,9 +933,13 @@
ZIO_FLAG_CANFAIL);
}
if (lwb->lwb_zio == NULL) {
+ if (zilog->zl_cur_used <= zil_slog_limit || !lwb->lwb_slog)
+ prio = ZIO_PRIORITY_SYNC_WRITE;
+ else
+ prio = ZIO_PRIORITY_ASYNC_WRITE;
lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa,
0, &lwb->lwb_blk, lwb->lwb_buf, BP_GET_LSIZE(&lwb->lwb_blk),
- zil_lwb_write_done, lwb, ZIO_PRIORITY_LOG_WRITE,
+ zil_lwb_write_done, lwb, prio,
ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE, &zb);
}
}
@@ -895,7 +949,7 @@
*
* These must be a multiple of 4KB. Note only the amount used (again
* aligned to 4KB) actually gets written. However, we can't always just
- * allocate SPA_MAXBLOCKSIZE as the slog space could be exhausted.
+ * allocate SPA_OLD_MAXBLOCKSIZE as the slog space could be exhausted.
*/
uint64_t zil_block_buckets[] = {
4096, /* non TX_WRITE */
@@ -905,21 +959,11 @@
};
/*
- * Use the slog as long as the logbias is 'latency' and the current commit size
- * is less than the limit or the total list size is less than 2X the limit.
- * Limit checking is disabled by setting zil_slog_limit to UINT64_MAX.
- */
-uint64_t zil_slog_limit = 1024 * 1024;
-#define USE_SLOG(zilog) (((zilog)->zl_logbias == ZFS_LOGBIAS_LATENCY) && \
- (((zilog)->zl_cur_used < zil_slog_limit) || \
- ((zilog)->zl_itx_list_sz < (zil_slog_limit << 1))))
-
-/*
* Start a log block write and advance to the next log block.
* Calls are serialized.
*/
static lwb_t *
-zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb)
+zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb, boolean_t last)
{
lwb_t *nlwb = NULL;
zil_chain_t *zilc;
@@ -929,6 +973,7 @@
uint64_t txg;
uint64_t zil_blksz, wsz;
int i, error;
+ boolean_t slog;
if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) {
zilc = (zil_chain_t *)lwb->lwb_buf;
@@ -950,7 +995,15 @@
* to clean up in the event of allocation failure or I/O failure.
*/
tx = dmu_tx_create(zilog->zl_os);
- VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0);
+
+ /*
+ * Since we are not going to create any new dirty data, and we
+ * can even help with clearing the existing dirty data, we
+ * should not be subject to the dirty data based delays. We
+ * use TXG_NOTHROTTLE to bypass the delay mechanism.
+ */
+ VERIFY0(dmu_tx_assign(tx, TXG_WAIT | TXG_NOTHROTTLE));
+
dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
txg = dmu_tx_get_txg(tx);
@@ -977,7 +1030,7 @@
continue;
zil_blksz = zil_block_buckets[i];
if (zil_blksz == UINT64_MAX)
- zil_blksz = SPA_MAXBLOCKSIZE;
+ zil_blksz = SPA_OLD_MAXBLOCKSIZE;
zilog->zl_prev_blks[zilog->zl_prev_rotor] = zil_blksz;
for (i = 0; i < ZIL_PREV_BLKS; i++)
zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]);
@@ -985,8 +1038,7 @@
BP_ZERO(bp);
/* pass the old blkptr in order to spread log blocks across devs */
- error = zio_alloc_zil(spa, txg, bp, &lwb->lwb_blk, zil_blksz,
- USE_SLOG(zilog));
+ error = zio_alloc_zil(spa, txg, bp, &lwb->lwb_blk, zil_blksz, &slog);
if (error == 0) {
ASSERT3U(bp->blk_birth, ==, txg);
bp->blk_cksum = lwb->lwb_blk.blk_cksum;
@@ -995,7 +1047,7 @@
/*
* Allocate a new log write buffer (lwb).
*/
- nlwb = zil_alloc_lwb(zilog, bp, txg);
+ nlwb = zil_alloc_lwb(zilog, bp, slog, txg);
/* Record the block for later vdev flushing */
zil_add_block(zilog, &lwb->lwb_blk);
@@ -1020,6 +1072,8 @@
*/
bzero(lwb->lwb_buf + lwb->lwb_nused, wsz - lwb->lwb_nused);
+ if (last)
+ lwb->lwb_zio->io_pipeline &= ~ZIO_STAGE_ISSUE_ASYNC;
zio_nowait(lwb->lwb_zio); /* Kick off the write for the old log block */
/*
@@ -1032,19 +1086,18 @@
static lwb_t *
zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
{
- lr_t *lrc = &itx->itx_lr; /* common log record */
- lr_write_t *lrw = (lr_write_t *)lrc;
+ lr_t *lrcb, *lrc = &itx->itx_lr; /* common log record */
+ lr_write_t *lrwb, *lrw = (lr_write_t *)lrc;
char *lr_buf;
uint64_t txg = lrc->lrc_txg;
uint64_t reclen = lrc->lrc_reclen;
uint64_t dlen = 0;
+ uint64_t dnow, lwb_sp;
if (lwb == NULL)
return (NULL);
ASSERT(lwb->lwb_buf != NULL);
- ASSERT(zilog_is_dirty(zilog) ||
- spa_freeze_txg(zilog->zl_spa) != UINT64_MAX);
if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY)
dlen = P2ROUNDUP_TYPED(
@@ -1054,25 +1107,30 @@
zil_lwb_write_init(zilog, lwb);
+cont:
/*
* If this record won't fit in the current log block, start a new one.
+ * For WR_NEED_COPY optimize layout for minimal number of chunks, but
+ * try to keep wasted space withing reasonable range (12%).
*/
- if (lwb->lwb_nused + reclen + dlen > lwb->lwb_sz) {
- lwb = zil_lwb_write_start(zilog, lwb);
+ lwb_sp = lwb->lwb_sz - lwb->lwb_nused;
+ if (reclen > lwb_sp || (reclen + dlen > lwb_sp &&
+ lwb_sp < ZIL_MAX_LOG_DATA / 8 && (dlen % ZIL_MAX_LOG_DATA == 0 ||
+ lwb_sp < reclen + dlen % ZIL_MAX_LOG_DATA))) {
+ lwb = zil_lwb_write_start(zilog, lwb, B_FALSE);
if (lwb == NULL)
return (NULL);
zil_lwb_write_init(zilog, lwb);
ASSERT(LWB_EMPTY(lwb));
- if (lwb->lwb_nused + reclen + dlen > lwb->lwb_sz) {
- txg_wait_synced(zilog->zl_dmu_pool, txg);
- return (lwb);
- }
+ lwb_sp = lwb->lwb_sz - lwb->lwb_nused;
+ ASSERT3U(reclen + MIN(dlen, sizeof(uint64_t)), <=, lwb_sp);
}
+ dnow = MIN(dlen, lwb_sp - reclen);
lr_buf = lwb->lwb_buf + lwb->lwb_nused;
bcopy(lrc, lr_buf, reclen);
- lrc = (lr_t *)lr_buf;
- lrw = (lr_write_t *)lrc;
+ lrcb = (lr_t *)lr_buf;
+ lrwb = (lr_write_t *)lrcb;
/*
* If it's a write, fetch the data or get its blkptr as appropriate.
@@ -1084,16 +1142,19 @@
char *dbuf;
int error;
- if (dlen) {
- ASSERT(itx->itx_wr_state == WR_NEED_COPY);
+ if (itx->itx_wr_state == WR_NEED_COPY) {
dbuf = lr_buf + reclen;
- lrw->lr_common.lrc_reclen += dlen;
+ lrcb->lrc_reclen += dnow;
+ if (lrwb->lr_length > dnow)
+ lrwb->lr_length = dnow;
+ lrw->lr_offset += dnow;
+ lrw->lr_length -= dnow;
} else {
ASSERT(itx->itx_wr_state == WR_INDIRECT);
dbuf = NULL;
}
error = zilog->zl_get_data(
- itx->itx_private, lrw, dbuf, lwb->lwb_zio);
+ itx->itx_private, lrwb, dbuf, lwb->lwb_zio);
if (error == EIO) {
txg_wait_synced(zilog->zl_dmu_pool, txg);
return (lwb);
@@ -1112,12 +1173,18 @@
* equal to the itx sequence number because not all transactions
* are synchronous, and sometimes spa_sync() gets there first.
*/
- lrc->lrc_seq = ++zilog->zl_lr_seq; /* we are single threaded */
- lwb->lwb_nused += reclen + dlen;
+ lrcb->lrc_seq = ++zilog->zl_lr_seq; /* we are single threaded */
+ lwb->lwb_nused += reclen + dnow;
lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg);
ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_sz);
ASSERT0(P2PHASE(lwb->lwb_nused, sizeof (uint64_t)));
+ dlen -= dnow;
+ if (dlen > 0) {
+ zilog->zl_cur_used += reclen;
+ goto cont;
+ }
+
return (lwb);
}
@@ -1131,7 +1198,6 @@
itx = kmem_alloc(offsetof(itx_t, itx_lr) + lrsize, KM_SLEEP);
itx->itx_lr.lrc_txtype = txtype;
itx->itx_lr.lrc_reclen = lrsize;
- itx->itx_sod = lrsize; /* if write & WR_NEED_COPY will be increased */
itx->itx_lr.lrc_seq = 0; /* defensive */
itx->itx_sync = B_TRUE; /* default is synchronous */
@@ -1280,11 +1346,8 @@
* this itxg. Save the itxs for release below.
* This should be rare.
*/
- atomic_add_64(&zilog->zl_itx_list_sz, -itxg->itxg_sod);
- itxg->itxg_sod = 0;
clean = itxg->itxg_itxs;
}
- ASSERT(itxg->itxg_sod == 0);
itxg->itxg_txg = txg;
itxs = itxg->itxg_itxs = kmem_zalloc(sizeof (itxs_t), KM_SLEEP);
@@ -1296,8 +1359,6 @@
}
if (itx->itx_sync) {
list_insert_tail(&itxs->i_sync_list, itx);
- atomic_add_64(&zilog->zl_itx_list_sz, itx->itx_sod);
- itxg->itxg_sod += itx->itx_sod;
} else {
avl_tree_t *t = &itxs->i_async_tree;
uint64_t foid = ((lr_ooo_t *)&itx->itx_lr)->lr_foid;
@@ -1345,8 +1406,6 @@
ASSERT3U(itxg->itxg_txg, <=, synced_txg);
ASSERT(itxg->itxg_txg != 0);
ASSERT(zilog->zl_clean_taskq != NULL);
- atomic_add_64(&zilog->zl_itx_list_sz, -itxg->itxg_sod);
- itxg->itxg_sod = 0;
clean_me = itxg->itxg_itxs;
itxg->itxg_itxs = NULL;
itxg->itxg_txg = 0;
@@ -1370,7 +1429,6 @@
{
uint64_t otxg, txg;
list_t *commit_list = &zilog->zl_itx_commit_list;
- uint64_t push_sod = 0;
if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */
otxg = ZILTEST_TXG;
@@ -1377,6 +1435,11 @@
else
otxg = spa_last_synced_txg(zilog->zl_spa) + 1;
+ /*
+ * This is inherently racy, since there is nothing to prevent
+ * the last synced txg from changing. That's okay since we'll
+ * only commit things in the future.
+ */
for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) {
itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK];
@@ -1386,19 +1449,26 @@
continue;
}
+ /*
+ * If we're adding itx records to the zl_itx_commit_list,
+ * then the zil better be dirty in this "txg". We can assert
+ * that here since we're holding the itxg_lock which will
+ * prevent spa_sync from cleaning it. Once we add the itxs
+ * to the zl_itx_commit_list we must commit it to disk even
+ * if it's unnecessary (i.e. the txg was synced).
+ */
+ ASSERT(zilog_is_dirty_in_txg(zilog, txg) ||
+ spa_freeze_txg(zilog->zl_spa) != UINT64_MAX);
list_move_tail(commit_list, &itxg->itxg_itxs->i_sync_list);
- push_sod += itxg->itxg_sod;
- itxg->itxg_sod = 0;
mutex_exit(&itxg->itxg_lock);
}
- atomic_add_64(&zilog->zl_itx_list_sz, -push_sod);
}
/*
* Move the async itxs for a specified object to commit into sync lists.
*/
-static void
+void
zil_async_to_sync(zilog_t *zilog, uint64_t foid)
{
uint64_t otxg, txg;
@@ -1411,6 +1481,10 @@
else
otxg = spa_last_synced_txg(zilog->zl_spa) + 1;
+ /*
+ * This is inherently racy, since there is nothing to prevent
+ * the last synced txg from changing.
+ */
for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) {
itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK];
@@ -1482,8 +1556,14 @@
DTRACE_PROBE1(zil__cw1, zilog_t *, zilog);
while (itx = list_head(&zilog->zl_itx_commit_list)) {
txg = itx->itx_lr.lrc_txg;
- ASSERT(txg);
+ ASSERT3U(txg, !=, 0);
+ /*
+ * This is inherently racy and may result in us writing
+ * out a log block for a txg that was just synced. This is
+ * ok since we'll end cleaning up that log block the next
+ * time we call zil_sync().
+ */
if (txg > spa_last_synced_txg(spa) || txg > spa_freeze_txg(spa))
lwb = zil_lwb_commit(zilog, itx, lwb);
list_remove(&zilog->zl_itx_commit_list, itx);
@@ -1494,7 +1574,7 @@
/* write the last block out */
if (lwb != NULL && lwb->lwb_zio != NULL)
- lwb = zil_lwb_write_start(zilog, lwb);
+ lwb = zil_lwb_write_start(zilog, lwb, B_TRUE);
zilog->zl_cur_used = 0;
@@ -1800,8 +1880,11 @@
mutex_exit(&zilog->zl_lock);
if (txg)
txg_wait_synced(zilog->zl_dmu_pool, txg);
- ASSERT(!zilog_is_dirty(zilog));
+ if (zilog_is_dirty(zilog))
+ zfs_dbgmsg("zil (%p) is dirty, txg %llu", zilog, txg);
+ VERIFY(!zilog_is_dirty(zilog));
+
taskq_destroy(zilog->zl_clean_taskq);
zilog->zl_clean_taskq = NULL;
zilog->zl_get_data = NULL;
@@ -1957,7 +2040,7 @@
static int
zil_replay_error(zilog_t *zilog, lr_t *lr, int error)
{
- char name[MAXNAMELEN];
+ char name[ZFS_MAX_DATASET_NAME_LEN];
zilog->zl_replaying_seq--; /* didn't actually replay this one */
@@ -2077,7 +2160,6 @@
zil_destroy(zilog, B_TRUE);
return;
}
- //printf("ZFS: Replaying ZIL on %s...\n", os->os->os_spa->spa_name);
zr.zr_replay = replay_func;
zr.zr_arg = arg;
@@ -2099,7 +2181,6 @@
zil_destroy(zilog, B_FALSE);
txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
zilog->zl_replay = B_FALSE;
- //printf("ZFS: Replay of ZIL on %s finished.\n", os->os->os_spa->spa_name);
}
boolean_t
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,9 +21,12 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
*/
+#include <sys/sysmacros.h>
#include <sys/zfs_context.h>
#include <sys/fm/fs/zfs.h>
#include <sys/spa.h>
@@ -37,13 +40,24 @@
#include <sys/arc.h>
#include <sys/ddt.h>
#include <sys/trim_map.h>
+#include <sys/blkptr.h>
+#include <sys/zfeature.h>
+#include <sys/metaslab_impl.h>
SYSCTL_DECL(_vfs_zfs);
SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO");
+#if defined(__amd64__)
+static int zio_use_uma = 1;
+#else
static int zio_use_uma = 0;
+#endif
TUNABLE_INT("vfs.zfs.zio.use_uma", &zio_use_uma);
SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, use_uma, CTLFLAG_RDTUN, &zio_use_uma, 0,
"Use uma(9) for ZIO allocations");
+static int zio_exclude_metadata = 0;
+TUNABLE_INT("vfs.zfs.zio.exclude_metadata", &zio_exclude_metadata);
+SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, exclude_metadata, CTLFLAG_RDTUN, &zio_exclude_metadata, 0,
+ "Exclude metadata buffers from dumps as well");
zio_trim_stats_t zio_trim_stats = {
{ "bytes", KSTAT_DATA_UINT64,
@@ -60,35 +74,18 @@
/*
* ==========================================================================
- * I/O priority table
- * ==========================================================================
- */
-uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = {
- 0, /* ZIO_PRIORITY_NOW */
- 0, /* ZIO_PRIORITY_SYNC_READ */
- 0, /* ZIO_PRIORITY_SYNC_WRITE */
- 0, /* ZIO_PRIORITY_LOG_WRITE */
- 1, /* ZIO_PRIORITY_CACHE_FILL */
- 1, /* ZIO_PRIORITY_AGG */
- 4, /* ZIO_PRIORITY_FREE */
- 4, /* ZIO_PRIORITY_ASYNC_WRITE */
- 6, /* ZIO_PRIORITY_ASYNC_READ */
- 10, /* ZIO_PRIORITY_RESILVER */
- 20, /* ZIO_PRIORITY_SCRUB */
- 2, /* ZIO_PRIORITY_DDT_PREFETCH */
- 30, /* ZIO_PRIORITY_TRIM */
-};
-
-/*
- * ==========================================================================
* I/O type descriptions
* ==========================================================================
*/
-char *zio_type_name[ZIO_TYPES] = {
+const char *zio_type_name[ZIO_TYPES] = {
"zio_null", "zio_read", "zio_write", "zio_free", "zio_claim",
"zio_ioctl"
};
+boolean_t zio_dva_throttle_enabled = B_TRUE;
+SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, dva_throttle_enabled, CTLFLAG_RDTUN,
+ &zio_dva_throttle_enabled, 0, "");
+
/*
* ==========================================================================
* I/O kmem caches
@@ -102,8 +99,13 @@
#ifdef _KERNEL
extern vmem_t *zio_alloc_arena;
#endif
-extern int zfs_mg_alloc_failures;
+#define ZIO_PIPELINE_CONTINUE 0x100
+#define ZIO_PIPELINE_STOP 0x101
+
+#define BP_SPANB(indblkshift, level) \
+ (((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT)))
+#define COMPARE_META_LEVEL 0x80000000ul
/*
* The following actions directly effect the spa's sync-to-convergence logic.
* The values below define the sync pass when we start performing the action.
@@ -137,12 +139,16 @@
boolean_t zio_requeue_io_start_cut_in_line = B_TRUE;
+#ifdef illumos
#ifdef ZFS_DEBUG
int zio_buf_debug_limit = 16384;
#else
int zio_buf_debug_limit = 0;
#endif
+#endif
+static void zio_taskq_dispatch(zio_t *, zio_taskq_type_t, boolean_t);
+
void
zio_init(void)
{
@@ -151,20 +157,21 @@
sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
zio_link_cache = kmem_cache_create("zio_link_cache",
sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+ if (!zio_use_uma)
+ goto out;
/*
* For small buffers, we want a cache for each multiple of
- * SPA_MINBLOCKSIZE. For medium-size buffers, we want a cache
- * for each quarter-power of 2. For large buffers, we want
- * a cache for each multiple of PAGESIZE.
+ * SPA_MINBLOCKSIZE. For larger buffers, we want a cache
+ * for each quarter-power of 2.
*/
for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
size_t p2 = size;
size_t align = 0;
- size_t cflags = (size > zio_buf_debug_limit) ? (KMC_NODEBUG|KMC_NOTOUCH) : 0;
+ int cflags = zio_exclude_metadata ? KMC_NODEBUG : 0;
- while (p2 & (p2 - 1))
+ while (!ISP2(p2))
p2 &= p2 - 1;
#ifdef illumos
@@ -181,10 +188,8 @@
#endif /* illumos */
if (size <= 4 * SPA_MINBLOCKSIZE) {
align = SPA_MINBLOCKSIZE;
- } else if (IS_P2ALIGNED(size, PAGESIZE)) {
- align = PAGESIZE;
} else if (IS_P2ALIGNED(size, p2 >> 2)) {
- align = p2 >> 2;
+ align = MIN(p2 >> 2, PAGESIZE);
}
if (align != 0) {
@@ -201,7 +206,7 @@
(void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size);
zio_data_buf_cache[c] = kmem_cache_create(name, size,
align, NULL, NULL, NULL, NULL, NULL,
- cflags | KMC_NOTOUCH);
+ cflags | KMC_NOTOUCH | KMC_NODEBUG);
}
}
@@ -214,16 +219,8 @@
if (zio_data_buf_cache[c - 1] == NULL)
zio_data_buf_cache[c - 1] = zio_data_buf_cache[c];
}
+out:
- /*
- * The zio write taskqs have 1 thread per cpu, allow 1/2 of the taskqs
- * to fail 3 times per txg or 8 failures, whichever is greater.
- */
- if (zfs_mg_alloc_failures == 0)
- zfs_mg_alloc_failures = MAX((3 * max_ncpus / 2), 8);
- else if (zfs_mg_alloc_failures < 8)
- zfs_mg_alloc_failures = 8;
-
zio_inject_init();
zio_trim_ksp = kstat_create("zfs", 0, "zio_trim", "misc",
@@ -281,19 +278,35 @@
* useful to inspect ZFS metadata, but if possible, we should avoid keeping
* excess / transient data in-core during a crashdump.
*/
-void *
-zio_buf_alloc(size_t size)
+static void *
+zio_buf_alloc_impl(size_t size, boolean_t canwait)
{
size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
+ int flags = zio_exclude_metadata ? KM_NODEBUG : 0;
- ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
+ VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
- if (zio_use_uma)
- return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE));
- else
- return (kmem_alloc(size, KM_SLEEP));
+ if (zio_use_uma) {
+ return (kmem_cache_alloc(zio_buf_cache[c],
+ canwait ? KM_PUSHPAGE : KM_NOSLEEP));
+ } else {
+ return (kmem_alloc(size,
+ (canwait ? KM_SLEEP : KM_NOSLEEP) | flags));
+ }
}
+void *
+zio_buf_alloc(size_t size)
+{
+ return (zio_buf_alloc_impl(size, B_TRUE));
+}
+
+void *
+zio_buf_alloc_nowait(size_t size)
+{
+ return (zio_buf_alloc_impl(size, B_FALSE));
+}
+
/*
* Use zio_data_buf_alloc to allocate data. The data will not appear in a
* crashdump if the kernel panics. This exists so that we will limit the amount
@@ -305,7 +318,7 @@
{
size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
- ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
+ VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
if (zio_use_uma)
return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE));
@@ -318,7 +331,7 @@
{
size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
- ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
+ VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
if (zio_use_uma)
kmem_cache_free(zio_buf_cache[c], buf);
@@ -331,7 +344,7 @@
{
size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
- ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
+ VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
if (zio_use_uma)
kmem_cache_free(zio_data_buf_cache[c], buf);
@@ -344,9 +357,9 @@
* Push and pop I/O transform buffers
* ==========================================================================
*/
-static void
+void
zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize,
- zio_transform_func_t *transform)
+ zio_transform_func_t *transform)
{
zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP);
@@ -362,7 +375,7 @@
zio->io_size = size;
}
-static void
+void
zio_pop_transforms(zio_t *zio)
{
zio_transform_t *zt;
@@ -411,52 +424,39 @@
* I/O parent/child relationships and pipeline interlocks
* ==========================================================================
*/
-/*
- * NOTE - Callers to zio_walk_parents() and zio_walk_children must
- * continue calling these functions until they return NULL.
- * Otherwise, the next caller will pick up the list walk in
- * some indeterminate state. (Otherwise every caller would
- * have to pass in a cookie to keep the state represented by
- * io_walk_link, which gets annoying.)
- */
zio_t *
-zio_walk_parents(zio_t *cio)
+zio_walk_parents(zio_t *cio, zio_link_t **zl)
{
- zio_link_t *zl = cio->io_walk_link;
list_t *pl = &cio->io_parent_list;
- zl = (zl == NULL) ? list_head(pl) : list_next(pl, zl);
- cio->io_walk_link = zl;
-
- if (zl == NULL)
+ *zl = (*zl == NULL) ? list_head(pl) : list_next(pl, *zl);
+ if (*zl == NULL)
return (NULL);
- ASSERT(zl->zl_child == cio);
- return (zl->zl_parent);
+ ASSERT((*zl)->zl_child == cio);
+ return ((*zl)->zl_parent);
}
zio_t *
-zio_walk_children(zio_t *pio)
+zio_walk_children(zio_t *pio, zio_link_t **zl)
{
- zio_link_t *zl = pio->io_walk_link;
list_t *cl = &pio->io_child_list;
- zl = (zl == NULL) ? list_head(cl) : list_next(cl, zl);
- pio->io_walk_link = zl;
-
- if (zl == NULL)
+ *zl = (*zl == NULL) ? list_head(cl) : list_next(cl, *zl);
+ if (*zl == NULL)
return (NULL);
- ASSERT(zl->zl_parent == pio);
- return (zl->zl_child);
+ ASSERT((*zl)->zl_parent == pio);
+ return ((*zl)->zl_child);
}
zio_t *
zio_unique_parent(zio_t *cio)
{
- zio_t *pio = zio_walk_parents(cio);
+ zio_link_t *zl = NULL;
+ zio_t *pio = zio_walk_parents(cio, &zl);
- VERIFY(zio_walk_parents(cio) == NULL);
+ VERIFY3P(zio_walk_parents(cio, &zl), ==, NULL);
return (pio);
}
@@ -516,20 +516,26 @@
}
static boolean_t
-zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait)
+zio_wait_for_children(zio_t *zio, uint8_t childbits, enum zio_wait_type wait)
{
- uint64_t *countp = &zio->io_children[child][wait];
boolean_t waiting = B_FALSE;
mutex_enter(&zio->io_lock);
ASSERT(zio->io_stall == NULL);
- if (*countp != 0) {
- zio->io_stage >>= 1;
- zio->io_stall = countp;
- waiting = B_TRUE;
+ for (int c = 0; c < ZIO_CHILD_TYPES; c++) {
+ if (!(ZIO_CHILD_BIT_IS_SET(childbits, c)))
+ continue;
+
+ uint64_t *countp = &zio->io_children[c][wait];
+ if (*countp != 0) {
+ zio->io_stage >>= 1;
+ ASSERT3U(zio->io_stage, !=, ZIO_STAGE_OPEN);
+ zio->io_stall = countp;
+ waiting = B_TRUE;
+ break;
+ }
}
mutex_exit(&zio->io_lock);
-
return (waiting);
}
@@ -544,10 +550,22 @@
*errorp = zio_worst_error(*errorp, zio->io_error);
pio->io_reexecute |= zio->io_reexecute;
ASSERT3U(*countp, >, 0);
- if (--*countp == 0 && pio->io_stall == countp) {
+
+ (*countp)--;
+
+ if (*countp == 0 && pio->io_stall == countp) {
+ zio_taskq_type_t type =
+ pio->io_stage < ZIO_STAGE_VDEV_IO_START ? ZIO_TASKQ_ISSUE :
+ ZIO_TASKQ_INTERRUPT;
pio->io_stall = NULL;
mutex_exit(&pio->io_lock);
- zio_execute(pio);
+ /*
+ * Dispatch the parent zio in its own taskq so that
+ * the child can continue to make progress. This also
+ * prevents overflowing the stack when we have deeply nested
+ * parent-child relationships.
+ */
+ zio_taskq_dispatch(pio, type, B_FALSE);
} else {
mutex_exit(&pio->io_lock);
}
@@ -560,6 +578,45 @@
zio->io_error = zio->io_child_error[c];
}
+int
+zio_timestamp_compare(const void *x1, const void *x2)
+{
+ const zio_t *z1 = x1;
+ const zio_t *z2 = x2;
+
+ if (z1->io_queued_timestamp < z2->io_queued_timestamp)
+ return (-1);
+ if (z1->io_queued_timestamp > z2->io_queued_timestamp)
+ return (1);
+
+ if (z1->io_bookmark.zb_objset < z2->io_bookmark.zb_objset)
+ return (-1);
+ if (z1->io_bookmark.zb_objset > z2->io_bookmark.zb_objset)
+ return (1);
+
+ if (z1->io_bookmark.zb_object < z2->io_bookmark.zb_object)
+ return (-1);
+ if (z1->io_bookmark.zb_object > z2->io_bookmark.zb_object)
+ return (1);
+
+ if (z1->io_bookmark.zb_level < z2->io_bookmark.zb_level)
+ return (-1);
+ if (z1->io_bookmark.zb_level > z2->io_bookmark.zb_level)
+ return (1);
+
+ if (z1->io_bookmark.zb_blkid < z2->io_bookmark.zb_blkid)
+ return (-1);
+ if (z1->io_bookmark.zb_blkid > z2->io_bookmark.zb_blkid)
+ return (1);
+
+ if (z1 < z2)
+ return (-1);
+ if (z1 > z2)
+ return (1);
+
+ return (0);
+}
+
/*
* ==========================================================================
* Create the various types of I/O (read, write, free, etc)
@@ -568,8 +625,8 @@
static zio_t *
zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
void *data, uint64_t size, zio_done_func_t *done, void *private,
- zio_type_t type, int priority, enum zio_flag flags,
- vdev_t *vd, uint64_t offset, const zbookmark_t *zb,
+ zio_type_t type, zio_priority_t priority, enum zio_flag flags,
+ vdev_t *vd, uint64_t offset, const zbookmark_phys_t *zb,
enum zio_stage stage, enum zio_stage pipeline)
{
zio_t *zio;
@@ -628,6 +685,7 @@
zio->io_orig_flags = zio->io_flags = flags;
zio->io_orig_stage = zio->io_stage = stage;
zio->io_orig_pipeline = zio->io_pipeline = pipeline;
+ zio->io_pipeline_trace = ZIO_STAGE_OPEN;
zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY);
zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE);
@@ -675,13 +733,97 @@
return (zio_null(NULL, spa, NULL, done, private, flags));
}
+void
+zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp)
+{
+ if (!DMU_OT_IS_VALID(BP_GET_TYPE(bp))) {
+ zfs_panic_recover("blkptr at %p has invalid TYPE %llu",
+ bp, (longlong_t)BP_GET_TYPE(bp));
+ }
+ if (BP_GET_CHECKSUM(bp) >= ZIO_CHECKSUM_FUNCTIONS ||
+ BP_GET_CHECKSUM(bp) <= ZIO_CHECKSUM_ON) {
+ zfs_panic_recover("blkptr at %p has invalid CHECKSUM %llu",
+ bp, (longlong_t)BP_GET_CHECKSUM(bp));
+ }
+ if (BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_FUNCTIONS ||
+ BP_GET_COMPRESS(bp) <= ZIO_COMPRESS_ON) {
+ zfs_panic_recover("blkptr at %p has invalid COMPRESS %llu",
+ bp, (longlong_t)BP_GET_COMPRESS(bp));
+ }
+ if (BP_GET_LSIZE(bp) > SPA_MAXBLOCKSIZE) {
+ zfs_panic_recover("blkptr at %p has invalid LSIZE %llu",
+ bp, (longlong_t)BP_GET_LSIZE(bp));
+ }
+ if (BP_GET_PSIZE(bp) > SPA_MAXBLOCKSIZE) {
+ zfs_panic_recover("blkptr at %p has invalid PSIZE %llu",
+ bp, (longlong_t)BP_GET_PSIZE(bp));
+ }
+
+ if (BP_IS_EMBEDDED(bp)) {
+ if (BPE_GET_ETYPE(bp) > NUM_BP_EMBEDDED_TYPES) {
+ zfs_panic_recover("blkptr at %p has invalid ETYPE %llu",
+ bp, (longlong_t)BPE_GET_ETYPE(bp));
+ }
+ }
+
+ /*
+ * Pool-specific checks.
+ *
+ * Note: it would be nice to verify that the blk_birth and
+ * BP_PHYSICAL_BIRTH() are not too large. However, spa_freeze()
+ * allows the birth time of log blocks (and dmu_sync()-ed blocks
+ * that are in the log) to be arbitrarily large.
+ */
+ for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
+ uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[i]);
+ if (vdevid >= spa->spa_root_vdev->vdev_children) {
+ zfs_panic_recover("blkptr at %p DVA %u has invalid "
+ "VDEV %llu",
+ bp, i, (longlong_t)vdevid);
+ continue;
+ }
+ vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid];
+ if (vd == NULL) {
+ zfs_panic_recover("blkptr at %p DVA %u has invalid "
+ "VDEV %llu",
+ bp, i, (longlong_t)vdevid);
+ continue;
+ }
+ if (vd->vdev_ops == &vdev_hole_ops) {
+ zfs_panic_recover("blkptr at %p DVA %u has hole "
+ "VDEV %llu",
+ bp, i, (longlong_t)vdevid);
+ continue;
+ }
+ if (vd->vdev_ops == &vdev_missing_ops) {
+ /*
+ * "missing" vdevs are valid during import, but we
+ * don't have their detailed info (e.g. asize), so
+ * we can't perform any more checks on them.
+ */
+ continue;
+ }
+ uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
+ uint64_t asize = DVA_GET_ASIZE(&bp->blk_dva[i]);
+ if (BP_IS_GANG(bp))
+ asize = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
+ if (offset + asize > vd->vdev_asize) {
+ zfs_panic_recover("blkptr at %p DVA %u has invalid "
+ "OFFSET %llu",
+ bp, i, (longlong_t)offset);
+ }
+ }
+}
+
zio_t *
zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
void *data, uint64_t size, zio_done_func_t *done, void *private,
- int priority, enum zio_flag flags, const zbookmark_t *zb)
+ zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb)
{
zio_t *zio;
+ zfs_blkptr_verify(spa, bp);
+
zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp,
data, size, done, private,
ZIO_TYPE_READ, priority, flags, NULL, 0, zb,
@@ -694,8 +836,10 @@
zio_t *
zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
void *data, uint64_t size, const zio_prop_t *zp,
- zio_done_func_t *ready, zio_done_func_t *done, void *private,
- int priority, enum zio_flag flags, const zbookmark_t *zb)
+ zio_done_func_t *ready, zio_done_func_t *children_ready,
+ zio_done_func_t *physdone, zio_done_func_t *done,
+ void *private, zio_priority_t priority, enum zio_flag flags,
+ const zbookmark_phys_t *zb)
{
zio_t *zio;
@@ -714,20 +858,32 @@
ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE);
zio->io_ready = ready;
+ zio->io_children_ready = children_ready;
+ zio->io_physdone = physdone;
zio->io_prop = *zp;
+ /*
+ * Data can be NULL if we are going to call zio_write_override() to
+ * provide the already-allocated BP. But we may need the data to
+ * verify a dedup hit (if requested). In this case, don't try to
+ * dedup (just take the already-allocated BP verbatim).
+ */
+ if (data == NULL && zio->io_prop.zp_dedup_verify) {
+ zio->io_prop.zp_dedup = zio->io_prop.zp_dedup_verify = B_FALSE;
+ }
+
return (zio);
}
zio_t *
zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data,
- uint64_t size, zio_done_func_t *done, void *private, int priority,
- enum zio_flag flags, zbookmark_t *zb)
+ uint64_t size, zio_done_func_t *done, void *private,
+ zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb)
{
zio_t *zio;
zio = zio_create(pio, spa, txg, bp, data, size, done, private,
- ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
+ ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_IO_REWRITE, NULL, 0, zb,
ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE);
return (zio);
@@ -755,8 +911,30 @@
void
zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp)
{
+
+ /*
+ * The check for EMBEDDED is a performance optimization. We
+ * process the free here (by ignoring it) rather than
+ * putting it on the list and then processing it in zio_free_sync().
+ */
+ if (BP_IS_EMBEDDED(bp))
+ return;
metaslab_check_free(spa, bp);
- bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp);
+
+ /*
+ * Frees that are for the currently-syncing txg, are not going to be
+ * deferred, and which will not need to do a read (i.e. not GANG or
+ * DEDUP), can be processed immediately. Otherwise, put them on the
+ * in-memory list for later processing.
+ */
+ if (zfs_trim_enabled || BP_IS_GANG(bp) || BP_GET_DEDUP(bp) ||
+ txg != spa->spa_syncing_txg ||
+ spa_sync_pass(spa) >= zfs_sync_pass_deferred_free) {
+ bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp);
+ } else {
+ VERIFY0(zio_wait(zio_free_sync(NULL, spa, txg, bp,
+ BP_GET_PSIZE(bp), 0)));
+ }
}
zio_t *
@@ -764,20 +942,34 @@
uint64_t size, enum zio_flag flags)
{
zio_t *zio;
+ enum zio_stage stage = ZIO_FREE_PIPELINE;
- dprintf_bp(bp, "freeing in txg %llu, pass %u",
- (longlong_t)txg, spa->spa_sync_pass);
-
ASSERT(!BP_IS_HOLE(bp));
ASSERT(spa_syncing_txg(spa) == txg);
ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free);
+ if (BP_IS_EMBEDDED(bp))
+ return (zio_null(pio, spa, NULL, NULL, NULL, 0));
+
metaslab_check_free(spa, bp);
arc_freed(spa, bp);
+ if (zfs_trim_enabled)
+ stage |= ZIO_STAGE_ISSUE_ASYNC | ZIO_STAGE_VDEV_IO_START |
+ ZIO_STAGE_VDEV_IO_ASSESS;
+ /*
+ * GANG and DEDUP blocks can induce a read (for the gang block header,
+ * or the DDT), so issue them asynchronously so that this thread is
+ * not tied up.
+ */
+ else if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp))
+ stage |= ZIO_STAGE_ISSUE_ASYNC;
+
+ flags |= ZIO_FLAG_DONT_QUEUE;
+
zio = zio_create(pio, spa, txg, bp, NULL, size,
- NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, flags,
- NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE);
+ NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW, flags,
+ NULL, 0, NULL, ZIO_STAGE_OPEN, stage);
return (zio);
}
@@ -788,6 +980,11 @@
{
zio_t *zio;
+ dprintf_bp(bp, "claiming in txg %llu", txg);
+
+ if (BP_IS_EMBEDDED(bp))
+ return (zio_null(pio, spa, NULL, NULL, NULL, 0));
+
/*
* A claim is an allocation of a specific block. Claims are needed
* to support immediate writes in the intent log. The issue is that
@@ -807,6 +1004,7 @@
zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags,
NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE);
+ ASSERT0(zio->io_queued_timestamp);
return (zio);
}
@@ -813,8 +1011,8 @@
zio_t *
zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, uint64_t offset,
- uint64_t size, zio_done_func_t *done, void *private, int priority,
- enum zio_flag flags)
+ uint64_t size, zio_done_func_t *done, void *private,
+ zio_priority_t priority, enum zio_flag flags)
{
zio_t *zio;
int c;
@@ -839,7 +1037,7 @@
zio_t *
zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
void *data, int checksum, zio_done_func_t *done, void *private,
- int priority, enum zio_flag flags, boolean_t labels)
+ zio_priority_t priority, enum zio_flag flags, boolean_t labels)
{
zio_t *zio;
@@ -849,8 +1047,8 @@
ASSERT3U(offset + size, <=, vd->vdev_psize);
zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
- ZIO_TYPE_READ, priority, flags, vd, offset, NULL,
- ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
+ ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset,
+ NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
zio->io_prop.zp_checksum = checksum;
@@ -860,7 +1058,7 @@
zio_t *
zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
void *data, int checksum, zio_done_func_t *done, void *private,
- int priority, enum zio_flag flags, boolean_t labels)
+ zio_priority_t priority, enum zio_flag flags, boolean_t labels)
{
zio_t *zio;
@@ -870,12 +1068,12 @@
ASSERT3U(offset + size, <=, vd->vdev_psize);
zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
- ZIO_TYPE_WRITE, priority, flags, vd, offset, NULL,
- ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
+ ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset,
+ NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
zio->io_prop.zp_checksum = checksum;
- if (zio_checksum_table[checksum].ci_eck) {
+ if (zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
/*
* zec checksums are necessarily destructive -- they modify
* the end of the write buffer to hold the verifier/checksum.
@@ -895,8 +1093,8 @@
*/
zio_t *
zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
- void *data, uint64_t size, int type, int priority, enum zio_flag flags,
- zio_done_func_t *done, void *private)
+ void *data, uint64_t size, int type, zio_priority_t priority,
+ enum zio_flag flags, zio_done_func_t *done, void *private)
{
enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE;
zio_t *zio;
@@ -915,6 +1113,10 @@
pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
}
+ /* Not all IO types require vdev io done stage e.g. free */
+ if (!(pio->io_pipeline & ZIO_STAGE_VDEV_IO_DONE))
+ pipeline &= ~ZIO_STAGE_VDEV_IO_DONE;
+
if (vd->vdev_children == 0)
offset += VDEV_LABEL_START_SIZE;
@@ -927,17 +1129,42 @@
if (flags & ZIO_FLAG_IO_REPAIR)
flags &= ~ZIO_FLAG_SPECULATIVE;
+ /*
+ * If we're creating a child I/O that is not associated with a
+ * top-level vdev, then the child zio is not an allocating I/O.
+ * If this is a retried I/O then we ignore it since we will
+ * have already processed the original allocating I/O.
+ */
+ if (flags & ZIO_FLAG_IO_ALLOCATING &&
+ (vd != vd->vdev_top || (flags & ZIO_FLAG_IO_RETRY))) {
+ metaslab_class_t *mc = spa_normal_class(pio->io_spa);
+
+ ASSERT(mc->mc_alloc_throttle_enabled);
+ ASSERT(type == ZIO_TYPE_WRITE);
+ ASSERT(priority == ZIO_PRIORITY_ASYNC_WRITE);
+ ASSERT(!(flags & ZIO_FLAG_IO_REPAIR));
+ ASSERT(!(pio->io_flags & ZIO_FLAG_IO_REWRITE) ||
+ pio->io_child_type == ZIO_CHILD_GANG);
+
+ flags &= ~ZIO_FLAG_IO_ALLOCATING;
+ }
+
zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size,
done, private, type, priority, flags, vd, offset, &pio->io_bookmark,
ZIO_STAGE_VDEV_IO_START >> 1, pipeline);
+ ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV);
+ zio->io_physdone = pio->io_physdone;
+ if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL)
+ zio->io_logical->io_phys_children++;
+
return (zio);
}
zio_t *
zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size,
- int type, int priority, enum zio_flag flags,
- zio_done_func_t *done, void *private)
+ int type, zio_priority_t priority, enum zio_flag flags,
+ zio_done_func_t *done, void *private)
{
zio_t *zio;
@@ -945,7 +1172,7 @@
zio = zio_create(NULL, vd->vdev_spa, 0, NULL,
data, size, done, private, type, priority,
- flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY,
+ flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED,
vd, offset, NULL,
ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE);
@@ -966,9 +1193,10 @@
ASSERT(vd->vdev_ops->vdev_op_leaf);
- return zio_ioctl(zio, spa, vd, DKIOCTRIM, offset, size,
- NULL, NULL, ZIO_PRIORITY_TRIM,
- ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY);
+ return (zio_create(zio, spa, 0, NULL, NULL, size, NULL, NULL,
+ ZIO_TYPE_FREE, ZIO_PRIORITY_TRIM, ZIO_FLAG_DONT_AGGREGATE |
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY,
+ vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PHYS_PIPELINE));
}
void
@@ -1002,12 +1230,20 @@
if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF &&
zio->io_child_type == ZIO_CHILD_LOGICAL &&
!(zio->io_flags & ZIO_FLAG_RAW)) {
- uint64_t psize = BP_GET_PSIZE(bp);
+ uint64_t psize =
+ BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp);
void *cbuf = zio_buf_alloc(psize);
zio_push_transform(zio, cbuf, psize, psize, zio_decompress);
}
+ if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) {
+ zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+ decode_embedded_bp_compressed(bp, zio->io_data);
+ } else {
+ ASSERT(!BP_IS_EMBEDDED(bp));
+ }
+
if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0)
zio->io_flags |= ZIO_FLAG_DONT_CACHE;
@@ -1023,22 +1259,6 @@
static int
zio_write_bp_init(zio_t *zio)
{
- spa_t *spa = zio->io_spa;
- zio_prop_t *zp = &zio->io_prop;
- enum zio_compress compress = zp->zp_compress;
- blkptr_t *bp = zio->io_bp;
- uint64_t lsize = zio->io_size;
- uint64_t psize = lsize;
- int pass = 1;
-
- /*
- * If our children haven't all reached the ready stage,
- * wait for them and then repeat this pipeline stage.
- */
- if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
- zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY))
- return (ZIO_PIPELINE_STOP);
-
if (!IO_IS_ALLOCATING(zio))
return (ZIO_PIPELINE_CONTINUE);
@@ -1045,6 +1265,9 @@
ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
if (zio->io_bp_override) {
+ blkptr_t *bp = zio->io_bp;
+ zio_prop_t *zp = &zio->io_prop;
+
ASSERT(bp->blk_birth != zio->io_txg);
ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0);
@@ -1051,6 +1274,9 @@
*bp = *zio->io_bp_override;
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+ if (BP_IS_EMBEDDED(bp))
+ return (ZIO_PIPELINE_CONTINUE);
+
/*
* If we've been overridden and nopwrite is set then
* set the flag accordingly to indicate that a nopwrite
@@ -1058,6 +1284,7 @@
*/
if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) {
ASSERT(!zp->zp_dedup);
+ ASSERT3U(BP_GET_CHECKSUM(bp), ==, zp->zp_checksum);
zio->io_flags |= ZIO_FLAG_NOPWRITE;
return (ZIO_PIPELINE_CONTINUE);
}
@@ -1067,8 +1294,8 @@
if (BP_IS_HOLE(bp) || !zp->zp_dedup)
return (ZIO_PIPELINE_CONTINUE);
- ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup ||
- zp->zp_dedup_verify);
+ ASSERT((zio_checksum_table[zp->zp_checksum].ci_flags &
+ ZCHECKSUM_FLAG_DEDUP) || zp->zp_dedup_verify);
if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) {
BP_SET_DEDUP(bp, 1);
@@ -1075,12 +1302,57 @@
zio->io_pipeline |= ZIO_STAGE_DDT_WRITE;
return (ZIO_PIPELINE_CONTINUE);
}
+
+ /*
+ * We were unable to handle this as an override bp, treat
+ * it as a regular write I/O.
+ */
zio->io_bp_override = NULL;
- BP_ZERO(bp);
+ *bp = zio->io_bp_orig;
+ zio->io_pipeline = zio->io_orig_pipeline;
}
- if (bp->blk_birth == zio->io_txg) {
+ return (ZIO_PIPELINE_CONTINUE);
+}
+
+static int
+zio_write_compress(zio_t *zio)
+{
+ spa_t *spa = zio->io_spa;
+ zio_prop_t *zp = &zio->io_prop;
+ enum zio_compress compress = zp->zp_compress;
+ blkptr_t *bp = zio->io_bp;
+ uint64_t lsize = zio->io_size;
+ uint64_t psize = lsize;
+ int pass = 1;
+
+ /*
+ * If our children haven't all reached the ready stage,
+ * wait for them and then repeat this pipeline stage.
+ */
+ if (zio_wait_for_children(zio, ZIO_CHILD_LOGICAL_BIT |
+ ZIO_CHILD_GANG_BIT, ZIO_WAIT_READY)) {
+ return (ZIO_PIPELINE_STOP);
+ }
+
+ if (!IO_IS_ALLOCATING(zio))
+ return (ZIO_PIPELINE_CONTINUE);
+
+ if (zio->io_children_ready != NULL) {
/*
+ * Now that all our children are ready, run the callback
+ * associated with this zio in case it wants to modify the
+ * data to be written.
+ */
+ ASSERT3U(zp->zp_level, >, 0);
+ zio->io_children_ready(zio);
+ }
+
+ ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
+ ASSERT(zio->io_bp_override == NULL);
+
+ if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) {
+ /*
* We're rewriting an existing block, which means we're
* working on behalf of spa_sync(). For spa_sync() to
* converge, it must eventually be the case that we don't
@@ -1099,7 +1371,7 @@
compress = ZIO_COMPRESS_OFF;
/* Make sure someone doesn't change their mind on overwrites */
- ASSERT(MIN(zp->zp_copies + BP_IS_GANG(bp),
+ ASSERT(BP_IS_EMBEDDED(bp) || MIN(zp->zp_copies + BP_IS_GANG(bp),
spa_max_replication(spa)) == BP_GET_NDVAS(bp));
}
@@ -1109,10 +1381,51 @@
if (psize == 0 || psize == lsize) {
compress = ZIO_COMPRESS_OFF;
zio_buf_free(cbuf, lsize);
+ } else if (!zp->zp_dedup && psize <= BPE_PAYLOAD_SIZE &&
+ zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) &&
+ spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) {
+ encode_embedded_bp_compressed(bp,
+ cbuf, compress, lsize, psize);
+ BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA);
+ BP_SET_TYPE(bp, zio->io_prop.zp_type);
+ BP_SET_LEVEL(bp, zio->io_prop.zp_level);
+ zio_buf_free(cbuf, lsize);
+ bp->blk_birth = zio->io_txg;
+ zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+ ASSERT(spa_feature_is_active(spa,
+ SPA_FEATURE_EMBEDDED_DATA));
+ return (ZIO_PIPELINE_CONTINUE);
} else {
- ASSERT(psize < lsize);
- zio_push_transform(zio, cbuf, psize, lsize, NULL);
+ /*
+ * Round up compressed size up to the ashift
+ * of the smallest-ashift device, and zero the tail.
+ * This ensures that the compressed size of the BP
+ * (and thus compressratio property) are correct,
+ * in that we charge for the padding used to fill out
+ * the last sector.
+ */
+ ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT);
+ size_t rounded = (size_t)P2ROUNDUP(psize,
+ 1ULL << spa->spa_min_ashift);
+ if (rounded >= lsize) {
+ compress = ZIO_COMPRESS_OFF;
+ zio_buf_free(cbuf, lsize);
+ psize = lsize;
+ } else {
+ bzero((char *)cbuf + psize, rounded - psize);
+ psize = rounded;
+ zio_push_transform(zio, cbuf,
+ psize, lsize, NULL);
+ }
}
+
+ /*
+ * We were unable to handle this as an override bp, treat
+ * it as a regular write I/O.
+ */
+ zio->io_bp_override = NULL;
+ *bp = zio->io_bp_orig;
+ zio->io_pipeline = zio->io_orig_pipeline;
}
/*
@@ -1123,7 +1436,8 @@
* spa_sync() to allocate new blocks, but force rewrites after that.
* There should only be a handful of blocks after pass 1 in any case.
*/
- if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == psize &&
+ if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg &&
+ BP_GET_PSIZE(bp) == psize &&
pass >= zfs_sync_pass_rewrite) {
ASSERT(psize != 0);
enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES;
@@ -1135,15 +1449,22 @@
}
if (psize == 0) {
+ if (zio->io_bp_orig.blk_birth != 0 &&
+ spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) {
+ BP_SET_LSIZE(bp, lsize);
+ BP_SET_TYPE(bp, zp->zp_type);
+ BP_SET_LEVEL(bp, zp->zp_level);
+ BP_SET_BIRTH(bp, zio->io_txg, 0);
+ }
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
} else {
ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER);
BP_SET_LSIZE(bp, lsize);
+ BP_SET_TYPE(bp, zp->zp_type);
+ BP_SET_LEVEL(bp, zp->zp_level);
BP_SET_PSIZE(bp, psize);
BP_SET_COMPRESS(bp, compress);
BP_SET_CHECKSUM(bp, zp->zp_checksum);
- BP_SET_TYPE(bp, zp->zp_type);
- BP_SET_LEVEL(bp, zp->zp_level);
BP_SET_DEDUP(bp, zp->zp_dedup);
BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
if (zp->zp_dedup) {
@@ -1157,7 +1478,6 @@
zio->io_pipeline |= ZIO_STAGE_NOP_WRITE;
}
}
-
return (ZIO_PIPELINE_CONTINUE);
}
@@ -1181,11 +1501,11 @@
*/
static void
-zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q, boolean_t cutinline)
+zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline)
{
spa_t *spa = zio->io_spa;
zio_type_t t = zio->io_type;
- int flags = TQ_SLEEP | (cutinline ? TQ_FRONT : 0);
+ int flags = (cutinline ? TQ_FRONT : 0);
ASSERT(q == ZIO_TASKQ_ISSUE || q == ZIO_TASKQ_INTERRUPT);
@@ -1204,31 +1524,43 @@
t = ZIO_TYPE_NULL;
/*
- * If this is a high priority I/O, then use the high priority taskq.
+ * If this is a high priority I/O, then use the high priority taskq if
+ * available.
*/
if (zio->io_priority == ZIO_PRIORITY_NOW &&
- spa->spa_zio_taskq[t][q + 1] != NULL)
+ spa->spa_zio_taskq[t][q + 1].stqs_count != 0)
q++;
ASSERT3U(q, <, ZIO_TASKQ_TYPES);
-#ifdef _KERNEL
- (void) taskq_dispatch_safe(spa->spa_zio_taskq[t][q],
- (task_func_t *)zio_execute, zio, flags, &zio->io_task);
+
+ /*
+ * NB: We are assuming that the zio can only be dispatched
+ * to a single taskq at a time. It would be a grievous error
+ * to dispatch the zio to another taskq at the same time.
+ */
+#if defined(illumos) || !defined(_KERNEL)
+ ASSERT(zio->io_tqent.tqent_next == NULL);
#else
- (void) taskq_dispatch(spa->spa_zio_taskq[t][q],
- (task_func_t *)zio_execute, zio, flags);
+ ASSERT(zio->io_tqent.tqent_task.ta_pending == 0);
#endif
+ spa_taskq_dispatch_ent(spa, t, q, (task_func_t *)zio_execute, zio,
+ flags, &zio->io_tqent);
}
static boolean_t
-zio_taskq_member(zio_t *zio, enum zio_taskq_type q)
+zio_taskq_member(zio_t *zio, zio_taskq_type_t q)
{
kthread_t *executor = zio->io_executor;
spa_t *spa = zio->io_spa;
- for (zio_type_t t = 0; t < ZIO_TYPES; t++)
- if (taskq_member(spa->spa_zio_taskq[t][q], executor))
- return (B_TRUE);
+ for (zio_type_t t = 0; t < ZIO_TYPES; t++) {
+ spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
+ uint_t i;
+ for (i = 0; i < tqs->stqs_count; i++) {
+ if (taskq_member(tqs->stqs_taskq[i], executor))
+ return (B_TRUE);
+ }
+ }
return (B_FALSE);
}
@@ -1247,6 +1579,58 @@
zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE);
}
+void
+zio_delay_interrupt(zio_t *zio)
+{
+ /*
+ * The timeout_generic() function isn't defined in userspace, so
+ * rather than trying to implement the function, the zio delay
+ * functionality has been disabled for userspace builds.
+ */
+
+#ifdef _KERNEL
+ /*
+ * If io_target_timestamp is zero, then no delay has been registered
+ * for this IO, thus jump to the end of this function and "skip" the
+ * delay; issuing it directly to the zio layer.
+ */
+ if (zio->io_target_timestamp != 0) {
+ hrtime_t now = gethrtime();
+
+ if (now >= zio->io_target_timestamp) {
+ /*
+ * This IO has already taken longer than the target
+ * delay to complete, so we don't want to delay it
+ * any longer; we "miss" the delay and issue it
+ * directly to the zio layer. This is likely due to
+ * the target latency being set to a value less than
+ * the underlying hardware can satisfy (e.g. delay
+ * set to 1ms, but the disks take 10ms to complete an
+ * IO request).
+ */
+
+ DTRACE_PROBE2(zio__delay__miss, zio_t *, zio,
+ hrtime_t, now);
+
+ zio_interrupt(zio);
+ } else {
+ hrtime_t diff = zio->io_target_timestamp - now;
+
+ DTRACE_PROBE3(zio__delay__hit, zio_t *, zio,
+ hrtime_t, now, hrtime_t, diff);
+
+ (void) timeout_generic(CALLOUT_NORMAL,
+ (void (*)(void *))zio_interrupt, zio, diff, 1, 0);
+ }
+
+ return;
+ }
+#endif
+
+ DTRACE_PROBE1(zio__delay__skip, zio_t *, zio);
+ zio_interrupt(zio);
+}
+
/*
* Execute the I/O pipeline until one of the following occurs:
*
@@ -1270,6 +1654,8 @@
{
zio->io_executor = curthread;
+ ASSERT3U(zio->io_queued_timestamp, >, 0);
+
while (zio->io_stage < ZIO_STAGE_DONE) {
enum zio_stage pipeline = zio->io_pipeline;
enum zio_stage stage = zio->io_stage;
@@ -1303,7 +1689,8 @@
}
zio->io_stage = stage;
- rv = zio_pipeline[highbit(stage) - 1](zio);
+ zio->io_pipeline_trace |= zio->io_stage;
+ rv = zio_pipeline[highbit64(stage) - 1](zio);
if (rv == ZIO_PIPELINE_STOP)
return;
@@ -1326,6 +1713,8 @@
ASSERT(zio->io_executor == NULL);
zio->io_waiter = curthread;
+ ASSERT0(zio->io_queued_timestamp);
+ zio->io_queued_timestamp = gethrtime();
zio_execute(zio);
@@ -1354,9 +1743,11 @@
*/
spa_t *spa = zio->io_spa;
- zio_add_child(spa->spa_async_zio_root, zio);
+ zio_add_child(spa->spa_async_zio_root[CPU_SEQID], zio);
}
+ ASSERT0(zio->io_queued_timestamp);
+ zio->io_queued_timestamp = gethrtime();
zio_execute(zio);
}
@@ -1381,6 +1772,7 @@
pio->io_pipeline = pio->io_orig_pipeline;
pio->io_reexecute = 0;
pio->io_flags |= ZIO_FLAG_REEXECUTED;
+ pio->io_pipeline_trace = 0;
pio->io_error = 0;
for (int w = 0; w < ZIO_WAIT_TYPES; w++)
pio->io_state[w] = 0;
@@ -1397,8 +1789,9 @@
* the remainder of pio's io_child_list, from 'cio_next' onward,
* cannot be affected by any side effects of reexecuting 'cio'.
*/
- for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) {
- cio_next = zio_walk_children(pio);
+ zio_link_t *zl = NULL;
+ for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) {
+ cio_next = zio_walk_children(pio, &zl);
mutex_enter(&pio->io_lock);
for (int w = 0; w < ZIO_WAIT_TYPES; w++)
pio->io_children[cio->io_child_type][w]++;
@@ -1411,8 +1804,10 @@
* We don't reexecute "The Godfather" I/O here as it's the
* responsibility of the caller to wait on him.
*/
- if (!(pio->io_flags & ZIO_FLAG_GODFATHER))
+ if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) {
+ pio->io_queued_timestamp = gethrtime();
zio_execute(pio);
+ }
}
void
@@ -1756,8 +2151,9 @@
{
blkptr_t *bp = zio->io_bp;
- if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE))
+ if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT, ZIO_WAIT_DONE)) {
return (ZIO_PIPELINE_STOP);
+ }
ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio);
ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
@@ -1806,6 +2202,7 @@
zio_write_gang_block(zio_t *pio)
{
spa_t *spa = pio->io_spa;
+ metaslab_class_t *mc = spa_normal_class(spa);
blkptr_t *bp = pio->io_bp;
zio_t *gio = pio->io_gang_leader;
zio_t *zio;
@@ -1819,10 +2216,43 @@
zio_prop_t zp;
int error;
- error = metaslab_alloc(spa, spa_normal_class(spa), SPA_GANGBLOCKSIZE,
- bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp,
- METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER);
+ int flags = METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER;
+ if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
+ ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
+ ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA));
+
+ flags |= METASLAB_ASYNC_ALLOC;
+ VERIFY(refcount_held(&mc->mc_alloc_slots, pio));
+
+ /*
+ * The logical zio has already placed a reservation for
+ * 'copies' allocation slots but gang blocks may require
+ * additional copies. These additional copies
+ * (i.e. gbh_copies - copies) are guaranteed to succeed
+ * since metaslab_class_throttle_reserve() always allows
+ * additional reservations for gang blocks.
+ */
+ VERIFY(metaslab_class_throttle_reserve(mc, gbh_copies - copies,
+ pio, flags));
+ }
+
+ error = metaslab_alloc(spa, mc, SPA_GANGBLOCKSIZE,
+ bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, flags, pio);
if (error) {
+ if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
+ ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
+ ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA));
+
+ /*
+ * If we failed to allocate the gang block header then
+ * we remove any additional allocation reservations that
+ * we placed here. The original reservation will
+ * be removed when the logical I/O goes to the ready
+ * stage.
+ */
+ metaslab_class_throttle_unreserve(mc,
+ gbh_copies - copies, pio);
+ }
pio->io_error = error;
return (ZIO_PIPELINE_CONTINUE);
}
@@ -1861,11 +2291,25 @@
zp.zp_dedup_verify = B_FALSE;
zp.zp_nopwrite = B_FALSE;
- zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
+ zio_t *cio = zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
(char *)pio->io_data + (pio->io_size - resid), lsize, &zp,
- zio_write_gang_member_ready, NULL, &gn->gn_child[g],
- pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
- &pio->io_bookmark));
+ zio_write_gang_member_ready, NULL, NULL, NULL,
+ &gn->gn_child[g], pio->io_priority,
+ ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
+
+ if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
+ ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
+ ASSERT(!(pio->io_flags & ZIO_FLAG_NODATA));
+
+ /*
+ * Gang children won't throttle but we should
+ * account for their work, so reserve an allocation
+ * slot for them here.
+ */
+ VERIFY(metaslab_class_throttle_reserve(mc,
+ zp.zp_copies, cio, flags));
+ }
+ zio_nowait(cio);
}
/*
@@ -1879,12 +2323,22 @@
}
/*
- * The zio_nop_write stage in the pipeline determines if allocating
- * a new bp is necessary. By leveraging a cryptographically secure checksum,
- * such as SHA256, we can compare the checksums of the new data and the old
- * to determine if allocating a new block is required. The nopwrite
- * feature can handle writes in either syncing or open context (i.e. zil
- * writes) and as a result is mutually exclusive with dedup.
+ * The zio_nop_write stage in the pipeline determines if allocating a
+ * new bp is necessary. The nopwrite feature can handle writes in
+ * either syncing or open context (i.e. zil writes) and as a result is
+ * mutually exclusive with dedup.
+ *
+ * By leveraging a cryptographically secure checksum, such as SHA256, we
+ * can compare the checksums of the new data and the old to determine if
+ * allocating a new block is required. Note that our requirements for
+ * cryptographic strength are fairly weak: there can't be any accidental
+ * hash collisions, but we don't need to be secure against intentional
+ * (malicious) collisions. To trigger a nopwrite, you have to be able
+ * to write the file to begin with, and triggering an incorrect (hash
+ * collision) nopwrite is no worse than simply writing to the file.
+ * That said, there are no known attacks against the checksum algorithms
+ * used for nopwrite, assuming that the salt and the checksums
+ * themselves remain secret.
*/
static int
zio_nop_write(zio_t *zio)
@@ -1907,7 +2361,8 @@
* allocate a new bp.
*/
if (BP_IS_HOLE(bp_orig) ||
- !zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_dedup ||
+ !(zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_flags &
+ ZCHECKSUM_FLAG_NOPWRITE) ||
BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) ||
BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) ||
BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) ||
@@ -1919,7 +2374,8 @@
* avoid allocating a new bp and issuing any I/O.
*/
if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) {
- ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup);
+ ASSERT(zio_checksum_table[zp->zp_checksum].ci_flags &
+ ZCHECKSUM_FLAG_NOPWRITE);
ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig));
ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig));
ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF);
@@ -2006,8 +2462,9 @@
{
blkptr_t *bp = zio->io_bp;
- if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE))
+ if (zio_wait_for_children(zio, ZIO_CHILD_DDT_BIT, ZIO_WAIT_DONE)) {
return (ZIO_PIPELINE_STOP);
+ }
ASSERT(BP_GET_DEDUP(bp));
ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
@@ -2064,7 +2521,7 @@
if (ddp->ddp_phys_birth != 0) {
arc_buf_t *abuf = NULL;
- uint32_t aflags = ARC_WAIT;
+ arc_flags_t aflags = ARC_FLAG_WAIT;
blkptr_t blk = *zio->io_bp;
int error;
@@ -2082,7 +2539,7 @@
bcmp(abuf->b_data, zio->io_orig_data,
zio->io_orig_size) != 0)
error = SET_ERROR(EEXIST);
- VERIFY(arc_buf_remove_ref(abuf, &abuf));
+ arc_buf_destroy(abuf, &abuf);
}
ddt_enter(ddt);
@@ -2111,7 +2568,8 @@
ddt_phys_fill(ddp, zio->io_bp);
- while ((pio = zio_walk_parents(zio)) != NULL)
+ zio_link_t *zl = NULL;
+ while ((pio = zio_walk_parents(zio, &zl)) != NULL)
ddt_bp_fill(ddp, pio->io_bp, zio->io_txg);
ddt_exit(ddt);
@@ -2132,7 +2590,8 @@
dde->dde_lead_zio[p] = NULL;
if (zio->io_error == 0) {
- while (zio_walk_parents(zio) != NULL)
+ zio_link_t *zl = NULL;
+ while (zio_walk_parents(zio, &zl) != NULL)
ddt_phys_addref(ddp);
} else {
ddt_phys_clear(ddp);
@@ -2200,7 +2659,8 @@
* we can't resolve it, so just convert to an ordinary write.
* (And automatically e-mail a paper to Nature?)
*/
- if (!zio_checksum_table[zp->zp_checksum].ci_dedup) {
+ if (!(zio_checksum_table[zp->zp_checksum].ci_flags &
+ ZCHECKSUM_FLAG_DEDUP)) {
zp->zp_checksum = spa_dedup_checksum(spa);
zio_pop_transforms(zio);
zio->io_stage = ZIO_STAGE_OPEN;
@@ -2240,8 +2700,8 @@
}
dio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
- zio->io_orig_size, &czp, NULL,
- zio_ddt_ditto_write_done, dde, zio->io_priority,
+ zio->io_orig_size, &czp, NULL, NULL,
+ NULL, zio_ddt_ditto_write_done, dde, zio->io_priority,
ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL);
@@ -2262,7 +2722,8 @@
ddt_phys_addref(ddp);
} else {
cio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
- zio->io_orig_size, zp, zio_ddt_child_write_ready,
+ zio->io_orig_size, zp,
+ zio_ddt_child_write_ready, NULL, NULL,
zio_ddt_child_write_done, dde, zio->io_priority,
ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
@@ -2308,7 +2769,98 @@
* Allocate and free blocks
* ==========================================================================
*/
+
+static zio_t *
+zio_io_to_allocate(spa_t *spa)
+{
+ zio_t *zio;
+
+ ASSERT(MUTEX_HELD(&spa->spa_alloc_lock));
+
+ zio = avl_first(&spa->spa_alloc_tree);
+ if (zio == NULL)
+ return (NULL);
+
+ ASSERT(IO_IS_ALLOCATING(zio));
+
+ /*
+ * Try to place a reservation for this zio. If we're unable to
+ * reserve then we throttle.
+ */
+ if (!metaslab_class_throttle_reserve(spa_normal_class(spa),
+ zio->io_prop.zp_copies, zio, 0)) {
+ return (NULL);
+ }
+
+ avl_remove(&spa->spa_alloc_tree, zio);
+ ASSERT3U(zio->io_stage, <, ZIO_STAGE_DVA_ALLOCATE);
+
+ return (zio);
+}
+
static int
+zio_dva_throttle(zio_t *zio)
+{
+ spa_t *spa = zio->io_spa;
+ zio_t *nio;
+
+ if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE ||
+ !spa_normal_class(zio->io_spa)->mc_alloc_throttle_enabled ||
+ zio->io_child_type == ZIO_CHILD_GANG ||
+ zio->io_flags & ZIO_FLAG_NODATA) {
+ return (ZIO_PIPELINE_CONTINUE);
+ }
+
+ ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
+
+ ASSERT3U(zio->io_queued_timestamp, >, 0);
+ ASSERT(zio->io_stage == ZIO_STAGE_DVA_THROTTLE);
+
+ mutex_enter(&spa->spa_alloc_lock);
+
+ ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+ avl_add(&spa->spa_alloc_tree, zio);
+
+ nio = zio_io_to_allocate(zio->io_spa);
+ mutex_exit(&spa->spa_alloc_lock);
+
+ if (nio == zio)
+ return (ZIO_PIPELINE_CONTINUE);
+
+ if (nio != NULL) {
+ ASSERT3U(nio->io_queued_timestamp, <=,
+ zio->io_queued_timestamp);
+ ASSERT(nio->io_stage == ZIO_STAGE_DVA_THROTTLE);
+ /*
+ * We are passing control to a new zio so make sure that
+ * it is processed by a different thread. We do this to
+ * avoid stack overflows that can occur when parents are
+ * throttled and children are making progress. We allow
+ * it to go to the head of the taskq since it's already
+ * been waiting.
+ */
+ zio_taskq_dispatch(nio, ZIO_TASKQ_ISSUE, B_TRUE);
+ }
+ return (ZIO_PIPELINE_STOP);
+}
+
+void
+zio_allocate_dispatch(spa_t *spa)
+{
+ zio_t *zio;
+
+ mutex_enter(&spa->spa_alloc_lock);
+ zio = zio_io_to_allocate(spa);
+ mutex_exit(&spa->spa_alloc_lock);
+ if (zio == NULL)
+ return;
+
+ ASSERT3U(zio->io_stage, ==, ZIO_STAGE_DVA_THROTTLE);
+ ASSERT0(zio->io_error);
+ zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_TRUE);
+}
+
+static int
zio_dva_allocate(zio_t *zio)
{
spa_t *spa = zio->io_spa;
@@ -2328,18 +2880,20 @@
ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa));
ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
- /*
- * The dump device does not support gang blocks so allocation on
- * behalf of the dump device (i.e. ZIO_FLAG_NODATA) must avoid
- * the "fast" gang feature.
- */
- flags |= (zio->io_flags & ZIO_FLAG_NODATA) ? METASLAB_GANG_AVOID : 0;
- flags |= (zio->io_flags & ZIO_FLAG_GANG_CHILD) ?
- METASLAB_GANG_CHILD : 0;
+ if (zio->io_flags & ZIO_FLAG_NODATA) {
+ flags |= METASLAB_DONT_THROTTLE;
+ }
+ if (zio->io_flags & ZIO_FLAG_GANG_CHILD) {
+ flags |= METASLAB_GANG_CHILD;
+ }
+ if (zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE) {
+ flags |= METASLAB_ASYNC_ALLOC;
+ }
+
error = metaslab_alloc(spa, mc, zio->io_size, bp,
- zio->io_prop.zp_copies, zio->io_txg, NULL, flags);
+ zio->io_prop.zp_copies, zio->io_txg, NULL, flags, zio);
- if (error) {
+ if (error != 0) {
spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, "
"size %llu, error %d", spa_name(spa), zio, zio->io_size,
error);
@@ -2398,27 +2952,21 @@
*/
int
zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp,
- uint64_t size, boolean_t use_slog)
+ uint64_t size, boolean_t *slog)
{
int error = 1;
ASSERT(txg > spa_syncing_txg(spa));
- /*
- * ZIL blocks are always contiguous (i.e. not gang blocks) so we
- * set the METASLAB_GANG_AVOID flag so that they don't "fast gang"
- * when allocating them.
- */
- if (use_slog) {
- error = metaslab_alloc(spa, spa_log_class(spa), size,
- new_bp, 1, txg, old_bp,
- METASLAB_HINTBP_AVOID | METASLAB_GANG_AVOID);
- }
-
- if (error) {
+ error = metaslab_alloc(spa, spa_log_class(spa), size,
+ new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID, NULL);
+ if (error == 0) {
+ *slog = TRUE;
+ } else {
error = metaslab_alloc(spa, spa_normal_class(spa), size,
- new_bp, 1, txg, old_bp,
- METASLAB_HINTBP_AVOID | METASLAB_GANG_AVOID);
+ new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID, NULL);
+ if (error == 0)
+ *slog = FALSE;
}
if (error == 0) {
@@ -2454,6 +3002,18 @@
* Read, write and delete to physical devices
* ==========================================================================
*/
+
+
+/*
+ * Issue an I/O to the underlying vdev. Typically the issue pipeline
+ * stops after this stage and will resume upon I/O completion.
+ * However, there are instances where the vdev layer may need to
+ * continue the pipeline when an I/O was not issued. Since the I/O
+ * that was sent to the vdev layer might be different than the one
+ * currently active in the pipeline (see vdev_queue_io()), we explicitly
+ * force the underlying vdev layers to call either zio_execute() or
+ * zio_interrupt() to ensure that the pipeline continues with the correct I/O.
+ */
static int
zio_vdev_io_start(zio_t *zio)
{
@@ -2460,6 +3020,7 @@
vdev_t *vd = zio->io_vd;
uint64_t align;
spa_t *spa = zio->io_spa;
+ int ret;
ASSERT(zio->io_error == 0);
ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0);
@@ -2471,20 +3032,24 @@
/*
* The mirror_ops handle multiple DVAs in a single BP.
*/
- return (vdev_mirror_ops.vdev_op_io_start(zio));
+ vdev_mirror_ops.vdev_op_io_start(zio);
+ return (ZIO_PIPELINE_STOP);
}
- if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_FREE) {
+ if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_FREE &&
+ zio->io_priority == ZIO_PRIORITY_NOW) {
trim_map_free(vd, zio->io_offset, zio->io_size, zio->io_txg);
return (ZIO_PIPELINE_CONTINUE);
}
+ ASSERT3P(zio->io_logical, !=, zio);
+
/*
* We keep track of time-sensitive I/Os so that the scan thread
* can quickly react to certain workloads. In particular, we care
* about non-scrubbing, top-level reads and writes with the following
* characteristics:
- * - synchronous writes of user data to non-slog devices
+ * - synchronous writes of user data to non-slog devices
* - any reads of user data
* When these conditions are met, adjust the timestamp of spa_last_io
* which allows the scan thread to adjust its workload accordingly.
@@ -2501,7 +3066,9 @@
align = 1ULL << vd->vdev_top->vdev_ashift;
- if (P2PHASE(zio->io_size, align) != 0) {
+ if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) &&
+ P2PHASE(zio->io_size, align) != 0) {
+ /* Transform logical writes to be a full physical block size. */
uint64_t asize = P2ROUNDUP(zio->io_size, align);
char *abuf = NULL;
if (zio->io_type == ZIO_TYPE_READ ||
@@ -2516,8 +3083,24 @@
zio_subblock);
}
- ASSERT(P2PHASE(zio->io_offset, align) == 0);
- ASSERT(P2PHASE(zio->io_size, align) == 0);
+ /*
+ * If this is not a physical io, make sure that it is properly aligned
+ * before proceeding.
+ */
+ if (!(zio->io_flags & ZIO_FLAG_PHYSICAL)) {
+ ASSERT0(P2PHASE(zio->io_offset, align));
+ ASSERT0(P2PHASE(zio->io_size, align));
+ } else {
+ /*
+ * For the physical io we allow alignment
+ * to a logical block size.
+ */
+ uint64_t log_align =
+ 1ULL << vd->vdev_top->vdev_logical_ashift;
+ ASSERT0(P2PHASE(zio->io_offset, log_align));
+ ASSERT0(P2PHASE(zio->io_size, log_align));
+ }
+
VERIFY(zio->io_type == ZIO_TYPE_READ || spa_writeable(spa));
/*
@@ -2542,34 +3125,37 @@
return (ZIO_PIPELINE_CONTINUE);
}
- if (vd->vdev_ops->vdev_op_leaf &&
- (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) {
+ if (vd->vdev_ops->vdev_op_leaf) {
+ switch (zio->io_type) {
+ case ZIO_TYPE_READ:
+ if (vdev_cache_read(zio))
+ return (ZIO_PIPELINE_CONTINUE);
+ /* FALLTHROUGH */
+ case ZIO_TYPE_WRITE:
+ case ZIO_TYPE_FREE:
+ if ((zio = vdev_queue_io(zio)) == NULL)
+ return (ZIO_PIPELINE_STOP);
- if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0)
- return (ZIO_PIPELINE_CONTINUE);
-
- if ((zio = vdev_queue_io(zio)) == NULL)
- return (ZIO_PIPELINE_STOP);
-
- if (!vdev_accessible(vd, zio)) {
- zio->io_error = SET_ERROR(ENXIO);
- zio_interrupt(zio);
- return (ZIO_PIPELINE_STOP);
+ if (!vdev_accessible(vd, zio)) {
+ zio->io_error = SET_ERROR(ENXIO);
+ zio_interrupt(zio);
+ return (ZIO_PIPELINE_STOP);
+ }
+ break;
}
- }
-
- /*
- * Note that we ignore repair writes for TRIM because they can conflict
- * with normal writes. This isn't an issue because, by definition, we
- * only repair blocks that aren't freed.
- */
- if (vd->vdev_ops->vdev_op_leaf && zio->io_type == ZIO_TYPE_WRITE &&
- !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
- if (!trim_map_write_start(zio))
+ /*
+ * Note that we ignore repair writes for TRIM because they can
+ * conflict with normal writes. This isn't an issue because, by
+ * definition, we only repair blocks that aren't freed.
+ */
+ if (zio->io_type == ZIO_TYPE_WRITE &&
+ !(zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
+ !trim_map_write_start(zio))
return (ZIO_PIPELINE_STOP);
}
- return (vd->vdev_ops->vdev_op_io_start(zio));
+ vd->vdev_ops->vdev_op_io_start(zio);
+ return (ZIO_PIPELINE_STOP);
}
static int
@@ -2579,14 +3165,16 @@
vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops;
boolean_t unexpected_error = B_FALSE;
- if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
+ if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) {
return (ZIO_PIPELINE_STOP);
+ }
ASSERT(zio->io_type == ZIO_TYPE_READ ||
zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_FREE);
if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
- (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) {
+ (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE ||
+ zio->io_type == ZIO_TYPE_FREE)) {
if (zio->io_type == ZIO_TYPE_WRITE &&
!(zio->io_flags & ZIO_FLAG_IO_REPAIR))
@@ -2605,7 +3193,10 @@
zio->io_error = zio_handle_label_injection(zio, EIO);
if (zio->io_error) {
- if (!vdev_accessible(vd, zio)) {
+ if (zio->io_error == ENOTSUP &&
+ zio->io_type == ZIO_TYPE_FREE) {
+ /* Not all devices support TRIM. */
+ } else if (!vdev_accessible(vd, zio)) {
zio->io_error = SET_ERROR(ENXIO);
} else {
unexpected_error = B_TRUE;
@@ -2652,8 +3243,9 @@
{
vdev_t *vd = zio->io_vd;
- if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
+ if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) {
return (ZIO_PIPELINE_STOP);
+ }
if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
spa_config_exit(zio->io_spa, SCL_ZIO, zio);
@@ -2666,7 +3258,8 @@
if (zio_injection_enabled && zio->io_error == 0)
zio->io_error = zio_handle_fault_injection(zio, EIO);
- if (zio->io_type == ZIO_TYPE_IOCTL && zio->io_cmd == DKIOCTRIM)
+ if (zio->io_type == ZIO_TYPE_FREE &&
+ zio->io_priority != ZIO_PRIORITY_NOW) {
switch (zio->io_error) {
case 0:
ZIO_TRIM_STAT_INCR(bytes, zio->io_size);
@@ -2679,6 +3272,7 @@
ZIO_TRIM_STAT_BUMP(failed);
break;
}
+ }
/*
* If the I/O failed, determine whether we should attempt to retry it.
@@ -2719,6 +3313,13 @@
if (zio->io_error)
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+ if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
+ zio->io_physdone != NULL) {
+ ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED));
+ ASSERT(zio->io_child_type == ZIO_CHILD_VDEV);
+ zio->io_physdone(zio->io_logical);
+ }
+
return (ZIO_PIPELINE_CONTINUE);
}
@@ -2807,7 +3408,8 @@
if ((error = zio_checksum_error(zio, &info)) != 0) {
zio->io_error = error;
- if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
+ if (error == ECKSUM &&
+ !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
zfs_ereport_start_checksum(zio->io_spa,
zio->io_vd, zio, zio->io_offset,
zio->io_size, NULL, &info);
@@ -2829,7 +3431,7 @@
/*
* ==========================================================================
* Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other.
- * An error of 0 indictes success. ENXIO indicates whole-device failure,
+ * An error of 0 indicates success. ENXIO indicates whole-device failure,
* which may be transient (e.g. unplugged) or permament. ECKSUM and EIO
* indicate errors that are specific to one I/O, and most likely permanent.
* Any other error is presumed to be worse because we weren't expecting it.
@@ -2862,10 +3464,12 @@
{
blkptr_t *bp = zio->io_bp;
zio_t *pio, *pio_next;
+ zio_link_t *zl = NULL;
- if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
- zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY))
+ if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT | ZIO_CHILD_DDT_BIT,
+ ZIO_WAIT_READY)) {
return (ZIO_PIPELINE_STOP);
+ }
if (zio->io_ready) {
ASSERT(IO_IS_ALLOCATING(zio));
@@ -2879,12 +3483,26 @@
if (bp != NULL && bp != &zio->io_bp_copy)
zio->io_bp_copy = *bp;
- if (zio->io_error)
+ if (zio->io_error != 0) {
zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+ if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
+ ASSERT(IO_IS_ALLOCATING(zio));
+ ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
+ /*
+ * We were unable to allocate anything, unreserve and
+ * issue the next I/O to allocate.
+ */
+ metaslab_class_throttle_unreserve(
+ spa_normal_class(zio->io_spa),
+ zio->io_prop.zp_copies, zio);
+ zio_allocate_dispatch(zio->io_spa);
+ }
+ }
+
mutex_enter(&zio->io_lock);
zio->io_state[ZIO_WAIT_READY] = 1;
- pio = zio_walk_parents(zio);
+ pio = zio_walk_parents(zio, &zl);
mutex_exit(&zio->io_lock);
/*
@@ -2895,7 +3513,7 @@
* all parents must wait for us to be done before they can be done.
*/
for (; pio != NULL; pio = pio_next) {
- pio_next = zio_walk_parents(zio);
+ pio_next = zio_walk_parents(zio, &zl);
zio_notify_parent(pio, zio, ZIO_WAIT_READY);
}
@@ -2915,6 +3533,66 @@
return (ZIO_PIPELINE_CONTINUE);
}
+/*
+ * Update the allocation throttle accounting.
+ */
+static void
+zio_dva_throttle_done(zio_t *zio)
+{
+ zio_t *lio = zio->io_logical;
+ zio_t *pio = zio_unique_parent(zio);
+ vdev_t *vd = zio->io_vd;
+ int flags = METASLAB_ASYNC_ALLOC;
+
+ ASSERT3P(zio->io_bp, !=, NULL);
+ ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
+ ASSERT3U(zio->io_priority, ==, ZIO_PRIORITY_ASYNC_WRITE);
+ ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV);
+ ASSERT(vd != NULL);
+ ASSERT3P(vd, ==, vd->vdev_top);
+ ASSERT(!(zio->io_flags & (ZIO_FLAG_IO_REPAIR | ZIO_FLAG_IO_RETRY)));
+ ASSERT(zio->io_flags & ZIO_FLAG_IO_ALLOCATING);
+ ASSERT(!(lio->io_flags & ZIO_FLAG_IO_REWRITE));
+ ASSERT(!(lio->io_orig_flags & ZIO_FLAG_NODATA));
+
+ /*
+ * Parents of gang children can have two flavors -- ones that
+ * allocated the gang header (will have ZIO_FLAG_IO_REWRITE set)
+ * and ones that allocated the constituent blocks. The allocation
+ * throttle needs to know the allocating parent zio so we must find
+ * it here.
+ */
+ if (pio->io_child_type == ZIO_CHILD_GANG) {
+ /*
+ * If our parent is a rewrite gang child then our grandparent
+ * would have been the one that performed the allocation.
+ */
+ if (pio->io_flags & ZIO_FLAG_IO_REWRITE)
+ pio = zio_unique_parent(pio);
+ flags |= METASLAB_GANG_CHILD;
+ }
+
+ ASSERT(IO_IS_ALLOCATING(pio));
+ ASSERT3P(zio, !=, zio->io_logical);
+ ASSERT(zio->io_logical != NULL);
+ ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR));
+ ASSERT0(zio->io_flags & ZIO_FLAG_NOPWRITE);
+
+ mutex_enter(&pio->io_lock);
+ metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, pio, flags);
+ mutex_exit(&pio->io_lock);
+
+ metaslab_class_throttle_unreserve(spa_normal_class(zio->io_spa),
+ 1, pio);
+
+ /*
+ * Call into the pipeline to see if there is more work that
+ * needs to be done. If there is work to be done it will be
+ * dispatched to another taskq thread.
+ */
+ zio_allocate_dispatch(zio->io_spa);
+}
+
static int
zio_done(zio_t *zio)
{
@@ -2924,22 +3602,46 @@
vdev_t *vd = zio->io_vd;
uint64_t psize = zio->io_size;
zio_t *pio, *pio_next;
+ metaslab_class_t *mc = spa_normal_class(spa);
+ zio_link_t *zl = NULL;
/*
* If our children haven't all completed,
* wait for them and then repeat this pipeline stage.
*/
- if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) ||
- zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) ||
- zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) ||
- zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE))
+ if (zio_wait_for_children(zio, ZIO_CHILD_ALL_BITS, ZIO_WAIT_DONE)) {
return (ZIO_PIPELINE_STOP);
+ }
+ /*
+ * If the allocation throttle is enabled, then update the accounting.
+ * We only track child I/Os that are part of an allocating async
+ * write. We must do this since the allocation is performed
+ * by the logical I/O but the actual write is done by child I/Os.
+ */
+ if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING &&
+ zio->io_child_type == ZIO_CHILD_VDEV) {
+ ASSERT(mc->mc_alloc_throttle_enabled);
+ zio_dva_throttle_done(zio);
+ }
+
+ /*
+ * If the allocation throttle is enabled, verify that
+ * we have decremented the refcounts for every I/O that was throttled.
+ */
+ if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) {
+ ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+ ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE);
+ ASSERT(bp != NULL);
+ metaslab_group_alloc_verify(spa, zio->io_bp, zio);
+ VERIFY(refcount_not_held(&mc->mc_alloc_slots, zio));
+ }
+
for (int c = 0; c < ZIO_CHILD_TYPES; c++)
for (int w = 0; w < ZIO_WAIT_TYPES; w++)
ASSERT(zio->io_children[c][w] == 0);
- if (bp != NULL) {
+ if (bp != NULL && !BP_IS_EMBEDDED(bp)) {
ASSERT(bp->blk_pad[0] == 0);
ASSERT(bp->blk_pad[1] == 0);
ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 ||
@@ -3104,13 +3806,15 @@
* trouble (e.g. suspended). This allows "The Godfather"
* I/O to return status without blocking.
*/
- for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) {
- zio_link_t *zl = zio->io_walk_link;
- pio_next = zio_walk_parents(zio);
+ zl = NULL;
+ for (pio = zio_walk_parents(zio, &zl); pio != NULL;
+ pio = pio_next) {
+ zio_link_t *remove_zl = zl;
+ pio_next = zio_walk_parents(zio, &zl);
if ((pio->io_flags & ZIO_FLAG_GODFATHER) &&
(zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) {
- zio_remove_child(pio, zio, zl);
+ zio_remove_child(pio, zio, remove_zl);
zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
}
}
@@ -3135,16 +3839,14 @@
* Reexecution is potentially a huge amount of work.
* Hand it off to the otherwise-unused claim taskq.
*/
-#ifdef _KERNEL
- (void) taskq_dispatch_safe(
- spa->spa_zio_taskq[ZIO_TYPE_CLAIM][ZIO_TASKQ_ISSUE],
- (task_func_t *)zio_reexecute, zio, TQ_SLEEP,
- &zio->io_task);
+#if defined(illumos) || !defined(_KERNEL)
+ ASSERT(zio->io_tqent.tqent_next == NULL);
#else
- (void) taskq_dispatch(
- spa->spa_zio_taskq[ZIO_TYPE_CLAIM][ZIO_TASKQ_ISSUE],
- (task_func_t *)zio_reexecute, zio, TQ_SLEEP);
+ ASSERT(zio->io_tqent.tqent_task.ta_pending == 0);
#endif
+ spa_taskq_dispatch_ent(spa, ZIO_TYPE_CLAIM,
+ ZIO_TASKQ_ISSUE, (task_func_t *)zio_reexecute, zio,
+ 0, &zio->io_tqent);
}
return (ZIO_PIPELINE_STOP);
}
@@ -3176,10 +3878,11 @@
zio->io_state[ZIO_WAIT_DONE] = 1;
mutex_exit(&zio->io_lock);
- for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) {
- zio_link_t *zl = zio->io_walk_link;
- pio_next = zio_walk_parents(zio);
- zio_remove_child(pio, zio, zl);
+ zl = NULL;
+ for (pio = zio_walk_parents(zio, &zl); pio != NULL; pio = pio_next) {
+ zio_link_t *remove_zl = zl;
+ pio_next = zio_walk_parents(zio, &zl);
+ zio_remove_child(pio, zio, remove_zl);
zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
}
@@ -3203,9 +3906,10 @@
static zio_pipe_stage_t *zio_pipeline[] = {
NULL,
zio_read_bp_init,
+ zio_write_bp_init,
zio_free_bp_init,
zio_issue_async,
- zio_write_bp_init,
+ zio_write_compress,
zio_checksum_generate,
zio_nop_write,
zio_ddt_read_start,
@@ -3214,6 +3918,7 @@
zio_ddt_free,
zio_gang_assemble,
zio_gang_issue,
+ zio_dva_throttle,
zio_dva_allocate,
zio_dva_free,
zio_dva_claim,
@@ -3225,44 +3930,127 @@
zio_done
};
-/* dnp is the dnode for zb1->zb_object */
-boolean_t
-zbookmark_is_before(const dnode_phys_t *dnp, const zbookmark_t *zb1,
- const zbookmark_t *zb2)
-{
- uint64_t zb1nextL0, zb2thisobj;
- ASSERT(zb1->zb_objset == zb2->zb_objset);
- ASSERT(zb2->zb_level == 0);
+
+/*
+ * Compare two zbookmark_phys_t's to see which we would reach first in a
+ * pre-order traversal of the object tree.
+ *
+ * This is simple in every case aside from the meta-dnode object. For all other
+ * objects, we traverse them in order (object 1 before object 2, and so on).
+ * However, all of these objects are traversed while traversing object 0, since
+ * the data it points to is the list of objects. Thus, we need to convert to a
+ * canonical representation so we can compare meta-dnode bookmarks to
+ * non-meta-dnode bookmarks.
+ *
+ * We do this by calculating "equivalents" for each field of the zbookmark.
+ * zbookmarks outside of the meta-dnode use their own object and level, and
+ * calculate the level 0 equivalent (the first L0 blkid that is contained in the
+ * blocks this bookmark refers to) by multiplying their blkid by their span
+ * (the number of L0 blocks contained within one block at their level).
+ * zbookmarks inside the meta-dnode calculate their object equivalent
+ * (which is L0equiv * dnodes per data block), use 0 for their L0equiv, and use
+ * level + 1<<31 (any value larger than a level could ever be) for their level.
+ * This causes them to always compare before a bookmark in their object
+ * equivalent, compare appropriately to bookmarks in other objects, and to
+ * compare appropriately to other bookmarks in the meta-dnode.
+ */
+int
+zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2, uint8_t ibs2,
+ const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2)
+{
/*
- * A bookmark in the deadlist is considered to be after
- * everything else.
+ * These variables represent the "equivalent" values for the zbookmark,
+ * after converting zbookmarks inside the meta dnode to their
+ * normal-object equivalents.
*/
- if (zb2->zb_object == DMU_DEADLIST_OBJECT)
- return (B_TRUE);
+ uint64_t zb1obj, zb2obj;
+ uint64_t zb1L0, zb2L0;
+ uint64_t zb1level, zb2level;
- /* The objset_phys_t isn't before anything. */
- if (dnp == NULL)
- return (B_FALSE);
+ if (zb1->zb_object == zb2->zb_object &&
+ zb1->zb_level == zb2->zb_level &&
+ zb1->zb_blkid == zb2->zb_blkid)
+ return (0);
- zb1nextL0 = (zb1->zb_blkid + 1) <<
- ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT));
+ /*
+ * BP_SPANB calculates the span in blocks.
+ */
+ zb1L0 = (zb1->zb_blkid) * BP_SPANB(ibs1, zb1->zb_level);
+ zb2L0 = (zb2->zb_blkid) * BP_SPANB(ibs2, zb2->zb_level);
- zb2thisobj = zb2->zb_object ? zb2->zb_object :
- zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT);
-
if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
- uint64_t nextobj = zb1nextL0 *
- (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT;
- return (nextobj <= zb2thisobj);
+ zb1obj = zb1L0 * (dbss1 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT));
+ zb1L0 = 0;
+ zb1level = zb1->zb_level + COMPARE_META_LEVEL;
+ } else {
+ zb1obj = zb1->zb_object;
+ zb1level = zb1->zb_level;
}
- if (zb1->zb_object < zb2thisobj)
- return (B_TRUE);
- if (zb1->zb_object > zb2thisobj)
+ if (zb2->zb_object == DMU_META_DNODE_OBJECT) {
+ zb2obj = zb2L0 * (dbss2 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT));
+ zb2L0 = 0;
+ zb2level = zb2->zb_level + COMPARE_META_LEVEL;
+ } else {
+ zb2obj = zb2->zb_object;
+ zb2level = zb2->zb_level;
+ }
+
+ /* Now that we have a canonical representation, do the comparison. */
+ if (zb1obj != zb2obj)
+ return (zb1obj < zb2obj ? -1 : 1);
+ else if (zb1L0 != zb2L0)
+ return (zb1L0 < zb2L0 ? -1 : 1);
+ else if (zb1level != zb2level)
+ return (zb1level > zb2level ? -1 : 1);
+ /*
+ * This can (theoretically) happen if the bookmarks have the same object
+ * and level, but different blkids, if the block sizes are not the same.
+ * There is presently no way to change the indirect block sizes
+ */
+ return (0);
+}
+
+/*
+ * This function checks the following: given that last_block is the place that
+ * our traversal stopped last time, does that guarantee that we've visited
+ * every node under subtree_root? Therefore, we can't just use the raw output
+ * of zbookmark_compare. We have to pass in a modified version of
+ * subtree_root; by incrementing the block id, and then checking whether
+ * last_block is before or equal to that, we can tell whether or not having
+ * visited last_block implies that all of subtree_root's children have been
+ * visited.
+ */
+boolean_t
+zbookmark_subtree_completed(const dnode_phys_t *dnp,
+ const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block)
+{
+ zbookmark_phys_t mod_zb = *subtree_root;
+ mod_zb.zb_blkid++;
+ ASSERT(last_block->zb_level == 0);
+
+ /* The objset_phys_t isn't before anything. */
+ if (dnp == NULL)
return (B_FALSE);
- if (zb2->zb_object == DMU_META_DNODE_OBJECT)
- return (B_FALSE);
- return (zb1nextL0 <= zb2->zb_blkid);
+
+ /*
+ * We pass in 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT) for the
+ * data block size in sectors, because that variable is only used if
+ * the bookmark refers to a block in the meta-dnode. Since we don't
+ * know without examining it what object it refers to, and there's no
+ * harm in passing in this value in other cases, we always pass it in.
+ *
+ * We pass in 0 for the indirect block size shift because zb2 must be
+ * level 0. The indirect block size is only used to calculate the span
+ * of the bookmark, but since the bookmark must be level 0, the span is
+ * always 1, so the math works out.
+ *
+ * If you make changes to how the zbookmark_compare code works, be sure
+ * to make sure that this code still works afterwards.
+ */
+ return (zbookmark_compare(dnp->dn_datablkszsec, dnp->dn_indblkshift,
+ 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT), 0, &mod_zb,
+ last_block) <= 0);
}
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_checksum.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,11 +21,14 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
*/
#include <sys/zfs_context.h>
#include <sys/spa.h>
+#include <sys/spa_impl.h>
#include <sys/zio.h>
#include <sys/zio_checksum.h>
#include <sys/zil.h>
@@ -59,28 +62,105 @@
* checksum function of the appropriate strength. When reading a block,
* we compare the expected checksum against the actual checksum, which we
* compute via the checksum function specified by BP_GET_CHECKSUM(bp).
+ *
+ * SALTED CHECKSUMS
+ *
+ * To enable the use of less secure hash algorithms with dedup, we
+ * introduce the notion of salted checksums (MACs, really). A salted
+ * checksum is fed both a random 256-bit value (the salt) and the data
+ * to be checksummed. This salt is kept secret (stored on the pool, but
+ * never shown to the user). Thus even if an attacker knew of collision
+ * weaknesses in the hash algorithm, they won't be able to mount a known
+ * plaintext attack on the DDT, since the actual hash value cannot be
+ * known ahead of time. How the salt is used is algorithm-specific
+ * (some might simply prefix it to the data block, others might need to
+ * utilize a full-blown HMAC). On disk the salt is stored in a ZAP
+ * object in the MOS (DMU_POOL_CHECKSUM_SALT).
+ *
+ * CONTEXT TEMPLATES
+ *
+ * Some hashing algorithms need to perform a substantial amount of
+ * initialization work (e.g. salted checksums above may need to pre-hash
+ * the salt) before being able to process data. Performing this
+ * redundant work for each block would be wasteful, so we instead allow
+ * a checksum algorithm to do the work once (the first time it's used)
+ * and then keep this pre-initialized context as a template inside the
+ * spa_t (spa_cksum_tmpls). If the zio_checksum_info_t contains
+ * non-NULL ci_tmpl_init and ci_tmpl_free callbacks, they are used to
+ * construct and destruct the pre-initialized checksum context. The
+ * pre-initialized context is then reused during each checksum
+ * invocation and passed to the checksum function.
*/
/*ARGSUSED*/
static void
-zio_checksum_off(const void *buf, uint64_t size, zio_cksum_t *zcp)
+zio_checksum_off(const void *buf, uint64_t size,
+ const void *ctx_template, zio_cksum_t *zcp)
{
ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
}
zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
- {{NULL, NULL}, 0, 0, 0, "inherit"},
- {{NULL, NULL}, 0, 0, 0, "on"},
- {{zio_checksum_off, zio_checksum_off}, 0, 0, 0, "off"},
- {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 1, 0, "label"},
- {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 1, 0, "gang_header"},
- {{fletcher_2_native, fletcher_2_byteswap}, 0, 1, 0, "zilog"},
- {{fletcher_2_native, fletcher_2_byteswap}, 0, 0, 0, "fletcher2"},
- {{fletcher_4_native, fletcher_4_byteswap}, 1, 0, 0, "fletcher4"},
- {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 0, 1, "sha256"},
- {{fletcher_4_native, fletcher_4_byteswap}, 0, 1, 0, "zilog2"},
+ {{NULL, NULL}, NULL, NULL, 0, "inherit"},
+ {{NULL, NULL}, NULL, NULL, 0, "on"},
+ {{zio_checksum_off, zio_checksum_off},
+ NULL, NULL, 0, "off"},
+ {{zio_checksum_SHA256, zio_checksum_SHA256},
+ NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED,
+ "label"},
+ {{zio_checksum_SHA256, zio_checksum_SHA256},
+ NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED,
+ "gang_header"},
+ {{fletcher_2_native, fletcher_2_byteswap},
+ NULL, NULL, ZCHECKSUM_FLAG_EMBEDDED, "zilog"},
+ {{fletcher_2_native, fletcher_2_byteswap},
+ NULL, NULL, 0, "fletcher2"},
+ {{fletcher_4_native, fletcher_4_byteswap},
+ NULL, NULL, ZCHECKSUM_FLAG_METADATA, "fletcher4"},
+ {{zio_checksum_SHA256, zio_checksum_SHA256},
+ NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
+ ZCHECKSUM_FLAG_NOPWRITE, "sha256"},
+ {{fletcher_4_native, fletcher_4_byteswap},
+ NULL, NULL, ZCHECKSUM_FLAG_EMBEDDED, "zilog2"},
+ {{zio_checksum_off, zio_checksum_off},
+ NULL, NULL, 0, "noparity"},
+#ifdef illumos
+ {{zio_checksum_SHA512_native, zio_checksum_SHA512_byteswap},
+ NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
+ ZCHECKSUM_FLAG_NOPWRITE, "sha512"},
+ {{zio_checksum_skein_native, zio_checksum_skein_byteswap},
+ zio_checksum_skein_tmpl_init, zio_checksum_skein_tmpl_free,
+ ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP |
+ ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"},
+ {{zio_checksum_edonr_native, zio_checksum_edonr_byteswap},
+ zio_checksum_edonr_tmpl_init, zio_checksum_edonr_tmpl_free,
+ ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_SALTED |
+ ZCHECKSUM_FLAG_NOPWRITE, "edonr"},
+#endif
};
+/*
+ * The flag corresponding to the "verify" in dedup=[checksum,]verify
+ * must be cleared first, so callers should use ZIO_CHECKSUM_MASK.
+ */
+spa_feature_t
+zio_checksum_to_feature(enum zio_checksum cksum)
+{
+#ifdef illumos
+ VERIFY((cksum & ~ZIO_CHECKSUM_MASK) == 0);
+
+ switch (cksum) {
+ case ZIO_CHECKSUM_SHA512:
+ return (SPA_FEATURE_SHA512);
+ case ZIO_CHECKSUM_SKEIN:
+ return (SPA_FEATURE_SKEIN);
+ case ZIO_CHECKSUM_EDONR:
+ return (SPA_FEATURE_EDONR);
+ }
+#endif
+ return (SPA_FEATURE_NONE);
+}
+
enum zio_checksum
zio_checksum_select(enum zio_checksum child, enum zio_checksum parent)
{
@@ -114,7 +194,8 @@
if (child == (ZIO_CHECKSUM_ON | ZIO_CHECKSUM_VERIFY))
return (spa_dedup_checksum(spa) | ZIO_CHECKSUM_VERIFY);
- ASSERT(zio_checksum_table[child & ZIO_CHECKSUM_MASK].ci_dedup ||
+ ASSERT((zio_checksum_table[child & ZIO_CHECKSUM_MASK].ci_flags &
+ ZCHECKSUM_FLAG_DEDUP) ||
(child & ZIO_CHECKSUM_VERIFY) || child == ZIO_CHECKSUM_OFF);
return (child);
@@ -147,21 +228,48 @@
}
/*
+ * Calls the template init function of a checksum which supports context
+ * templates and installs the template into the spa_t.
+ */
+static void
+zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa)
+{
+ zio_checksum_info_t *ci = &zio_checksum_table[checksum];
+
+ if (ci->ci_tmpl_init == NULL)
+ return;
+ if (spa->spa_cksum_tmpls[checksum] != NULL)
+ return;
+
+ VERIFY(ci->ci_tmpl_free != NULL);
+ mutex_enter(&spa->spa_cksum_tmpls_lock);
+ if (spa->spa_cksum_tmpls[checksum] == NULL) {
+ spa->spa_cksum_tmpls[checksum] =
+ ci->ci_tmpl_init(&spa->spa_cksum_salt);
+ VERIFY(spa->spa_cksum_tmpls[checksum] != NULL);
+ }
+ mutex_exit(&spa->spa_cksum_tmpls_lock);
+}
+
+/*
* Generate the checksum.
*/
void
zio_checksum_compute(zio_t *zio, enum zio_checksum checksum,
- void *data, uint64_t size)
+ void *data, uint64_t size)
{
blkptr_t *bp = zio->io_bp;
uint64_t offset = zio->io_offset;
zio_checksum_info_t *ci = &zio_checksum_table[checksum];
zio_cksum_t cksum;
+ spa_t *spa = zio->io_spa;
ASSERT((uint_t)checksum < ZIO_CHECKSUM_FUNCTIONS);
ASSERT(ci->ci_func[0] != NULL);
- if (ci->ci_eck) {
+ zio_checksum_template_init(checksum, spa);
+
+ if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
zio_eck_t *eck;
if (checksum == ZIO_CHECKSUM_ZILOG2) {
@@ -180,33 +288,31 @@
else
bp->blk_cksum = eck->zec_cksum;
eck->zec_magic = ZEC_MAGIC;
- ci->ci_func[0](data, size, &cksum);
+ ci->ci_func[0](data, size, spa->spa_cksum_tmpls[checksum],
+ &cksum);
eck->zec_cksum = cksum;
} else {
- ci->ci_func[0](data, size, &bp->blk_cksum);
+ ci->ci_func[0](data, size, spa->spa_cksum_tmpls[checksum],
+ &bp->blk_cksum);
}
}
int
-zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info)
+zio_checksum_error_impl(spa_t *spa, blkptr_t *bp, enum zio_checksum checksum,
+ void *data, uint64_t size, uint64_t offset, zio_bad_cksum_t *info)
{
- blkptr_t *bp = zio->io_bp;
- uint_t checksum = (bp == NULL ? zio->io_prop.zp_checksum :
- (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
+ zio_checksum_info_t *ci = &zio_checksum_table[checksum];
+ zio_cksum_t actual_cksum, expected_cksum;
int byteswap;
- int error;
- uint64_t size = (bp == NULL ? zio->io_size :
- (BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp)));
- uint64_t offset = zio->io_offset;
- void *data = zio->io_data;
- zio_checksum_info_t *ci = &zio_checksum_table[checksum];
- zio_cksum_t actual_cksum, expected_cksum, verifier;
if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL)
return (SET_ERROR(EINVAL));
- if (ci->ci_eck) {
+ zio_checksum_template_init(checksum, spa);
+
+ if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) {
zio_eck_t *eck;
+ zio_cksum_t verifier;
if (checksum == ZIO_CHECKSUM_ZILOG2) {
zil_chain_t *zilc = data;
@@ -242,35 +348,76 @@
expected_cksum = eck->zec_cksum;
eck->zec_cksum = verifier;
- ci->ci_func[byteswap](data, size, &actual_cksum);
+ ci->ci_func[byteswap](data, size,
+ spa->spa_cksum_tmpls[checksum], &actual_cksum);
eck->zec_cksum = expected_cksum;
- if (byteswap)
+ if (byteswap) {
byteswap_uint64_array(&expected_cksum,
sizeof (zio_cksum_t));
+ }
} else {
- ASSERT(!BP_IS_GANG(bp));
byteswap = BP_SHOULD_BYTESWAP(bp);
expected_cksum = bp->blk_cksum;
- ci->ci_func[byteswap](data, size, &actual_cksum);
+ ci->ci_func[byteswap](data, size,
+ spa->spa_cksum_tmpls[checksum], &actual_cksum);
}
- info->zbc_expected = expected_cksum;
- info->zbc_actual = actual_cksum;
- info->zbc_checksum_name = ci->ci_name;
- info->zbc_byteswapped = byteswap;
- info->zbc_injected = 0;
- info->zbc_has_cksum = 1;
+ if (info != NULL) {
+ info->zbc_expected = expected_cksum;
+ info->zbc_actual = actual_cksum;
+ info->zbc_checksum_name = ci->ci_name;
+ info->zbc_byteswapped = byteswap;
+ info->zbc_injected = 0;
+ info->zbc_has_cksum = 1;
+ }
if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum))
return (SET_ERROR(ECKSUM));
- if (zio_injection_enabled && !zio->io_error &&
+ return (0);
+}
+
+int
+zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info)
+{
+ blkptr_t *bp = zio->io_bp;
+ uint_t checksum = (bp == NULL ? zio->io_prop.zp_checksum :
+ (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
+ int error;
+ uint64_t size = (bp == NULL ? zio->io_size :
+ (BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp)));
+ uint64_t offset = zio->io_offset;
+ void *data = zio->io_data;
+ spa_t *spa = zio->io_spa;
+
+ error = zio_checksum_error_impl(spa, bp, checksum, data, size,
+ offset, info);
+ if (error != 0 && zio_injection_enabled && !zio->io_error &&
(error = zio_handle_fault_injection(zio, ECKSUM)) != 0) {
info->zbc_injected = 1;
return (error);
}
+ return (error);
+}
- return (0);
+/*
+ * Called by a spa_t that's about to be deallocated. This steps through
+ * all of the checksum context templates and deallocates any that were
+ * initialized using the algorithm-specific template init function.
+ */
+void
+zio_checksum_templates_free(spa_t *spa)
+{
+ for (enum zio_checksum checksum = 0;
+ checksum < ZIO_CHECKSUM_FUNCTIONS; checksum++) {
+ if (spa->spa_cksum_tmpls[checksum] != NULL) {
+ zio_checksum_info_t *ci = &zio_checksum_table[checksum];
+
+ VERIFY(ci->ci_tmpl_free != NULL);
+ ci->ci_tmpl_free(spa->spa_cksum_tmpls[checksum]);
+ spa->spa_cksum_tmpls[checksum] = NULL;
+ }
+ }
}
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -34,10 +34,31 @@
#include <sys/zfs_context.h>
#include <sys/compress.h>
+#include <sys/kstat.h>
#include <sys/spa.h>
+#include <sys/zfeature.h>
#include <sys/zio.h>
#include <sys/zio_compress.h>
+typedef struct zcomp_stats {
+ kstat_named_t zcompstat_attempts;
+ kstat_named_t zcompstat_empty;
+ kstat_named_t zcompstat_skipped_insufficient_gain;
+} zcomp_stats_t;
+
+static zcomp_stats_t zcomp_stats = {
+ { "attempts", KSTAT_DATA_UINT64 },
+ { "empty", KSTAT_DATA_UINT64 },
+ { "skipped_insufficient_gain", KSTAT_DATA_UINT64 }
+};
+
+#define ZCOMPSTAT_INCR(stat, val) \
+ atomic_add_64(&zcomp_stats.stat.value.ui64, (val));
+
+#define ZCOMPSTAT_BUMP(stat) ZCOMPSTAT_INCR(stat, 1);
+
+kstat_t *zcomp_ksp;
+
/*
* Compression vectors.
*/
@@ -62,19 +83,27 @@
};
enum zio_compress
-zio_compress_select(enum zio_compress child, enum zio_compress parent)
+zio_compress_select(spa_t *spa, enum zio_compress child,
+ enum zio_compress parent)
{
+ enum zio_compress result;
+
ASSERT(child < ZIO_COMPRESS_FUNCTIONS);
ASSERT(parent < ZIO_COMPRESS_FUNCTIONS);
- ASSERT(parent != ZIO_COMPRESS_INHERIT && parent != ZIO_COMPRESS_ON);
+ ASSERT(parent != ZIO_COMPRESS_INHERIT);
- if (child == ZIO_COMPRESS_INHERIT)
- return (parent);
+ result = child;
+ if (result == ZIO_COMPRESS_INHERIT)
+ result = parent;
- if (child == ZIO_COMPRESS_ON)
- return (ZIO_COMPRESS_ON_VALUE);
+ if (result == ZIO_COMPRESS_ON) {
+ if (spa_feature_is_active(spa, SPA_FEATURE_LZ4_COMPRESS))
+ result = ZIO_COMPRESS_LZ4_ON_VALUE;
+ else
+ result = ZIO_COMPRESS_LEGACY_ON_VALUE;
+ }
- return (child);
+ return (result);
}
size_t
@@ -81,12 +110,14 @@
zio_compress_data(enum zio_compress c, void *src, void *dst, size_t s_len)
{
uint64_t *word, *word_end;
- size_t c_len, d_len, r_len;
+ size_t c_len, d_len;
zio_compress_info_t *ci = &zio_compress_table[c];
ASSERT((uint_t)c < ZIO_COMPRESS_FUNCTIONS);
ASSERT((uint_t)c == ZIO_COMPRESS_EMPTY || ci->ci_compress != NULL);
+ ZCOMPSTAT_BUMP(zcompstat_attempts);
+
/*
* If the data is all zeroes, we don't even need to allocate
* a block for it. We indicate this by returning zero size.
@@ -96,35 +127,24 @@
if (*word != 0)
break;
- if (word == word_end)
- return (0);
+ if (word == word_end) {
+ ZCOMPSTAT_BUMP(zcompstat_empty);
+ return (0);
+ }
if (c == ZIO_COMPRESS_EMPTY)
return (s_len);
/* Compress at least 12.5% */
- d_len = P2ALIGN(s_len - (s_len >> 3), (size_t)SPA_MINBLOCKSIZE);
- if (d_len == 0)
- return (s_len);
-
+ d_len = s_len - (s_len >> 3);
c_len = ci->ci_compress(src, dst, s_len, d_len, ci->ci_level);
- if (c_len > d_len)
+ if (c_len > d_len) {
+ ZCOMPSTAT_BUMP(zcompstat_skipped_insufficient_gain);
return (s_len);
-
- /*
- * Cool. We compressed at least as much as we were hoping to.
- * For both security and repeatability, pad out the last sector.
- */
- r_len = P2ROUNDUP(c_len, (size_t)SPA_MINBLOCKSIZE);
- if (r_len > c_len) {
- bzero((char *)dst + c_len, r_len - c_len);
- c_len = r_len;
}
ASSERT3U(c_len, <=, d_len);
- ASSERT(P2PHASE(c_len, (size_t)SPA_MINBLOCKSIZE) == 0);
-
return (c_len);
}
@@ -139,3 +159,26 @@
return (ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level));
}
+
+void
+zio_compress_init(void)
+{
+
+ zcomp_ksp = kstat_create("zfs", 0, "zcompstats", "misc",
+ KSTAT_TYPE_NAMED, sizeof (zcomp_stats) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL);
+
+ if (zcomp_ksp != NULL) {
+ zcomp_ksp->ks_data = &zcomp_stats;
+ kstat_install(zcomp_ksp);
+ }
+}
+
+void
+zio_compress_fini(void)
+{
+ if (zcomp_ksp != NULL) {
+ kstat_delete(zcomp_ksp);
+ zcomp_ksp = NULL;
+ }
+}
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,7 +21,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
*/
/*
@@ -50,15 +50,53 @@
uint32_t zio_injection_enabled;
+/*
+ * Data describing each zinject handler registered on the system, and
+ * contains the list node linking the handler in the global zinject
+ * handler list.
+ */
typedef struct inject_handler {
int zi_id;
spa_t *zi_spa;
zinject_record_t zi_record;
+ uint64_t *zi_lanes;
+ int zi_next_lane;
list_node_t zi_link;
} inject_handler_t;
+/*
+ * List of all zinject handlers registered on the system, protected by
+ * the inject_lock defined below.
+ */
static list_t inject_handlers;
+
+/*
+ * This protects insertion into, and traversal of, the inject handler
+ * list defined above; as well as the inject_delay_count. Any time a
+ * handler is inserted or removed from the list, this lock should be
+ * taken as a RW_WRITER; and any time traversal is done over the list
+ * (without modification to it) this lock should be taken as a RW_READER.
+ */
static krwlock_t inject_lock;
+
+/*
+ * This holds the number of zinject delay handlers that have been
+ * registered on the system. It is protected by the inject_lock defined
+ * above. Thus modifications to this count must be a RW_WRITER of the
+ * inject_lock, and reads of this count must be (at least) a RW_READER
+ * of the lock.
+ */
+static int inject_delay_count = 0;
+
+/*
+ * This lock is used only in zio_handle_io_delay(), refer to the comment
+ * in that function for more details.
+ */
+static kmutex_t inject_delay_mtx;
+
+/*
+ * Used to assign unique identifying numbers to each new zinject handler.
+ */
static int inject_next_id = 1;
/*
@@ -65,7 +103,7 @@
* Returns true if the given record matches the I/O in progress.
*/
static boolean_t
-zio_match_handler(zbookmark_t *zb, uint64_t type,
+zio_match_handler(zbookmark_phys_t *zb, uint64_t type,
zinject_record_t *record, int error)
{
/*
@@ -361,32 +399,164 @@
rw_exit(&inject_lock);
}
-uint64_t
+hrtime_t
zio_handle_io_delay(zio_t *zio)
{
vdev_t *vd = zio->io_vd;
- inject_handler_t *handler;
- uint64_t seconds = 0;
+ inject_handler_t *min_handler = NULL;
+ hrtime_t min_target = 0;
- if (zio_injection_enabled == 0)
+ rw_enter(&inject_lock, RW_READER);
+
+ /*
+ * inject_delay_count is a subset of zio_injection_enabled that
+ * is only incremented for delay handlers. These checks are
+ * mainly added to remind the reader why we're not explicitly
+ * checking zio_injection_enabled like the other functions.
+ */
+ IMPLY(inject_delay_count > 0, zio_injection_enabled > 0);
+ IMPLY(zio_injection_enabled == 0, inject_delay_count == 0);
+
+ /*
+ * If there aren't any inject delay handlers registered, then we
+ * can short circuit and simply return 0 here. A value of zero
+ * informs zio_delay_interrupt() that this request should not be
+ * delayed. This short circuit keeps us from acquiring the
+ * inject_delay_mutex unnecessarily.
+ */
+ if (inject_delay_count == 0) {
+ rw_exit(&inject_lock);
return (0);
+ }
- rw_enter(&inject_lock, RW_READER);
+ /*
+ * Each inject handler has a number of "lanes" associated with
+ * it. Each lane is able to handle requests independently of one
+ * another, and at a latency defined by the inject handler
+ * record's zi_timer field. Thus if a handler in configured with
+ * a single lane with a 10ms latency, it will delay requests
+ * such that only a single request is completed every 10ms. So,
+ * if more than one request is attempted per each 10ms interval,
+ * the average latency of the requests will be greater than
+ * 10ms; but if only a single request is submitted each 10ms
+ * interval the average latency will be 10ms.
+ *
+ * We need to acquire this mutex to prevent multiple concurrent
+ * threads being assigned to the same lane of a given inject
+ * handler. The mutex allows us to perform the following two
+ * operations atomically:
+ *
+ * 1. determine the minimum handler and minimum target
+ * value of all the possible handlers
+ * 2. update that minimum handler's lane array
+ *
+ * Without atomicity, two (or more) threads could pick the same
+ * lane in step (1), and then conflict with each other in step
+ * (2). This could allow a single lane handler to process
+ * multiple requests simultaneously, which shouldn't be possible.
+ */
+ mutex_enter(&inject_delay_mtx);
- for (handler = list_head(&inject_handlers); handler != NULL;
- handler = list_next(&inject_handlers, handler)) {
-
+ for (inject_handler_t *handler = list_head(&inject_handlers);
+ handler != NULL; handler = list_next(&inject_handlers, handler)) {
if (handler->zi_record.zi_cmd != ZINJECT_DELAY_IO)
continue;
- if (vd->vdev_guid == handler->zi_record.zi_guid) {
- seconds = handler->zi_record.zi_timer;
- break;
+ if (vd->vdev_guid != handler->zi_record.zi_guid)
+ continue;
+
+ /*
+ * Defensive; should never happen as the array allocation
+ * occurs prior to inserting this handler on the list.
+ */
+ ASSERT3P(handler->zi_lanes, !=, NULL);
+
+ /*
+ * This should never happen, the zinject command should
+ * prevent a user from setting an IO delay with zero lanes.
+ */
+ ASSERT3U(handler->zi_record.zi_nlanes, !=, 0);
+
+ ASSERT3U(handler->zi_record.zi_nlanes, >,
+ handler->zi_next_lane);
+
+ /*
+ * We want to issue this IO to the lane that will become
+ * idle the soonest, so we compare the soonest this
+ * specific handler can complete the IO with all other
+ * handlers, to find the lowest value of all possible
+ * lanes. We then use this lane to submit the request.
+ *
+ * Since each handler has a constant value for its
+ * delay, we can just use the "next" lane for that
+ * handler; as it will always be the lane with the
+ * lowest value for that particular handler (i.e. the
+ * lane that will become idle the soonest). This saves a
+ * scan of each handler's lanes array.
+ *
+ * There's two cases to consider when determining when
+ * this specific IO request should complete. If this
+ * lane is idle, we want to "submit" the request now so
+ * it will complete after zi_timer milliseconds. Thus,
+ * we set the target to now + zi_timer.
+ *
+ * If the lane is busy, we want this request to complete
+ * zi_timer milliseconds after the lane becomes idle.
+ * Since the 'zi_lanes' array holds the time at which
+ * each lane will become idle, we use that value to
+ * determine when this request should complete.
+ */
+ hrtime_t idle = handler->zi_record.zi_timer + gethrtime();
+ hrtime_t busy = handler->zi_record.zi_timer +
+ handler->zi_lanes[handler->zi_next_lane];
+ hrtime_t target = MAX(idle, busy);
+
+ if (min_handler == NULL) {
+ min_handler = handler;
+ min_target = target;
+ continue;
}
+ ASSERT3P(min_handler, !=, NULL);
+ ASSERT3U(min_target, !=, 0);
+
+ /*
+ * We don't yet increment the "next lane" variable since
+ * we still might find a lower value lane in another
+ * handler during any remaining iterations. Once we're
+ * sure we've selected the absolute minimum, we'll claim
+ * the lane and increment the handler's "next lane"
+ * field below.
+ */
+
+ if (target < min_target) {
+ min_handler = handler;
+ min_target = target;
+ }
}
+
+ /*
+ * 'min_handler' will be NULL if no IO delays are registered for
+ * this vdev, otherwise it will point to the handler containing
+ * the lane that will become idle the soonest.
+ */
+ if (min_handler != NULL) {
+ ASSERT3U(min_target, !=, 0);
+ min_handler->zi_lanes[min_handler->zi_next_lane] = min_target;
+
+ /*
+ * If we've used all possible lanes for this handler,
+ * loop back and start using the first lane again;
+ * otherwise, just increment the lane index.
+ */
+ min_handler->zi_next_lane = (min_handler->zi_next_lane + 1) %
+ min_handler->zi_record.zi_nlanes;
+ }
+
+ mutex_exit(&inject_delay_mtx);
rw_exit(&inject_lock);
- return (seconds);
+
+ return (min_target);
}
/*
@@ -410,6 +580,24 @@
if ((error = spa_reset(name)) != 0)
return (error);
+ if (record->zi_cmd == ZINJECT_DELAY_IO) {
+ /*
+ * A value of zero for the number of lanes or for the
+ * delay time doesn't make sense.
+ */
+ if (record->zi_timer == 0 || record->zi_nlanes == 0)
+ return (SET_ERROR(EINVAL));
+
+ /*
+ * The number of lanes is directly mapped to the size of
+ * an array used by the handler. Thus, to ensure the
+ * user doesn't trigger an allocation that's "too large"
+ * we cap the number of lanes here.
+ */
+ if (record->zi_nlanes >= UINT16_MAX)
+ return (SET_ERROR(EINVAL));
+ }
+
if (!(flags & ZINJECT_NULL)) {
/*
* spa_inject_ref() will add an injection reference, which will
@@ -421,13 +609,36 @@
handler = kmem_alloc(sizeof (inject_handler_t), KM_SLEEP);
+ handler->zi_spa = spa;
+ handler->zi_record = *record;
+
+ if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
+ handler->zi_lanes = kmem_zalloc(
+ sizeof (*handler->zi_lanes) *
+ handler->zi_record.zi_nlanes, KM_SLEEP);
+ handler->zi_next_lane = 0;
+ } else {
+ handler->zi_lanes = NULL;
+ handler->zi_next_lane = 0;
+ }
+
rw_enter(&inject_lock, RW_WRITER);
+ /*
+ * We can't move this increment into the conditional
+ * above because we need to hold the RW_WRITER lock of
+ * inject_lock, and we don't want to hold that while
+ * allocating the handler's zi_lanes array.
+ */
+ if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
+ ASSERT3S(inject_delay_count, >=, 0);
+ inject_delay_count++;
+ ASSERT3S(inject_delay_count, >, 0);
+ }
+
*id = handler->zi_id = inject_next_id++;
- handler->zi_spa = spa;
- handler->zi_record = *record;
list_insert_tail(&inject_handlers, handler);
- atomic_add_32(&zio_injection_enabled, 1);
+ atomic_inc_32(&zio_injection_enabled);
rw_exit(&inject_lock);
}
@@ -439,7 +650,11 @@
* fault injection isn't a performance critical path.
*/
if (flags & ZINJECT_FLUSH_ARC)
- arc_flush(NULL);
+ /*
+ * We must use FALSE to ensure arc_flush returns, since
+ * we're not preventing concurrent ARC insertions.
+ */
+ arc_flush(NULL, FALSE);
return (0);
}
@@ -499,12 +714,26 @@
return (SET_ERROR(ENOENT));
}
+ if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
+ ASSERT3S(inject_delay_count, >, 0);
+ inject_delay_count--;
+ ASSERT3S(inject_delay_count, >=, 0);
+ }
+
list_remove(&inject_handlers, handler);
rw_exit(&inject_lock);
+ if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) {
+ ASSERT3P(handler->zi_lanes, !=, NULL);
+ kmem_free(handler->zi_lanes, sizeof (*handler->zi_lanes) *
+ handler->zi_record.zi_nlanes);
+ } else {
+ ASSERT3P(handler->zi_lanes, ==, NULL);
+ }
+
spa_inject_delref(handler->zi_spa);
kmem_free(handler, sizeof (inject_handler_t));
- atomic_add_32(&zio_injection_enabled, -1);
+ atomic_dec_32(&zio_injection_enabled);
return (0);
}
@@ -513,6 +742,7 @@
zio_inject_init(void)
{
rw_init(&inject_lock, NULL, RW_DEFAULT, NULL);
+ mutex_init(&inject_delay_mtx, NULL, MUTEX_DEFAULT, NULL);
list_create(&inject_handlers, sizeof (inject_handler_t),
offsetof(inject_handler_t, zi_link));
}
@@ -521,5 +751,6 @@
zio_inject_fini(void)
{
list_destroy(&inject_handlers);
+ mutex_destroy(&inject_delay_mtx);
rw_destroy(&inject_lock);
}
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zrlock.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zrlock.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zrlock.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -21,6 +21,8 @@
*/
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, 2015 by Delphix. All rights reserved.
+ * Copyright 2016 The MathWorks, Inc. All rights reserved.
*/
/*
@@ -43,7 +45,7 @@
* A ZRL can be locked only while there are zero references, so ZRL_LOCKED is
* treated as zero references.
*/
-#define ZRL_LOCKED ((uint32_t)-1)
+#define ZRL_LOCKED -1
#define ZRL_DESTROYED -2
void
@@ -61,7 +63,7 @@
void
zrl_destroy(zrlock_t *zrl)
{
- ASSERT(zrl->zr_refcount == 0);
+ ASSERT0(zrl->zr_refcount);
mutex_destroy(&zrl->zr_mtx);
zrl->zr_refcount = ZRL_DESTROYED;
@@ -69,43 +71,34 @@
}
void
+zrl_add_impl(zrlock_t *zrl, const char *zc)
+{
+ for (;;) {
+ uint32_t n = (uint32_t)zrl->zr_refcount;
+ while (n != ZRL_LOCKED) {
+ uint32_t cas = atomic_cas_32(
+ (uint32_t *)&zrl->zr_refcount, n, n + 1);
+ if (cas == n) {
+ ASSERT3S((int32_t)n, >=, 0);
#ifdef ZFS_DEBUG
-zrl_add_debug(zrlock_t *zrl, const char *zc)
-#else
-zrl_add(zrlock_t *zrl)
+ if (zrl->zr_owner == curthread) {
+ DTRACE_PROBE2(zrlock__reentry,
+ zrlock_t *, zrl, uint32_t, n);
+ }
+ zrl->zr_owner = curthread;
+ zrl->zr_caller = zc;
#endif
-{
- uint32_t n = (uint32_t)zrl->zr_refcount;
-
- while (n != ZRL_LOCKED) {
- uint32_t cas = atomic_cas_32(
- (uint32_t *)&zrl->zr_refcount, n, n + 1);
- if (cas == n) {
- ASSERT((int32_t)n >= 0);
-#ifdef ZFS_DEBUG
- if (zrl->zr_owner == curthread) {
- DTRACE_PROBE2(zrlock__reentry,
- zrlock_t *, zrl, uint32_t, n);
+ return;
}
- zrl->zr_owner = curthread;
- zrl->zr_caller = zc;
-#endif
- return;
+ n = cas;
}
- n = cas;
- }
- mutex_enter(&zrl->zr_mtx);
- while (zrl->zr_refcount == ZRL_LOCKED) {
- cv_wait(&zrl->zr_cv, &zrl->zr_mtx);
+ mutex_enter(&zrl->zr_mtx);
+ while (zrl->zr_refcount == ZRL_LOCKED) {
+ cv_wait(&zrl->zr_cv, &zrl->zr_mtx);
+ }
+ mutex_exit(&zrl->zr_mtx);
}
- ASSERT(zrl->zr_refcount >= 0);
- zrl->zr_refcount++;
-#ifdef ZFS_DEBUG
- zrl->zr_owner = curthread;
- zrl->zr_caller = zc;
-#endif
- mutex_exit(&zrl->zr_mtx);
}
void
@@ -113,8 +106,6 @@
{
uint32_t n;
- n = atomic_dec_32_nv((uint32_t *)&zrl->zr_refcount);
- ASSERT((int32_t)n >= 0);
#ifdef ZFS_DEBUG
if (zrl->zr_owner == curthread) {
zrl->zr_owner = NULL;
@@ -121,6 +112,8 @@
zrl->zr_caller = NULL;
}
#endif
+ n = atomic_dec_32_nv((uint32_t *)&zrl->zr_refcount);
+ ASSERT3S((int32_t)n, >=, 0);
}
int
@@ -133,7 +126,7 @@
(uint32_t *)&zrl->zr_refcount, 0, ZRL_LOCKED);
if (cas == 0) {
#ifdef ZFS_DEBUG
- ASSERT(zrl->zr_owner == NULL);
+ ASSERT3P(zrl->zr_owner, ==, NULL);
zrl->zr_owner = curthread;
#endif
return (1);
@@ -140,7 +133,7 @@
}
}
- ASSERT((int32_t)n > ZRL_DESTROYED);
+ ASSERT3S((int32_t)n, >, ZRL_DESTROYED);
return (0);
}
@@ -148,11 +141,11 @@
void
zrl_exit(zrlock_t *zrl)
{
- ASSERT(zrl->zr_refcount == ZRL_LOCKED);
+ ASSERT3S(zrl->zr_refcount, ==, ZRL_LOCKED);
mutex_enter(&zrl->zr_mtx);
#ifdef ZFS_DEBUG
- ASSERT(zrl->zr_owner == curthread);
+ ASSERT3P(zrl->zr_owner, ==, curthread);
zrl->zr_owner = NULL;
membar_producer(); /* make sure the owner store happens first */
#endif
@@ -164,7 +157,7 @@
int
zrl_refcount(zrlock_t *zrl)
{
- ASSERT(zrl->zr_refcount > ZRL_DESTROYED);
+ ASSERT3S(zrl->zr_refcount, >, ZRL_DESTROYED);
int n = (int)zrl->zr_refcount;
return (n <= 0 ? 0 : n);
@@ -173,7 +166,7 @@
int
zrl_is_zero(zrlock_t *zrl)
{
- ASSERT(zrl->zr_refcount > ZRL_DESTROYED);
+ ASSERT3S(zrl->zr_refcount, >, ZRL_DESTROYED);
return (zrl->zr_refcount <= 0);
}
@@ -181,7 +174,7 @@
int
zrl_is_locked(zrlock_t *zrl)
{
- ASSERT(zrl->zr_refcount > ZRL_DESTROYED);
+ ASSERT3S(zrl->zr_refcount, >, ZRL_DESTROYED);
return (zrl->zr_refcount == ZRL_LOCKED);
}
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zvol.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -24,10 +24,15 @@
*
* Copyright (c) 2006-2010 Pawel Jakub Dawidek <pjd at FreeBSD.org>
* All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ *
+ * Portions Copyright 2010 Robert Milkowski
+ *
+ * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
*/
-/* Portions Copyright 2010 Robert Milkowski */
/* Portions Copyright 2011 Martin Matuska <mm at FreeBSD.org> */
/*
@@ -61,7 +66,9 @@
#include <sys/stat.h>
#include <sys/zap.h>
#include <sys/spa.h>
+#include <sys/spa_impl.h>
#include <sys/zio.h>
+#include <sys/disk.h>
#include <sys/dmu_traverse.h>
#include <sys/dnode.h>
#include <sys/dsl_dataset.h>
@@ -71,6 +78,7 @@
#include <sys/sunddi.h>
#include <sys/dirent.h>
#include <sys/policy.h>
+#include <sys/queue.h>
#include <sys/fs/zfs.h>
#include <sys/zfs_ioctl.h>
#include <sys/zil.h>
@@ -78,13 +86,20 @@
#include <sys/zfs_znode.h>
#include <sys/zfs_rlock.h>
#include <sys/vdev_impl.h>
+#include <sys/vdev_raidz.h>
#include <sys/zvol.h>
#include <sys/zil_impl.h>
#include <sys/dbuf.h>
+#include <sys/dmu_tx.h>
+#include <sys/zfeature.h>
+#include <sys/zio_checksum.h>
+#include <sys/filio.h>
+
#include <geom/geom.h>
#include "zfs_namecheck.h"
+#ifndef illumos
struct g_class zfs_zvol_class = {
.name = "ZFS::ZVOL",
.version = G_VERSION,
@@ -92,6 +107,7 @@
DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol);
+#endif
void *zfsdev_state;
static char *zvol_tag = "zvol_tag";
@@ -98,13 +114,31 @@
#define ZVOL_DUMPSIZE "dumpsize"
/*
- * The spa_namespace_lock protects the zfsdev_state structure from being
- * modified while it's being used, e.g. an open that comes in before a
- * create finishes. It also protects temporary opens of the dataset so that,
+ * This lock protects the zfsdev_state structure from being modified
+ * while it's being used, e.g. an open that comes in before a create
+ * finishes. It also protects temporary opens of the dataset so that,
* e.g., an open doesn't get a spurious EBUSY.
*/
+#ifdef illumos
+kmutex_t zfsdev_state_lock;
+#else
+/*
+ * In FreeBSD we've replaced the upstream zfsdev_state_lock with the
+ * spa_namespace_lock in the ZVOL code.
+ */
+#define zfsdev_state_lock spa_namespace_lock
+#endif
static uint32_t zvol_minors;
+#ifndef illumos
+SYSCTL_DECL(_vfs_zfs);
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, vol, CTLFLAG_RW, 0, "ZFS VOLUME");
+static int volmode = ZFS_VOLMODE_GEOM;
+TUNABLE_INT("vfs.zfs.vol.mode", &volmode);
+SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, mode, CTLFLAG_RWTUN, &volmode, 0,
+ "Expose as GEOM providers (1), device files (2) or neither");
+
+#endif
typedef struct zvol_extent {
list_node_t ze_node;
dva_t ze_dva; /* dva associated with this extent */
@@ -115,23 +149,41 @@
* The in-core state of each volume.
*/
typedef struct zvol_state {
+#ifndef illumos
+ LIST_ENTRY(zvol_state) zv_links;
+#endif
char zv_name[MAXPATHLEN]; /* pool/dd name */
uint64_t zv_volsize; /* amount of space we advertise */
uint64_t zv_volblocksize; /* volume block size */
+#ifdef illumos
+ minor_t zv_minor; /* minor number */
+#else
+ struct cdev *zv_dev; /* non-GEOM device */
struct g_provider *zv_provider; /* GEOM provider */
+#endif
uint8_t zv_min_bs; /* minimum addressable block shift */
uint8_t zv_flags; /* readonly, dumpified, etc. */
objset_t *zv_objset; /* objset handle */
+#ifdef illumos
+ uint32_t zv_open_count[OTYPCNT]; /* open counts */
+#endif
uint32_t zv_total_opens; /* total open count */
+ uint32_t zv_sync_cnt; /* synchronous open count */
zilog_t *zv_zilog; /* ZIL handle */
list_t zv_extents; /* List of extents for dump */
znode_t zv_znode; /* for range locking */
dmu_buf_t *zv_dbuf; /* bonus handle */
+#ifndef illumos
int zv_state;
+ int zv_volmode; /* Provide GEOM or cdev */
struct bio_queue_head zv_queue;
struct mtx zv_queue_mtx; /* zv_queue mutex */
+#endif
} zvol_state_t;
+#ifndef illumos
+static LIST_HEAD(, zvol_state) all_zvols;
+#endif
/*
* zvol specific flags
*/
@@ -145,6 +197,43 @@
*/
int zvol_maxphys = DMU_MAX_ACCESS/2;
+/*
+ * Toggle unmap functionality.
+ */
+boolean_t zvol_unmap_enabled = B_TRUE;
+#ifndef illumos
+SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, unmap_enabled, CTLFLAG_RWTUN,
+ &zvol_unmap_enabled, 0,
+ "Enable UNMAP functionality");
+
+static d_open_t zvol_d_open;
+static d_close_t zvol_d_close;
+static d_read_t zvol_read;
+static d_write_t zvol_write;
+static d_ioctl_t zvol_d_ioctl;
+static d_strategy_t zvol_strategy;
+
+static struct cdevsw zvol_cdevsw = {
+ .d_version = D_VERSION,
+ .d_open = zvol_d_open,
+ .d_close = zvol_d_close,
+ .d_read = zvol_read,
+ .d_write = zvol_write,
+ .d_ioctl = zvol_d_ioctl,
+ .d_strategy = zvol_strategy,
+ .d_name = "zvol",
+ .d_flags = D_DISK | D_TRACKCLOSE,
+};
+
+static void zvol_geom_run(zvol_state_t *zv);
+static void zvol_geom_destroy(zvol_state_t *zv);
+static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace);
+static void zvol_geom_start(struct bio *bp);
+static void zvol_geom_worker(void *arg);
+static void zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off,
+ uint64_t len, boolean_t sync);
+#endif /* !illumos */
+
extern int zfs_set_prop_nvlist(const char *, zprop_source_t,
nvlist_t *, nvlist_t *);
static int zvol_remove_zv(zvol_state_t *);
@@ -153,19 +242,13 @@
static int zvol_dump_fini(zvol_state_t *zv);
static int zvol_dump_init(zvol_state_t *zv, boolean_t resize);
-static zvol_state_t *zvol_geom_create(const char *name);
-static void zvol_geom_run(zvol_state_t *zv);
-static void zvol_geom_destroy(zvol_state_t *zv);
-static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace);
-static void zvol_geom_start(struct bio *bp);
-static void zvol_geom_worker(void *arg);
-
static void
-zvol_size_changed(zvol_state_t *zv)
+zvol_size_changed(zvol_state_t *zv, uint64_t volsize)
{
-#ifdef sun
- dev_t dev = makedevice(maj, min);
+#ifdef illumos
+ dev_t dev = makedevice(ddi_driver_major(zfs_dip), zv->zv_minor);
+ zv->zv_volsize = volsize;
VERIFY(ddi_prop_update_int64(dev, zfs_dip,
"Size", volsize) == DDI_SUCCESS);
VERIFY(ddi_prop_update_int64(dev, zfs_dip,
@@ -174,22 +257,19 @@
/* Notify specfs to invalidate the cached size */
spec_size_invalidate(dev, VBLK);
spec_size_invalidate(dev, VCHR);
-#else /* !sun */
- struct g_provider *pp;
+#else /* !illumos */
+ zv->zv_volsize = volsize;
+ if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
+ struct g_provider *pp;
- pp = zv->zv_provider;
- if (pp == NULL)
- return;
- if (zv->zv_volsize == pp->mediasize)
- return;
- /*
- * Changing provider size is not really supported by GEOM, but it
- * should be safe when provider is closed.
- */
- if (zv->zv_total_opens > 0)
- return;
- pp->mediasize = zv->zv_volsize;
-#endif /* !sun */
+ pp = zv->zv_provider;
+ if (pp == NULL)
+ return;
+ g_topology_lock();
+ g_resize_provider(pp, zv->zv_volsize);
+ g_topology_unlock();
+ }
+#endif /* illumos */
}
int
@@ -212,7 +292,7 @@
zvol_check_volblocksize(uint64_t volblocksize)
{
if (volblocksize < SPA_MINBLOCKSIZE ||
- volblocksize > SPA_MAXBLOCKSIZE ||
+ volblocksize > SPA_OLD_MAXBLOCKSIZE ||
!ISP2(volblocksize))
return (SET_ERROR(EDOM));
@@ -245,26 +325,26 @@
static zvol_state_t *
zvol_minor_lookup(const char *name)
{
- struct g_provider *pp;
- struct g_geom *gp;
- zvol_state_t *zv = NULL;
+#ifdef illumos
+ minor_t minor;
+#endif
+ zvol_state_t *zv;
- ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ ASSERT(MUTEX_HELD(&zfsdev_state_lock));
- g_topology_lock();
- LIST_FOREACH(gp, &zfs_zvol_class.geom, geom) {
- pp = LIST_FIRST(&gp->provider);
- if (pp == NULL)
- continue;
- zv = pp->private;
+#ifdef illumos
+ for (minor = 1; minor <= ZFSDEV_MAX_MINOR; minor++) {
+ zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
if (zv == NULL)
continue;
+#else
+ LIST_FOREACH(zv, &all_zvols, zv_links) {
+#endif
if (strcmp(zv->zv_name, name) == 0)
- break;
+ return (zv);
}
- g_topology_unlock();
- return (gp != NULL ? zv : NULL);
+ return (NULL);
}
/* extent mapping arg */
@@ -276,15 +356,18 @@
/*ARGSUSED*/
static int
zvol_map_block(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
- const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
+ const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
{
struct maparg *ma = arg;
zvol_extent_t *ze;
int bs = ma->ma_zv->zv_volblocksize;
- if (bp == NULL || zb->zb_object != ZVOL_OBJ || zb->zb_level != 0)
+ if (bp == NULL || BP_IS_HOLE(bp) ||
+ zb->zb_object != ZVOL_OBJ || zb->zb_level != 0)
return (0);
+ VERIFY(!BP_IS_EMBEDDED(bp));
+
VERIFY3U(ma->ma_blks, ==, zb->zb_blkid);
ma->ma_blks++;
@@ -385,6 +468,24 @@
}
/*
+ * Replay a TX_TRUNCATE ZIL transaction if asked. TX_TRUNCATE is how we
+ * implement DKIOCFREE/free-long-range.
+ */
+static int
+zvol_replay_truncate(zvol_state_t *zv, lr_truncate_t *lr, boolean_t byteswap)
+{
+ uint64_t offset, length;
+
+ if (byteswap)
+ byteswap_uint64_array(lr, sizeof (*lr));
+
+ offset = lr->lr_offset;
+ length = lr->lr_length;
+
+ return (dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, offset, length));
+}
+
+/*
* Replay a TX_WRITE ZIL transaction that didn't get committed
* after a system failure
*/
@@ -434,7 +535,7 @@
/*
* Callback vectors for replaying records.
- * Only TX_WRITE is needed for zvol.
+ * Only TX_WRITE and TX_TRUNCATE are needed for zvol.
*/
zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = {
zvol_replay_err, /* 0 no such transaction type */
@@ -447,7 +548,7 @@
zvol_replay_err, /* TX_LINK */
zvol_replay_err, /* TX_RENAME */
zvol_replay_write, /* TX_WRITE */
- zvol_replay_err, /* TX_TRUNCATE */
+ zvol_replay_truncate, /* TX_TRUNCATE */
zvol_replay_err, /* TX_SETATTR */
zvol_replay_err, /* TX_ACL */
zvol_replay_err, /* TX_CREATE_ACL */
@@ -459,20 +560,20 @@
zvol_replay_err, /* TX_WRITE2 */
};
-#ifdef sun
+#ifdef illumos
int
zvol_name2minor(const char *name, minor_t *minor)
{
zvol_state_t *zv;
- mutex_enter(&spa_namespace_lock);
+ mutex_enter(&zfsdev_state_lock);
zv = zvol_minor_lookup(name);
if (minor && zv)
*minor = zv->zv_minor;
- mutex_exit(&spa_namespace_lock);
+ mutex_exit(&zfsdev_state_lock);
return (zv ? 0 : -1);
}
-#endif /* sun */
+#endif /* illumos */
/*
* Create a minor node (plus a whole lot more) for the specified volume.
@@ -484,15 +585,24 @@
zvol_state_t *zv;
objset_t *os;
dmu_object_info_t doi;
- uint64_t volsize;
+#ifdef illumos
+ minor_t minor = 0;
+ char chrbuf[30], blkbuf[30];
+#else
+ struct g_provider *pp;
+ struct g_geom *gp;
+ uint64_t volsize, mode;
+#endif
int error;
+#ifndef illumos
ZFS_LOG(1, "Creating ZVOL %s...", name);
+#endif
- mutex_enter(&spa_namespace_lock);
+ mutex_enter(&zfsdev_state_lock);
if (zvol_minor_lookup(name) != NULL) {
- mutex_exit(&spa_namespace_lock);
+ mutex_exit(&zfsdev_state_lock);
return (SET_ERROR(EEXIST));
}
@@ -500,20 +610,20 @@
error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, FTAG, &os);
if (error) {
- mutex_exit(&spa_namespace_lock);
+ mutex_exit(&zfsdev_state_lock);
return (error);
}
-#ifdef sun
+#ifdef illumos
if ((minor = zfsdev_minor_alloc()) == 0) {
dmu_objset_disown(os, FTAG);
- mutex_exit(&spa_namespace_lock);
+ mutex_exit(&zfsdev_state_lock);
return (SET_ERROR(ENXIO));
}
if (ddi_soft_state_zalloc(zfsdev_state, minor) != DDI_SUCCESS) {
dmu_objset_disown(os, FTAG);
- mutex_exit(&spa_namespace_lock);
+ mutex_exit(&zfsdev_state_lock);
return (SET_ERROR(EAGAIN));
}
(void) ddi_prop_update_string(minor, zfs_dip, ZVOL_PROP_NAME,
@@ -525,7 +635,7 @@
minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
ddi_soft_state_free(zfsdev_state, minor);
dmu_objset_disown(os, FTAG);
- mutex_exit(&spa_namespace_lock);
+ mutex_exit(&zfsdev_state_lock);
return (SET_ERROR(EAGAIN));
}
@@ -536,7 +646,7 @@
ddi_remove_minor_node(zfs_dip, chrbuf);
ddi_soft_state_free(zfsdev_state, minor);
dmu_objset_disown(os, FTAG);
- mutex_exit(&spa_namespace_lock);
+ mutex_exit(&zfsdev_state_lock);
return (SET_ERROR(EAGAIN));
}
@@ -543,26 +653,68 @@
zs = ddi_get_soft_state(zfsdev_state, minor);
zs->zss_type = ZSST_ZVOL;
zv = zs->zss_data = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP);
-#else /* !sun */
+#else /* !illumos */
+ zv = kmem_zalloc(sizeof(*zv), KM_SLEEP);
+ zv->zv_state = 0;
error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
if (error) {
- ASSERT(error == 0);
+ kmem_free(zv, sizeof(*zv));
dmu_objset_disown(os, zvol_tag);
- mutex_exit(&spa_namespace_lock);
+ mutex_exit(&zfsdev_state_lock);
return (error);
}
+ error = dsl_prop_get_integer(name,
+ zfs_prop_to_name(ZFS_PROP_VOLMODE), &mode, NULL);
+ if (error != 0 || mode == ZFS_VOLMODE_DEFAULT)
+ mode = volmode;
DROP_GIANT();
- g_topology_lock();
- zv = zvol_geom_create(name);
zv->zv_volsize = volsize;
- zv->zv_provider->mediasize = zv->zv_volsize;
+ zv->zv_volmode = mode;
+ if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
+ g_topology_lock();
+ gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name);
+ gp->start = zvol_geom_start;
+ gp->access = zvol_geom_access;
+ pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name);
+ pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
+ pp->sectorsize = DEV_BSIZE;
+ pp->mediasize = zv->zv_volsize;
+ pp->private = zv;
-#endif /* !sun */
+ zv->zv_provider = pp;
+ bioq_init(&zv->zv_queue);
+ mtx_init(&zv->zv_queue_mtx, "zvol", NULL, MTX_DEF);
+ } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
+ struct make_dev_args args;
+ make_dev_args_init(&args);
+ args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
+ args.mda_devsw = &zvol_cdevsw;
+ args.mda_cr = NULL;
+ args.mda_uid = UID_ROOT;
+ args.mda_gid = GID_OPERATOR;
+ args.mda_mode = 0640;
+ args.mda_si_drv2 = zv;
+ error = make_dev_s(&args, &zv->zv_dev,
+ "%s/%s", ZVOL_DRIVER, name);
+ if (error != 0) {
+ kmem_free(zv, sizeof(*zv));
+ dmu_objset_disown(os, FTAG);
+ mutex_exit(&zfsdev_state_lock);
+ return (error);
+ }
+ zv->zv_dev->si_iosize_max = MAXPHYS;
+ }
+ LIST_INSERT_HEAD(&all_zvols, zv, zv_links);
+#endif /* illumos */
+
(void) strlcpy(zv->zv_name, name, MAXPATHLEN);
zv->zv_min_bs = DEV_BSHIFT;
+#ifdef illumos
+ zv->zv_minor = minor;
+#endif
zv->zv_objset = os;
if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os)))
zv->zv_flags |= ZVOL_RDONLY;
@@ -587,14 +739,16 @@
zvol_minors++;
- mutex_exit(&spa_namespace_lock);
-
- zvol_geom_run(zv);
-
- g_topology_unlock();
+ mutex_exit(&zfsdev_state_lock);
+#ifndef illumos
+ if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
+ zvol_geom_run(zv);
+ g_topology_unlock();
+ }
PICKUP_GIANT();
ZFS_LOG(1, "ZVOL %s created.", name);
+#endif
return (0);
}
@@ -605,26 +759,42 @@
static int
zvol_remove_zv(zvol_state_t *zv)
{
-#ifdef sun
+#ifdef illumos
+ char nmbuf[20];
minor_t minor = zv->zv_minor;
#endif
- ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ ASSERT(MUTEX_HELD(&zfsdev_state_lock));
if (zv->zv_total_opens != 0)
return (SET_ERROR(EBUSY));
- ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name);
-
-#ifdef sun
+#ifdef illumos
(void) snprintf(nmbuf, sizeof (nmbuf), "%u,raw", minor);
ddi_remove_minor_node(zfs_dip, nmbuf);
-#endif /* sun */
+ (void) snprintf(nmbuf, sizeof (nmbuf), "%u", minor);
+ ddi_remove_minor_node(zfs_dip, nmbuf);
+#else
+ ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name);
+
+ LIST_REMOVE(zv, zv_links);
+ if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
+ g_topology_lock();
+ zvol_geom_destroy(zv);
+ g_topology_unlock();
+ } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
+ if (zv->zv_dev != NULL)
+ destroy_dev(zv->zv_dev);
+ }
+#endif
+
avl_destroy(&zv->zv_znode.z_range_avl);
mutex_destroy(&zv->zv_znode.z_range_lock);
- zvol_geom_destroy(zv);
-
+ kmem_free(zv, sizeof (zvol_state_t));
+#ifdef illumos
+ ddi_soft_state_free(zfsdev_state, minor);
+#endif
zvol_minors--;
return (0);
}
@@ -635,15 +805,13 @@
zvol_state_t *zv;
int rc;
- mutex_enter(&spa_namespace_lock);
+ mutex_enter(&zfsdev_state_lock);
if ((zv = zvol_minor_lookup(name)) == NULL) {
- mutex_exit(&spa_namespace_lock);
+ mutex_exit(&zfsdev_state_lock);
return (SET_ERROR(ENXIO));
}
- g_topology_lock();
rc = zvol_remove_zv(zv);
- g_topology_unlock();
- mutex_exit(&spa_namespace_lock);
+ mutex_exit(&zfsdev_state_lock);
return (rc);
}
@@ -661,6 +829,7 @@
if (error)
return (error);
+ zv->zv_objset = os;
error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
if (error) {
ASSERT(error == 0);
@@ -667,15 +836,15 @@
dmu_objset_disown(os, zvol_tag);
return (error);
}
- zv->zv_objset = os;
+
error = dmu_bonus_hold(os, ZVOL_OBJ, zvol_tag, &zv->zv_dbuf);
if (error) {
dmu_objset_disown(os, zvol_tag);
return (error);
}
- zv->zv_volsize = volsize;
+
+ zvol_size_changed(zv, volsize);
zv->zv_zilog = zil_open(os, zvol_get_data);
- zvol_size_changed(zv);
VERIFY(dsl_prop_get_integer(zv->zv_name, "readonly", &readonly,
NULL) == 0);
@@ -708,7 +877,7 @@
zv->zv_objset = NULL;
}
-#ifdef sun
+#ifdef illumos
int
zvol_prealloc(zvol_state_t *zv)
{
@@ -728,7 +897,7 @@
while (resid != 0) {
int error;
- uint64_t bytes = MIN(resid, SPA_MAXBLOCKSIZE);
+ uint64_t bytes = MIN(resid, SPA_OLD_MAXBLOCKSIZE);
tx = dmu_tx_create(os);
dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes);
@@ -747,7 +916,7 @@
return (0);
}
-#endif /* sun */
+#endif /* illumos */
static int
zvol_update_volsize(objset_t *os, uint64_t volsize)
@@ -755,10 +924,11 @@
dmu_tx_t *tx;
int error;
- ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ ASSERT(MUTEX_HELD(&zfsdev_state_lock));
tx = dmu_tx_create(os);
dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
+ dmu_tx_mark_netfree(tx);
error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
dmu_tx_abort(tx);
@@ -778,95 +948,86 @@
void
zvol_remove_minors(const char *name)
{
- struct g_geom *gp, *gptmp;
- struct g_provider *pp;
+#ifdef illumos
zvol_state_t *zv;
+ char *namebuf;
+ minor_t minor;
+
+ namebuf = kmem_zalloc(strlen(name) + 2, KM_SLEEP);
+ (void) strncpy(namebuf, name, strlen(name));
+ (void) strcat(namebuf, "/");
+ mutex_enter(&zfsdev_state_lock);
+ for (minor = 1; minor <= ZFSDEV_MAX_MINOR; minor++) {
+
+ zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
+ if (zv == NULL)
+ continue;
+ if (strncmp(namebuf, zv->zv_name, strlen(namebuf)) == 0)
+ (void) zvol_remove_zv(zv);
+ }
+ kmem_free(namebuf, strlen(name) + 2);
+
+ mutex_exit(&zfsdev_state_lock);
+#else /* !illumos */
+ zvol_state_t *zv, *tzv;
size_t namelen;
namelen = strlen(name);
DROP_GIANT();
- mutex_enter(&spa_namespace_lock);
- g_topology_lock();
+ mutex_enter(&zfsdev_state_lock);
- LIST_FOREACH_SAFE(gp, &zfs_zvol_class.geom, geom, gptmp) {
- pp = LIST_FIRST(&gp->provider);
- if (pp == NULL)
- continue;
- zv = pp->private;
- if (zv == NULL)
- continue;
+ LIST_FOREACH_SAFE(zv, &all_zvols, zv_links, tzv) {
if (strcmp(zv->zv_name, name) == 0 ||
(strncmp(zv->zv_name, name, namelen) == 0 &&
- zv->zv_name[namelen] == '/')) {
+ strlen(zv->zv_name) > namelen && (zv->zv_name[namelen] == '/' ||
+ zv->zv_name[namelen] == '@'))) {
(void) zvol_remove_zv(zv);
}
}
- g_topology_unlock();
- mutex_exit(&spa_namespace_lock);
+ mutex_exit(&zfsdev_state_lock);
PICKUP_GIANT();
+#endif /* illumos */
}
-int
-zvol_set_volsize(const char *name, major_t maj, uint64_t volsize)
+static int
+zvol_update_live_volsize(zvol_state_t *zv, uint64_t volsize)
{
- zvol_state_t *zv = NULL;
- objset_t *os;
- int error;
- dmu_object_info_t doi;
uint64_t old_volsize = 0ULL;
- uint64_t readonly;
+ int error = 0;
- mutex_enter(&spa_namespace_lock);
- zv = zvol_minor_lookup(name);
- if ((error = dmu_objset_hold(name, FTAG, &os)) != 0) {
- mutex_exit(&spa_namespace_lock);
- return (error);
- }
+ ASSERT(MUTEX_HELD(&zfsdev_state_lock));
- if ((error = dmu_object_info(os, ZVOL_OBJ, &doi)) != 0 ||
- (error = zvol_check_volsize(volsize,
- doi.doi_data_block_size)) != 0)
- goto out;
-
- VERIFY(dsl_prop_get_integer(name, "readonly", &readonly,
- NULL) == 0);
- if (readonly) {
- error = EROFS;
- goto out;
- }
-
- error = zvol_update_volsize(os, volsize);
/*
* Reinitialize the dump area to the new size. If we
* failed to resize the dump area then restore it back to
- * its original size.
+ * its original size. We must set the new volsize prior
+ * to calling dumpvp_resize() to ensure that the devices'
+ * size(9P) is not visible by the dump subsystem.
*/
- if (zv && error == 0) {
+ old_volsize = zv->zv_volsize;
+ zvol_size_changed(zv, volsize);
+
#ifdef ZVOL_DUMP
- if (zv->zv_flags & ZVOL_DUMPIFIED) {
- old_volsize = zv->zv_volsize;
- zv->zv_volsize = volsize;
- if ((error = zvol_dumpify(zv)) != 0 ||
- (error = dumpvp_resize()) != 0) {
- (void) zvol_update_volsize(os, old_volsize);
- zv->zv_volsize = old_volsize;
- error = zvol_dumpify(zv);
- }
+ if (zv->zv_flags & ZVOL_DUMPIFIED) {
+ if ((error = zvol_dumpify(zv)) != 0 ||
+ (error = dumpvp_resize()) != 0) {
+ int dumpify_error;
+
+ (void) zvol_update_volsize(zv->zv_objset, old_volsize);
+ zvol_size_changed(zv, old_volsize);
+ dumpify_error = zvol_dumpify(zv);
+ error = dumpify_error ? dumpify_error : error;
}
+ }
#endif /* ZVOL_DUMP */
- if (error == 0) {
- zv->zv_volsize = volsize;
- zvol_size_changed(zv);
- }
- }
-#ifdef sun
+#ifdef illumos
/*
* Generate a LUN expansion event.
*/
- if (zv && error == 0) {
+ if (error == 0) {
sysevent_id_t eid;
nvlist_t *attr;
char *physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
@@ -883,44 +1044,79 @@
nvlist_free(attr);
kmem_free(physpath, MAXPATHLEN);
}
-#endif /* sun */
+#endif /* illumos */
+ return (error);
+}
-out:
- dmu_objset_rele(os, FTAG);
+int
+zvol_set_volsize(const char *name, uint64_t volsize)
+{
+ zvol_state_t *zv = NULL;
+ objset_t *os;
+ int error;
+ dmu_object_info_t doi;
+ uint64_t readonly;
+ boolean_t owned = B_FALSE;
- mutex_exit(&spa_namespace_lock);
+ error = dsl_prop_get_integer(name,
+ zfs_prop_to_name(ZFS_PROP_READONLY), &readonly, NULL);
+ if (error != 0)
+ return (error);
+ if (readonly)
+ return (SET_ERROR(EROFS));
+ mutex_enter(&zfsdev_state_lock);
+ zv = zvol_minor_lookup(name);
+
+ if (zv == NULL || zv->zv_objset == NULL) {
+ if ((error = dmu_objset_own(name, DMU_OST_ZVOL, B_FALSE,
+ FTAG, &os)) != 0) {
+ mutex_exit(&zfsdev_state_lock);
+ return (error);
+ }
+ owned = B_TRUE;
+ if (zv != NULL)
+ zv->zv_objset = os;
+ } else {
+ os = zv->zv_objset;
+ }
+
+ if ((error = dmu_object_info(os, ZVOL_OBJ, &doi)) != 0 ||
+ (error = zvol_check_volsize(volsize, doi.doi_data_block_size)) != 0)
+ goto out;
+
+ error = zvol_update_volsize(os, volsize);
+
+ if (error == 0 && zv != NULL)
+ error = zvol_update_live_volsize(zv, volsize);
+out:
+ if (owned) {
+ dmu_objset_disown(os, FTAG);
+ if (zv != NULL)
+ zv->zv_objset = NULL;
+ }
+ mutex_exit(&zfsdev_state_lock);
return (error);
}
/*ARGSUSED*/
+#ifdef illumos
+int
+zvol_open(dev_t *devp, int flag, int otyp, cred_t *cr)
+#else
static int
zvol_open(struct g_provider *pp, int flag, int count)
+#endif
{
zvol_state_t *zv;
int err = 0;
- boolean_t locked = B_FALSE;
+#ifdef illumos
- /*
- * Protect against recursively entering spa_namespace_lock
- * when spa_open() is used for a pool on a (local) ZVOL(s).
- * This is needed since we replaced upstream zfsdev_state_lock
- * with spa_namespace_lock in the ZVOL code.
- * We are using the same trick as spa_open().
- * Note that calls in zvol_first_open which need to resolve
- * pool name to a spa object will enter spa_open()
- * recursively, but that function already has all the
- * necessary protection.
- */
- if (!MUTEX_HELD(&spa_namespace_lock)) {
- mutex_enter(&spa_namespace_lock);
- locked = B_TRUE;
- }
+ mutex_enter(&zfsdev_state_lock);
- zv = pp->private;
+ zv = zfsdev_get_soft_state(getminor(*devp), ZSST_ZVOL);
if (zv == NULL) {
- if (locked)
- mutex_exit(&spa_namespace_lock);
+ mutex_exit(&zfsdev_state_lock);
return (SET_ERROR(ENXIO));
}
@@ -927,10 +1123,41 @@
if (zv->zv_total_opens == 0)
err = zvol_first_open(zv);
if (err) {
- if (locked)
- mutex_exit(&spa_namespace_lock);
+ mutex_exit(&zfsdev_state_lock);
return (err);
}
+#else /* !illumos */
+ if (tsd_get(zfs_geom_probe_vdev_key) != NULL) {
+ /*
+ * if zfs_geom_probe_vdev_key is set, that means that zfs is
+ * attempting to probe geom providers while looking for a
+ * replacement for a missing VDEV. In this case, the
+ * spa_namespace_lock will not be held, but it is still illegal
+ * to use a zvol as a vdev. Deadlocks can result if another
+ * thread has spa_namespace_lock
+ */
+ return (EOPNOTSUPP);
+ }
+
+ mutex_enter(&zfsdev_state_lock);
+
+ zv = pp->private;
+ if (zv == NULL) {
+ mutex_exit(&zfsdev_state_lock);
+ return (SET_ERROR(ENXIO));
+ }
+
+ if (zv->zv_total_opens == 0) {
+ err = zvol_first_open(zv);
+ if (err) {
+ mutex_exit(&zfsdev_state_lock);
+ return (err);
+ }
+ pp->mediasize = zv->zv_volsize;
+ pp->stripeoffset = 0;
+ pp->stripesize = zv->zv_volblocksize;
+ }
+#endif /* illumos */
if ((flag & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
err = SET_ERROR(EROFS);
goto out;
@@ -949,20 +1176,44 @@
}
#endif
+#ifdef illumos
+ if (zv->zv_open_count[otyp] == 0 || otyp == OTYP_LYR) {
+ zv->zv_open_count[otyp]++;
+ zv->zv_total_opens++;
+ }
+ mutex_exit(&zfsdev_state_lock);
+#else
zv->zv_total_opens += count;
- if (locked)
- mutex_exit(&spa_namespace_lock);
+ mutex_exit(&zfsdev_state_lock);
+#endif
return (err);
out:
if (zv->zv_total_opens == 0)
zvol_last_close(zv);
- if (locked)
- mutex_exit(&spa_namespace_lock);
+#ifdef illumos
+ mutex_exit(&zfsdev_state_lock);
+#else
+ mutex_exit(&zfsdev_state_lock);
+#endif
return (err);
}
/*ARGSUSED*/
+#ifdef illumos
+int
+zvol_close(dev_t dev, int flag, int otyp, cred_t *cr)
+{
+ minor_t minor = getminor(dev);
+ zvol_state_t *zv;
+ int error = 0;
+
+ mutex_enter(&zfsdev_state_lock);
+
+ zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
+ if (zv == NULL) {
+ mutex_exit(&zfsdev_state_lock);
+#else /* !illumos */
static int
zvol_close(struct g_provider *pp, int flag, int count)
{
@@ -971,8 +1222,8 @@
boolean_t locked = B_FALSE;
/* See comment in zvol_open(). */
- if (!MUTEX_HELD(&spa_namespace_lock)) {
- mutex_enter(&spa_namespace_lock);
+ if (!MUTEX_HELD(&zfsdev_state_lock)) {
+ mutex_enter(&zfsdev_state_lock);
locked = B_TRUE;
}
@@ -979,7 +1230,8 @@
zv = pp->private;
if (zv == NULL) {
if (locked)
- mutex_exit(&spa_namespace_lock);
+ mutex_exit(&zfsdev_state_lock);
+#endif /* illumos */
return (SET_ERROR(ENXIO));
}
@@ -992,18 +1244,30 @@
* If the open count is zero, this is a spurious close.
* That indicates a bug in the kernel / DDI framework.
*/
+#ifdef illumos
+ ASSERT(zv->zv_open_count[otyp] != 0);
+#endif
ASSERT(zv->zv_total_opens != 0);
/*
* You may get multiple opens, but only one close.
*/
+#ifdef illumos
+ zv->zv_open_count[otyp]--;
+ zv->zv_total_opens--;
+#else
zv->zv_total_opens -= count;
+#endif
if (zv->zv_total_opens == 0)
zvol_last_close(zv);
+#ifdef illumos
+ mutex_exit(&zfsdev_state_lock);
+#else
if (locked)
- mutex_exit(&spa_namespace_lock);
+ mutex_exit(&zfsdev_state_lock);
+#endif
return (error);
}
@@ -1042,7 +1306,6 @@
zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
zgd->zgd_zilog = zv->zv_zilog;
- zgd->zgd_rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_READER);
/*
* Write records come in two flavors: immediate and indirect.
@@ -1051,12 +1314,22 @@
* sync the data and get a pointer to it (indirect) so that
* we don't have to write the data twice.
*/
- if (buf != NULL) { /* immediate write */
+ if (buf != NULL) { /* immediate write */
+ zgd->zgd_rl = zfs_range_lock(&zv->zv_znode, offset, size,
+ RL_READER);
error = dmu_read(os, object, offset, size, buf,
DMU_READ_NO_PREFETCH);
- } else {
+ } else { /* indirect write */
+ /*
+ * Have to lock the whole block to ensure when it's written out
+ * and its checksum is being calculated that no one can change
+ * the data. Contrarily to zfs_get_data we need not re-check
+ * blocksize after we get the lock because it cannot be changed.
+ */
size = zv->zv_volblocksize;
offset = P2ALIGN(offset, size);
+ zgd->zgd_rl = zfs_range_lock(&zv->zv_znode, offset, size,
+ RL_READER);
error = dmu_buf_hold(os, object, offset, zgd, &db,
DMU_READ_NO_PREFETCH);
if (error == 0) {
@@ -1099,54 +1372,44 @@
{
uint32_t blocksize = zv->zv_volblocksize;
zilog_t *zilog = zv->zv_zilog;
- boolean_t slogging;
- ssize_t immediate_write_sz;
+ itx_wr_state_t write_state;
if (zil_replaying(zilog, tx))
return;
- immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
- ? 0 : zvol_immediate_write_sz;
+ if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
+ write_state = WR_INDIRECT;
+ else if (!spa_has_slogs(zilog->zl_spa) &&
+ resid >= blocksize && blocksize > zvol_immediate_write_sz)
+ write_state = WR_INDIRECT;
+ else if (sync)
+ write_state = WR_COPIED;
+ else
+ write_state = WR_NEED_COPY;
- slogging = spa_has_slogs(zilog->zl_spa) &&
- (zilog->zl_logbias == ZFS_LOGBIAS_LATENCY);
-
while (resid) {
itx_t *itx;
lr_write_t *lr;
- ssize_t len;
- itx_wr_state_t write_state;
+ itx_wr_state_t wr_state = write_state;
+ ssize_t len = resid;
- /*
- * Unlike zfs_log_write() we can be called with
- * upto DMU_MAX_ACCESS/2 (5MB) writes.
- */
- if (blocksize > immediate_write_sz && !slogging &&
- resid >= blocksize && off % blocksize == 0) {
- write_state = WR_INDIRECT; /* uses dmu_sync */
- len = blocksize;
- } else if (sync) {
- write_state = WR_COPIED;
- len = MIN(ZIL_MAX_LOG_DATA, resid);
- } else {
- write_state = WR_NEED_COPY;
- len = MIN(ZIL_MAX_LOG_DATA, resid);
- }
+ if (wr_state == WR_COPIED && resid > ZIL_MAX_COPIED_DATA)
+ wr_state = WR_NEED_COPY;
+ else if (wr_state == WR_INDIRECT)
+ len = MIN(blocksize - P2PHASE(off, blocksize), resid);
itx = zil_itx_create(TX_WRITE, sizeof (*lr) +
- (write_state == WR_COPIED ? len : 0));
+ (wr_state == WR_COPIED ? len : 0));
lr = (lr_write_t *)&itx->itx_lr;
- if (write_state == WR_COPIED && dmu_read(zv->zv_objset,
+ if (wr_state == WR_COPIED && dmu_read(zv->zv_objset,
ZVOL_OBJ, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) {
zil_itx_destroy(itx);
itx = zil_itx_create(TX_WRITE, sizeof (*lr));
lr = (lr_write_t *)&itx->itx_lr;
- write_state = WR_NEED_COPY;
+ wr_state = WR_NEED_COPY;
}
- itx->itx_wr_state = write_state;
- if (write_state == WR_NEED_COPY)
- itx->itx_sod += len;
+ itx->itx_wr_state = wr_state;
lr->lr_foid = ZVOL_OBJ;
lr->lr_offset = off;
lr->lr_length = len;
@@ -1154,8 +1417,10 @@
BP_ZERO(&lr->lr_blkptr);
itx->itx_private = zv;
- itx->itx_sync = sync;
+ if (!sync && (zv->zv_sync_cnt == 0))
+ itx->itx_sync = B_FALSE;
+
zil_itx_assign(zilog, itx, tx);
off += len;
@@ -1163,29 +1428,30 @@
}
}
-#ifdef sun
+#ifdef illumos
static int
-zvol_dumpio_vdev(vdev_t *vd, void *addr, uint64_t offset, uint64_t size,
- boolean_t doread, boolean_t isdump)
+zvol_dumpio_vdev(vdev_t *vd, void *addr, uint64_t offset, uint64_t origoffset,
+ uint64_t size, boolean_t doread, boolean_t isdump)
{
vdev_disk_t *dvd;
int c;
int numerrors = 0;
- for (c = 0; c < vd->vdev_children; c++) {
- ASSERT(vd->vdev_ops == &vdev_mirror_ops ||
- vd->vdev_ops == &vdev_replacing_ops ||
- vd->vdev_ops == &vdev_spare_ops);
- int err = zvol_dumpio_vdev(vd->vdev_child[c],
- addr, offset, size, doread, isdump);
- if (err != 0) {
- numerrors++;
- } else if (doread) {
- break;
+ if (vd->vdev_ops == &vdev_mirror_ops ||
+ vd->vdev_ops == &vdev_replacing_ops ||
+ vd->vdev_ops == &vdev_spare_ops) {
+ for (c = 0; c < vd->vdev_children; c++) {
+ int err = zvol_dumpio_vdev(vd->vdev_child[c],
+ addr, offset, origoffset, size, doread, isdump);
+ if (err != 0) {
+ numerrors++;
+ } else if (doread) {
+ break;
+ }
}
}
- if (!vd->vdev_ops->vdev_op_leaf)
+ if (!vd->vdev_ops->vdev_op_leaf && vd->vdev_ops != &vdev_raidz_ops)
return (numerrors < vd->vdev_children ? 0 : EIO);
if (doread && !vdev_readable(vd))
@@ -1193,8 +1459,11 @@
else if (!doread && !vdev_writeable(vd))
return (SET_ERROR(EIO));
- dvd = vd->vdev_tsd;
- ASSERT3P(dvd, !=, NULL);
+ if (vd->vdev_ops == &vdev_raidz_ops) {
+ return (vdev_raidz_physio(vd,
+ addr, size, offset, origoffset, doread, isdump));
+ }
+
offset += VDEV_LABEL_START_SIZE;
if (ddi_in_panic() || isdump) {
@@ -1201,11 +1470,15 @@
ASSERT(!doread);
if (doread)
return (SET_ERROR(EIO));
+ dvd = vd->vdev_tsd;
+ ASSERT3P(dvd, !=, NULL);
return (ldi_dump(dvd->vd_lh, addr, lbtodb(offset),
lbtodb(size)));
} else {
- return (vdev_disk_physio(dvd->vd_lh, addr, size, offset,
- doread ? B_READ : B_WRITE));
+ dvd = vd->vdev_tsd;
+ ASSERT3P(dvd, !=, NULL);
+ return (vdev_disk_ldi_physio(dvd->vd_lh, addr, size,
+ offset, doread ? B_READ : B_WRITE));
}
}
@@ -1240,7 +1513,8 @@
vd = vdev_lookup_top(spa, DVA_GET_VDEV(&ze->ze_dva));
offset += DVA_GET_OFFSET(&ze->ze_dva);
- error = zvol_dumpio_vdev(vd, addr, offset, size, doread, isdump);
+ error = zvol_dumpio_vdev(vd, addr, offset, DVA_GET_OFFSET(&ze->ze_dva),
+ size, doread, isdump);
if (!ddi_in_panic())
spa_config_exit(spa, SCL_STATE, FTAG);
@@ -1247,12 +1521,17 @@
return (error);
}
-#endif /* sun */
int
+zvol_strategy(buf_t *bp)
+{
+ zfs_soft_state_t *zs = NULL;
+#else /* !illumos */
+void
zvol_strategy(struct bio *bp)
{
- zvol_state_t *zv = bp->bio_to->private;
+#endif /* illumos */
+ zvol_state_t *zv;
uint64_t off, volsize;
size_t resid;
char *addr;
@@ -1259,34 +1538,105 @@
objset_t *os;
rl_t *rl;
int error = 0;
- boolean_t doread = (bp->bio_cmd == BIO_READ);
+#ifdef illumos
+ boolean_t doread = bp->b_flags & B_READ;
+#else
+ boolean_t doread = 0;
+#endif
+ boolean_t is_dumpified;
boolean_t sync;
- if (zv == NULL) {
- g_io_deliver(bp, ENXIO);
+#ifdef illumos
+ if (getminor(bp->b_edev) == 0) {
+ error = SET_ERROR(EINVAL);
+ } else {
+ zs = ddi_get_soft_state(zfsdev_state, getminor(bp->b_edev));
+ if (zs == NULL)
+ error = SET_ERROR(ENXIO);
+ else if (zs->zss_type != ZSST_ZVOL)
+ error = SET_ERROR(EINVAL);
+ }
+
+ if (error) {
+ bioerror(bp, error);
+ biodone(bp);
return (0);
}
- if (bp->bio_cmd != BIO_READ && (zv->zv_flags & ZVOL_RDONLY)) {
- g_io_deliver(bp, EROFS);
+ zv = zs->zss_data;
+
+ if (!(bp->b_flags & B_READ) && (zv->zv_flags & ZVOL_RDONLY)) {
+ bioerror(bp, EROFS);
+ biodone(bp);
return (0);
}
+ off = ldbtob(bp->b_blkno);
+#else /* !illumos */
+ if (bp->bio_to)
+ zv = bp->bio_to->private;
+ else
+ zv = bp->bio_dev->si_drv2;
+
+ if (zv == NULL) {
+ error = SET_ERROR(ENXIO);
+ goto out;
+ }
+
+ if (bp->bio_cmd != BIO_READ && (zv->zv_flags & ZVOL_RDONLY)) {
+ error = SET_ERROR(EROFS);
+ goto out;
+ }
+
+ switch (bp->bio_cmd) {
+ case BIO_FLUSH:
+ goto sync;
+ case BIO_READ:
+ doread = 1;
+ case BIO_WRITE:
+ case BIO_DELETE:
+ break;
+ default:
+ error = EOPNOTSUPP;
+ goto out;
+ }
+
off = bp->bio_offset;
+#endif /* illumos */
volsize = zv->zv_volsize;
os = zv->zv_objset;
ASSERT(os != NULL);
+#ifdef illumos
+ bp_mapin(bp);
+ addr = bp->b_un.b_addr;
+ resid = bp->b_bcount;
+
+ if (resid > 0 && (off < 0 || off >= volsize)) {
+ bioerror(bp, EIO);
+ biodone(bp);
+ return (0);
+ }
+
+ is_dumpified = zv->zv_flags & ZVOL_DUMPIFIED;
+ sync = ((!(bp->b_flags & B_ASYNC) &&
+ !(zv->zv_flags & ZVOL_WCE)) ||
+ (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS)) &&
+ !doread && !is_dumpified;
+#else /* !illumos */
addr = bp->bio_data;
resid = bp->bio_length;
if (resid > 0 && (off < 0 || off >= volsize)) {
- g_io_deliver(bp, EIO);
- return (0);
+ error = SET_ERROR(EIO);
+ goto out;
}
- sync = !doread && zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
+ is_dumpified = B_FALSE;
+ sync = !doread && !is_dumpified &&
+ zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS;
+#endif /* illumos */
/*
* There must be no buffer changes when doing a dmu_sync() because
@@ -1295,9 +1645,33 @@
rl = zfs_range_lock(&zv->zv_znode, off, resid,
doread ? RL_READER : RL_WRITER);
+#ifndef illumos
+ if (bp->bio_cmd == BIO_DELETE) {
+ dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error != 0) {
+ dmu_tx_abort(tx);
+ } else {
+ zvol_log_truncate(zv, tx, off, resid, sync);
+ dmu_tx_commit(tx);
+ error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
+ off, resid);
+ resid = 0;
+ }
+ goto unlock;
+ }
+#endif
while (resid != 0 && off < volsize) {
size_t size = MIN(resid, zvol_maxphys);
+#ifdef illumos
+ if (is_dumpified) {
+ size = MIN(size, P2END(off, zv->zv_volblocksize) - off);
+ error = zvol_dumpio(zv, addr, off, size,
+ doread, B_FALSE);
+ } else if (doread) {
+#else
if (doread) {
+#endif
error = dmu_read(os, ZVOL_OBJ, off, size, addr,
DMU_READ_PREFETCH);
} else {
@@ -1322,20 +1696,38 @@
addr += size;
resid -= size;
}
+#ifndef illumos
+unlock:
+#endif
zfs_range_unlock(rl);
- bp->bio_completed = bp->bio_length - resid;
- if (bp->bio_completed < bp->bio_length)
- bp->bio_error = (off > volsize ? EINVAL : error);
+#ifdef illumos
+ if ((bp->b_resid = resid) == bp->b_bcount)
+ bioerror(bp, off > volsize ? EINVAL : error);
if (sync)
zil_commit(zv->zv_zilog, ZVOL_OBJ);
- g_io_deliver(bp, 0);
+ biodone(bp);
return (0);
+#else /* !illumos */
+ bp->bio_completed = bp->bio_length - resid;
+ if (bp->bio_completed < bp->bio_length && off > volsize)
+ error = EINVAL;
+
+ if (sync) {
+sync:
+ zil_commit(zv->zv_zilog, ZVOL_OBJ);
+ }
+out:
+ if (bp->bio_to)
+ g_io_deliver(bp, error);
+ else
+ biofinish(bp, NULL, error);
+#endif /* illumos */
}
-#ifdef sun
+#ifdef illumos
/*
* Set the buffer count to the zvol maximum transfer.
* Using our own routine instead of the default minphys()
@@ -1391,25 +1783,37 @@
zvol_read(dev_t dev, uio_t *uio, cred_t *cr)
{
minor_t minor = getminor(dev);
+#else /* !illumos */
+int
+zvol_read(struct cdev *dev, struct uio *uio, int ioflag)
+{
+#endif /* illumos */
zvol_state_t *zv;
uint64_t volsize;
rl_t *rl;
int error = 0;
+#ifdef illumos
zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
if (zv == NULL)
return (SET_ERROR(ENXIO));
+#else
+ zv = dev->si_drv2;
+#endif
volsize = zv->zv_volsize;
+ /* uio_loffset == volsize isn't an error as its required for EOF processing. */
if (uio->uio_resid > 0 &&
- (uio->uio_loffset < 0 || uio->uio_loffset >= volsize))
+ (uio->uio_loffset < 0 || uio->uio_loffset > volsize))
return (SET_ERROR(EIO));
+#ifdef illumos
if (zv->zv_flags & ZVOL_DUMPIFIED) {
error = physio(zvol_strategy, NULL, dev, B_READ,
zvol_minphys, uio);
return (error);
}
+#endif
rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid,
RL_READER);
@@ -1420,7 +1824,7 @@
if (bytes > volsize - uio->uio_loffset)
bytes = volsize - uio->uio_loffset;
- error = dmu_read_uio(zv->zv_objset, ZVOL_OBJ, uio, bytes);
+ error = dmu_read_uio_dbuf(zv->zv_dbuf, uio, bytes);
if (error) {
/* convert checksum errors into IO errors */
if (error == ECKSUM)
@@ -1432,11 +1836,17 @@
return (error);
}
+#ifdef illumos
/*ARGSUSED*/
int
zvol_write(dev_t dev, uio_t *uio, cred_t *cr)
{
minor_t minor = getminor(dev);
+#else /* !illumos */
+int
+zvol_write(struct cdev *dev, struct uio *uio, int ioflag)
+{
+#endif /* illumos */
zvol_state_t *zv;
uint64_t volsize;
rl_t *rl;
@@ -1443,15 +1853,21 @@
int error = 0;
boolean_t sync;
+#ifdef illumos
zv = zfsdev_get_soft_state(minor, ZSST_ZVOL);
if (zv == NULL)
return (SET_ERROR(ENXIO));
+#else
+ zv = dev->si_drv2;
+#endif
volsize = zv->zv_volsize;
+ /* uio_loffset == volsize isn't an error as its required for EOF processing. */
if (uio->uio_resid > 0 &&
- (uio->uio_loffset < 0 || uio->uio_loffset >= volsize))
+ (uio->uio_loffset < 0 || uio->uio_loffset > volsize))
return (SET_ERROR(EIO));
+#ifdef illumos
if (zv->zv_flags & ZVOL_DUMPIFIED) {
error = physio(zvol_strategy, NULL, dev, B_WRITE,
zvol_minphys, uio);
@@ -1459,6 +1875,9 @@
}
sync = !(zv->zv_flags & ZVOL_WCE) ||
+#else
+ sync = (ioflag & IO_SYNC) ||
+#endif
(zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid,
@@ -1491,6 +1910,7 @@
return (error);
}
+#ifdef illumos
int
zvol_getefi(void *arg, int flag, uint64_t vs, uint8_t bs)
{
@@ -1619,9 +2039,36 @@
/*
* END entry points to allow external callers access to the volume.
*/
+#endif /* illumos */
/*
+ * Log a DKIOCFREE/free-long-range to the ZIL with TX_TRUNCATE.
+ */
+static void
+zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off, uint64_t len,
+ boolean_t sync)
+{
+ itx_t *itx;
+ lr_truncate_t *lr;
+ zilog_t *zilog = zv->zv_zilog;
+
+ if (zil_replaying(zilog, tx))
+ return;
+
+ itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr));
+ lr = (lr_truncate_t *)&itx->itx_lr;
+ lr->lr_foid = ZVOL_OBJ;
+ lr->lr_offset = off;
+ lr->lr_length = len;
+
+ itx->itx_sync = (sync || zv->zv_sync_cnt != 0);
+ zil_itx_assign(zilog, itx, tx);
+}
+
+#ifdef illumos
+/*
* Dirtbag ioctls to support mkfs(1M) for UFS filesystems. See dkio(7I).
+ * Also a dirtbag dkio ioctl for unmap/free-block functionality.
*/
/*ARGSUSED*/
int
@@ -1628,18 +2075,16 @@
zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
{
zvol_state_t *zv;
- struct dk_cinfo dki;
- struct dk_minfo dkm;
struct dk_callback *dkc;
int error = 0;
rl_t *rl;
- mutex_enter(&spa_namespace_lock);
+ mutex_enter(&zfsdev_state_lock);
zv = zfsdev_get_soft_state(getminor(dev), ZSST_ZVOL);
if (zv == NULL) {
- mutex_exit(&spa_namespace_lock);
+ mutex_exit(&zfsdev_state_lock);
return (SET_ERROR(ENXIO));
}
ASSERT(zv->zv_total_opens > 0);
@@ -1647,40 +2092,64 @@
switch (cmd) {
case DKIOCINFO:
+ {
+ struct dk_cinfo dki;
+
bzero(&dki, sizeof (dki));
(void) strcpy(dki.dki_cname, "zvol");
(void) strcpy(dki.dki_dname, "zvol");
dki.dki_ctype = DKC_UNKNOWN;
dki.dki_unit = getminor(dev);
- dki.dki_maxtransfer = 1 << (SPA_MAXBLOCKSHIFT - zv->zv_min_bs);
- mutex_exit(&spa_namespace_lock);
+ dki.dki_maxtransfer =
+ 1 << (SPA_OLD_MAXBLOCKSHIFT - zv->zv_min_bs);
+ mutex_exit(&zfsdev_state_lock);
if (ddi_copyout(&dki, (void *)arg, sizeof (dki), flag))
error = SET_ERROR(EFAULT);
return (error);
+ }
case DKIOCGMEDIAINFO:
+ {
+ struct dk_minfo dkm;
+
bzero(&dkm, sizeof (dkm));
dkm.dki_lbsize = 1U << zv->zv_min_bs;
dkm.dki_capacity = zv->zv_volsize >> zv->zv_min_bs;
dkm.dki_media_type = DK_UNKNOWN;
- mutex_exit(&spa_namespace_lock);
+ mutex_exit(&zfsdev_state_lock);
if (ddi_copyout(&dkm, (void *)arg, sizeof (dkm), flag))
error = SET_ERROR(EFAULT);
return (error);
+ }
+ case DKIOCGMEDIAINFOEXT:
+ {
+ struct dk_minfo_ext dkmext;
+
+ bzero(&dkmext, sizeof (dkmext));
+ dkmext.dki_lbsize = 1U << zv->zv_min_bs;
+ dkmext.dki_pbsize = zv->zv_volblocksize;
+ dkmext.dki_capacity = zv->zv_volsize >> zv->zv_min_bs;
+ dkmext.dki_media_type = DK_UNKNOWN;
+ mutex_exit(&zfsdev_state_lock);
+ if (ddi_copyout(&dkmext, (void *)arg, sizeof (dkmext), flag))
+ error = SET_ERROR(EFAULT);
+ return (error);
+ }
+
case DKIOCGETEFI:
- {
- uint64_t vs = zv->zv_volsize;
- uint8_t bs = zv->zv_min_bs;
+ {
+ uint64_t vs = zv->zv_volsize;
+ uint8_t bs = zv->zv_min_bs;
- mutex_exit(&spa_namespace_lock);
- error = zvol_getefi((void *)arg, flag, vs, bs);
- return (error);
- }
+ mutex_exit(&zfsdev_state_lock);
+ error = zvol_getefi((void *)arg, flag, vs, bs);
+ return (error);
+ }
case DKIOCFLUSHWRITECACHE:
dkc = (struct dk_callback *)arg;
- mutex_exit(&spa_namespace_lock);
+ mutex_exit(&zfsdev_state_lock);
zil_commit(zv->zv_zilog, ZVOL_OBJ);
if ((flag & FKIOCTL) && dkc != NULL && dkc->dkc_callback) {
(*dkc->dkc_callback)(dkc->dkc_cookie, error);
@@ -1689,31 +2158,31 @@
return (error);
case DKIOCGETWCE:
- {
- int wce = (zv->zv_flags & ZVOL_WCE) ? 1 : 0;
- if (ddi_copyout(&wce, (void *)arg, sizeof (int),
- flag))
- error = SET_ERROR(EFAULT);
+ {
+ int wce = (zv->zv_flags & ZVOL_WCE) ? 1 : 0;
+ if (ddi_copyout(&wce, (void *)arg, sizeof (int),
+ flag))
+ error = SET_ERROR(EFAULT);
+ break;
+ }
+ case DKIOCSETWCE:
+ {
+ int wce;
+ if (ddi_copyin((void *)arg, &wce, sizeof (int),
+ flag)) {
+ error = SET_ERROR(EFAULT);
break;
}
- case DKIOCSETWCE:
- {
- int wce;
- if (ddi_copyin((void *)arg, &wce, sizeof (int),
- flag)) {
- error = SET_ERROR(EFAULT);
- break;
- }
- if (wce) {
- zv->zv_flags |= ZVOL_WCE;
- mutex_exit(&spa_namespace_lock);
- } else {
- zv->zv_flags &= ~ZVOL_WCE;
- mutex_exit(&spa_namespace_lock);
- zil_commit(zv->zv_zilog, ZVOL_OBJ);
- }
- return (0);
+ if (wce) {
+ zv->zv_flags |= ZVOL_WCE;
+ mutex_exit(&zfsdev_state_lock);
+ } else {
+ zv->zv_flags &= ~ZVOL_WCE;
+ mutex_exit(&zfsdev_state_lock);
+ zil_commit(zv->zv_zilog, ZVOL_OBJ);
}
+ return (0);
+ }
case DKIOCGGEOM:
case DKIOCGVTOC:
@@ -1745,6 +2214,9 @@
dkioc_free_t df;
dmu_tx_t *tx;
+ if (!zvol_unmap_enabled)
+ break;
+
if (ddi_copyin((void *)arg, &df, sizeof (df), flag)) {
error = SET_ERROR(EFAULT);
break;
@@ -1757,12 +2229,13 @@
*/
if (df.df_start >= zv->zv_volsize)
break; /* No need to do anything... */
- if (df.df_start + df.df_length > zv->zv_volsize)
- df.df_length = DMU_OBJECT_END;
+ mutex_exit(&zfsdev_state_lock);
+
rl = zfs_range_lock(&zv->zv_znode, df.df_start, df.df_length,
RL_WRITER);
tx = dmu_tx_create(zv->zv_objset);
+ dmu_tx_mark_netfree(tx);
error = dmu_tx_assign(tx, TXG_WAIT);
if (error != 0) {
dmu_tx_abort(tx);
@@ -1796,7 +2269,7 @@
dmu_objset_pool(zv->zv_objset), 0);
}
}
- break;
+ return (error);
}
default:
@@ -1804,10 +2277,10 @@
break;
}
- mutex_exit(&spa_namespace_lock);
+ mutex_exit(&zfsdev_state_lock);
return (error);
}
-#endif /* sun */
+#endif /* illumos */
int
zvol_busy(void)
@@ -1820,37 +2293,117 @@
{
VERIFY(ddi_soft_state_init(&zfsdev_state, sizeof (zfs_soft_state_t),
1) == 0);
+#ifdef illumos
+ mutex_init(&zfsdev_state_lock, NULL, MUTEX_DEFAULT, NULL);
+#else
ZFS_LOG(1, "ZVOL Initialized.");
+#endif
}
void
zvol_fini(void)
{
+#ifdef illumos
+ mutex_destroy(&zfsdev_state_lock);
+#endif
ddi_soft_state_fini(&zfsdev_state);
ZFS_LOG(1, "ZVOL Deinitialized.");
}
-#ifdef sun
+#ifdef illumos
+/*ARGSUSED*/
static int
+zfs_mvdev_dump_feature_check(void *arg, dmu_tx_t *tx)
+{
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+
+ if (spa_feature_is_active(spa, SPA_FEATURE_MULTI_VDEV_CRASH_DUMP))
+ return (1);
+ return (0);
+}
+
+/*ARGSUSED*/
+static void
+zfs_mvdev_dump_activate_feature_sync(void *arg, dmu_tx_t *tx)
+{
+ spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+
+ spa_feature_incr(spa, SPA_FEATURE_MULTI_VDEV_CRASH_DUMP, tx);
+}
+
+static int
zvol_dump_init(zvol_state_t *zv, boolean_t resize)
{
dmu_tx_t *tx;
- int error = 0;
+ int error;
objset_t *os = zv->zv_objset;
+ spa_t *spa = dmu_objset_spa(os);
+ vdev_t *vd = spa->spa_root_vdev;
nvlist_t *nv = NULL;
- uint64_t version = spa_version(dmu_objset_spa(zv->zv_objset));
+ uint64_t version = spa_version(spa);
+ uint64_t checksum, compress, refresrv, vbs, dedup;
- ASSERT(MUTEX_HELD(&spa_namespace_lock));
+ ASSERT(MUTEX_HELD(&zfsdev_state_lock));
+ ASSERT(vd->vdev_ops == &vdev_root_ops);
+
error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, 0,
DMU_OBJECT_END);
+ if (error != 0)
+ return (error);
/* wait for dmu_free_long_range to actually free the blocks */
txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
+ /*
+ * If the pool on which the dump device is being initialized has more
+ * than one child vdev, check that the MULTI_VDEV_CRASH_DUMP feature is
+ * enabled. If so, bump that feature's counter to indicate that the
+ * feature is active. We also check the vdev type to handle the
+ * following case:
+ * # zpool create test raidz disk1 disk2 disk3
+ * Now have spa_root_vdev->vdev_children == 1 (the raidz vdev),
+ * the raidz vdev itself has 3 children.
+ */
+ if (vd->vdev_children > 1 || vd->vdev_ops == &vdev_raidz_ops) {
+ if (!spa_feature_is_enabled(spa,
+ SPA_FEATURE_MULTI_VDEV_CRASH_DUMP))
+ return (SET_ERROR(ENOTSUP));
+ (void) dsl_sync_task(spa_name(spa),
+ zfs_mvdev_dump_feature_check,
+ zfs_mvdev_dump_activate_feature_sync, NULL,
+ 2, ZFS_SPACE_CHECK_RESERVED);
+ }
+
+ if (!resize) {
+ error = dsl_prop_get_integer(zv->zv_name,
+ zfs_prop_to_name(ZFS_PROP_COMPRESSION), &compress, NULL);
+ if (error == 0) {
+ error = dsl_prop_get_integer(zv->zv_name,
+ zfs_prop_to_name(ZFS_PROP_CHECKSUM), &checksum,
+ NULL);
+ }
+ if (error == 0) {
+ error = dsl_prop_get_integer(zv->zv_name,
+ zfs_prop_to_name(ZFS_PROP_REFRESERVATION),
+ &refresrv, NULL);
+ }
+ if (error == 0) {
+ error = dsl_prop_get_integer(zv->zv_name,
+ zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &vbs,
+ NULL);
+ }
+ if (version >= SPA_VERSION_DEDUP && error == 0) {
+ error = dsl_prop_get_integer(zv->zv_name,
+ zfs_prop_to_name(ZFS_PROP_DEDUP), &dedup, NULL);
+ }
+ }
+ if (error != 0)
+ return (error);
+
tx = dmu_tx_create(os);
dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
dmu_tx_hold_bonus(tx, ZVOL_OBJ);
error = dmu_tx_assign(tx, TXG_WAIT);
- if (error) {
+ if (error != 0) {
dmu_tx_abort(tx);
return (error);
}
@@ -1866,42 +2419,35 @@
zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1,
&zv->zv_volsize, tx);
} else {
- uint64_t checksum, compress, refresrv, vbs, dedup;
-
- error = dsl_prop_get_integer(zv->zv_name,
- zfs_prop_to_name(ZFS_PROP_COMPRESSION), &compress, NULL);
- error = error ? error : dsl_prop_get_integer(zv->zv_name,
- zfs_prop_to_name(ZFS_PROP_CHECKSUM), &checksum, NULL);
- error = error ? error : dsl_prop_get_integer(zv->zv_name,
- zfs_prop_to_name(ZFS_PROP_REFRESERVATION), &refresrv, NULL);
- error = error ? error : dsl_prop_get_integer(zv->zv_name,
- zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &vbs, NULL);
- if (version >= SPA_VERSION_DEDUP) {
- error = error ? error :
- dsl_prop_get_integer(zv->zv_name,
- zfs_prop_to_name(ZFS_PROP_DEDUP), &dedup, NULL);
- }
-
- error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
+ error = zap_update(os, ZVOL_ZAP_OBJ,
zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1,
&compress, tx);
- error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
- zfs_prop_to_name(ZFS_PROP_CHECKSUM), 8, 1, &checksum, tx);
- error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
- zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1,
- &refresrv, tx);
- error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
- zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1,
- &vbs, tx);
- error = error ? error : dmu_object_set_blocksize(
- os, ZVOL_OBJ, SPA_MAXBLOCKSIZE, 0, tx);
- if (version >= SPA_VERSION_DEDUP) {
- error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
+ if (error == 0) {
+ error = zap_update(os, ZVOL_ZAP_OBJ,
+ zfs_prop_to_name(ZFS_PROP_CHECKSUM), 8, 1,
+ &checksum, tx);
+ }
+ if (error == 0) {
+ error = zap_update(os, ZVOL_ZAP_OBJ,
+ zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1,
+ &refresrv, tx);
+ }
+ if (error == 0) {
+ error = zap_update(os, ZVOL_ZAP_OBJ,
+ zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1,
+ &vbs, tx);
+ }
+ if (error == 0) {
+ error = dmu_object_set_blocksize(
+ os, ZVOL_OBJ, SPA_OLD_MAXBLOCKSIZE, 0, tx);
+ }
+ if (version >= SPA_VERSION_DEDUP && error == 0) {
+ error = zap_update(os, ZVOL_ZAP_OBJ,
zfs_prop_to_name(ZFS_PROP_DEDUP), 8, 1,
&dedup, tx);
}
if (error == 0)
- zv->zv_volblocksize = SPA_MAXBLOCKSIZE;
+ zv->zv_volblocksize = SPA_OLD_MAXBLOCKSIZE;
}
dmu_tx_commit(tx);
@@ -1909,7 +2455,15 @@
* We only need update the zvol's property if we are initializing
* the dump area for the first time.
*/
- if (!resize) {
+ if (error == 0 && !resize) {
+ /*
+ * If MULTI_VDEV_CRASH_DUMP is active, use the NOPARITY checksum
+ * function. Otherwise, use the old default -- OFF.
+ */
+ checksum = spa_feature_is_active(spa,
+ SPA_FEATURE_MULTI_VDEV_CRASH_DUMP) ? ZIO_CHECKSUM_NOPARITY :
+ ZIO_CHECKSUM_OFF;
+
VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
VERIFY(nvlist_add_uint64(nv,
zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 0) == 0);
@@ -1918,7 +2472,7 @@
ZIO_COMPRESS_OFF) == 0);
VERIFY(nvlist_add_uint64(nv,
zfs_prop_to_name(ZFS_PROP_CHECKSUM),
- ZIO_CHECKSUM_OFF) == 0);
+ checksum) == 0);
if (version >= SPA_VERSION_DEDUP) {
VERIFY(nvlist_add_uint64(nv,
zfs_prop_to_name(ZFS_PROP_DEDUP),
@@ -1928,13 +2482,11 @@
error = zfs_set_prop_nvlist(zv->zv_name, ZPROP_SRC_LOCAL,
nv, NULL);
nvlist_free(nv);
-
- if (error)
- return (error);
}
/* Allocate the space for the dump */
- error = zvol_prealloc(zv);
+ if (error == 0)
+ error = zvol_prealloc(zv);
return (error);
}
@@ -2062,32 +2614,8 @@
return (0);
}
-#endif /* sun */
+#else /* !illumos */
-static zvol_state_t *
-zvol_geom_create(const char *name)
-{
- struct g_provider *pp;
- struct g_geom *gp;
- zvol_state_t *zv;
-
- gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name);
- gp->start = zvol_geom_start;
- gp->access = zvol_geom_access;
- pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name);
- pp->sectorsize = DEV_BSIZE;
-
- zv = kmem_zalloc(sizeof(*zv), KM_SLEEP);
- zv->zv_provider = pp;
- zv->zv_state = 0;
- bioq_init(&zv->zv_queue);
- mtx_init(&zv->zv_queue_mtx, "zvol", NULL, MTX_DEF);
-
- pp->private = zv;
-
- return (zv);
-}
-
static void
zvol_geom_run(zvol_state_t *zv)
{
@@ -2118,8 +2646,6 @@
zv->zv_provider = NULL;
pp->private = NULL;
g_wither_geom(pp->geom, ENXIO);
-
- kmem_free(zv, sizeof(*zv));
}
static int
@@ -2178,25 +2704,67 @@
zvol_state_t *zv;
boolean_t first;
+ zv = bp->bio_to->private;
+ ASSERT(zv != NULL);
switch (bp->bio_cmd) {
+ case BIO_FLUSH:
+ if (!THREAD_CAN_SLEEP())
+ goto enqueue;
+ zil_commit(zv->zv_zilog, ZVOL_OBJ);
+ g_io_deliver(bp, 0);
+ break;
case BIO_READ:
case BIO_WRITE:
- case BIO_FLUSH:
- zv = bp->bio_to->private;
- ASSERT(zv != NULL);
- mtx_lock(&zv->zv_queue_mtx);
- first = (bioq_first(&zv->zv_queue) == NULL);
- bioq_insert_tail(&zv->zv_queue, bp);
- mtx_unlock(&zv->zv_queue_mtx);
- if (first)
- wakeup_one(&zv->zv_queue);
+ case BIO_DELETE:
+ if (!THREAD_CAN_SLEEP())
+ goto enqueue;
+ zvol_strategy(bp);
break;
- case BIO_GETATTR:
- case BIO_DELETE:
+ case BIO_GETATTR: {
+ spa_t *spa = dmu_objset_spa(zv->zv_objset);
+ uint64_t refd, avail, usedobjs, availobjs, val;
+
+ if (g_handleattr_int(bp, "GEOM::candelete", 1))
+ return;
+ if (strcmp(bp->bio_attribute, "blocksavail") == 0) {
+ dmu_objset_space(zv->zv_objset, &refd, &avail,
+ &usedobjs, &availobjs);
+ if (g_handleattr_off_t(bp, "blocksavail",
+ avail / DEV_BSIZE))
+ return;
+ } else if (strcmp(bp->bio_attribute, "blocksused") == 0) {
+ dmu_objset_space(zv->zv_objset, &refd, &avail,
+ &usedobjs, &availobjs);
+ if (g_handleattr_off_t(bp, "blocksused",
+ refd / DEV_BSIZE))
+ return;
+ } else if (strcmp(bp->bio_attribute, "poolblocksavail") == 0) {
+ avail = metaslab_class_get_space(spa_normal_class(spa));
+ avail -= metaslab_class_get_alloc(spa_normal_class(spa));
+ if (g_handleattr_off_t(bp, "poolblocksavail",
+ avail / DEV_BSIZE))
+ return;
+ } else if (strcmp(bp->bio_attribute, "poolblocksused") == 0) {
+ refd = metaslab_class_get_alloc(spa_normal_class(spa));
+ if (g_handleattr_off_t(bp, "poolblocksused",
+ refd / DEV_BSIZE))
+ return;
+ }
+ /* FALLTHROUGH */
+ }
default:
g_io_deliver(bp, EOPNOTSUPP);
break;
}
+ return;
+
+enqueue:
+ mtx_lock(&zv->zv_queue_mtx);
+ first = (bioq_first(&zv->zv_queue) == NULL);
+ bioq_insert_tail(&zv->zv_queue, bp);
+ mtx_unlock(&zv->zv_queue_mtx);
+ if (first)
+ wakeup_one(&zv->zv_queue);
}
static void
@@ -2232,8 +2800,12 @@
break;
case BIO_READ:
case BIO_WRITE:
+ case BIO_DELETE:
zvol_strategy(bp);
break;
+ default:
+ g_io_deliver(bp, EOPNOTSUPP);
+ break;
}
}
}
@@ -2273,7 +2845,8 @@
break;
}
- if ((error = zvol_create_minor(sname)) != 0) {
+ error = zvol_create_minor(sname);
+ if (error != 0 && error != EEXIST) {
printf("ZFS WARNING: Unable to create ZVOL %s (error=%d).\n",
sname, error);
break;
@@ -2303,9 +2876,10 @@
if (dmu_objset_type(os) == DMU_OST_ZVOL) {
dsl_dataset_long_hold(os->os_dsl_dataset, FTAG);
dsl_pool_rele(dmu_objset_pool(os), FTAG);
- if ((error = zvol_create_minor(name)) == 0)
+ error = zvol_create_minor(name);
+ if (error == 0 || error == EEXIST) {
error = zvol_create_snapshots(os, name);
- else {
+ } else {
printf("ZFS WARNING: Unable to create ZVOL %s (error=%d).\n",
name, error);
}
@@ -2354,29 +2928,58 @@
}
static void
-zvol_rename_minor(struct g_geom *gp, const char *newname)
+zvol_rename_minor(zvol_state_t *zv, const char *newname)
{
+ struct g_geom *gp;
struct g_provider *pp;
- zvol_state_t *zv;
+ struct cdev *dev;
- ASSERT(MUTEX_HELD(&spa_namespace_lock));
- g_topology_assert();
+ ASSERT(MUTEX_HELD(&zfsdev_state_lock));
- pp = LIST_FIRST(&gp->provider);
- ASSERT(pp != NULL);
- zv = pp->private;
- ASSERT(zv != NULL);
+ if (zv->zv_volmode == ZFS_VOLMODE_GEOM) {
+ g_topology_lock();
+ pp = zv->zv_provider;
+ ASSERT(pp != NULL);
+ gp = pp->geom;
+ ASSERT(gp != NULL);
- zv->zv_provider = NULL;
- g_wither_provider(pp, ENXIO);
+ zv->zv_provider = NULL;
+ g_wither_provider(pp, ENXIO);
- pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, newname);
- pp->sectorsize = DEV_BSIZE;
- pp->mediasize = zv->zv_volsize;
- pp->private = zv;
- zv->zv_provider = pp;
+ pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, newname);
+ pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND;
+ pp->sectorsize = DEV_BSIZE;
+ pp->mediasize = zv->zv_volsize;
+ pp->private = zv;
+ zv->zv_provider = pp;
+ g_error_provider(pp, 0);
+ g_topology_unlock();
+ } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) {
+ struct make_dev_args args;
+
+ if ((dev = zv->zv_dev) != NULL) {
+ zv->zv_dev = NULL;
+ destroy_dev(dev);
+ if (zv->zv_total_opens > 0) {
+ zv->zv_flags &= ~ZVOL_EXCL;
+ zv->zv_total_opens = 0;
+ zvol_last_close(zv);
+ }
+ }
+
+ make_dev_args_init(&args);
+ args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK;
+ args.mda_devsw = &zvol_cdevsw;
+ args.mda_cr = NULL;
+ args.mda_uid = UID_ROOT;
+ args.mda_gid = GID_OPERATOR;
+ args.mda_mode = 0640;
+ args.mda_si_drv2 = zv;
+ if (make_dev_s(&args, &zv->zv_dev,
+ "%s/%s", ZVOL_DRIVER, newname) == 0)
+ zv->zv_dev->si_iosize_max = MAXPHYS;
+ }
strlcpy(zv->zv_name, newname, sizeof(zv->zv_name));
- g_error_provider(pp, 0);
}
void
@@ -2388,23 +2991,21 @@
size_t oldnamelen, newnamelen;
zvol_state_t *zv;
char *namebuf;
+ boolean_t locked = B_FALSE;
oldnamelen = strlen(oldname);
newnamelen = strlen(newname);
DROP_GIANT();
- mutex_enter(&spa_namespace_lock);
- g_topology_lock();
+ /* See comment in zvol_open(). */
+ if (!MUTEX_HELD(&zfsdev_state_lock)) {
+ mutex_enter(&zfsdev_state_lock);
+ locked = B_TRUE;
+ }
- LIST_FOREACH(gp, &zfs_zvol_class.geom, geom) {
- pp = LIST_FIRST(&gp->provider);
- if (pp == NULL)
- continue;
- zv = pp->private;
- if (zv == NULL)
- continue;
+ LIST_FOREACH(zv, &all_zvols, zv_links) {
if (strcmp(zv->zv_name, oldname) == 0) {
- zvol_rename_minor(gp, newname);
+ zvol_rename_minor(zv, newname);
} else if (strncmp(zv->zv_name, oldname, oldnamelen) == 0 &&
(zv->zv_name[oldnamelen] == '/' ||
zv->zv_name[oldnamelen] == '@')) {
@@ -2411,11 +3012,198 @@
snprintf(name, sizeof(name), "%s%c%s", newname,
zv->zv_name[oldnamelen],
zv->zv_name + oldnamelen + 1);
- zvol_rename_minor(gp, name);
+ zvol_rename_minor(zv, name);
}
}
- g_topology_unlock();
- mutex_exit(&spa_namespace_lock);
+ if (locked)
+ mutex_exit(&zfsdev_state_lock);
PICKUP_GIANT();
}
+
+static int
+zvol_d_open(struct cdev *dev, int flags, int fmt, struct thread *td)
+{
+ zvol_state_t *zv = dev->si_drv2;
+ int err = 0;
+
+ mutex_enter(&zfsdev_state_lock);
+ if (zv->zv_total_opens == 0)
+ err = zvol_first_open(zv);
+ if (err) {
+ mutex_exit(&zfsdev_state_lock);
+ return (err);
+ }
+ if ((flags & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
+ err = SET_ERROR(EROFS);
+ goto out;
+ }
+ if (zv->zv_flags & ZVOL_EXCL) {
+ err = SET_ERROR(EBUSY);
+ goto out;
+ }
+#ifdef FEXCL
+ if (flags & FEXCL) {
+ if (zv->zv_total_opens != 0) {
+ err = SET_ERROR(EBUSY);
+ goto out;
+ }
+ zv->zv_flags |= ZVOL_EXCL;
+ }
+#endif
+
+ zv->zv_total_opens++;
+ if (flags & (FSYNC | FDSYNC)) {
+ zv->zv_sync_cnt++;
+ if (zv->zv_sync_cnt == 1)
+ zil_async_to_sync(zv->zv_zilog, ZVOL_OBJ);
+ }
+ mutex_exit(&zfsdev_state_lock);
+ return (err);
+out:
+ if (zv->zv_total_opens == 0)
+ zvol_last_close(zv);
+ mutex_exit(&zfsdev_state_lock);
+ return (err);
+}
+
+static int
+zvol_d_close(struct cdev *dev, int flags, int fmt, struct thread *td)
+{
+ zvol_state_t *zv = dev->si_drv2;
+
+ mutex_enter(&zfsdev_state_lock);
+ if (zv->zv_flags & ZVOL_EXCL) {
+ ASSERT(zv->zv_total_opens == 1);
+ zv->zv_flags &= ~ZVOL_EXCL;
+ }
+
+ /*
+ * If the open count is zero, this is a spurious close.
+ * That indicates a bug in the kernel / DDI framework.
+ */
+ ASSERT(zv->zv_total_opens != 0);
+
+ /*
+ * You may get multiple opens, but only one close.
+ */
+ zv->zv_total_opens--;
+ if (flags & (FSYNC | FDSYNC))
+ zv->zv_sync_cnt--;
+
+ if (zv->zv_total_opens == 0)
+ zvol_last_close(zv);
+
+ mutex_exit(&zfsdev_state_lock);
+ return (0);
+}
+
+static int
+zvol_d_ioctl(struct cdev *dev, u_long cmd, caddr_t data, int fflag, struct thread *td)
+{
+ zvol_state_t *zv;
+ rl_t *rl;
+ off_t offset, length;
+ int i, error;
+ boolean_t sync;
+
+ zv = dev->si_drv2;
+
+ error = 0;
+ KASSERT(zv->zv_total_opens > 0,
+ ("Device with zero access count in zvol_d_ioctl"));
+
+ i = IOCPARM_LEN(cmd);
+ switch (cmd) {
+ case DIOCGSECTORSIZE:
+ *(u_int *)data = DEV_BSIZE;
+ break;
+ case DIOCGMEDIASIZE:
+ *(off_t *)data = zv->zv_volsize;
+ break;
+ case DIOCGFLUSH:
+ zil_commit(zv->zv_zilog, ZVOL_OBJ);
+ break;
+ case DIOCGDELETE:
+ if (!zvol_unmap_enabled)
+ break;
+
+ offset = ((off_t *)data)[0];
+ length = ((off_t *)data)[1];
+ if ((offset % DEV_BSIZE) != 0 || (length % DEV_BSIZE) != 0 ||
+ offset < 0 || offset >= zv->zv_volsize ||
+ length <= 0) {
+ printf("%s: offset=%jd length=%jd\n", __func__, offset,
+ length);
+ error = EINVAL;
+ break;
+ }
+
+ rl = zfs_range_lock(&zv->zv_znode, offset, length, RL_WRITER);
+ dmu_tx_t *tx = dmu_tx_create(zv->zv_objset);
+ error = dmu_tx_assign(tx, TXG_WAIT);
+ if (error != 0) {
+ sync = FALSE;
+ dmu_tx_abort(tx);
+ } else {
+ sync = (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS);
+ zvol_log_truncate(zv, tx, offset, length, sync);
+ dmu_tx_commit(tx);
+ error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
+ offset, length);
+ }
+ zfs_range_unlock(rl);
+ if (sync)
+ zil_commit(zv->zv_zilog, ZVOL_OBJ);
+ break;
+ case DIOCGSTRIPESIZE:
+ *(off_t *)data = zv->zv_volblocksize;
+ break;
+ case DIOCGSTRIPEOFFSET:
+ *(off_t *)data = 0;
+ break;
+ case DIOCGATTR: {
+ spa_t *spa = dmu_objset_spa(zv->zv_objset);
+ struct diocgattr_arg *arg = (struct diocgattr_arg *)data;
+ uint64_t refd, avail, usedobjs, availobjs;
+
+ if (strcmp(arg->name, "GEOM::candelete") == 0)
+ arg->value.i = 1;
+ else if (strcmp(arg->name, "blocksavail") == 0) {
+ dmu_objset_space(zv->zv_objset, &refd, &avail,
+ &usedobjs, &availobjs);
+ arg->value.off = avail / DEV_BSIZE;
+ } else if (strcmp(arg->name, "blocksused") == 0) {
+ dmu_objset_space(zv->zv_objset, &refd, &avail,
+ &usedobjs, &availobjs);
+ arg->value.off = refd / DEV_BSIZE;
+ } else if (strcmp(arg->name, "poolblocksavail") == 0) {
+ avail = metaslab_class_get_space(spa_normal_class(spa));
+ avail -= metaslab_class_get_alloc(spa_normal_class(spa));
+ arg->value.off = avail / DEV_BSIZE;
+ } else if (strcmp(arg->name, "poolblocksused") == 0) {
+ refd = metaslab_class_get_alloc(spa_normal_class(spa));
+ arg->value.off = refd / DEV_BSIZE;
+ } else
+ error = ENOIOCTL;
+ break;
+ }
+ case FIOSEEKHOLE:
+ case FIOSEEKDATA: {
+ off_t *off = (off_t *)data;
+ uint64_t noff;
+ boolean_t hole;
+
+ hole = (cmd == FIOSEEKHOLE);
+ noff = *off;
+ error = dmu_offset_next(zv->zv_objset, ZVOL_OBJ, hole, &noff);
+ *off = noff;
+ break;
+ }
+ default:
+ error = ENOIOCTL;
+ }
+
+ return (error);
+}
+#endif /* illumos */
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/os/callb.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/os/callb.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/os/callb.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -368,7 +369,7 @@
mutex_exit(&ct->ct_lock);
}
-#ifdef sun
+#ifdef illumos
/*
* Return a boolean value indicating whether a particular kernel thread is
* stopped in accordance with the cpr callback protocol. If returning
@@ -432,7 +433,7 @@
mutex_exit(&ct->ct_lock);
return (ret_val);
}
-#endif /* sun */
+#endif /* illumos */
SYSINIT(sol_callb, SI_SUB_DRIVERS, SI_ORDER_FIRST, callb_init, NULL);
SYSUNINIT(sol_callb, SI_SUB_DRIVERS, SI_ORDER_FIRST, callb_fini, NULL);
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/os/fm.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/os/fm.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/os/fm.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -79,7 +80,7 @@
static const char *fm_msgid = "SUNOS-8000-0G";
static char *volatile fm_panicstr = NULL;
-#ifdef sun
+#ifdef illumos
errorq_t *ereport_errorq;
#endif
void *ereport_dumpbuf;
@@ -112,7 +113,7 @@
{ "payload-set-failed", KSTAT_DATA_UINT64 }
};
-#ifdef sun
+#ifdef illumos
/*ARGSUSED*/
static void
fm_drain(void *private, void *data, errorq_elem_t *eep)
@@ -131,7 +132,7 @@
{
kstat_t *ksp;
-#ifdef sun
+#ifdef illumos
(void) sysevent_evc_bind(FM_ERROR_CHAN,
&ereport_chan, EVCH_CREAT | EVCH_HOLD_PEND);
@@ -145,7 +146,7 @@
if (ereport_size == 0)
ereport_size = ERPT_DATA_SZ;
-#ifdef sun
+#ifdef illumos
ereport_errorq = errorq_nvcreate("fm_ereport_queue",
(errorq_func_t)fm_drain, NULL, ereport_qlen, ereport_size,
FM_ERR_PIL, ERRORQ_VITAL);
@@ -170,7 +171,7 @@
}
}
-#ifdef sun
+#ifdef illumos
/*
* Formatting utility function for fm_nvprintr. We attempt to wrap chunks of
* output so they aren't split across console lines, and return the end column.
@@ -379,7 +380,7 @@
{
va_list ap;
- (void) casptr((void *)&fm_panicstr, NULL, (void *)format);
+ (void) atomic_cas_ptr((void *)&fm_panicstr, NULL, (void *)format);
#if defined(__i386) || defined(__amd64)
fastreboot_disable_highpil();
#endif /* __i386 || __amd64 */
@@ -524,20 +525,20 @@
(void) nvlist_size(ereport, &nvl_size, NV_ENCODE_NATIVE);
if (nvl_size > ERPT_DATA_SZ || nvl_size == 0) {
- atomic_add_64(&erpt_kstat_data.erpt_dropped.value.ui64, 1);
+ atomic_inc_64(&erpt_kstat_data.erpt_dropped.value.ui64);
return;
}
-#ifdef sun
+#ifdef illumos
if (sysevent_evc_bind(FM_ERROR_CHAN, &error_chan,
EVCH_CREAT|EVCH_HOLD_PEND) != 0) {
- atomic_add_64(&erpt_kstat_data.erpt_dropped.value.ui64, 1);
+ atomic_inc_64(&erpt_kstat_data.erpt_dropped.value.ui64);
return;
}
if (sysevent_evc_publish(error_chan, EC_FM, ESC_FM_ERROR,
SUNW_VENDOR, FM_PUB, ereport, evc_flag) != 0) {
- atomic_add_64(&erpt_kstat_data.erpt_dropped.value.ui64, 1);
+ atomic_inc_64(&erpt_kstat_data.erpt_dropped.value.ui64);
(void) sysevent_evc_unbind(error_chan);
return;
}
@@ -803,8 +804,7 @@
va_end(ap);
if (ret)
- atomic_add_64(
- &erpt_kstat_data.payload_set_failed.value.ui64, 1);
+ atomic_inc_64(&erpt_kstat_data.payload_set_failed.value.ui64);
}
/*
@@ -837,7 +837,7 @@
int ret;
if (version != FM_EREPORT_VERS0) {
- atomic_add_64(&erpt_kstat_data.erpt_set_failed.value.ui64, 1);
+ atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64);
return;
}
@@ -844,17 +844,17 @@
(void) snprintf(ereport_class, FM_MAX_CLASS, "%s.%s",
FM_EREPORT_CLASS, erpt_class);
if (nvlist_add_string(ereport, FM_CLASS, ereport_class) != 0) {
- atomic_add_64(&erpt_kstat_data.erpt_set_failed.value.ui64, 1);
+ atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64);
return;
}
if (nvlist_add_uint64(ereport, FM_EREPORT_ENA, ena)) {
- atomic_add_64(&erpt_kstat_data.erpt_set_failed.value.ui64, 1);
+ atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64);
}
if (nvlist_add_nvlist(ereport, FM_EREPORT_DETECTOR,
(nvlist_t *)detector) != 0) {
- atomic_add_64(&erpt_kstat_data.erpt_set_failed.value.ui64, 1);
+ atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64);
}
va_start(ap, detector);
@@ -863,7 +863,7 @@
va_end(ap);
if (ret)
- atomic_add_64(&erpt_kstat_data.erpt_set_failed.value.ui64, 1);
+ atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64);
}
/*
@@ -886,19 +886,19 @@
fm_fmri_hc_set_common(nvlist_t *fmri, int version, const nvlist_t *auth)
{
if (version != FM_HC_SCHEME_VERSION) {
- atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
return (0);
}
if (nvlist_add_uint8(fmri, FM_VERSION, version) != 0 ||
nvlist_add_string(fmri, FM_FMRI_SCHEME, FM_FMRI_SCHEME_HC) != 0) {
- atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
return (0);
}
if (auth != NULL && nvlist_add_nvlist(fmri, FM_FMRI_AUTHORITY,
(nvlist_t *)auth) != 0) {
- atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
return (0);
}
@@ -930,14 +930,14 @@
pairs[i] = fm_nvlist_create(nva);
if (nvlist_add_string(pairs[i], FM_FMRI_HC_NAME, name) != 0 ||
nvlist_add_string(pairs[i], FM_FMRI_HC_ID, idstr) != 0) {
- atomic_add_64(
- &erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ atomic_inc_64(
+ &erpt_kstat_data.fmri_set_failed.value.ui64);
}
}
va_end(ap);
if (nvlist_add_nvlist_array(fmri, FM_FMRI_HC_LIST, pairs, npairs) != 0)
- atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
for (i = 0; i < npairs; i++)
fm_nvlist_destroy(pairs[i], FM_NVA_RETAIN);
@@ -944,8 +944,8 @@
if (snvl != NULL) {
if (nvlist_add_nvlist(fmri, FM_FMRI_HC_SPECIFIC, snvl) != 0) {
- atomic_add_64(
- &erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ atomic_inc_64(
+ &erpt_kstat_data.fmri_set_failed.value.ui64);
}
}
}
@@ -970,7 +970,7 @@
int err = 0;
if (version != DEV_SCHEME_VERSION0) {
- atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
return;
}
@@ -991,7 +991,7 @@
err |= nvlist_add_string(fmri_dev, FM_FMRI_DEV_TGTPTLUN0, tpl0);
if (err)
- atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
}
@@ -1016,35 +1016,35 @@
uint64_t *failedp = &erpt_kstat_data.fmri_set_failed.value.ui64;
if (version < CPU_SCHEME_VERSION1) {
- atomic_add_64(failedp, 1);
+ atomic_inc_64(failedp);
return;
}
if (nvlist_add_uint8(fmri_cpu, FM_VERSION, version) != 0) {
- atomic_add_64(failedp, 1);
+ atomic_inc_64(failedp);
return;
}
if (nvlist_add_string(fmri_cpu, FM_FMRI_SCHEME,
FM_FMRI_SCHEME_CPU) != 0) {
- atomic_add_64(failedp, 1);
+ atomic_inc_64(failedp);
return;
}
if (auth != NULL && nvlist_add_nvlist(fmri_cpu, FM_FMRI_AUTHORITY,
(nvlist_t *)auth) != 0)
- atomic_add_64(failedp, 1);
+ atomic_inc_64(failedp);
if (nvlist_add_uint32(fmri_cpu, FM_FMRI_CPU_ID, cpu_id) != 0)
- atomic_add_64(failedp, 1);
+ atomic_inc_64(failedp);
if (cpu_maskp != NULL && nvlist_add_uint8(fmri_cpu, FM_FMRI_CPU_MASK,
*cpu_maskp) != 0)
- atomic_add_64(failedp, 1);
+ atomic_inc_64(failedp);
if (serial_idp == NULL || nvlist_add_string(fmri_cpu,
FM_FMRI_CPU_SERIAL_ID, (char *)serial_idp) != 0)
- atomic_add_64(failedp, 1);
+ atomic_inc_64(failedp);
}
/*
@@ -1065,22 +1065,22 @@
const char *unum, const char *serial, uint64_t offset)
{
if (version != MEM_SCHEME_VERSION0) {
- atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
return;
}
if (!serial && (offset != (uint64_t)-1)) {
- atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
return;
}
if (nvlist_add_uint8(fmri, FM_VERSION, version) != 0) {
- atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
return;
}
if (nvlist_add_string(fmri, FM_FMRI_SCHEME, FM_FMRI_SCHEME_MEM) != 0) {
- atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
return;
}
@@ -1087,27 +1087,25 @@
if (auth != NULL) {
if (nvlist_add_nvlist(fmri, FM_FMRI_AUTHORITY,
(nvlist_t *)auth) != 0) {
- atomic_add_64(
- &erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ atomic_inc_64(
+ &erpt_kstat_data.fmri_set_failed.value.ui64);
}
}
if (nvlist_add_string(fmri, FM_FMRI_MEM_UNUM, unum) != 0) {
- atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
}
if (serial != NULL) {
if (nvlist_add_string_array(fmri, FM_FMRI_MEM_SERIAL_ID,
(char **)&serial, 1) != 0) {
- atomic_add_64(
- &erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ atomic_inc_64(
+ &erpt_kstat_data.fmri_set_failed.value.ui64);
}
- if (offset != (uint64_t)-1) {
- if (nvlist_add_uint64(fmri, FM_FMRI_MEM_OFFSET,
- offset) != 0) {
- atomic_add_64(&erpt_kstat_data.
- fmri_set_failed.value.ui64, 1);
- }
+ if (offset != (uint64_t)-1 && nvlist_add_uint64(fmri,
+ FM_FMRI_MEM_OFFSET, offset) != 0) {
+ atomic_inc_64(
+ &erpt_kstat_data.fmri_set_failed.value.ui64);
}
}
}
@@ -1117,28 +1115,28 @@
uint64_t vdev_guid)
{
if (version != ZFS_SCHEME_VERSION0) {
- atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
return;
}
if (nvlist_add_uint8(fmri, FM_VERSION, version) != 0) {
- atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
return;
}
if (nvlist_add_string(fmri, FM_FMRI_SCHEME, FM_FMRI_SCHEME_ZFS) != 0) {
- atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
return;
}
if (nvlist_add_uint64(fmri, FM_FMRI_ZFS_POOL, pool_guid) != 0) {
- atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
}
if (vdev_guid != 0) {
if (nvlist_add_uint64(fmri, FM_FMRI_ZFS_VDEV, vdev_guid) != 0) {
- atomic_add_64(
- &erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ atomic_inc_64(
+ &erpt_kstat_data.fmri_set_failed.value.ui64);
}
}
}
@@ -1265,7 +1263,7 @@
return (time);
}
-#ifdef sun
+#ifdef illumos
/*
* Convert a getpcstack() trace to symbolic name+offset, and add the resulting
* string array to a Fault Management ereport as FM_EREPORT_PAYLOAD_NAME_STACK.
@@ -1293,7 +1291,7 @@
}
#endif
-#ifdef sun
+#ifdef illumos
void
print_msg_hwerr(ctid_t ct_id, proc_t *p)
{
@@ -1322,7 +1320,7 @@
*/
if (nvlist_lookup_nvlist_array(bboard, FM_FMRI_HC_LIST, &hcl, &n)
!= 0) {
- atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
return;
}
@@ -1329,13 +1327,13 @@
for (i = 0; i < n; i++) {
if (nvlist_lookup_string(hcl[i], FM_FMRI_HC_NAME,
&hcname) != 0) {
- atomic_add_64(
- &erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ atomic_inc_64(
+ &erpt_kstat_data.fmri_set_failed.value.ui64);
return;
}
if (nvlist_lookup_string(hcl[i], FM_FMRI_HC_ID, &hcid) != 0) {
- atomic_add_64(
- &erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ atomic_inc_64(
+ &erpt_kstat_data.fmri_set_failed.value.ui64);
return;
}
@@ -1347,8 +1345,8 @@
fm_nvlist_destroy(pairs[j],
FM_NVA_RETAIN);
}
- atomic_add_64(
- &erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ atomic_inc_64(
+ &erpt_kstat_data.fmri_set_failed.value.ui64);
return;
}
}
@@ -1372,8 +1370,8 @@
fm_nvlist_destroy(pairs[j],
FM_NVA_RETAIN);
}
- atomic_add_64(
- &erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ atomic_inc_64(
+ &erpt_kstat_data.fmri_set_failed.value.ui64);
return;
}
}
@@ -1384,7 +1382,7 @@
*/
if (nvlist_add_nvlist_array(fmri, FM_FMRI_HC_LIST, pairs,
npairs + n) != 0) {
- atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
return;
}
@@ -1394,8 +1392,8 @@
if (snvl != NULL) {
if (nvlist_add_nvlist(fmri, FM_FMRI_HC_SPECIFIC, snvl) != 0) {
- atomic_add_64(
- &erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ atomic_inc_64(
+ &erpt_kstat_data.fmri_set_failed.value.ui64);
return;
}
}
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/os/list.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/os/list.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/os/list.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/os/nvpair_alloc_system.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/os/nvpair_alloc_system.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/os/nvpair_alloc_system.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/acl.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/acl.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/acl.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -19,6 +20,8 @@
* CDDL HEADER END
*/
/*
+ * Copyright 2014 Garrett D'Amore <garrett at damore.org>
+ *
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -187,11 +190,12 @@
ACE_DIRECTORY_INHERIT_ACE | \
ACE_NO_PROPAGATE_INHERIT_ACE | \
ACE_INHERIT_ONLY_ACE | \
+ ACE_INHERITED_ACE | \
ACE_IDENTIFIER_GROUP)
#define ACE_TYPE_FLAGS (ACE_OWNER|ACE_GROUP|ACE_EVERYONE| \
ACE_IDENTIFIER_GROUP)
-#define ACE_INHERIT_FLAGS (ACE_FILE_INHERIT_ACE| \
+#define ACE_INHERIT_FLAGS (ACE_FILE_INHERIT_ACE| ACL_INHERITED_ACE| \
ACE_DIRECTORY_INHERIT_ACE|ACE_NO_PROPAGATE_INHERIT_ACE|ACE_INHERIT_ONLY_ACE)
/* cmd args to acl(2) for aclent_t */
@@ -299,13 +303,8 @@
#endif /* !defined(_KERNEL) */
-#if defined(__STDC__)
extern int acl(const char *path, int cmd, int cnt, void *buf);
extern int facl(int fd, int cmd, int cnt, void *buf);
-#else /* !__STDC__ */
-extern int acl();
-extern int facl();
-#endif /* defined(__STDC__) */
#ifdef __cplusplus
}
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/acl_impl.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/acl_impl.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/acl_impl.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/asm_linkage.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/asm_linkage.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/asm_linkage.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/avl.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/avl.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/avl.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -23,6 +24,10 @@
* Use is subject to license terms.
*/
+/*
+ * Copyright (c) 2014 by Delphix. All rights reserved.
+ */
+
#ifndef _AVL_H
#define _AVL_H
@@ -39,7 +44,7 @@
#include <sys/avl_impl.h>
/*
- * This is a generic implemenatation of AVL trees for use in the Solaris kernel.
+ * This is a generic implementation of AVL trees for use in the Solaris kernel.
* The interfaces provide an efficient way of implementing an ordered set of
* data structures.
*
@@ -175,7 +180,7 @@
* Insert "new_data" in "tree" in the given "direction" either after
* or before the data "here".
*
- * This might be usefull for avl clients caching recently accessed
+ * This might be useful for avl clients caching recently accessed
* data to avoid doing avl_find() again for insertion.
*
* new_data - new data to insert
@@ -260,6 +265,11 @@
extern boolean_t avl_update_gt(avl_tree_t *, void *);
/*
+ * Swaps the contents of the two trees.
+ */
+extern void avl_swap(avl_tree_t *tree1, avl_tree_t *tree2);
+
+/*
* Return the number of nodes in the tree
*/
extern ulong_t avl_numnodes(avl_tree_t *tree);
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/avl_impl.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/avl_impl.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/avl_impl.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/bitmap.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/bitmap.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/bitmap.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -24,6 +25,10 @@
* Use is subject to license terms.
*/
+/*
+ * Copyright (c) 2014 by Delphix. All rights reserved.
+ */
+
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
/* All Rights Reserved */
@@ -31,8 +36,6 @@
#ifndef _SYS_BITMAP_H
#define _SYS_BITMAP_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#ifdef __cplusplus
extern "C" {
#endif
@@ -152,6 +155,7 @@
* Low order bit is 0, high order bit is 31.
*/
extern int highbit(ulong_t);
+extern int highbit64(uint64_t);
extern int lowbit(ulong_t);
extern int bt_getlowbit(ulong_t *bitmap, size_t start, size_t stop);
extern void bt_copy(ulong_t *, ulong_t *, ulong_t);
@@ -168,9 +172,9 @@
* to 0 otherwise.
*/
#define BT_ATOMIC_SET(bitmap, bitindex) \
- { atomic_or_long(&(BT_WIM(bitmap, bitindex)), BT_BIW(bitindex)); }
+ { atomic_or_ulong(&(BT_WIM(bitmap, bitindex)), BT_BIW(bitindex)); }
#define BT_ATOMIC_CLEAR(bitmap, bitindex) \
- { atomic_and_long(&(BT_WIM(bitmap, bitindex)), ~BT_BIW(bitindex)); }
+ { atomic_and_ulong(&(BT_WIM(bitmap, bitindex)), ~BT_BIW(bitindex)); }
#define BT_ATOMIC_SET_EXCL(bitmap, bitindex, result) \
{ result = atomic_set_long_excl(&(BT_WIM(bitmap, bitindex)), \
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/callb.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/callb.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/callb.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/ccompile.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/ccompile.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/ccompile.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/cmn_err.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/cmn_err.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/cmn_err.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/compress.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/compress.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/compress.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/cpupart.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/cpupart.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/cpupart.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/cpuvar.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/cpuvar.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/cpuvar.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -21,6 +22,7 @@
/*
* Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2014 Igor Kozhukhov <ikozhukhov at gmail.com>.
*/
#ifndef _SYS_CPUVAR_H
@@ -31,6 +33,7 @@
#include <sys/disp.h>
#include <sys/processor.h>
+#include <sys/loadavg.h>
#if (defined(_KERNEL) || defined(_KMEMUSER)) && defined(_MACHDEP)
#include <sys/machcpuvar.h>
#endif
@@ -52,16 +55,7 @@
struct squeue_set_s;
#define CPU_CACHE_COHERENCE_SIZE 64
-#define S_LOADAVG_SZ 11
-#define S_MOVAVG_SZ 10
-struct loadavg_s {
- int lg_cur; /* current loadavg entry */
- unsigned int lg_len; /* number entries recorded */
- hrtime_t lg_total; /* used to temporarily hold load totals */
- hrtime_t lg_loads[S_LOADAVG_SZ]; /* table of recorded entries */
-};
-
/*
* For fast event tracing.
*/
@@ -524,8 +518,8 @@
largest = (uint_t)(highbit(set) - 1); \
}
-#define CPUSET_ATOMIC_DEL(set, cpu) atomic_and_long(&(set), ~CPUSET(cpu))
-#define CPUSET_ATOMIC_ADD(set, cpu) atomic_or_long(&(set), CPUSET(cpu))
+#define CPUSET_ATOMIC_DEL(set, cpu) atomic_and_ulong(&(set), ~CPUSET(cpu))
+#define CPUSET_ATOMIC_ADD(set, cpu) atomic_or_ulong(&(set), CPUSET(cpu))
#define CPUSET_ATOMIC_XADD(set, cpu, result) \
{ result = atomic_set_long_excl(&(set), (cpu)); }
@@ -655,7 +649,7 @@
void mach_cpu_pause(volatile char *);
-void pause_cpus(cpu_t *off_cp);
+void pause_cpus(cpu_t *off_cp, void *(*func)(void *));
void start_cpus(void);
int cpus_paused(void);
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/cred.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/cred.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/cred.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/ctf.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/ctf.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/ctf.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -27,7 +28,7 @@
#ifndef _CTF_H
#define _CTF_H
-#if defined(sun)
+#ifdef illumos
#pragma ident "%Z%%M% %I% %E% SMI"
#endif
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/ctf_api.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/ctf_api.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/ctf_api.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -23,6 +24,9 @@
* Copyright 2005 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
+/*
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ */
/*
* This header file defines the interfaces available from the CTF debugger
@@ -40,8 +44,6 @@
#ifndef _CTF_API_H
#define _CTF_API_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/types.h>
#include <sys/param.h>
#include <sys/elf.h>
@@ -68,7 +70,7 @@
const char *cts_name; /* section name (if any) */
ulong_t cts_type; /* section type (ELF SHT_... value) */
ulong_t cts_flags; /* section flags (ELF SHF_... value) */
-#if defined(sun)
+#ifdef illumos
const void *cts_data; /* pointer to section data */
#else
void *cts_data; /* pointer to section data */
@@ -154,6 +156,7 @@
extern ctf_file_t *ctf_fdopen(int, int *);
extern ctf_file_t *ctf_open(const char *, int *);
extern ctf_file_t *ctf_create(int *);
+extern ctf_file_t *ctf_dup(ctf_file_t *);
extern void ctf_close(ctf_file_t *);
extern ctf_file_t *ctf_parent_file(ctf_file_t *);
@@ -179,6 +182,8 @@
extern ctf_id_t ctf_type_resolve(ctf_file_t *, ctf_id_t);
extern ssize_t ctf_type_lname(ctf_file_t *, ctf_id_t, char *, size_t);
extern char *ctf_type_name(ctf_file_t *, ctf_id_t, char *, size_t);
+extern char *ctf_type_qname(ctf_file_t *, ctf_id_t, char *, size_t,
+ const char *);
extern ssize_t ctf_type_size(ctf_file_t *, ctf_id_t);
extern ssize_t ctf_type_align(ctf_file_t *, ctf_id_t);
extern int ctf_type_kind(ctf_file_t *, ctf_id_t);
@@ -227,6 +232,8 @@
extern int ctf_set_array(ctf_file_t *, ctf_id_t, const ctf_arinfo_t *);
+extern int ctf_delete_type(ctf_file_t *, ctf_id_t);
+
extern int ctf_update(ctf_file_t *);
extern int ctf_discard(ctf_file_t *);
extern int ctf_write(ctf_file_t *, int);
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/debug.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/debug.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/debug.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -19,6 +20,8 @@
* CDDL HEADER END
*/
/*
+ * Copyright 2014 Garrett D'Amore <garrett at damore.org>
+ *
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -25,6 +28,7 @@
/*
* Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright 2013 Saso Kiselkov. All rights reserved.
*/
/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
@@ -47,7 +51,6 @@
* ASSERT and is evaluated on both debug and non-debug kernels.
*/
-#if defined(__STDC__)
extern int assfail(const char *, const char *, int);
#define VERIFY(EX) ((void)((EX) || assfail(#EX, __FILE__, __LINE__)))
#ifdef DEBUG
@@ -55,15 +58,6 @@
#else
#define ASSERT(x) ((void)0)
#endif
-#else /* defined(__STDC__) */
-extern int assfail();
-#define VERIFY(EX) ((void)((EX) || assfail("EX", __FILE__, __LINE__)))
-#ifdef DEBUG
-#define ASSERT(EX) ((void)((EX) || assfail("EX", __FILE__, __LINE__)))
-#else
-#define ASSERT(x) ((void)0)
-#endif
-#endif /* defined(__STDC__) */
/*
* Assertion variants sensitive to the compilation data model
@@ -131,6 +125,16 @@
#define ASSERT0(x) ((void)0)
#endif
+/*
+ * Compile-time assertion. The condition 'x' must be constant.
+ */
+#ifndef CTASSERT
+#define CTASSERT(x) _CTASSERT(x, __LINE__)
+#define _CTASSERT(x, y) __CTASSERT(x, y)
+#define __CTASSERT(x, y) \
+ typedef char __compile_time_assertion__ ## y [(x) ? 1 : -1]
+#endif
+
#ifdef _KERNEL
extern void abort_sequence_enter(char *);
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/dtrace.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/dtrace.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/dtrace.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -25,14 +26,13 @@
*/
/*
- * Copyright (c) 2011, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
*/
#ifndef _SYS_DTRACE_H
#define _SYS_DTRACE_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#ifdef __cplusplus
extern "C" {
#endif
@@ -53,9 +53,10 @@
#include <sys/types.h>
#include <sys/modctl.h>
#include <sys/processor.h>
-#if defined(sun)
+#ifdef illumos
#include <sys/systm.h>
#else
+#include <sys/cpuvar.h>
#include <sys/param.h>
#include <sys/linker.h>
#include <sys/ioccom.h>
@@ -63,8 +64,8 @@
typedef int model_t;
#endif
#include <sys/ctf_api.h>
+#ifdef illumos
#include <sys/cyclic.h>
-#if defined(sun)
#include <sys/int_limits.h>
#else
#include <sys/stdint.h>
@@ -255,7 +256,7 @@
#define DIF_VAR_ERRNO 0x0120 /* thread errno */
#define DIF_VAR_EXECARGS 0x0121 /* process arguments */
-#if !defined(sun)
+#ifndef illumos
#define DIF_VAR_CPU 0x0200
#endif
@@ -310,9 +311,12 @@
#define DIF_SUBR_SX_SHARED_HELD 48
#define DIF_SUBR_SX_EXCLUSIVE_HELD 49
#define DIF_SUBR_SX_ISEXCLUSIVE 50
+#define DIF_SUBR_MEMSTR 51
+#define DIF_SUBR_GETF 52
+#define DIF_SUBR_JSON 53
+#define DIF_SUBR_STRTOLL 54
+#define DIF_SUBR_MAX 54 /* max subroutine value */
-#define DIF_SUBR_MAX 50 /* max subroutine value */
-
typedef uint32_t dif_instr_t;
#define DIF_INSTR_OP(i) (((i) >> 24) & 0xff)
@@ -373,6 +377,7 @@
#define DIF_TYPE_STRING 1 /* type is a D string */
#define DIF_TF_BYREF 0x1 /* type is passed by reference */
+#define DIF_TF_BYUREF 0x2 /* user type is passed by reference */
/*
* A DTrace Intermediate Format variable record is used to describe each of the
@@ -722,6 +727,20 @@
#define DOF_SECF_LOAD 1 /* section should be loaded */
+#define DOF_SEC_ISLOADABLE(x) \
+ (((x) == DOF_SECT_ECBDESC) || ((x) == DOF_SECT_PROBEDESC) || \
+ ((x) == DOF_SECT_ACTDESC) || ((x) == DOF_SECT_DIFOHDR) || \
+ ((x) == DOF_SECT_DIF) || ((x) == DOF_SECT_STRTAB) || \
+ ((x) == DOF_SECT_VARTAB) || ((x) == DOF_SECT_RELTAB) || \
+ ((x) == DOF_SECT_TYPTAB) || ((x) == DOF_SECT_URELHDR) || \
+ ((x) == DOF_SECT_KRELHDR) || ((x) == DOF_SECT_OPTDESC) || \
+ ((x) == DOF_SECT_PROVIDER) || ((x) == DOF_SECT_PROBES) || \
+ ((x) == DOF_SECT_PRARGS) || ((x) == DOF_SECT_PROFFS) || \
+ ((x) == DOF_SECT_INTTAB) || ((x) == DOF_SECT_XLTAB) || \
+ ((x) == DOF_SECT_XLMEMBERS) || ((x) == DOF_SECT_XLIMPORT) || \
+ ((x) == DOF_SECT_XLIMPORT) || ((x) == DOF_SECT_XLEXPORT) || \
+ ((x) == DOF_SECT_PREXPORT) || ((x) == DOF_SECT_PRENOFFS))
+
typedef struct dof_ecbdesc {
dof_secidx_t dofe_probes; /* link to DOF_SECT_PROBEDESC */
dof_secidx_t dofe_pred; /* link to DOF_SECT_DIFOHDR */
@@ -930,10 +949,10 @@
* DTrace Metadata Description Structures
*
* DTrace separates the trace data stream from the metadata stream. The only
- * metadata tokens placed in the data stream are enabled probe identifiers
- * (EPIDs) or (in the case of aggregations) aggregation identifiers. In order
- * to determine the structure of the data, DTrace consumers pass the token to
- * the kernel, and receive in return a corresponding description of the enabled
+ * metadata tokens placed in the data stream are the dtrace_rechdr_t (EPID +
+ * timestamp) or (in the case of aggregations) aggregation identifiers. To
+ * determine the structure of the data, DTrace consumers pass the token to the
+ * kernel, and receive in return a corresponding description of the enabled
* probe (via the dtrace_eprobedesc structure) or the aggregation (via the
* dtrace_aggdesc structure). Both of these structures are expressed in terms
* of record descriptions (via the dtrace_recdesc structure) that describe the
@@ -1028,7 +1047,12 @@
#define DTRACEOPT_AGGSORTREV 24 /* reverse-sort aggregations */
#define DTRACEOPT_AGGSORTPOS 25 /* agg. position to sort on */
#define DTRACEOPT_AGGSORTKEYPOS 26 /* agg. key position to sort on */
-#define DTRACEOPT_MAX 27 /* number of options */
+#define DTRACEOPT_TEMPORAL 27 /* temporally ordered output */
+#define DTRACEOPT_AGGHIST 28 /* histogram aggregation output */
+#define DTRACEOPT_AGGPACK 29 /* packed aggregation output */
+#define DTRACEOPT_AGGZOOM 30 /* zoomed aggregation scaling */
+#define DTRACEOPT_ZONE 31 /* zone in which to enable probes */
+#define DTRACEOPT_MAX 32 /* number of options */
#define DTRACEOPT_UNSET (dtrace_optval_t)-2 /* unset option */
@@ -1048,7 +1072,9 @@
* where user-level wishes the kernel to snapshot the buffer to (the
* dtbd_data field). The kernel uses the same structure to pass back some
* information regarding the buffer: the size of data actually copied out, the
- * number of drops, the number of errors, and the offset of the oldest record.
+ * number of drops, the number of errors, the offset of the oldest record,
+ * and the time of the snapshot.
+ *
* If the buffer policy is a "switch" policy, taking a snapshot of the
* principal buffer has the additional effect of switching the active and
* inactive buffers. Taking a snapshot of the aggregation buffer _always_ has
@@ -1061,9 +1087,30 @@
uint64_t dtbd_drops; /* number of drops */
DTRACE_PTR(char, dtbd_data); /* data */
uint64_t dtbd_oldest; /* offset of oldest record */
+ uint64_t dtbd_timestamp; /* hrtime of snapshot */
} dtrace_bufdesc_t;
/*
+ * Each record in the buffer (dtbd_data) begins with a header that includes
+ * the epid and a timestamp. The timestamp is split into two 4-byte parts
+ * so that we do not require 8-byte alignment.
+ */
+typedef struct dtrace_rechdr {
+ dtrace_epid_t dtrh_epid; /* enabled probe id */
+ uint32_t dtrh_timestamp_hi; /* high bits of hrtime_t */
+ uint32_t dtrh_timestamp_lo; /* low bits of hrtime_t */
+} dtrace_rechdr_t;
+
+#define DTRACE_RECORD_LOAD_TIMESTAMP(dtrh) \
+ ((dtrh)->dtrh_timestamp_lo + \
+ ((uint64_t)(dtrh)->dtrh_timestamp_hi << 32))
+
+#define DTRACE_RECORD_STORE_TIMESTAMP(dtrh, hrtime) { \
+ (dtrh)->dtrh_timestamp_lo = (uint32_t)hrtime; \
+ (dtrh)->dtrh_timestamp_hi = hrtime >> 32; \
+}
+
+/*
* DTrace Status
*
* The status of DTrace is relayed via the dtrace_status structure. This
@@ -1229,7 +1276,7 @@
* pseudodevice driver. These ioctls comprise the user-kernel interface to
* DTrace.
*/
-#if defined(sun)
+#ifdef illumos
#define DTRACEIOC (('d' << 24) | ('t' << 16) | ('r' << 8))
#define DTRACEIOC_PROVIDER (DTRACEIOC | 1) /* provider query */
#define DTRACEIOC_PROBES (DTRACEIOC | 2) /* probe query */
@@ -1360,7 +1407,7 @@
* helpers and should no longer be used. No other ioctls are valid on the
* helper minor node.
*/
-#if defined(sun)
+#ifdef illumos
#define DTRACEHIOC (('d' << 24) | ('t' << 16) | ('h' << 8))
#define DTRACEHIOC_ADD (DTRACEHIOC | 1) /* add helper */
#define DTRACEHIOC_REMOVE (DTRACEHIOC | 2) /* remove helper */
@@ -1375,7 +1422,7 @@
char dofhp_mod[DTRACE_MODNAMELEN]; /* executable or library name */
uint64_t dofhp_addr; /* base address of object */
uint64_t dofhp_dof; /* address of helper DOF */
-#if !defined(sun)
+#ifndef illumos
int gen;
#endif
} dof_helper_t;
@@ -1680,7 +1727,22 @@
*
* 1.10.3 Return value
*
- * A boolean value.
+ * A bitwise OR that encapsulates both the mode (either DTRACE_MODE_KERNEL
+ * or DTRACE_MODE_USER) and the policy when the privilege of the enabling
+ * is insufficient for that mode (a combination of DTRACE_MODE_NOPRIV_DROP,
+ * DTRACE_MODE_NOPRIV_RESTRICT, and DTRACE_MODE_LIMITEDPRIV_RESTRICT). If
+ * DTRACE_MODE_NOPRIV_DROP bit is set, insufficient privilege will result
+ * in the probe firing being silently ignored for the enabling; if the
+ * DTRACE_NODE_NOPRIV_RESTRICT bit is set, insufficient privilege will not
+ * prevent probe processing for the enabling, but restrictions will be in
+ * place that induce a UPRIV fault upon attempt to examine probe arguments
+ * or current process state. If the DTRACE_MODE_LIMITEDPRIV_RESTRICT bit
+ * is set, similar restrictions will be placed upon operation if the
+ * privilege is sufficient to process the enabling, but does not otherwise
+ * entitle the enabling to all zones. The DTRACE_MODE_NOPRIV_DROP and
+ * DTRACE_MODE_NOPRIV_RESTRICT are mutually exclusive (and one of these
+ * two policies must be specified), but either may be combined (or not)
+ * with DTRACE_MODE_LIMITEDPRIV_RESTRICT.
*
* 1.10.4 Caller's context
*
@@ -2075,6 +2137,12 @@
void (*dtps_destroy)(void *arg, dtrace_id_t id, void *parg);
} dtrace_pops_t;
+#define DTRACE_MODE_KERNEL 0x01
+#define DTRACE_MODE_USER 0x02
+#define DTRACE_MODE_NOPRIV_DROP 0x10
+#define DTRACE_MODE_NOPRIV_RESTRICT 0x20
+#define DTRACE_MODE_LIMITEDPRIV_RESTRICT 0x40
+
typedef uintptr_t dtrace_provider_id_t;
extern int dtrace_register(const char *, const dtrace_pattr_t *, uint32_t,
@@ -2256,7 +2324,7 @@
DTRACE_VTIME_ACTIVE_TNF /* DTrace virtual time _and_ TNF */
} dtrace_vtime_state_t;
-#if defined(sun)
+#ifdef illumos
extern dtrace_vtime_state_t dtrace_vtime_active;
#endif
extern void dtrace_vtime_switch(kthread_t *next);
@@ -2268,7 +2336,7 @@
struct regs;
struct reg;
-#if defined(sun)
+#ifdef illumos
extern int (*dtrace_pid_probe_ptr)(struct reg *);
extern int (*dtrace_return_probe_ptr)(struct reg *);
extern void (*dtrace_fasttrap_fork_ptr)(proc_t *, proc_t *);
@@ -2287,18 +2355,21 @@
extern void dtrace_membar_consumer(void);
extern void (*dtrace_cpu_init)(processorid_t);
+#ifdef illumos
extern void (*dtrace_modload)(modctl_t *);
extern void (*dtrace_modunload)(modctl_t *);
+#endif
extern void (*dtrace_helpers_cleanup)(void);
extern void (*dtrace_helpers_fork)(proc_t *parent, proc_t *child);
extern void (*dtrace_cpustart_init)(void);
extern void (*dtrace_cpustart_fini)(void);
+extern void (*dtrace_closef)(void);
extern void (*dtrace_debugger_init)(void);
extern void (*dtrace_debugger_fini)(void);
extern dtrace_cacheid_t dtrace_predcache_id;
-#if defined(sun)
+#ifdef illumos
extern hrtime_t dtrace_gethrtime(void);
#else
void dtrace_debug_printf(const char *, ...) __printflike(1, 2);
@@ -2317,10 +2388,10 @@
#if defined(__i386) || defined(__amd64)
extern int dtrace_instr_size(uchar_t *instr);
extern int dtrace_instr_size_isa(uchar_t *, model_t, int *);
+extern void dtrace_invop_callsite(void);
+#endif
extern void dtrace_invop_add(int (*)(uintptr_t, uintptr_t *, uintptr_t));
extern void dtrace_invop_remove(int (*)(uintptr_t, uintptr_t *, uintptr_t));
-extern void dtrace_invop_callsite(void);
-#endif
#ifdef __sparc
extern int dtrace_blksuword32(uintptr_t, uint32_t *, int);
@@ -2327,7 +2398,7 @@
extern void dtrace_getfsr(uint64_t *);
#endif
-#if !defined(sun)
+#ifndef illumos
extern void dtrace_helpers_duplicate(proc_t *, proc_t *);
extern void dtrace_helpers_destroy(proc_t *);
#endif
@@ -2353,6 +2424,15 @@
#define DTRACE_INVOP_NOP 4
#define DTRACE_INVOP_RET 5
+#elif defined(__powerpc__)
+
+#define DTRACE_INVOP_RET 1
+#define DTRACE_INVOP_BCTR 2
+#define DTRACE_INVOP_BLR 3
+#define DTRACE_INVOP_JUMP 4
+#define DTRACE_INVOP_MFLR_R0 5
+#define DTRACE_INVOP_NOP 6
+
#endif
#ifdef __cplusplus
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/dtrace_impl.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/dtrace_impl.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/dtrace_impl.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -18,11 +19,12 @@
*
* CDDL HEADER END
*
- * $FreeBSD: release/9.2.0/sys/cddl/contrib/opensolaris/uts/common/sys/dtrace_impl.h 250484 2013-05-10 21:12:55Z pfg $
+ * $FreeBSD: stable/10/sys/cddl/contrib/opensolaris/uts/common/sys/dtrace_impl.h 313486 2017-02-09 22:04:56Z ngie $
*/
/*
* Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2012 by Delphix. All rights reserved.
* Use is subject to license terms.
*/
@@ -49,7 +51,7 @@
*/
#include <sys/dtrace.h>
-#if !defined(sun)
+#ifndef illumos
#ifdef __sparcv9
typedef uint32_t pc_t;
#else
@@ -209,15 +211,18 @@
* predicate is non-NULL, the DIF object is executed. If the result is
* non-zero, the action list is processed, with each action being executed
* accordingly. When the action list has been completely executed, processing
- * advances to the next ECB. processing advances to the next ECB. If the
- * result is non-zero; For each ECB, it first determines the The ECB
- * abstraction allows disjoint consumers to multiplex on single probes.
+ * advances to the next ECB. The ECB abstraction allows disjoint consumers
+ * to multiplex on single probes.
+ *
+ * Execution of the ECB results in consuming dte_size bytes in the buffer
+ * to record data. During execution, dte_needed bytes must be available in
+ * the buffer. This space is used for both recorded data and tuple data.
*/
struct dtrace_ecb {
dtrace_epid_t dte_epid; /* enabled probe ID */
uint32_t dte_alignment; /* required alignment */
- size_t dte_needed; /* bytes needed */
- size_t dte_size; /* total size of payload */
+ size_t dte_needed; /* space needed for execution */
+ size_t dte_size; /* size of recorded payload */
dtrace_predicate_t *dte_predicate; /* predicate, if any */
dtrace_action_t *dte_action; /* actions, if any */
dtrace_ecb_t *dte_next; /* next ECB on probe */
@@ -275,27 +280,30 @@
* the EPID, the consumer can determine the data layout. (The data buffer
* layout is shown schematically below.) By assuring that one can determine
* data layout from the EPID, the metadata stream can be separated from the
- * data stream -- simplifying the data stream enormously.
+ * data stream -- simplifying the data stream enormously. The ECB always
+ * proceeds the recorded data as part of the dtrace_rechdr_t structure that
+ * includes the EPID and a high-resolution timestamp used for output ordering
+ * consistency.
*
- * base of data buffer ---> +------+--------------------+------+
- * | EPID | data | EPID |
- * +------+--------+------+----+------+
- * | data | EPID | data |
- * +---------------+------+-----------+
- * | data, cont. |
- * +------+--------------------+------+
- * | EPID | data | |
- * +------+--------------------+ |
- * | || |
- * | || |
- * | \/ |
- * : :
- * . .
- * . .
- * . .
- * : :
- * | |
- * limit of data buffer ---> +----------------------------------+
+ * base of data buffer ---> +--------+--------------------+--------+
+ * | rechdr | data | rechdr |
+ * +--------+------+--------+----+--------+
+ * | data | rechdr | data |
+ * +---------------+--------+-------------+
+ * | data, cont. |
+ * +--------+--------------------+--------+
+ * | rechdr | data | |
+ * +--------+--------------------+ |
+ * | || |
+ * | || |
+ * | \/ |
+ * : :
+ * . .
+ * . .
+ * . .
+ * : :
+ * | |
+ * limit of data buffer ---> +--------------------------------------+
*
* When evaluating an ECB, dtrace_probe() determines if the ECB's needs of the
* principal buffer (both scratch and payload) exceed the available space. If
@@ -927,6 +935,7 @@
uintptr_t dtms_strtok; /* saved strtok() pointer */
uint32_t dtms_access; /* memory access rights */
dtrace_difo_t *dtms_difo; /* current dif object */
+ file_t *dtms_getf; /* cached rval of getf() */
} dtrace_mstate_t;
#define DTRACE_COND_OWNER 0x1
@@ -1114,7 +1123,7 @@
* dtrace_state structure.
*/
struct dtrace_state {
-#if defined(sun)
+#ifdef illumos
dev_t dts_dev; /* device */
#else
struct cdev *dts_dev; /* device */
@@ -1132,7 +1141,7 @@
int dts_nspeculations; /* number of speculations */
int dts_naggregations; /* number of aggregations */
dtrace_aggregation_t **dts_aggregations; /* aggregation array */
-#if defined(sun)
+#ifdef illumos
vmem_t *dts_aggid_arena; /* arena for aggregation IDs */
#else
struct unrhdr *dts_aggid_arena; /* arena for aggregation IDs */
@@ -1144,7 +1153,7 @@
uint32_t dts_dblerrors; /* errors in ERROR probes */
uint32_t dts_reserve; /* space reserved for END */
hrtime_t dts_laststatus; /* time of last status */
-#if defined(sun)
+#ifdef illumos
cyclic_id_t dts_cleaner; /* cleaning cyclic */
cyclic_id_t dts_deadman; /* deadman cyclic */
#else
@@ -1159,6 +1168,7 @@
dtrace_optval_t dts_options[DTRACEOPT_MAX]; /* options */
dtrace_cred_t dts_cred; /* credentials */
size_t dts_nretained; /* number of retained enabs */
+ int dts_getf; /* number of getf() calls */
};
struct dtrace_provider {
@@ -1261,7 +1271,11 @@
uintptr_t dtt_limit; /* limit of toxic range */
} dtrace_toxrange_t;
+#ifdef illumos
extern uint64_t dtrace_getarg(int, int);
+#else
+extern uint64_t __noinline dtrace_getarg(int, int);
+#endif
extern greg_t dtrace_getfp(void);
extern int dtrace_getipl(void);
extern uintptr_t dtrace_caller(int);
@@ -1287,7 +1301,7 @@
int, uintptr_t);
extern int dtrace_assfail(const char *, const char *, int);
extern int dtrace_attached(void);
-#if defined(sun)
+#ifdef illumos
extern hrtime_t dtrace_gethrestime(void);
#endif
@@ -1304,16 +1318,19 @@
/*
* DTrace Assertions
*
- * DTrace calls ASSERT from probe context. To assure that a failed ASSERT
- * does not induce a markedly more catastrophic failure (e.g., one from which
- * a dump cannot be gleaned), DTrace must define its own ASSERT to be one that
- * may safely be called from probe context. This header file must thus be
- * included by any DTrace component that calls ASSERT from probe context, and
- * _only_ by those components. (The only exception to this is kernel
- * debugging infrastructure at user-level that doesn't depend on calling
- * ASSERT.)
+ * DTrace calls ASSERT and VERIFY from probe context. To assure that a failed
+ * ASSERT or VERIFY does not induce a markedly more catastrophic failure (e.g.,
+ * one from which a dump cannot be gleaned), DTrace must define its own ASSERT
+ * and VERIFY macros to be ones that may safely be called from probe context.
+ * This header file must thus be included by any DTrace component that calls
+ * ASSERT and/or VERIFY from probe context, and _only_ by those components.
+ * (The only exception to this is kernel debugging infrastructure at user-level
+ * that doesn't depend on calling ASSERT.)
*/
#undef ASSERT
+#undef VERIFY
+#define VERIFY(EX) ((void)((EX) || \
+ dtrace_assfail(#EX, __FILE__, __LINE__)))
#ifdef DEBUG
#define ASSERT(EX) ((void)((EX) || \
dtrace_assfail(#EX, __FILE__, __LINE__)))
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/errorq.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/errorq.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/errorq.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/extdirent.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/extdirent.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/extdirent.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/fasttrap.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/fasttrap.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/fasttrap.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -37,13 +38,13 @@
extern "C" {
#endif
-#if defined(sun)
+#ifdef illumos
#define FASTTRAPIOC (('m' << 24) | ('r' << 16) | ('f' << 8))
#define FASTTRAPIOC_MAKEPROBE (FASTTRAPIOC | 1)
#define FASTTRAPIOC_GETINSTR (FASTTRAPIOC | 2)
#else
-#define FASTTRAPIOC_MAKEPROBE _IOW('f', 1, fasttrap_probe_spec_t)
#define FASTTRAPIOC_GETINSTR _IOWR('f', 2, uint8_t)
+#define FASTTRAPIOC_MAKEPROBE _IO('f', 3)
#endif
typedef enum fasttrap_probe_type {
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/fasttrap_impl.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/fasttrap_impl.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/fasttrap_impl.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -32,6 +33,7 @@
#include <sys/types.h>
#include <sys/dtrace.h>
#include <sys/proc.h>
+#include <sys/queue.h>
#include <sys/fasttrap.h>
#include <sys/fasttrap_isa.h>
@@ -68,8 +70,30 @@
* then disabled, ownership of that tracepoint may be exchanged for an
* unused tracepoint belonging to another probe that was attached to the
* enabled tracepoint.
+ *
+ * On FreeBSD, fasttrap providers also maintain per-thread scratch space for use
+ * by the ISA-specific fasttrap code. The fasttrap_scrblock_t type stores the
+ * virtual address of a page-sized memory block that is mapped into a process'
+ * address space. Each block is carved up into chunks (fasttrap_scrspace_t) for
+ * use by individual threads, which keep the address of their scratch space
+ * chunk in their struct kdtrace_thread. A thread's scratch space isn't released
+ * until it exits.
*/
+#ifndef illumos
+typedef struct fasttrap_scrblock {
+ vm_offset_t ftsb_addr; /* address of a scratch block */
+ LIST_ENTRY(fasttrap_scrblock) ftsb_next;/* next block in list */
+} fasttrap_scrblock_t;
+#define FASTTRAP_SCRBLOCK_SIZE PAGE_SIZE
+
+typedef struct fasttrap_scrspace {
+ uintptr_t ftss_addr; /* scratch space address */
+ LIST_ENTRY(fasttrap_scrspace) ftss_next;/* next in list */
+} fasttrap_scrspace_t;
+#define FASTTRAP_SCRSPACE_SIZE 64
+#endif
+
typedef struct fasttrap_proc {
pid_t ftpc_pid; /* process ID for this proc */
uint64_t ftpc_acount; /* count of active providers */
@@ -76,6 +100,11 @@
uint64_t ftpc_rcount; /* count of extant providers */
kmutex_t ftpc_mtx; /* lock on all but acount */
struct fasttrap_proc *ftpc_next; /* next proc in hash chain */
+#ifndef illumos
+ LIST_HEAD(, fasttrap_scrblock) ftpc_scrblks; /* mapped scratch blocks */
+ LIST_HEAD(, fasttrap_scrspace) ftpc_fscr; /* free scratch space */
+ LIST_HEAD(, fasttrap_scrspace) ftpc_ascr; /* used scratch space */
+#endif
} fasttrap_proc_t;
typedef struct fasttrap_provider {
@@ -158,22 +187,30 @@
*/
#define fasttrap_copyout copyout
#define fasttrap_fuword32 fuword32
-#define fasttrap_suword32(_k, _u) copyout((_k), (_u), sizeof(uint32_t))
-#define fasttrap_suword64(_k, _u) copyout((_k), (_u), sizeof(uint64_t))
+#define fasttrap_suword32 suword32
+#define fasttrap_suword64 suword64
#ifdef __amd64__
#define fasttrap_fulword fuword64
-#define fasttrap_sulword fasttrap_suword64
+#define fasttrap_sulword suword64
#else
#define fasttrap_fulword fuword32
-#define fasttrap_sulword fasttrap_suword32
+#define fasttrap_sulword suword32
#endif
extern void fasttrap_sigtrap(proc_t *, kthread_t *, uintptr_t);
+#ifndef illumos
+extern fasttrap_scrspace_t *fasttrap_scraddr(struct thread *,
+ fasttrap_proc_t *);
+#endif
extern dtrace_id_t fasttrap_probe_id;
extern fasttrap_hash_t fasttrap_tpoints;
+#ifndef illumos
+extern struct rmlock fasttrap_tp_lock;
+#endif
+
#define FASTTRAP_TPOINTS_INDEX(pid, pc) \
(((pc) / sizeof (fasttrap_instr_t) + (pid)) & fasttrap_tpoints.fth_mask)
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/feature_tests.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/feature_tests.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/feature_tests.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -20,6 +21,8 @@
*/
/*
+ * Copyright 2013 Garrett D'Amore <garrett at damore.org>
+ *
* Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -42,6 +45,7 @@
* 199309L POSIX.1b-1993 compilation (Real Time)
* 199506L POSIX.1c-1995 compilation (POSIX Threads)
* 200112L POSIX.1-2001 compilation (Austin Group Revision)
+ * 200809L POSIX.1-2008 compilation
*/
#if defined(_POSIX_SOURCE) && !defined(_POSIX_C_SOURCE)
#define _POSIX_C_SOURCE 1
@@ -48,9 +52,9 @@
#endif
/*
- * The feature test macros __XOPEN_OR_POSIX, _STRICT_STDC, and _STDC_C99
- * are Sun implementation specific macros created in order to compress
- * common standards specified feature test macros for easier reading.
+ * The feature test macros __XOPEN_OR_POSIX, _STRICT_STDC, _STRICT_SYMBOLS,
+ * and _STDC_C99 are Sun implementation specific macros created in order to
+ * compress common standards specified feature test macros for easier reading.
* These macros should not be used by the application developer as
* unexpected results may occur. Instead, the user should reference
* standards(5) for correct usage of the standards feature test macros.
@@ -76,6 +80,10 @@
* the C standard. A value of 199901L indicates a
* compiler that complies with ISO/IEC 9899:1999, other-
* wise known as the C99 standard.
+ *
+ * _STRICT_SYMBOLS Used in cases where symbol visibility is restricted
+ * by the standards, and the user has not explicitly
+ * relaxed the strictness via __EXTENSIONS__.
*/
#if defined(_XOPEN_SOURCE) || defined(_POSIX_C_SOURCE)
@@ -145,6 +153,14 @@
#endif
/*
+ * Use strict symbol visibility.
+ */
+#if (defined(_STRICT_STDC) || defined(__XOPEN_OR_POSIX)) && \
+ !defined(__EXTENSIONS__)
+#define _STRICT_SYMBOLS
+#endif
+
+/*
* Large file interfaces:
*
* _LARGEFILE_SOURCE
@@ -223,6 +239,8 @@
* X/Open CAE Specification, Issue 5 (XPG5)
* Open Group Technical Standard, Issue 6 (XPG6), also referred to as
* IEEE Std. 1003.1-2001 and ISO/IEC 9945:2002.
+ * Open Group Technical Standard, Issue 7 (XPG7), also referred to as
+ * IEEE Std. 1003.1-2008 and ISO/IEC 9945:2009.
*
* XPG4v2 is also referred to as UNIX 95 (SUS or SUSv1).
* XPG5 is also referred to as UNIX 98 or the Single Unix Specification,
@@ -230,6 +248,7 @@
* XPG6 is the result of a merge of the X/Open and POSIX specifications
* and as such is also referred to as IEEE Std. 1003.1-2001 in
* addition to UNIX 03 and SUSv3.
+ * XPG7 is also referred to as UNIX 08 and SUSv4.
*
* When writing a conforming X/Open application, as per the specification
* requirements, the appropriate feature test macros must be defined at
@@ -242,6 +261,7 @@
* _XOPEN_SOURCE && _XOPEN_SOURCE_EXTENDED = 1 XPG4v2
* _XOPEN_SOURCE = 500 XPG5
* _XOPEN_SOURCE = 600 (or POSIX_C_SOURCE=200112L) XPG6
+ * _XOPEN_SOURCE = 700 (or POSIX_C_SOURCE=200809L) XPG7
*
* In order to simplify the guards within the headers, the following
* implementation private test macros have been created. Applications
@@ -261,6 +281,7 @@
* _XPG4_2 X/Open CAE Specification, Issue 4, Version 2 (XPG4v2/UNIX 95/SUS)
* _XPG5 X/Open CAE Specification, Issue 5 (XPG5/UNIX 98/SUSv2)
* _XPG6 Open Group Technical Standard, Issue 6 (XPG6/UNIX 03/SUSv3)
+ * _XPG7 Open Group Technical Standard, Issue 7 (XPG7/UNIX 08/SUSv4)
*/
/* X/Open Portability Guide, Issue 3 */
@@ -295,6 +316,19 @@
#define _POSIX_C_SOURCE 200112L
#undef _XOPEN_SOURCE
#define _XOPEN_SOURCE 600
+
+/* Open Group Technical Standard, Issue 7 */
+#elif (_XOPEN_SOURCE - 0 == 700) || (_POSIX_C_SOURCE - 0 == 200809L)
+#define _XPG7
+#define _XPG6
+#define _XPG5
+#define _XPG4_2
+#define _XPG4
+#define _XPG3
+#undef _POSIX_C_SOURCE
+#define _POSIX_C_SOURCE 200809L
+#undef _XOPEN_SOURCE
+#define _XOPEN_SOURCE 700
#endif
/*
@@ -305,12 +339,15 @@
* with the value of 4 indicates an XPG4 or XPG4v2 (UNIX 95) application.
* _XOPEN_VERSION defined with a value of 500 indicates an XPG5 (UNIX 98)
* application and with a value of 600 indicates an XPG6 (UNIX 03)
- * application. The appropriate version is determined by the use of the
+ * application and with a value of 700 indicates an XPG7 (UNIX 08).
+ * The appropriate version is determined by the use of the
* feature test macros described earlier. The value of _XOPEN_VERSION
* defaults to 3 otherwise indicating support for XPG3 applications.
*/
#ifndef _XOPEN_VERSION
-#ifdef _XPG6
+#if defined(_XPG7)
+#define _XOPEN_VERSION 700
+#elif defined(_XPG6)
#define _XOPEN_VERSION 600
#elif defined(_XPG5)
#define _XOPEN_VERSION 500
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/fm/fs/zfs.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/fm/fs/zfs.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/fm/fs/zfs.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/fm/protocol.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/fm/protocol.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/fm/protocol.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/fm/util.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/fm/util.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/fm/util.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -21,10 +22,11 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2013, Joyent, Inc. All rights reserved.
* Copyright (c) 2012, Martin Matuska <mm at FreeBSD.org>. All rights reserved.
+ * Copyright (c) 2014 Integros [integros.com]
*/
/* Portions Copyright 2010 Robert Milkowski */
@@ -49,12 +51,17 @@
* combined into masks that can be passed to various functions.
*/
typedef enum {
- ZFS_TYPE_FILESYSTEM = 0x1,
- ZFS_TYPE_SNAPSHOT = 0x2,
- ZFS_TYPE_VOLUME = 0x4,
- ZFS_TYPE_POOL = 0x8
+ ZFS_TYPE_FILESYSTEM = (1 << 0),
+ ZFS_TYPE_SNAPSHOT = (1 << 1),
+ ZFS_TYPE_VOLUME = (1 << 2),
+ ZFS_TYPE_POOL = (1 << 3),
+ ZFS_TYPE_BOOKMARK = (1 << 4)
} zfs_type_t;
+/*
+ * NB: lzc_dataset_type should be updated whenever a new objset type is added,
+ * if it represents a real type of a dataset that can be created from userland.
+ */
typedef enum dmu_objset_type {
DMU_OST_NONE,
DMU_OST_META,
@@ -68,9 +75,13 @@
#define ZFS_TYPE_DATASET \
(ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME | ZFS_TYPE_SNAPSHOT)
+/*
+ * All of these include the terminating NUL byte.
+ */
#define ZAP_MAXNAMELEN 256
#define ZAP_MAXVALUELEN (1024 * 8)
#define ZAP_OLDMAXVALUELEN 1024
+#define ZFS_MAX_DATASET_NAME_LEN 256
/*
* Dataset properties are identified by these constants and must be added to
@@ -143,6 +154,15 @@
ZFS_PROP_CLONES,
ZFS_PROP_LOGICALUSED,
ZFS_PROP_LOGICALREFERENCED,
+ ZFS_PROP_INCONSISTENT, /* not exposed to the user */
+ ZFS_PROP_VOLMODE,
+ ZFS_PROP_FILESYSTEM_LIMIT,
+ ZFS_PROP_SNAPSHOT_LIMIT,
+ ZFS_PROP_FILESYSTEM_COUNT,
+ ZFS_PROP_SNAPSHOT_COUNT,
+ ZFS_PROP_REDUNDANT_METADATA,
+ ZFS_PROP_PREV_SNAP,
+ ZFS_PROP_RECEIVE_RESUME_TOKEN,
ZFS_NUM_PROPS
} zfs_prop_t;
@@ -185,6 +205,10 @@
ZPOOL_PROP_COMMENT,
ZPOOL_PROP_EXPANDSZ,
ZPOOL_PROP_FREEING,
+ ZPOOL_PROP_FRAGMENTATION,
+ ZPOOL_PROP_LEAKED,
+ ZPOOL_PROP_MAXBLOCKSIZE,
+ ZPOOL_PROP_TNAME,
ZPOOL_NUM_PROPS
} zpool_prop_t;
@@ -335,7 +359,18 @@
ZFS_SYNC_DISABLED = 2
} zfs_sync_type_t;
+typedef enum {
+ ZFS_VOLMODE_DEFAULT = 0,
+ ZFS_VOLMODE_GEOM = 1,
+ ZFS_VOLMODE_DEV = 2,
+ ZFS_VOLMODE_NONE = 3
+} zfs_volmode_t;
+typedef enum {
+ ZFS_REDUNDANT_METADATA_ALL,
+ ZFS_REDUNDANT_METADATA_MOST
+} zfs_redundant_metadata_type_t;
+
/*
* On-disk version number.
*/
@@ -523,7 +558,7 @@
#define ZPOOL_CONFIG_SPLIT_GUID "split_guid"
#define ZPOOL_CONFIG_SPLIT_LIST "guid_list"
#define ZPOOL_CONFIG_REMOVING "removing"
-#define ZPOOL_CONFIG_RESILVERING "resilvering"
+#define ZPOOL_CONFIG_RESILVER_TXG "resilver_txg"
#define ZPOOL_CONFIG_COMMENT "comment"
#define ZPOOL_CONFIG_SUSPENDED "suspended" /* not stored on disk */
#define ZPOOL_CONFIG_TIMESTAMP "timestamp" /* not stored on disk */
@@ -574,10 +609,19 @@
/*
* This is needed in userland to report the minimum necessary device size.
+ *
+ * Note that the zfs test suite uses 64MB vdevs.
*/
#define SPA_MINDEVSIZE (64ULL << 20)
/*
+ * Set if the fragmentation has not yet been calculated. This can happen
+ * because the space maps have not been upgraded or the histogram feature
+ * is not enabled.
+ */
+#define ZFS_FRAG_INVALID UINT64_MAX
+
+/*
* The location of the pool configuration repository, shared between kernel and
* userland.
*/
@@ -620,7 +664,8 @@
VDEV_AUX_IO_FAILURE, /* experienced I/O failure */
VDEV_AUX_BAD_LOG, /* cannot read log chain(s) */
VDEV_AUX_EXTERNAL, /* external diagnosis */
- VDEV_AUX_SPLIT_POOL /* vdev was split off into another pool */
+ VDEV_AUX_SPLIT_POOL, /* vdev was split off into another pool */
+ VDEV_AUX_ASHIFT_TOO_BIG /* vdev's min block size is too large */
} vdev_aux_t;
/*
@@ -714,7 +759,14 @@
uint64_t vs_self_healed; /* self-healed bytes */
uint64_t vs_scan_removing; /* removing? */
uint64_t vs_scan_processed; /* scan processed bytes */
+ uint64_t vs_configured_ashift; /* TLV vdev_ashift */
+ uint64_t vs_logical_ashift; /* vdev_logical_ashift */
+ uint64_t vs_physical_ashift; /* vdev_physical_ashift */
+ uint64_t vs_fragmentation; /* device fragmentation */
} vdev_stat_t;
+#define VDEV_STAT_VALID(field, uint64_t_field_count) \
+ ((uint64_t_field_count * sizeof(uint64_t)) >= \
+ (offsetof(vdev_stat_t, field) + sizeof(((vdev_stat_t *)NULL)->field)))
/*
* DDT statistics. Note: all fields should be 64-bit because this
@@ -745,6 +797,10 @@
#define ZFS_DRIVER "zfs"
#define ZFS_DEV_NAME "zfs"
#define ZFS_DEV "/dev/" ZFS_DEV_NAME
+#define ZFS_DISK_ROOT "/dev/dsk"
+#define ZFS_DISK_ROOTD ZFS_DISK_ROOT "/"
+#define ZFS_RDISK_ROOT "/dev/rdsk"
+#define ZFS_RDISK_ROOTD ZFS_RDISK_ROOT "/"
/* general zvol path */
#define ZVOL_DIR "/dev/zvol"
@@ -831,6 +887,10 @@
ZFS_IOC_SEND_NEW,
ZFS_IOC_SEND_SPACE,
ZFS_IOC_CLONE,
+ ZFS_IOC_BOOKMARK,
+ ZFS_IOC_GET_BOOKMARKS,
+ ZFS_IOC_DESTROY_BOOKMARKS,
+ ZFS_IOC_NEXTBOOT,
ZFS_IOC_LAST
} zfs_ioc_t;
@@ -843,7 +903,8 @@
SPA_LOAD_IMPORT, /* import in progress */
SPA_LOAD_TRYIMPORT, /* tryimport in progress */
SPA_LOAD_RECOVER, /* recovery requested */
- SPA_LOAD_ERROR /* load failed */
+ SPA_LOAD_ERROR, /* load failed */
+ SPA_LOAD_CREATE /* creation in progress */
} spa_load_state_t;
/*
@@ -892,6 +953,7 @@
#define ZFS_IMPORT_ANY_HOST 0x2
#define ZFS_IMPORT_MISSING_LOG 0x4
#define ZFS_IMPORT_ONLY 0x8
+#define ZFS_IMPORT_TEMP_NAME 0x20
/*
* Sysevent payload members. ZFS will generate the following sysevents with the
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zut.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zut.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zut.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/gfs.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/gfs.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/gfs.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/idmap.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/idmap.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/idmap.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/isa_defs.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/isa_defs.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/isa_defs.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -235,7 +236,7 @@
/*
* Define the appropriate "processor characteristics"
*/
-#if defined(sun)
+#ifdef illumos
#define _LITTLE_ENDIAN
#endif
#define _STACK_GROWS_DOWNWARD
@@ -302,7 +303,7 @@
/*
* Define the appropriate "processor characteristics"
*/
-#if defined(sun)
+#ifdef illumos
#define _LITTLE_ENDIAN
#endif
#define _STACK_GROWS_DOWNWARD
@@ -504,7 +505,7 @@
* Define the appropriate "processor characteristics" shared between
* all Solaris on SPARC systems.
*/
-#if defined(sun)
+#ifdef illumos
#define _BIG_ENDIAN
#endif
#define _STACK_GROWS_DOWNWARD
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/list.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/list.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/list.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/list_impl.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/list_impl.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/list_impl.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/note.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/note.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/note.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/nvpair.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/nvpair.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/nvpair.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/nvpair_impl.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/nvpair_impl.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/nvpair_impl.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/processor.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/processor.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/processor.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -25,6 +26,8 @@
*/
/*
+ * Copyright 2014 Garrett D'Amore <garrett at damore.org>
+ *
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -112,7 +115,6 @@
* User-level system call interface prototypes
*/
#ifndef _KERNEL
-#ifdef __STDC__
extern int p_online(processorid_t processorid, int flag);
extern int processor_info(processorid_t processorid,
@@ -122,16 +124,6 @@
extern processorid_t getcpuid(void);
extern lgrpid_t gethomelgroup(void);
-#else
-
-extern int p_online();
-extern int processor_info();
-extern int processor_bind();
-extern processorid_t getcpuid();
-extern lgrpid_t gethomelgroup();
-
-#endif /* __STDC__ */
-
#else /* _KERNEL */
/*
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/procset.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/procset.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/procset.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -143,7 +144,7 @@
#endif /* !defined(_XPG4_2) || defined(__EXTENSIONS__) */
-#if defined(sun)
+#ifdef illumos
#ifdef _KERNEL
struct proc;
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/synch.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/synch.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/synch.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/sysevent/dev.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/sysevent/dev.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/sysevent/dev.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/sysevent/eventdefs.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/sysevent/eventdefs.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/sysevent/eventdefs.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -20,7 +21,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
*/
#ifndef _SYS_SYSEVENT_EVENTDEFS_H
@@ -249,9 +250,16 @@
#define ESC_ZFS_RESILVER_START "ESC_ZFS_resilver_start"
#define ESC_ZFS_RESILVER_FINISH "ESC_ZFS_resilver_finish"
#define ESC_ZFS_VDEV_REMOVE "ESC_ZFS_vdev_remove"
+#define ESC_ZFS_VDEV_REMOVE_AUX "ESC_ZFS_vdev_remove_aux"
+#define ESC_ZFS_VDEV_REMOVE_DEV "ESC_ZFS_vdev_remove_dev"
+#define ESC_ZFS_POOL_CREATE "ESC_ZFS_pool_create"
#define ESC_ZFS_POOL_DESTROY "ESC_ZFS_pool_destroy"
+#define ESC_ZFS_POOL_IMPORT "ESC_ZFS_pool_import"
+#define ESC_ZFS_VDEV_ADD "ESC_ZFS_vdev_add"
+#define ESC_ZFS_VDEV_ATTACH "ESC_ZFS_vdev_attach"
#define ESC_ZFS_VDEV_CLEAR "ESC_ZFS_vdev_clear"
#define ESC_ZFS_VDEV_CHECK "ESC_ZFS_vdev_check"
+#define ESC_ZFS_VDEV_ONLINE "ESC_ZFS_vdev_online"
#define ESC_ZFS_CONFIG_SYNC "ESC_ZFS_config_sync"
#define ESC_ZFS_SCRUB_START "ESC_ZFS_scrub_start"
#define ESC_ZFS_SCRUB_FINISH "ESC_ZFS_scrub_finish"
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/sysevent.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/sysevent.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/sysevent.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -214,7 +215,7 @@
#define EVCH_SET_CHAN_LEN 3 /* Set event queue length */
#define EVCH_CMD_LAST EVCH_SET_CHAN_LEN /* Last command */
-#ifdef sun
+#ifdef illumos
/*
* Shared user/kernel event channel interface definitions
*/
@@ -228,11 +229,11 @@
extern int sysevent_evc_control(evchan_t *, int, ...);
extern int sysevent_evc_setpropnvl(evchan_t *, nvlist_t *);
extern int sysevent_evc_getpropnvl(evchan_t *, nvlist_t **);
-#endif /* sun */
+#endif /* illumos */
#ifndef _KERNEL
-#ifdef sun
+#ifdef illumos
/*
* Userland-only event channel interfaces
*/
@@ -254,7 +255,7 @@
extern int sysevent_evc_xsubscribe(evchan_t *, const char *, const char *,
int (*)(sysevent_t *, void *), void *, uint32_t, sysevent_subattr_t *);
-#endif /* sun */
+#endif /* illumos */
#else
@@ -270,7 +271,7 @@
extern void sysevent_free_attr(sysevent_attr_list_t *);
extern int sysevent_attach_attributes(sysevent_t *, sysevent_attr_list_t *);
extern void sysevent_detach_attributes(sysevent_t *);
-#ifdef sun
+#ifdef illumos
extern char *sysevent_get_class_name(sysevent_t *);
extern char *sysevent_get_subclass_name(sysevent_t *);
extern uint64_t sysevent_get_seq(sysevent_t *);
@@ -278,7 +279,7 @@
extern size_t sysevent_get_size(sysevent_t *);
extern char *sysevent_get_pub(sysevent_t *);
extern int sysevent_get_attr_list(sysevent_t *, nvlist_t **);
-#endif /* sun */
+#endif /* illumos */
#endif /* _KERNEL */
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/sysmacros.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/sysmacros.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/sysmacros.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -32,6 +33,9 @@
#include <sys/param.h>
#include <sys/isa_defs.h>
+#if defined(__FreeBSD__) && defined(_KERNEL)
+#include <sys/libkern.h>
+#endif
#ifdef __cplusplus
extern "C" {
@@ -112,7 +116,7 @@
#define L_MAXMIN L_MAXMIN32
#endif
-#ifdef sun
+#ifdef illumos
#ifdef _KERNEL
/* major part of a device internal to the kernel */
@@ -172,7 +176,7 @@
#define getemajor(x) (major_t)((((dev_t)(x) >> L_BITSMINOR) > L_MAXMAJ) ? \
NODEV : (((dev_t)(x) >> L_BITSMINOR) & L_MAXMAJ))
#define geteminor(x) (minor_t)((x) & L_MAXMIN)
-#endif /* sun */
+#endif /* illumos */
/*
* These are versions of the kernel routines for compressing and
@@ -382,7 +386,10 @@
static __inline int
highbit(ulong_t i)
{
- register int h = 1;
+#if defined(__FreeBSD__) && defined(_KERNEL) && defined(HAVE_INLINE_FLSL)
+ return (flsl(i));
+#else
+ int h = 1;
if (i == 0)
return (0);
@@ -407,8 +414,45 @@
h += 1;
}
return (h);
+#endif
}
+/*
+ * Find highest one bit set.
+ * Returns bit number + 1 of highest bit that is set, otherwise returns 0.
+ */
+static __inline int
+highbit64(uint64_t i)
+{
+#if defined(__FreeBSD__) && defined(_KERNEL) && defined(HAVE_INLINE_FLSLL)
+ return (flsll(i));
+#else
+ int h = 1;
+
+ if (i == 0)
+ return (0);
+ if (i & 0xffffffff00000000ULL) {
+ h += 32; i >>= 32;
+ }
+ if (i & 0xffff0000) {
+ h += 16; i >>= 16;
+ }
+ if (i & 0xff00) {
+ h += 8; i >>= 8;
+ }
+ if (i & 0xf0) {
+ h += 4; i >>= 4;
+ }
+ if (i & 0xc) {
+ h += 2; i >>= 2;
+ }
+ if (i & 0x2) {
+ h += 1;
+ }
+ return (h);
+#endif
+}
+
#ifdef __cplusplus
}
#endif
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/taskq.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/taskq.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/taskq.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -45,6 +46,12 @@
typedef uintptr_t taskqid_t;
typedef void (task_func_t)(void *);
+typedef struct taskq_ent {
+ struct task tqent_task;
+ task_func_t *tqent_func;
+ void *tqent_arg;
+} taskq_ent_t;
+
struct proc;
/*
@@ -70,24 +77,25 @@
extern taskq_t *system_taskq;
-extern void taskq_init(void);
-extern void taskq_mp_init(void);
+void taskq_init(void);
+void taskq_mp_init(void);
-extern taskq_t *taskq_create(const char *, int, pri_t, int, int, uint_t);
-extern taskq_t *taskq_create_instance(const char *, int, int, pri_t, int,
- int, uint_t);
-extern taskq_t *taskq_create_proc(const char *, int, pri_t, int, int,
+taskq_t *taskq_create(const char *, int, pri_t, int, int, uint_t);
+taskq_t *taskq_create_instance(const char *, int, int, pri_t, int, int, uint_t);
+taskq_t *taskq_create_proc(const char *, int, pri_t, int, int,
struct proc *, uint_t);
-extern taskq_t *taskq_create_sysdc(const char *, int, int, int,
+taskq_t *taskq_create_sysdc(const char *, int, int, int,
struct proc *, uint_t, uint_t);
-extern taskqid_t taskq_dispatch(taskq_t *, task_func_t, void *, uint_t);
-extern void nulltask(void *);
-extern void taskq_destroy(taskq_t *);
-extern void taskq_wait(taskq_t *);
-extern void taskq_suspend(taskq_t *);
-extern int taskq_suspended(taskq_t *);
-extern void taskq_resume(taskq_t *);
-extern int taskq_member(taskq_t *, kthread_t *);
+taskqid_t taskq_dispatch(taskq_t *, task_func_t, void *, uint_t);
+void taskq_dispatch_ent(taskq_t *, task_func_t, void *, uint_t,
+ taskq_ent_t *);
+void nulltask(void *);
+void taskq_destroy(taskq_t *);
+void taskq_wait(taskq_t *);
+void taskq_suspend(taskq_t *);
+int taskq_suspended(taskq_t *);
+void taskq_resume(taskq_t *);
+int taskq_member(taskq_t *, kthread_t *);
#endif /* _KERNEL */
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/u8_textprep.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/u8_textprep.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/u8_textprep.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -36,7 +37,7 @@
extern "C" {
#endif
-#ifdef sun
+#ifdef illumos
/*
* Unicode encoding conversion functions and their macros.
*/
@@ -58,7 +59,7 @@
extern int uconv_u32tou8(const uint32_t *, size_t *, uchar_t *, size_t *, int);
extern int uconv_u8tou16(const uchar_t *, size_t *, uint16_t *, size_t *, int);
extern int uconv_u8tou32(const uchar_t *, size_t *, uint32_t *, size_t *, int);
-#endif /* sun */
+#endif /* illumos */
/*
* UTF-8 text preparation functions and their macros.
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/u8_textprep_data.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/u8_textprep_data.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/u8_textprep_data.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/vnode.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/vnode.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/vnode.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/sys/zmod.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/sys/zmod.h 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/sys/zmod.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/zmod/opensolaris_crc32.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/zmod/opensolaris_crc32.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/zmod/opensolaris_crc32.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
Modified: trunk/sys/cddl/contrib/opensolaris/uts/common/zmod/zmod.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/common/zmod/zmod.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/common/zmod/zmod.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,4 +1,4 @@
-/* $MidnightBSD: src/sys/cddl/contrib/opensolaris/uts/common/zmod/zmod.c,v 1.2 2008/12/03 00:24:34 laffer1 Exp $ */
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
Modified: trunk/sys/cddl/contrib/opensolaris/uts/intel/dtrace/fasttrap_isa.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/intel/dtrace/fasttrap_isa.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/intel/dtrace/fasttrap_isa.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -28,7 +29,7 @@
* Use is subject to license terms.
*/
-#if defined(sun)
+#ifdef illumos
#pragma ident "%Z%%M% %I% %E% SMI"
#endif
@@ -37,7 +38,7 @@
#include <sys/dtrace.h>
#include <sys/dtrace_impl.h>
#include <sys/cmn_err.h>
-#if defined(sun)
+#ifdef illumos
#include <sys/regset.h>
#include <sys/privregs.h>
#include <sys/segments.h>
@@ -46,14 +47,15 @@
#include <cddl/dev/dtrace/dtrace_cddl.h>
#include <sys/types.h>
#include <sys/proc.h>
+#include <sys/rmlock.h>
#include <sys/dtrace_bsd.h>
-#include <cddl/dev/dtrace/i386/regset.h>
+#include <cddl/dev/dtrace/x86/regset.h>
#include <machine/segments.h>
#include <machine/reg.h>
#include <machine/pcb.h>
#endif
#include <sys/sysmacros.h>
-#if defined(sun)
+#ifdef illumos
#include <sys/trap.h>
#include <sys/archsystm.h>
#else
@@ -75,7 +77,7 @@
uio.uio_td = curthread;
uio.uio_rw = op;
PHOLD(p);
- if (proc_rwmem(p, &uio) < 0) {
+ if (proc_rwmem(p, &uio) != 0) {
PRELE(p);
return (-1);
}
@@ -97,7 +99,7 @@
return (proc_ops(UIO_WRITE, p, kaddr, uaddr, len));
}
-#endif /* sun */
+#endif /* illumos */
#ifdef __i386__
#define r_rax r_eax
#define r_rbx r_ebx
@@ -104,6 +106,7 @@
#define r_rip r_eip
#define r_rflags r_eflags
#define r_rsp r_esp
+#define r_rbp r_ebp
#endif
/*
@@ -272,7 +275,20 @@
* registers.
*/
if (argno < 6)
- return ((&rp->r_rdi)[argno]);
+ switch (argno) {
+ case 0:
+ return (rp->r_rdi);
+ case 1:
+ return (rp->r_rsi);
+ case 2:
+ return (rp->r_rdx);
+ case 3:
+ return (rp->r_rcx);
+ case 4:
+ return (rp->r_r8);
+ case 5:
+ return (rp->r_r9);
+ }
stack = (uintptr_t *)rp->r_rsp;
DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
@@ -733,13 +749,15 @@
fasttrap_tracepoint_t *tp;
fasttrap_bucket_t *bucket;
fasttrap_id_t *id;
-#if defined(sun)
+#ifdef illumos
kmutex_t *pid_mtx;
-#endif
-#if defined(sun)
pid_mtx = &cpu_core[CPU->cpu_id].cpuc_pid_lock;
mutex_enter(pid_mtx);
+#else
+ struct rm_priotracker tracker;
+
+ rm_rlock(&fasttrap_tp_lock, &tracker);
#endif
bucket = &fasttrap_tpoints.fth_table[FASTTRAP_TPOINTS_INDEX(pid, pc)];
@@ -755,8 +773,10 @@
* is not essential to the correct execution of the process.
*/
if (tp == NULL) {
-#if defined(sun)
+#ifdef illumos
mutex_exit(pid_mtx);
+#else
+ rm_runlock(&fasttrap_tp_lock, &tracker);
#endif
return;
}
@@ -778,8 +798,10 @@
rp->r_rax, rp->r_rbx, 0, 0);
}
-#if defined(sun)
+#ifdef illumos
mutex_exit(pid_mtx);
+#else
+ rm_runlock(&fasttrap_tp_lock, &tracker);
#endif
}
@@ -786,7 +808,7 @@
static void
fasttrap_sigsegv(proc_t *p, kthread_t *t, uintptr_t addr)
{
-#if defined(sun)
+#ifdef illumos
sigqueue_t *sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP);
sqp->sq_info.si_signo = SIGSEGV;
@@ -987,10 +1009,14 @@
fasttrap_pid_probe(struct reg *rp)
{
proc_t *p = curproc;
+#ifndef illumos
+ struct rm_priotracker tracker;
+ proc_t *pp;
+#endif
uintptr_t pc = rp->r_rip - 1;
uintptr_t new_pc = 0;
fasttrap_bucket_t *bucket;
-#if defined(sun)
+#ifdef illumos
kmutex_t *pid_mtx;
#endif
fasttrap_tracepoint_t *tp, tp_local;
@@ -1022,24 +1048,31 @@
curthread->t_dtrace_regv = 0;
#endif
-#if defined(sun)
/*
* Treat a child created by a call to vfork(2) as if it were its
* parent. We know that there's only one thread of control in such a
* process: this one.
*/
+#ifdef illumos
while (p->p_flag & SVFORK) {
p = p->p_parent;
}
-#endif
- PROC_LOCK(p);
- _PHOLD(p);
pid = p->p_pid;
-#if defined(sun)
pid_mtx = &cpu_core[CPU->cpu_id].cpuc_pid_lock;
mutex_enter(pid_mtx);
+#else
+ pp = p;
+ sx_slock(&proctree_lock);
+ while (pp->p_vmspace == pp->p_pptr->p_vmspace)
+ pp = pp->p_pptr;
+ pid = pp->p_pid;
+ sx_sunlock(&proctree_lock);
+ pp = NULL;
+
+ rm_rlock(&fasttrap_tp_lock, &tracker);
#endif
+
bucket = &fasttrap_tpoints.fth_table[FASTTRAP_TPOINTS_INDEX(pid, pc)];
/*
@@ -1057,11 +1090,11 @@
* fasttrap_ioctl), or somehow we have mislaid this tracepoint.
*/
if (tp == NULL) {
-#if defined(sun)
+#ifdef illumos
mutex_exit(pid_mtx);
+#else
+ rm_runlock(&fasttrap_tp_lock, &tracker);
#endif
- _PRELE(p);
- PROC_UNLOCK(p);
return (-1);
}
@@ -1183,9 +1216,10 @@
* tracepoint again later if we need to light up any return probes.
*/
tp_local = *tp;
- PROC_UNLOCK(p);
-#if defined(sun)
+#ifdef illumos
mutex_exit(pid_mtx);
+#else
+ rm_runlock(&fasttrap_tp_lock, &tracker);
#endif
tp = &tp_local;
@@ -1381,17 +1415,16 @@
case FASTTRAP_T_PUSHL_EBP:
{
int ret = 0;
- uintptr_t addr = 0;
#ifdef __amd64
if (p->p_model == DATAMODEL_NATIVE) {
- addr = rp->r_rsp - sizeof (uintptr_t);
- ret = fasttrap_sulword((void *)addr, &rp->r_rsp);
+ rp->r_rsp -= sizeof (uintptr_t);
+ ret = fasttrap_sulword((void *)rp->r_rsp, rp->r_rbp);
} else {
#endif
#ifdef __i386__
- addr = rp->r_rsp - sizeof (uint32_t);
- ret = fasttrap_suword32((void *)addr, &rp->r_rsp);
+ rp->r_rsp -= sizeof (uint32_t);
+ ret = fasttrap_suword32((void *)rp->r_rsp, rp->r_rbp);
#endif
#ifdef __amd64
}
@@ -1398,12 +1431,11 @@
#endif
if (ret == -1) {
- fasttrap_sigsegv(p, curthread, addr);
+ fasttrap_sigsegv(p, curthread, rp->r_rsp);
new_pc = pc;
break;
}
- rp->r_rsp = addr;
new_pc = pc + tp->ftt_size;
break;
}
@@ -1417,10 +1449,7 @@
if (tp->ftt_code == 0) {
new_pc = tp->ftt_dest;
} else {
-#ifdef __amd64
- uintptr_t value;
-#endif
- uintptr_t addr = tp->ftt_dest;
+ uintptr_t value, addr = tp->ftt_dest;
if (tp->ftt_base != FASTTRAP_NOREG)
addr += fasttrap_getreg(rp, tp->ftt_base);
@@ -1444,6 +1473,7 @@
#ifdef __amd64
if (p->p_model == DATAMODEL_NATIVE) {
+#endif
if ((value = fasttrap_fulword((void *)addr))
== -1) {
fasttrap_sigsegv(p, curthread,
@@ -1452,9 +1482,8 @@
break;
}
new_pc = value;
+#ifdef __amd64
} else {
-#endif
-#ifdef __i386__
uint32_t value32;
addr = (uintptr_t)(uint32_t)addr;
if ((value32 = fasttrap_fuword32((void *)addr))
@@ -1465,13 +1494,11 @@
break;
}
new_pc = value32;
+ }
#endif
- }
-#ifdef __amd64
} else {
new_pc = addr;
}
-#endif
}
/*
@@ -1487,14 +1514,12 @@
if (p->p_model == DATAMODEL_NATIVE) {
addr = rp->r_rsp - sizeof (uintptr_t);
pcps = pc + tp->ftt_size;
- ret = fasttrap_sulword((void *)addr, &pcps);
+ ret = fasttrap_sulword((void *)addr, pcps);
} else {
#endif
-#ifdef __i386__
addr = rp->r_rsp - sizeof (uint32_t);
pcps = (uint32_t)(pc + tp->ftt_size);
- ret = fasttrap_suword32((void *)addr, &pcps);
-#endif
+ ret = fasttrap_suword32((void *)addr, pcps);
#ifdef __amd64
}
#endif
@@ -1519,9 +1544,8 @@
uint8_t scratch[2 * FASTTRAP_MAX_INSTR_SIZE + 7];
#endif
uint_t i = 0;
-#if defined(sun)
+#ifdef illumos
klwp_t *lwp = ttolwp(curthread);
-#endif
/*
* Compute the address of the ulwp_t and step over the
@@ -1529,7 +1553,6 @@
* thread pointer is very different on 32- and 64-bit
* kernels.
*/
-#if defined(sun)
#if defined(__amd64)
if (p->p_model == DATAMODEL_LP64) {
addr = lwp->lwp_pcb.pcb_fsbase;
@@ -1542,13 +1565,23 @@
addr = USD_GETBASE(&lwp->lwp_pcb.pcb_gsdesc);
addr += sizeof (void *);
#endif
-#endif /* sun */
-#ifdef __i386__
- addr = USD_GETBASE(&curthread->td_pcb->pcb_gsd);
-#else
- addr = curthread->td_pcb->pcb_gsbase;
-#endif
- addr += sizeof (void *);
+#else /* !illumos */
+ fasttrap_scrspace_t *scrspace;
+ scrspace = fasttrap_scraddr(curthread, tp->ftt_proc);
+ if (scrspace == NULL) {
+ /*
+ * We failed to allocate scratch space for this thread.
+ * Try to write the original instruction back out and
+ * reset the pc.
+ */
+ if (fasttrap_copyout(tp->ftt_instr, (void *)pc,
+ tp->ftt_size))
+ fasttrap_sigtrap(p, curthread, pc);
+ new_pc = pc;
+ break;
+ }
+ addr = scrspace->ftss_addr;
+#endif /* illumos */
/*
* Generic Instruction Tracing
@@ -1734,10 +1767,10 @@
ASSERT(i <= sizeof (scratch));
-#if defined(sun)
+#ifdef illumos
if (fasttrap_copyout(scratch, (char *)addr, i)) {
#else
- if (uwrite(curproc, scratch, i, addr)) {
+ if (uwrite(p, scratch, i, addr)) {
#endif
fasttrap_sigtrap(p, curthread, pc);
new_pc = pc;
@@ -1796,10 +1829,11 @@
rp->r_rip = new_pc;
+#ifndef illumos
PROC_LOCK(p);
proc_write_regs(curthread, rp);
- _PRELE(p);
PROC_UNLOCK(p);
+#endif
return (0);
}
@@ -1816,7 +1850,7 @@
curthread->t_dtrace_scrpc = 0;
curthread->t_dtrace_astpc = 0;
-#if defined(sun)
+#ifdef illumos
/*
* Treat a child created by a call to vfork(2) as if it were its
* parent. We know that there's only one thread of control in such a
@@ -1889,7 +1923,7 @@
case REG_ERR: return (rp->r_err);
case REG_RIP: return (rp->r_rip);
case REG_CS: return (rp->r_cs);
-#if defined(sun)
+#ifdef illumos
case REG_RFL: return (rp->r_rfl);
#endif
case REG_RSP: return (rp->r_rsp);
Added: trunk/sys/cddl/contrib/opensolaris/uts/mips/dtrace/fasttrap_isa.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/mips/dtrace/fasttrap_isa.c (rev 0)
+++ trunk/sys/cddl/contrib/opensolaris/uts/mips/dtrace/fasttrap_isa.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -0,0 +1,31 @@
+/* $MidnightBSD$ */
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+
+/*
+ * XXX: Placeholder for MIPS fasttrap code
+ */
Property changes on: trunk/sys/cddl/contrib/opensolaris/uts/mips/dtrace/fasttrap_isa.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/cddl/contrib/opensolaris/uts/mips/sys/fasttrap_isa.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/mips/sys/fasttrap_isa.h (rev 0)
+++ trunk/sys/cddl/contrib/opensolaris/uts/mips/sys/fasttrap_isa.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -0,0 +1,49 @@
+/* $MidnightBSD$ */
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _FASTTRAP_ISA_H
+#define _FASTTRAP_ISA_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*
+ * XXXDTRACE: placehodler for MIPS fasttrap stuff
+ */
+
+typedef uint32_t fasttrap_instr_t;
+#define FASTTRAP_SUNWDTRACE_SIZE 64
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _FASTTRAP_ISA_H */
Property changes on: trunk/sys/cddl/contrib/opensolaris/uts/mips/sys/fasttrap_isa.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/cddl/contrib/opensolaris/uts/powerpc/dtrace/fasttrap_isa.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/powerpc/dtrace/fasttrap_isa.c (rev 0)
+++ trunk/sys/cddl/contrib/opensolaris/uts/powerpc/dtrace/fasttrap_isa.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -0,0 +1,583 @@
+/* $MidnightBSD$ */
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/* Portions Copyright 2013 Justin Hibbits */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/fasttrap_isa.h>
+#include <sys/fasttrap_impl.h>
+#include <sys/dtrace.h>
+#include <sys/dtrace_impl.h>
+#include <cddl/dev/dtrace/dtrace_cddl.h>
+#include <sys/proc.h>
+#include <sys/types.h>
+#include <sys/uio.h>
+#include <sys/ptrace.h>
+#include <sys/rmlock.h>
+#include <sys/sysent.h>
+
+#define OP(x) ((x) >> 26)
+#define OPX(x) (((x) >> 2) & 0x3FF)
+#define OP_BO(x) (((x) & 0x03E00000) >> 21)
+#define OP_BI(x) (((x) & 0x001F0000) >> 16)
+#define OP_RS(x) (((x) & 0x03E00000) >> 21)
+#define OP_RA(x) (((x) & 0x001F0000) >> 16)
+#define OP_RB(x) (((x) & 0x0000F100) >> 11)
+
+
+static int
+proc_ops(int op, proc_t *p, void *kaddr, off_t uaddr, size_t len)
+{
+ struct iovec iov;
+ struct uio uio;
+
+ iov.iov_base = kaddr;
+ iov.iov_len = len;
+ uio.uio_offset = uaddr;
+ uio.uio_iov = &iov;
+ uio.uio_resid = len;
+ uio.uio_iovcnt = 1;
+ uio.uio_segflg = UIO_SYSSPACE;
+ uio.uio_td = curthread;
+ uio.uio_rw = op;
+ PHOLD(p);
+ if (proc_rwmem(p, &uio) != 0) {
+ PRELE(p);
+ return (-1);
+ }
+ PRELE(p);
+
+ return (0);
+}
+
+static int
+uread(proc_t *p, void *kaddr, size_t len, uintptr_t uaddr)
+{
+
+ return (proc_ops(UIO_READ, p, kaddr, uaddr, len));
+}
+
+static int
+uwrite(proc_t *p, void *kaddr, size_t len, uintptr_t uaddr)
+{
+
+ return (proc_ops(UIO_WRITE, p, kaddr, uaddr, len));
+}
+
+int
+fasttrap_tracepoint_install(proc_t *p, fasttrap_tracepoint_t *tp)
+{
+ fasttrap_instr_t instr = FASTTRAP_INSTR;
+
+ if (uwrite(p, &instr, 4, tp->ftt_pc) != 0)
+ return (-1);
+
+ return (0);
+}
+
+int
+fasttrap_tracepoint_remove(proc_t *p, fasttrap_tracepoint_t *tp)
+{
+ uint32_t instr;
+
+ /*
+ * Distinguish between read or write failures and a changed
+ * instruction.
+ */
+ if (uread(p, &instr, 4, tp->ftt_pc) != 0)
+ return (0);
+ if (instr != FASTTRAP_INSTR)
+ return (0);
+ if (uwrite(p, &tp->ftt_instr, 4, tp->ftt_pc) != 0)
+ return (-1);
+
+ return (0);
+}
+
+int
+fasttrap_tracepoint_init(proc_t *p, fasttrap_tracepoint_t *tp, uintptr_t pc,
+ fasttrap_probe_type_t type)
+{
+ uint32_t instr;
+ //int32_t disp;
+
+ /*
+ * Read the instruction at the given address out of the process's
+ * address space. We don't have to worry about a debugger
+ * changing this instruction before we overwrite it with our trap
+ * instruction since P_PR_LOCK is set.
+ */
+ if (uread(p, &instr, 4, pc) != 0)
+ return (-1);
+
+ /*
+ * Decode the instruction to fill in the probe flags. We can have
+ * the process execute most instructions on its own using a pc/npc
+ * trick, but pc-relative control transfer present a problem since
+ * we're relocating the instruction. We emulate these instructions
+ * in the kernel. We assume a default type and over-write that as
+ * needed.
+ *
+ * pc-relative instructions must be emulated for correctness;
+ * other instructions (which represent a large set of commonly traced
+ * instructions) are emulated or otherwise optimized for performance.
+ */
+ tp->ftt_type = FASTTRAP_T_COMMON;
+ tp->ftt_instr = instr;
+
+ switch (OP(instr)) {
+ /* The following are invalid for trapping (invalid opcodes, tw/twi). */
+ case 0:
+ case 1:
+ case 2:
+ case 4:
+ case 5:
+ case 6:
+ case 30:
+ case 39:
+ case 58:
+ case 62:
+ case 3: /* twi */
+ return (-1);
+ case 31: /* tw */
+ if (OPX(instr) == 4)
+ return (-1);
+ else if (OPX(instr) == 444 && OP_RS(instr) == OP_RA(instr) &&
+ OP_RS(instr) == OP_RB(instr))
+ tp->ftt_type = FASTTRAP_T_NOP;
+ break;
+ case 16:
+ tp->ftt_type = FASTTRAP_T_BC;
+ tp->ftt_dest = instr & 0x0000FFFC; /* Extract target address */
+ if (instr & 0x00008000)
+ tp->ftt_dest |= 0xFFFF0000;
+ /* Use as offset if not absolute address. */
+ if (!(instr & 0x02))
+ tp->ftt_dest += pc;
+ tp->ftt_bo = OP_BO(instr);
+ tp->ftt_bi = OP_BI(instr);
+ break;
+ case 18:
+ tp->ftt_type = FASTTRAP_T_B;
+ tp->ftt_dest = instr & 0x03FFFFFC; /* Extract target address */
+ if (instr & 0x02000000)
+ tp->ftt_dest |= 0xFC000000;
+ /* Use as offset if not absolute address. */
+ if (!(instr & 0x02))
+ tp->ftt_dest += pc;
+ break;
+ case 19:
+ switch (OPX(instr)) {
+ case 528: /* bcctr */
+ tp->ftt_type = FASTTRAP_T_BCTR;
+ tp->ftt_bo = OP_BO(instr);
+ tp->ftt_bi = OP_BI(instr);
+ break;
+ case 16: /* bclr */
+ tp->ftt_type = FASTTRAP_T_BCTR;
+ tp->ftt_bo = OP_BO(instr);
+ tp->ftt_bi = OP_BI(instr);
+ break;
+ };
+ break;
+ case 24:
+ if (OP_RS(instr) == OP_RA(instr) &&
+ (instr & 0x0000FFFF) == 0)
+ tp->ftt_type = FASTTRAP_T_NOP;
+ break;
+ };
+
+ /*
+ * We don't know how this tracepoint is going to be used, but in case
+ * it's used as part of a function return probe, we need to indicate
+ * whether it's always a return site or only potentially a return
+ * site. If it's part of a return probe, it's always going to be a
+ * return from that function if it's a restore instruction or if
+ * the previous instruction was a return. If we could reliably
+ * distinguish jump tables from return sites, this wouldn't be
+ * necessary.
+ */
+#if 0
+ if (tp->ftt_type != FASTTRAP_T_RESTORE &&
+ (uread(p, &instr, 4, pc - sizeof (instr)) != 0 ||
+ !(OP(instr) == 2 && OP3(instr) == OP3_RETURN)))
+ tp->ftt_flags |= FASTTRAP_F_RETMAYBE;
+#endif
+
+ return (0);
+}
+
+static uint64_t
+fasttrap_anarg(struct reg *rp, int argno)
+{
+ uint64_t value;
+ proc_t *p = curproc;
+
+ /* The first 8 arguments are in registers. */
+ if (argno < 8)
+ return rp->fixreg[argno + 3];
+
+ /* Arguments on stack start after SP+LR (2 register slots). */
+ if (SV_PROC_FLAG(p, SV_ILP32)) {
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
+ value = dtrace_fuword32((void *)(rp->fixreg[1] + 8 +
+ ((argno - 8) * sizeof(uint32_t))));
+ DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT | CPU_DTRACE_BADADDR);
+ } else {
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
+ value = dtrace_fuword64((void *)(rp->fixreg[1] + 16 +
+ ((argno - 8) * sizeof(uint32_t))));
+ DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT | CPU_DTRACE_BADADDR);
+ }
+ return value;
+}
+
+uint64_t
+fasttrap_pid_getarg(void *arg, dtrace_id_t id, void *parg, int argno,
+ int aframes)
+{
+ struct reg r;
+
+ fill_regs(curthread, &r);
+
+ return (fasttrap_anarg(&r, argno));
+}
+
+uint64_t
+fasttrap_usdt_getarg(void *arg, dtrace_id_t id, void *parg, int argno,
+ int aframes)
+{
+ struct reg r;
+
+ fill_regs(curthread, &r);
+
+ return (fasttrap_anarg(&r, argno));
+}
+
+static void
+fasttrap_usdt_args(fasttrap_probe_t *probe, struct reg *rp, int argc,
+ uintptr_t *argv)
+{
+ int i, x, cap = MIN(argc, probe->ftp_nargs);
+
+ for (i = 0; i < cap; i++) {
+ x = probe->ftp_argmap[i];
+
+ if (x < 8)
+ argv[i] = rp->fixreg[x];
+ else
+ if (SV_PROC_FLAG(curproc, SV_ILP32))
+ argv[i] = fuword32((void *)(rp->fixreg[1] + 8 +
+ (x * sizeof(uint32_t))));
+ else
+ argv[i] = fuword32((void *)(rp->fixreg[1] + 16 +
+ (x * sizeof(uint64_t))));
+ }
+
+ for (; i < argc; i++) {
+ argv[i] = 0;
+ }
+}
+
+static void
+fasttrap_return_common(struct reg *rp, uintptr_t pc, pid_t pid,
+ uintptr_t new_pc)
+{
+ struct rm_priotracker tracker;
+ fasttrap_tracepoint_t *tp;
+ fasttrap_bucket_t *bucket;
+ fasttrap_id_t *id;
+
+ rm_rlock(&fasttrap_tp_lock, &tracker);
+ bucket = &fasttrap_tpoints.fth_table[FASTTRAP_TPOINTS_INDEX(pid, pc)];
+
+ for (tp = bucket->ftb_data; tp != NULL; tp = tp->ftt_next) {
+ if (pid == tp->ftt_pid && pc == tp->ftt_pc &&
+ tp->ftt_proc->ftpc_acount != 0)
+ break;
+ }
+
+ /*
+ * Don't sweat it if we can't find the tracepoint again; unlike
+ * when we're in fasttrap_pid_probe(), finding the tracepoint here
+ * is not essential to the correct execution of the process.
+ */
+ if (tp == NULL) {
+ rm_runlock(&fasttrap_tp_lock, &tracker);
+ return;
+ }
+
+ for (id = tp->ftt_retids; id != NULL; id = id->fti_next) {
+ /*
+ * If there's a branch that could act as a return site, we
+ * need to trace it, and check here if the program counter is
+ * external to the function.
+ */
+ /* Skip function-local branches. */
+ if ((new_pc - id->fti_probe->ftp_faddr) < id->fti_probe->ftp_fsize)
+ continue;
+
+ dtrace_probe(id->fti_probe->ftp_id,
+ pc - id->fti_probe->ftp_faddr,
+ rp->fixreg[3], rp->fixreg[4], 0, 0);
+ }
+ rm_runlock(&fasttrap_tp_lock, &tracker);
+}
+
+
+static int
+fasttrap_branch_taken(int bo, int bi, struct reg *regs)
+{
+ int crzero = 0;
+
+ /* Branch always? */
+ if ((bo & 0x14) == 0x14)
+ return 1;
+
+ /* Handle decrementing ctr */
+ if (!(bo & 0x04)) {
+ --regs->ctr;
+ crzero = (regs->ctr == 0);
+ if (bo & 0x10) {
+ return (!(crzero ^ (bo >> 1)));
+ }
+ }
+
+ return (crzero | (((regs->cr >> (31 - bi)) ^ (bo >> 3)) ^ 1));
+}
+
+
+int
+fasttrap_pid_probe(struct reg *rp)
+{
+ struct rm_priotracker tracker;
+ proc_t *p = curproc;
+ uintptr_t pc = rp->pc;
+ uintptr_t new_pc = 0;
+ fasttrap_bucket_t *bucket;
+ fasttrap_tracepoint_t *tp, tp_local;
+ pid_t pid;
+ dtrace_icookie_t cookie;
+ uint_t is_enabled = 0;
+
+ /*
+ * It's possible that a user (in a veritable orgy of bad planning)
+ * could redirect this thread's flow of control before it reached the
+ * return probe fasttrap. In this case we need to kill the process
+ * since it's in a unrecoverable state.
+ */
+ if (curthread->t_dtrace_step) {
+ ASSERT(curthread->t_dtrace_on);
+ fasttrap_sigtrap(p, curthread, pc);
+ return (0);
+ }
+
+ /*
+ * Clear all user tracing flags.
+ */
+ curthread->t_dtrace_ft = 0;
+ curthread->t_dtrace_pc = 0;
+ curthread->t_dtrace_npc = 0;
+ curthread->t_dtrace_scrpc = 0;
+ curthread->t_dtrace_astpc = 0;
+
+ rm_rlock(&fasttrap_tp_lock, &tracker);
+ pid = p->p_pid;
+ bucket = &fasttrap_tpoints.fth_table[FASTTRAP_TPOINTS_INDEX(pid, pc)];
+
+ /*
+ * Lookup the tracepoint that the process just hit.
+ */
+ for (tp = bucket->ftb_data; tp != NULL; tp = tp->ftt_next) {
+ if (pid == tp->ftt_pid && pc == tp->ftt_pc &&
+ tp->ftt_proc->ftpc_acount != 0)
+ break;
+ }
+
+ /*
+ * If we couldn't find a matching tracepoint, either a tracepoint has
+ * been inserted without using the pid<pid> ioctl interface (see
+ * fasttrap_ioctl), or somehow we have mislaid this tracepoint.
+ */
+ if (tp == NULL) {
+ rm_runlock(&fasttrap_tp_lock, &tracker);
+ return (-1);
+ }
+
+ if (tp->ftt_ids != NULL) {
+ fasttrap_id_t *id;
+
+ for (id = tp->ftt_ids; id != NULL; id = id->fti_next) {
+ fasttrap_probe_t *probe = id->fti_probe;
+
+ if (id->fti_ptype == DTFTP_ENTRY) {
+ /*
+ * We note that this was an entry
+ * probe to help ustack() find the
+ * first caller.
+ */
+ cookie = dtrace_interrupt_disable();
+ DTRACE_CPUFLAG_SET(CPU_DTRACE_ENTRY);
+ dtrace_probe(probe->ftp_id, rp->fixreg[3],
+ rp->fixreg[4], rp->fixreg[5], rp->fixreg[6],
+ rp->fixreg[7]);
+ DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_ENTRY);
+ dtrace_interrupt_enable(cookie);
+ } else if (id->fti_ptype == DTFTP_IS_ENABLED) {
+ /*
+ * Note that in this case, we don't
+ * call dtrace_probe() since it's only
+ * an artificial probe meant to change
+ * the flow of control so that it
+ * encounters the true probe.
+ */
+ is_enabled = 1;
+ } else if (probe->ftp_argmap == NULL) {
+ dtrace_probe(probe->ftp_id, rp->fixreg[3],
+ rp->fixreg[4], rp->fixreg[5], rp->fixreg[6],
+ rp->fixreg[7]);
+ } else {
+ uintptr_t t[5];
+
+ fasttrap_usdt_args(probe, rp,
+ sizeof (t) / sizeof (t[0]), t);
+
+ dtrace_probe(probe->ftp_id, t[0], t[1],
+ t[2], t[3], t[4]);
+ }
+ }
+ }
+
+ /*
+ * We're about to do a bunch of work so we cache a local copy of
+ * the tracepoint to emulate the instruction, and then find the
+ * tracepoint again later if we need to light up any return probes.
+ */
+ tp_local = *tp;
+ rm_runlock(&fasttrap_tp_lock, &tracker);
+ tp = &tp_local;
+
+ /*
+ * If there's an is-enabled probe connected to this tracepoint it
+ * means that there was a 'xor r3, r3, r3'
+ * instruction that was placed there by DTrace when the binary was
+ * linked. As this probe is, in fact, enabled, we need to stuff 1
+ * into R3. Accordingly, we can bypass all the instruction
+ * emulation logic since we know the inevitable result. It's possible
+ * that a user could construct a scenario where the 'is-enabled'
+ * probe was on some other instruction, but that would be a rather
+ * exotic way to shoot oneself in the foot.
+ */
+ if (is_enabled) {
+ rp->fixreg[3] = 1;
+ new_pc = rp->pc + 4;
+ goto done;
+ }
+
+
+ switch (tp->ftt_type) {
+ case FASTTRAP_T_NOP:
+ new_pc = rp->pc + 4;
+ break;
+ case FASTTRAP_T_BC:
+ if (!fasttrap_branch_taken(tp->ftt_bo, tp->ftt_bi, rp))
+ break;
+ /* FALLTHROUGH */
+ case FASTTRAP_T_B:
+ if (tp->ftt_instr & 0x01)
+ rp->lr = rp->pc + 4;
+ new_pc = tp->ftt_dest;
+ break;
+ case FASTTRAP_T_BLR:
+ case FASTTRAP_T_BCTR:
+ if (!fasttrap_branch_taken(tp->ftt_bo, tp->ftt_bi, rp))
+ break;
+ /* FALLTHROUGH */
+ if (tp->ftt_type == FASTTRAP_T_BCTR)
+ new_pc = rp->ctr;
+ else
+ new_pc = rp->lr;
+ if (tp->ftt_instr & 0x01)
+ rp->lr = rp->pc + 4;
+ break;
+ case FASTTRAP_T_COMMON:
+ break;
+ };
+done:
+ /*
+ * If there were no return probes when we first found the tracepoint,
+ * we should feel no obligation to honor any return probes that were
+ * subsequently enabled -- they'll just have to wait until the next
+ * time around.
+ */
+ if (tp->ftt_retids != NULL) {
+ /*
+ * We need to wait until the results of the instruction are
+ * apparent before invoking any return probes. If this
+ * instruction was emulated we can just call
+ * fasttrap_return_common(); if it needs to be executed, we
+ * need to wait until the user thread returns to the kernel.
+ */
+ if (tp->ftt_type != FASTTRAP_T_COMMON) {
+ fasttrap_return_common(rp, pc, pid, new_pc);
+ } else {
+ ASSERT(curthread->t_dtrace_ret != 0);
+ ASSERT(curthread->t_dtrace_pc == pc);
+ ASSERT(curthread->t_dtrace_scrpc != 0);
+ ASSERT(new_pc == curthread->t_dtrace_astpc);
+ }
+ }
+
+ rp->pc = new_pc;
+ set_regs(curthread, rp);
+
+ return (0);
+}
+
+int
+fasttrap_return_probe(struct reg *rp)
+{
+ proc_t *p = curproc;
+ uintptr_t pc = curthread->t_dtrace_pc;
+ uintptr_t npc = curthread->t_dtrace_npc;
+
+ curthread->t_dtrace_pc = 0;
+ curthread->t_dtrace_npc = 0;
+ curthread->t_dtrace_scrpc = 0;
+ curthread->t_dtrace_astpc = 0;
+
+ /*
+ * We set rp->pc to the address of the traced instruction so
+ * that it appears to dtrace_probe() that we're on the original
+ * instruction, and so that the user can't easily detect our
+ * complex web of lies. dtrace_return_probe() (our caller)
+ * will correctly set %pc after we return.
+ */
+ rp->pc = pc;
+
+ fasttrap_return_common(rp, pc, p->p_pid, npc);
+
+ return (0);
+}
+
Property changes on: trunk/sys/cddl/contrib/opensolaris/uts/powerpc/dtrace/fasttrap_isa.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/cddl/contrib/opensolaris/uts/powerpc/sys/fasttrap_isa.h
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/powerpc/sys/fasttrap_isa.h (rev 0)
+++ trunk/sys/cddl/contrib/opensolaris/uts/powerpc/sys/fasttrap_isa.h 2018-06-01 22:46:41 UTC (rev 10164)
@@ -0,0 +1,77 @@
+/* $MidnightBSD$ */
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/* Portions Copyright 2013 Justin Hibbits */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _FASTTRAP_ISA_H
+#define _FASTTRAP_ISA_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define FASTTRAP_SUNWDTRACE_SIZE 64
+#define FASTTRAP_INSTR 0x0FFFDDDD
+
+typedef uint32_t fasttrap_instr_t;
+
+typedef struct fasttrap_machtp_t {
+ fasttrap_instr_t ftmt_instr; /* original instruction */
+ uintptr_t ftmt_dest; /* branch target */
+ uint8_t ftmt_type; /* emulation type */
+ uint8_t ftmt_flags; /* emulation flags */
+ uint8_t ftmt_bo; /* BO field */
+ uint8_t ftmt_bi; /* BI field (CR bit) */
+} fasttrap_machtp_t;
+
+#define ftt_instr ftt_mtp.ftmt_instr
+#define ftt_dest ftt_mtp.ftmt_dest
+#define ftt_type ftt_mtp.ftmt_type
+#define ftt_flags ftt_mtp.ftmt_flags
+#define ftt_bo ftt_mtp.ftmt_bo
+#define ftt_bi ftt_mtp.ftmt_bi
+
+#define FASTTRAP_T_COMMON 0x00
+#define FASTTRAP_T_B 0x01
+#define FASTTRAP_T_BC 0x02
+#define FASTTRAP_T_BLR 0x03
+#define FASTTRAP_T_BCTR 0x04
+#define FASTTRAP_T_NOP 0x05
+
+#define FASTTRAP_AFRAMES 3
+#define FASTTRAP_RETURN_AFRAMES 4
+#define FASTTRAP_ENTRY_AFRAMES 3
+#define FASTTRAP_OFFSET_AFRAMES 3
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _FASTTRAP_ISA_H */
Property changes on: trunk/sys/cddl/contrib/opensolaris/uts/powerpc/sys/fasttrap_isa.h
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Modified: trunk/sys/cddl/contrib/opensolaris/uts/sparc/dtrace/fasttrap_isa.c
===================================================================
--- trunk/sys/cddl/contrib/opensolaris/uts/sparc/dtrace/fasttrap_isa.c 2018-06-01 22:42:24 UTC (rev 10163)
+++ trunk/sys/cddl/contrib/opensolaris/uts/sparc/dtrace/fasttrap_isa.c 2018-06-01 22:46:41 UTC (rev 10164)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*
* CDDL HEADER START
*
@@ -24,8 +25,6 @@
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/fasttrap_isa.h>
#include <sys/fasttrap_impl.h>
#include <sys/dtrace.h>
@@ -1410,7 +1409,7 @@
value = dtrace_getreg_win(reg, 1);
dtrace_interrupt_enable(cookie);
- atomic_add_64(&fasttrap_getreg_fast_cnt, 1);
+ atomic_inc_64(&fasttrap_getreg_fast_cnt);
return (value);
}
@@ -1435,7 +1434,7 @@
if ((long)mpcb->mpcb_spbuf[i] != rp->r_sp)
continue;
- atomic_add_64(&fasttrap_getreg_mpcb_cnt, 1);
+ atomic_inc_64(&fasttrap_getreg_mpcb_cnt);
return (rwin[i].rw_local[reg - 16]);
} while (i > 0);
}
@@ -1455,7 +1454,7 @@
if ((long)mpcb->mpcb_spbuf[i] != rp->r_sp)
continue;
- atomic_add_64(&fasttrap_getreg_mpcb_cnt, 1);
+ atomic_inc_64(&fasttrap_getreg_mpcb_cnt);
return (rwin[i].rw_local[reg - 16]);
} while (i > 0);
}
@@ -1466,7 +1465,7 @@
v32[0] = 0;
}
- atomic_add_64(&fasttrap_getreg_slow_cnt, 1);
+ atomic_inc_64(&fasttrap_getreg_slow_cnt);
return (value);
err:
@@ -1505,7 +1504,7 @@
if (dtrace_getotherwin() > 0) {
dtrace_putreg_win(reg, value);
dtrace_interrupt_enable(cookie);
- atomic_add_64(&fasttrap_putreg_fast_cnt, 1);
+ atomic_inc_64(&fasttrap_putreg_fast_cnt);
return;
}
dtrace_interrupt_enable(cookie);
@@ -1536,7 +1535,7 @@
continue;
rwin[i].rw_local[reg - 16] = value;
- atomic_add_64(&fasttrap_putreg_mpcb_cnt, 1);
+ atomic_inc_64(&fasttrap_putreg_mpcb_cnt);
return;
} while (i > 0);
}
@@ -1549,7 +1548,7 @@
rwin[mpcb->mpcb_wbcnt].rw_local[reg - 16] = value;
mpcb->mpcb_spbuf[mpcb->mpcb_wbcnt] = (caddr_t)rp->r_sp;
mpcb->mpcb_wbcnt++;
- atomic_add_64(&fasttrap_putreg_mpcb_cnt, 1);
+ atomic_inc_64(&fasttrap_putreg_mpcb_cnt);
return;
}
} else {
@@ -1567,7 +1566,7 @@
continue;
rwin[i].rw_local[reg - 16] = v32;
- atomic_add_64(&fasttrap_putreg_mpcb_cnt, 1);
+ atomic_inc_64(&fasttrap_putreg_mpcb_cnt);
return;
} while (i > 0);
}
@@ -1580,12 +1579,12 @@
rwin[mpcb->mpcb_wbcnt].rw_local[reg - 16] = v32;
mpcb->mpcb_spbuf[mpcb->mpcb_wbcnt] = (caddr_t)rp->r_sp;
mpcb->mpcb_wbcnt++;
- atomic_add_64(&fasttrap_putreg_mpcb_cnt, 1);
+ atomic_inc_64(&fasttrap_putreg_mpcb_cnt);
return;
}
}
- atomic_add_64(&fasttrap_putreg_slow_cnt, 1);
+ atomic_inc_64(&fasttrap_putreg_slow_cnt);
return;
err:
More information about the Midnightbsd-cvs
mailing list