[Midnightbsd-cvs] src [9952] trunk/sys/kern: sync with freebsd
laffer1 at midnightbsd.org
laffer1 at midnightbsd.org
Sat May 26 10:24:53 EDT 2018
Revision: 9952
http://svnweb.midnightbsd.org/src/?rev=9952
Author: laffer1
Date: 2018-05-26 10:24:52 -0400 (Sat, 26 May 2018)
Log Message:
-----------
sync with freebsd
Modified Paths:
--------------
trunk/sys/kern/vfs_aio.c
trunk/sys/kern/vfs_bio.c
trunk/sys/kern/vfs_cache.c
trunk/sys/kern/vfs_cluster.c
trunk/sys/kern/vfs_default.c
trunk/sys/kern/vfs_export.c
trunk/sys/kern/vfs_extattr.c
trunk/sys/kern/vfs_hash.c
trunk/sys/kern/vfs_init.c
trunk/sys/kern/vfs_lookup.c
trunk/sys/kern/vfs_mount.c
trunk/sys/kern/vfs_mountroot.c
trunk/sys/kern/vfs_subr.c
trunk/sys/kern/vfs_syscalls.c
Modified: trunk/sys/kern/vfs_aio.c
===================================================================
--- trunk/sys/kern/vfs_aio.c 2018-05-25 21:07:58 UTC (rev 9951)
+++ trunk/sys/kern/vfs_aio.c 2018-05-26 14:24:52 UTC (rev 9952)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 1997 John S. Dyson. All rights reserved.
*
@@ -19,7 +20,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/vfs_aio.c 304739 2016-08-24 09:20:27Z kib $");
#include "opt_compat.h"
@@ -28,7 +29,7 @@
#include <sys/malloc.h>
#include <sys/bio.h>
#include <sys/buf.h>
-#include <sys/capability.h>
+#include <sys/capsicum.h>
#include <sys/eventhandler.h>
#include <sys/sysproto.h>
#include <sys/filedesc.h>
@@ -46,6 +47,7 @@
#include <sys/resourcevar.h>
#include <sys/signalvar.h>
#include <sys/protosw.h>
+#include <sys/rwlock.h>
#include <sys/sema.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
@@ -58,10 +60,12 @@
#include <sys/conf.h>
#include <sys/event.h>
#include <sys/mount.h>
+#include <geom/geom.h>
#include <machine/atomic.h>
#include <vm/vm.h>
+#include <vm/vm_page.h>
#include <vm/vm_extern.h>
#include <vm/pmap.h>
#include <vm/vm_map.h>
@@ -231,9 +235,10 @@
int jobstate; /* (b) job state */
int inputcharge; /* (*) input blockes */
int outputcharge; /* (*) output blockes */
- struct buf *bp; /* (*) private to BIO backend,
- * buffer pointer
- */
+ struct bio *bp; /* (*) BIO backend BIO pointer */
+ struct buf *pbuf; /* (*) BIO backend buffer pointer */
+ struct vm_page *pages[btoc(MAXPHYS)+1]; /* BIO backend pages */
+ int npages; /* BIO backend number of pages */
struct proc *userproc; /* (*) user process */
struct ucred *cred; /* (*) active credential when created */
struct file *fd_file; /* (*) pointer to file structure */
@@ -242,7 +247,6 @@
struct knlist klist; /* (a) list of knotes */
struct aiocb uaiocb; /* (*) kernel I/O control block */
ksiginfo_t ksi; /* (a) realtime signal info */
- struct task biotask; /* (*) private to BIO backend */
uint64_t seqno; /* (*) job number */
int pending; /* (a) number of pending I/O, aio_fsync only */
};
@@ -337,15 +341,16 @@
void aio_init_aioinfo(struct proc *p);
static int aio_onceonly(void);
static int aio_free_entry(struct aiocblist *aiocbe);
-static void aio_process(struct aiocblist *aiocbe);
+static void aio_process_rw(struct aiocblist *aiocbe);
+static void aio_process_sync(struct aiocblist *aiocbe);
+static void aio_process_mlock(struct aiocblist *aiocbe);
static int aio_newproc(int *);
int aio_aqueue(struct thread *td, struct aiocb *job,
struct aioliojob *lio, int type, struct aiocb_ops *ops);
-static void aio_physwakeup(struct buf *bp);
+static void aio_physwakeup(struct bio *bp);
static void aio_proc_rundown(void *arg, struct proc *p);
static void aio_proc_rundown_exec(void *arg, struct proc *p, struct image_params *imgp);
static int aio_qphysio(struct proc *p, struct aiocblist *iocb);
-static void biohelper(void *, int);
static void aio_daemon(void *param);
static void aio_swake_cb(struct socket *, struct sockbuf *);
static int aio_unload(void);
@@ -424,6 +429,7 @@
SYSCALL_INIT_HELPER(aio_cancel),
SYSCALL_INIT_HELPER(aio_error),
SYSCALL_INIT_HELPER(aio_fsync),
+ SYSCALL_INIT_HELPER(aio_mlock),
SYSCALL_INIT_HELPER(aio_read),
SYSCALL_INIT_HELPER(aio_return),
SYSCALL_INIT_HELPER(aio_suspend),
@@ -451,6 +457,7 @@
SYSCALL32_INIT_HELPER(freebsd32_aio_cancel),
SYSCALL32_INIT_HELPER(freebsd32_aio_error),
SYSCALL32_INIT_HELPER(freebsd32_aio_fsync),
+ SYSCALL32_INIT_HELPER(freebsd32_aio_mlock),
SYSCALL32_INIT_HELPER(freebsd32_aio_read),
SYSCALL32_INIT_HELPER(freebsd32_aio_write),
SYSCALL32_INIT_HELPER(freebsd32_aio_waitcomplete),
@@ -700,7 +707,8 @@
* at open time, but this is already true of file descriptors in
* a multithreaded process.
*/
- fdrop(aiocbe->fd_file, curthread);
+ if (aiocbe->fd_file)
+ fdrop(aiocbe->fd_file, curthread);
crfree(aiocbe->cred);
uma_zfree(aiocb_zone, aiocbe);
AIO_LOCK(ki);
@@ -835,17 +843,15 @@
aio_fsync_vnode(struct thread *td, struct vnode *vp)
{
struct mount *mp;
- int vfslocked;
int error;
- vfslocked = VFS_LOCK_GIANT(vp->v_mount);
if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
goto drop;
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
if (vp->v_object != NULL) {
- VM_OBJECT_LOCK(vp->v_object);
+ VM_OBJECT_WLOCK(vp->v_object);
vm_object_page_clean(vp->v_object, 0, 0, 0);
- VM_OBJECT_UNLOCK(vp->v_object);
+ VM_OBJECT_WUNLOCK(vp->v_object);
}
error = VOP_FSYNC(vp, MNT_WAIT, td);
@@ -852,20 +858,19 @@
VOP_UNLOCK(vp, 0);
vn_finished_write(mp);
drop:
- VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
/*
- * The AIO processing activity. This is the code that does the I/O request for
- * the non-physio version of the operations. The normal vn operations are used,
- * and this code should work in all instances for every type of file, including
- * pipes, sockets, fifos, and regular files.
+ * The AIO processing activity for LIO_READ/LIO_WRITE. This is the code that
+ * does the I/O request for the non-physio version of the operations. The
+ * normal vn operations are used, and this code should work in all instances
+ * for every type of file, including pipes, sockets, fifos, and regular files.
*
* XXX I don't think it works well for socket, pipe, and fifo.
*/
static void
-aio_process(struct aiocblist *aiocbe)
+aio_process_rw(struct aiocblist *aiocbe)
{
struct ucred *td_savedcred;
struct thread *td;
@@ -879,6 +884,10 @@
int oublock_st, oublock_end;
int inblock_st, inblock_end;
+ KASSERT(aiocbe->uaiocb.aio_lio_opcode == LIO_READ ||
+ aiocbe->uaiocb.aio_lio_opcode == LIO_WRITE,
+ ("%s: opcode %d", __func__, aiocbe->uaiocb.aio_lio_opcode));
+
td = curthread;
td_savedcred = td->td_ucred;
td->td_ucred = aiocbe->cred;
@@ -885,17 +894,6 @@
cb = &aiocbe->uaiocb;
fp = aiocbe->fd_file;
- if (cb->aio_lio_opcode == LIO_SYNC) {
- error = 0;
- cnt = 0;
- if (fp->f_vnode != NULL)
- error = aio_fsync_vnode(td, fp->f_vnode);
- cb->_aiocb_private.error = error;
- cb->_aiocb_private.status = 0;
- td->td_ucred = td_savedcred;
- return;
- }
-
aiov.iov_base = (void *)(uintptr_t)cb->aio_buf;
aiov.iov_len = cb->aio_nbytes;
@@ -956,6 +954,41 @@
}
static void
+aio_process_sync(struct aiocblist *aiocbe)
+{
+ struct thread *td = curthread;
+ struct ucred *td_savedcred = td->td_ucred;
+ struct aiocb *cb = &aiocbe->uaiocb;
+ struct file *fp = aiocbe->fd_file;
+ int error = 0;
+
+ KASSERT(aiocbe->uaiocb.aio_lio_opcode == LIO_SYNC,
+ ("%s: opcode %d", __func__, aiocbe->uaiocb.aio_lio_opcode));
+
+ td->td_ucred = aiocbe->cred;
+ if (fp->f_vnode != NULL)
+ error = aio_fsync_vnode(td, fp->f_vnode);
+ cb->_aiocb_private.error = error;
+ cb->_aiocb_private.status = 0;
+ td->td_ucred = td_savedcred;
+}
+
+static void
+aio_process_mlock(struct aiocblist *aiocbe)
+{
+ struct aiocb *cb = &aiocbe->uaiocb;
+ int error;
+
+ KASSERT(aiocbe->uaiocb.aio_lio_opcode == LIO_MLOCK,
+ ("%s: opcode %d", __func__, aiocbe->uaiocb.aio_lio_opcode));
+
+ error = vm_mlock(aiocbe->userproc, aiocbe->cred,
+ __DEVOLATILE(void *, cb->aio_buf), cb->aio_nbytes);
+ cb->_aiocb_private.error = error;
+ cb->_aiocb_private.status = 0;
+}
+
+static void
aio_bio_done_notify(struct proc *userp, struct aiocblist *aiocbe, int type)
{
struct aioliojob *lj;
@@ -1026,7 +1059,7 @@
}
/*
- * The AIO daemon, most of the actual work is done in aio_process,
+ * The AIO daemon, most of the actual work is done in aio_process_*,
* but the setup (and address space mgmt) is done in this routine.
*/
static void
@@ -1123,7 +1156,18 @@
ki = userp->p_aioinfo;
/* Do the I/O function. */
- aio_process(aiocbe);
+ switch(aiocbe->uaiocb.aio_lio_opcode) {
+ case LIO_READ:
+ case LIO_WRITE:
+ aio_process_rw(aiocbe);
+ break;
+ case LIO_SYNC:
+ aio_process_sync(aiocbe);
+ break;
+ case LIO_MLOCK:
+ aio_process_mlock(aiocbe);
+ break;
+ }
mtx_lock(&aio_job_mtx);
/* Decrement the active job count. */
@@ -1252,122 +1296,139 @@
{
struct aiocb *cb;
struct file *fp;
- struct buf *bp;
+ struct bio *bp;
+ struct buf *pbuf;
struct vnode *vp;
struct cdevsw *csw;
struct cdev *dev;
struct kaioinfo *ki;
struct aioliojob *lj;
- int error, ref;
+ int error, ref, unmap, poff;
+ vm_prot_t prot;
cb = &aiocbe->uaiocb;
fp = aiocbe->fd_file;
- if (fp->f_type != DTYPE_VNODE)
+ if (fp == NULL || fp->f_type != DTYPE_VNODE)
return (-1);
vp = fp->f_vnode;
-
- /*
- * If its not a disk, we don't want to return a positive error.
- * It causes the aio code to not fall through to try the thread
- * way when you're talking to a regular file.
- */
- if (!vn_isdisk(vp, &error)) {
- if (error == ENOTBLK)
- return (-1);
- else
- return (error);
- }
-
+ if (vp->v_type != VCHR)
+ return (-1);
if (vp->v_bufobj.bo_bsize == 0)
return (-1);
-
- if (cb->aio_nbytes % vp->v_bufobj.bo_bsize)
+ if (cb->aio_nbytes % vp->v_bufobj.bo_bsize)
return (-1);
- if (cb->aio_nbytes >
- MAXPHYS - (((vm_offset_t) cb->aio_buf) & PAGE_MASK))
- return (-1);
-
- ki = p->p_aioinfo;
- if (ki->kaio_buffer_count >= ki->kaio_ballowed_count)
- return (-1);
-
ref = 0;
csw = devvn_refthread(vp, &dev, &ref);
if (csw == NULL)
return (ENXIO);
+
+ if ((csw->d_flags & D_DISK) == 0) {
+ error = -1;
+ goto unref;
+ }
if (cb->aio_nbytes > dev->si_iosize_max) {
error = -1;
goto unref;
}
- /* Create and build a buffer header for a transfer. */
- bp = (struct buf *)getpbuf(NULL);
- BUF_KERNPROC(bp);
+ ki = p->p_aioinfo;
+ poff = (vm_offset_t)cb->aio_buf & PAGE_MASK;
+ unmap = ((dev->si_flags & SI_UNMAPPED) && unmapped_buf_allowed);
+ if (unmap) {
+ if (cb->aio_nbytes > MAXPHYS) {
+ error = -1;
+ goto unref;
+ }
+ } else {
+ if (cb->aio_nbytes > MAXPHYS - poff) {
+ error = -1;
+ goto unref;
+ }
+ if (ki->kaio_buffer_count >= ki->kaio_ballowed_count) {
+ error = -1;
+ goto unref;
+ }
+ }
+ aiocbe->bp = bp = g_alloc_bio();
+ if (!unmap) {
+ aiocbe->pbuf = pbuf = (struct buf *)getpbuf(NULL);
+ BUF_KERNPROC(pbuf);
+ } else
+ pbuf = NULL;
AIO_LOCK(ki);
ki->kaio_count++;
- ki->kaio_buffer_count++;
+ if (!unmap)
+ ki->kaio_buffer_count++;
lj = aiocbe->lio;
if (lj)
lj->lioj_count++;
+ TAILQ_INSERT_TAIL(&ki->kaio_bufqueue, aiocbe, plist);
+ TAILQ_INSERT_TAIL(&ki->kaio_all, aiocbe, allist);
+ aiocbe->jobstate = JOBST_JOBQBUF;
+ cb->_aiocb_private.status = cb->aio_nbytes;
AIO_UNLOCK(ki);
- /*
- * Get a copy of the kva from the physical buffer.
- */
- error = 0;
+ bp->bio_length = cb->aio_nbytes;
+ bp->bio_bcount = cb->aio_nbytes;
+ bp->bio_done = aio_physwakeup;
+ bp->bio_data = (void *)(uintptr_t)cb->aio_buf;
+ bp->bio_offset = cb->aio_offset;
+ bp->bio_cmd = cb->aio_lio_opcode == LIO_WRITE ? BIO_WRITE : BIO_READ;
+ bp->bio_dev = dev;
+ bp->bio_caller1 = (void *)aiocbe;
- bp->b_bcount = cb->aio_nbytes;
- bp->b_bufsize = cb->aio_nbytes;
- bp->b_iodone = aio_physwakeup;
- bp->b_saveaddr = bp->b_data;
- bp->b_data = (void *)(uintptr_t)cb->aio_buf;
- bp->b_offset = cb->aio_offset;
- bp->b_iooffset = cb->aio_offset;
- bp->b_blkno = btodb(cb->aio_offset);
- bp->b_iocmd = cb->aio_lio_opcode == LIO_WRITE ? BIO_WRITE : BIO_READ;
-
- /*
- * Bring buffer into kernel space.
- */
- if (vmapbuf(bp) < 0) {
+ prot = VM_PROT_READ;
+ if (cb->aio_lio_opcode == LIO_READ)
+ prot |= VM_PROT_WRITE; /* Less backwards than it looks */
+ if ((aiocbe->npages = vm_fault_quick_hold_pages(
+ &curproc->p_vmspace->vm_map,
+ (vm_offset_t)bp->bio_data, bp->bio_length, prot, aiocbe->pages,
+ sizeof(aiocbe->pages)/sizeof(aiocbe->pages[0]))) < 0) {
error = EFAULT;
goto doerror;
}
+ if (!unmap) {
+ pmap_qenter((vm_offset_t)pbuf->b_data,
+ aiocbe->pages, aiocbe->npages);
+ bp->bio_data = pbuf->b_data + poff;
+ } else {
+ bp->bio_ma = aiocbe->pages;
+ bp->bio_ma_n = aiocbe->npages;
+ bp->bio_ma_offset = poff;
+ bp->bio_data = unmapped_buf;
+ bp->bio_flags |= BIO_UNMAPPED;
+ }
- AIO_LOCK(ki);
- aiocbe->bp = bp;
- bp->b_caller1 = (void *)aiocbe;
- TAILQ_INSERT_TAIL(&ki->kaio_bufqueue, aiocbe, plist);
- TAILQ_INSERT_TAIL(&ki->kaio_all, aiocbe, allist);
- aiocbe->jobstate = JOBST_JOBQBUF;
- cb->_aiocb_private.status = cb->aio_nbytes;
- AIO_UNLOCK(ki);
-
atomic_add_int(&num_queue_count, 1);
- atomic_add_int(&num_buf_aio, 1);
+ if (!unmap)
+ atomic_add_int(&num_buf_aio, 1);
- bp->b_error = 0;
-
- TASK_INIT(&aiocbe->biotask, 0, biohelper, aiocbe);
-
/* Perform transfer. */
- dev_strategy_csw(dev, csw, bp);
+ csw->d_strategy(bp);
dev_relthread(dev, ref);
return (0);
doerror:
AIO_LOCK(ki);
+ aiocbe->jobstate = JOBST_NULL;
+ TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist);
+ TAILQ_REMOVE(&ki->kaio_all, aiocbe, allist);
ki->kaio_count--;
- ki->kaio_buffer_count--;
+ if (!unmap)
+ ki->kaio_buffer_count--;
if (lj)
lj->lioj_count--;
+ AIO_UNLOCK(ki);
+ if (pbuf) {
+ relpbuf(pbuf, NULL);
+ aiocbe->pbuf = NULL;
+ }
+ g_destroy_bio(bp);
aiocbe->bp = NULL;
- AIO_UNLOCK(ki);
- relpbuf(bp, NULL);
unref:
dev_relthread(dev, ref);
return (error);
@@ -1522,9 +1583,10 @@
*/
int
aio_aqueue(struct thread *td, struct aiocb *job, struct aioliojob *lj,
- int type, struct aiocb_ops *ops)
+ int type, struct aiocb_ops *ops)
{
struct proc *p = td->td_proc;
+ cap_rights_t rights;
struct file *fp;
struct socket *so;
struct aiocblist *aiocbe, *cb;
@@ -1553,8 +1615,6 @@
}
aiocbe = uma_zalloc(aiocb_zone, M_WAITOK | M_ZERO);
- aiocbe->inputcharge = 0;
- aiocbe->outputcharge = 0;
knlist_init_mtx(&aiocbe->klist, AIO_MTX(ki));
error = ops->copyin(job, &aiocbe->uaiocb);
@@ -1564,6 +1624,12 @@
return (error);
}
+ /* XXX: aio_nbytes is later casted to signed types. */
+ if (aiocbe->uaiocb.aio_nbytes > INT_MAX) {
+ uma_zfree(aiocb_zone, aiocbe);
+ return (EINVAL);
+ }
+
if (aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT &&
aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_SIGNAL &&
aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_THREAD_ID &&
@@ -1601,16 +1667,21 @@
fd = aiocbe->uaiocb.aio_fildes;
switch (opcode) {
case LIO_WRITE:
- error = fget_write(td, fd, CAP_WRITE | CAP_SEEK, &fp);
+ error = fget_write(td, fd,
+ cap_rights_init(&rights, CAP_PWRITE), &fp);
break;
case LIO_READ:
- error = fget_read(td, fd, CAP_READ | CAP_SEEK, &fp);
+ error = fget_read(td, fd,
+ cap_rights_init(&rights, CAP_PREAD), &fp);
break;
case LIO_SYNC:
- error = fget(td, fd, CAP_FSYNC, &fp);
+ error = fget(td, fd, cap_rights_init(&rights, CAP_FSYNC), &fp);
break;
+ case LIO_MLOCK:
+ fp = NULL;
+ break;
case LIO_NOP:
- error = fget(td, fd, 0, &fp);
+ error = fget(td, fd, cap_rights_init(&rights), &fp);
break;
default:
error = EINVAL;
@@ -1666,7 +1737,8 @@
error = kqfd_register(kqfd, &kev, td, 1);
aqueue_fail:
if (error) {
- fdrop(fp, td);
+ if (fp)
+ fdrop(fp, td);
uma_zfree(aiocb_zone, aiocbe);
ops->store_error(job, error);
goto done;
@@ -1683,7 +1755,7 @@
if (opcode == LIO_SYNC)
goto queueit;
- if (fp->f_type == DTYPE_SOCKET) {
+ if (fp && fp->f_type == DTYPE_SOCKET) {
/*
* Alternate queueing for socket ops: Reach down into the
* descriptor to get the socket data. Then check to see if the
@@ -1734,8 +1806,6 @@
}
#endif
queueit:
- /* No buffer for daemon I/O. */
- aiocbe->bp = NULL;
atomic_add_int(&num_queue_count, 1);
AIO_LOCK(ki);
@@ -1997,7 +2067,7 @@
struct vnode *vp;
/* Lookup file object. */
- error = fget(td, uap->fd, 0, &fp);
+ error = fget(td, uap->fd, NULL, &fp);
if (error)
return (error);
@@ -2161,6 +2231,13 @@
return (aio_aqueue(td, uap->aiocbp, NULL, LIO_WRITE, &aiocb_ops));
}
+int
+sys_aio_mlock(struct thread *td, struct aio_mlock_args *uap)
+{
+
+ return (aio_aqueue(td, uap->aiocbp, NULL, LIO_MLOCK, &aiocb_ops));
+}
+
static int
kern_lio_listio(struct thread *td, int mode, struct aiocb * const *uacb_list,
struct aiocb **acb_list, int nent, struct sigevent *sig,
@@ -2365,54 +2442,43 @@
return (error);
}
-/*
- * Called from interrupt thread for physio, we should return as fast
- * as possible, so we schedule a biohelper task.
- */
static void
-aio_physwakeup(struct buf *bp)
+aio_physwakeup(struct bio *bp)
{
- struct aiocblist *aiocbe;
-
- aiocbe = (struct aiocblist *)bp->b_caller1;
- taskqueue_enqueue(taskqueue_aiod_bio, &aiocbe->biotask);
-}
-
-/*
- * Task routine to perform heavy tasks, process wakeup, and signals.
- */
-static void
-biohelper(void *context, int pending)
-{
- struct aiocblist *aiocbe = context;
- struct buf *bp;
+ struct aiocblist *aiocbe = (struct aiocblist *)bp->bio_caller1;
struct proc *userp;
struct kaioinfo *ki;
int nblks;
+ /* Release mapping into kernel space. */
+ if (aiocbe->pbuf) {
+ pmap_qremove((vm_offset_t)aiocbe->pbuf->b_data, aiocbe->npages);
+ relpbuf(aiocbe->pbuf, NULL);
+ aiocbe->pbuf = NULL;
+ atomic_subtract_int(&num_buf_aio, 1);
+ }
+ vm_page_unhold_pages(aiocbe->pages, aiocbe->npages);
+
bp = aiocbe->bp;
+ aiocbe->bp = NULL;
userp = aiocbe->userproc;
ki = userp->p_aioinfo;
AIO_LOCK(ki);
- aiocbe->uaiocb._aiocb_private.status -= bp->b_resid;
+ aiocbe->uaiocb._aiocb_private.status -= bp->bio_resid;
aiocbe->uaiocb._aiocb_private.error = 0;
- if (bp->b_ioflags & BIO_ERROR)
- aiocbe->uaiocb._aiocb_private.error = bp->b_error;
+ if (bp->bio_flags & BIO_ERROR)
+ aiocbe->uaiocb._aiocb_private.error = bp->bio_error;
nblks = btodb(aiocbe->uaiocb.aio_nbytes);
if (aiocbe->uaiocb.aio_lio_opcode == LIO_WRITE)
aiocbe->outputcharge += nblks;
else
aiocbe->inputcharge += nblks;
- aiocbe->bp = NULL;
TAILQ_REMOVE(&userp->p_aioinfo->kaio_bufqueue, aiocbe, plist);
ki->kaio_buffer_count--;
aio_bio_done_notify(userp, aiocbe, DONE_BUF);
AIO_UNLOCK(ki);
- /* Release mapping into kernel space. */
- vunmapbuf(bp);
- relpbuf(bp, NULL);
- atomic_subtract_int(&num_buf_aio, 1);
+ g_destroy_bio(bp);
}
/* syscall - wait for the next completion of an aio request */
@@ -2503,14 +2569,9 @@
kern_aio_fsync(struct thread *td, int op, struct aiocb *aiocbp,
struct aiocb_ops *ops)
{
- struct proc *p = td->td_proc;
- struct kaioinfo *ki;
if (op != O_SYNC) /* XXX lack of O_DSYNC */
return (EINVAL);
- ki = p->p_aioinfo;
- if (ki == NULL)
- aio_init_aioinfo(p);
return (aio_aqueue(td, aiocbp, NULL, LIO_SYNC, ops));
}
@@ -2696,31 +2757,6 @@
}
static int
-convert_sigevent32(struct sigevent32 *sig32, struct sigevent *sig)
-{
-
- CP(*sig32, *sig, sigev_notify);
- switch (sig->sigev_notify) {
- case SIGEV_NONE:
- break;
- case SIGEV_THREAD_ID:
- CP(*sig32, *sig, sigev_notify_thread_id);
- /* FALLTHROUGH */
- case SIGEV_SIGNAL:
- CP(*sig32, *sig, sigev_signo);
- break;
- case SIGEV_KEVENT:
- CP(*sig32, *sig, sigev_notify_kqueue);
- CP(*sig32, *sig, sigev_notify_kevent_flags);
- PTRIN_CP(*sig32, *sig, sigev_value.sival_ptr);
- break;
- default:
- return (EINVAL);
- }
- return (0);
-}
-
-static int
aiocb32_copyin(struct aiocb *ujob, struct aiocb *kjob)
{
struct aiocb32 job32;
@@ -2903,6 +2939,14 @@
}
int
+freebsd32_aio_mlock(struct thread *td, struct freebsd32_aio_mlock_args *uap)
+{
+
+ return (aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_MLOCK,
+ &aiocb32_ops));
+}
+
+int
freebsd32_aio_waitcomplete(struct thread *td,
struct freebsd32_aio_waitcomplete_args *uap)
{
Modified: trunk/sys/kern/vfs_bio.c
===================================================================
--- trunk/sys/kern/vfs_bio.c 2018-05-25 21:07:58 UTC (rev 9951)
+++ trunk/sys/kern/vfs_bio.c 2018-05-26 14:24:52 UTC (rev 9952)
@@ -1,8 +1,13 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 2004 Poul-Henning Kamp
* Copyright (c) 1994,1997 John S. Dyson
+ * Copyright (c) 2013 The FreeBSD Foundation
* All rights reserved.
*
+ * Portions of this software were developed by Konstantin Belousov
+ * under sponsorship from the FreeBSD Foundation.
+ *
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
@@ -39,7 +44,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/vfs_bio.c 307672 2016-10-20 13:12:19Z kib $");
#include <sys/param.h>
#include <sys/systm.h>
@@ -58,7 +63,9 @@
#include <sys/kthread.h>
#include <sys/proc.h>
#include <sys/resourcevar.h>
+#include <sys/rwlock.h>
#include <sys/sysctl.h>
+#include <sys/vmem.h>
#include <sys/vmmeter.h>
#include <sys/vnode.h>
#include <geom/geom.h>
@@ -71,7 +78,6 @@
#include <vm/vm_extern.h>
#include <vm/vm_map.h>
#include "opt_compat.h"
-#include "opt_directio.h"
#include "opt_swap.h"
static MALLOC_DEFINE(M_BIOBUF, "biobuf", "BIO buffer");
@@ -91,8 +97,10 @@
* carnal knowledge of buffers. This knowledge should be moved to vfs_bio.c.
*/
struct buf *buf; /* buffer header pool */
+caddr_t unmapped_buf;
-static struct proc *bufdaemonproc;
+/* Used below and for softdep flushing threads in ufs/ffs/ffs_softdep.c */
+struct proc *bufdaemonproc;
static int inmem(struct vnode *vp, daddr_t blkno);
static void vm_hold_free_pages(struct buf *bp, int newbsize);
@@ -101,16 +109,16 @@
static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, vm_page_t m);
static void vfs_page_set_validclean(struct buf *bp, vm_ooffset_t off,
vm_page_t m);
-static void vfs_drain_busy_pages(struct buf *bp);
static void vfs_clean_pages_dirty_buf(struct buf *bp);
static void vfs_setdirty_locked_object(struct buf *bp);
static void vfs_vmio_release(struct buf *bp);
static int vfs_bio_clcheck(struct vnode *vp, int size,
daddr_t lblkno, daddr_t blkno);
-static int buf_do_flush(struct vnode *vp);
+static int buf_flush(struct vnode *vp, int);
static int flushbufqueues(struct vnode *, int, int);
static void buf_daemon(void);
static void bremfreel(struct buf *bp);
+static __inline void bd_wakeup(void);
#if defined(COMPAT_FREEBSD4) || defined(COMPAT_FREEBSD5) || \
defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD7)
static int sysctl_bufspace(SYSCTL_HANDLER_ARGS);
@@ -131,6 +139,10 @@
SYSCTL_LONG(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, &bufspace, 0,
"Virtual memory used for buffers");
#endif
+static long unmapped_bufspace;
+SYSCTL_LONG(_vfs, OID_AUTO, unmapped_bufspace, CTLFLAG_RD,
+ &unmapped_bufspace, 0,
+ "Amount of unmapped buffers, inclusive in the bufspace");
static long maxbufspace;
SYSCTL_LONG(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RD, &maxbufspace, 0,
"Maximum allowed value of bufspace (including buf_daemon)");
@@ -199,18 +211,56 @@
"Number of calls to getnewbuf");
static int getnewbufrestarts;
SYSCTL_INT(_vfs, OID_AUTO, getnewbufrestarts, CTLFLAG_RW, &getnewbufrestarts, 0,
- "Number of times getnewbuf has had to restart a buffer aquisition");
+ "Number of times getnewbuf has had to restart a buffer acquisition");
+static int mappingrestarts;
+SYSCTL_INT(_vfs, OID_AUTO, mappingrestarts, CTLFLAG_RW, &mappingrestarts, 0,
+ "Number of times getblk has had to restart a buffer mapping for "
+ "unmapped buffer");
static int flushbufqtarget = 100;
SYSCTL_INT(_vfs, OID_AUTO, flushbufqtarget, CTLFLAG_RW, &flushbufqtarget, 0,
"Amount of work to do in flushbufqueues when helping bufdaemon");
-static long notbufdflashes;
-SYSCTL_LONG(_vfs, OID_AUTO, notbufdflashes, CTLFLAG_RD, ¬bufdflashes, 0,
+static long notbufdflushes;
+SYSCTL_LONG(_vfs, OID_AUTO, notbufdflushes, CTLFLAG_RD, ¬bufdflushes, 0,
"Number of dirty buffer flushes done by the bufdaemon helpers");
static long barrierwrites;
SYSCTL_LONG(_vfs, OID_AUTO, barrierwrites, CTLFLAG_RW, &barrierwrites, 0,
"Number of barrier writes");
+SYSCTL_INT(_vfs, OID_AUTO, unmapped_buf_allowed, CTLFLAG_RD,
+ &unmapped_buf_allowed, 0,
+ "Permit the use of the unmapped i/o");
/*
+ * Lock for the non-dirty bufqueues
+ */
+static struct mtx_padalign bqclean;
+
+/*
+ * Lock for the dirty queue.
+ */
+static struct mtx_padalign bqdirty;
+
+/*
+ * This lock synchronizes access to bd_request.
+ */
+static struct mtx_padalign bdlock;
+
+/*
+ * This lock protects the runningbufreq and synchronizes runningbufwakeup and
+ * waitrunningbufspace().
+ */
+static struct mtx_padalign rbreqlock;
+
+/*
+ * Lock that protects needsbuffer and the sleeps/wakeups surrounding it.
+ */
+static struct rwlock_padalign nblock;
+
+/*
+ * Lock that protects bdirtywait.
+ */
+static struct mtx_padalign bdirtylock;
+
+/*
* Wakeup point for bufdaemon, as well as indicator of whether it is already
* active. Set to 1 when the bufdaemon is already "on" the queue, 0 when it
* is idling.
@@ -226,11 +276,6 @@
static int bd_speedupreq;
/*
- * This lock synchronizes access to bd_request.
- */
-static struct mtx bdlock;
-
-/*
* bogus page -- for I/O to/from partially complete buffers
* this is a temporary solution to the problem, but it is not
* really that bad. it would be better to split the buffer
@@ -246,45 +291,38 @@
*/
static int runningbufreq;
-/*
- * This lock protects the runningbufreq and synchronizes runningbufwakeup and
- * waitrunningbufspace().
- */
-static struct mtx rbreqlock;
-
/*
* Synchronization (sleep/wakeup) variable for buffer requests.
* Can contain the VFS_BIO_NEED flags defined below; setting/clearing is done
* by and/or.
- * Used in numdirtywakeup(), bufspacewakeup(), bufcountwakeup(), bwillwrite(),
+ * Used in numdirtywakeup(), bufspacewakeup(), bufcountadd(), bwillwrite(),
* getnewbuf(), and getblk().
*/
-static int needsbuffer;
+static volatile int needsbuffer;
/*
- * Lock that protects needsbuffer and the sleeps/wakeups surrounding it.
+ * Synchronization for bwillwrite() waiters.
*/
-static struct mtx nblock;
+static int bdirtywait;
/*
* Definitions for the buffer free lists.
*/
-#define BUFFER_QUEUES 6 /* number of free buffer queues */
+#define BUFFER_QUEUES 5 /* number of free buffer queues */
#define QUEUE_NONE 0 /* on no queue */
#define QUEUE_CLEAN 1 /* non-B_DELWRI buffers */
#define QUEUE_DIRTY 2 /* B_DELWRI buffers */
-#define QUEUE_DIRTY_GIANT 3 /* B_DELWRI buffers that need giant */
-#define QUEUE_EMPTYKVA 4 /* empty buffer headers w/KVA assignment */
-#define QUEUE_EMPTY 5 /* empty buffer headers */
+#define QUEUE_EMPTYKVA 3 /* empty buffer headers w/KVA assignment */
+#define QUEUE_EMPTY 4 /* empty buffer headers */
#define QUEUE_SENTINEL 1024 /* not an queue index, but mark for sentinel */
/* Queues for free buffers with various properties */
static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } };
+#ifdef INVARIANTS
+static int bq_len[BUFFER_QUEUES];
+#endif
-/* Lock for the bufqueues */
-static struct mtx bqlock;
-
/*
* Single global constant for BUF_WMESG, to avoid getting multiple references.
* buf_wmesg is referred from macros.
@@ -292,7 +330,6 @@
const char *buf_wmesg = BUF_WMESG;
#define VFS_BIO_NEED_ANY 0x01 /* any freeable buffer */
-#define VFS_BIO_NEED_DIRTYFLUSH 0x02 /* waiting for dirty buffer flush */
#define VFS_BIO_NEED_FREE 0x04 /* wait for free bufs, hi hysteresis */
#define VFS_BIO_NEED_BUFSPACE 0x08 /* wait for buf space, lo hysteresis */
@@ -315,31 +352,71 @@
}
#endif
-#ifdef DIRECTIO
-extern void ffs_rawread_setup(void);
-#endif /* DIRECTIO */
/*
- * numdirtywakeup:
+ * bqlock:
*
- * If someone is blocked due to there being too many dirty buffers,
- * and numdirtybuffers is now reasonable, wake them up.
+ * Return the appropriate queue lock based on the index.
*/
+static inline struct mtx *
+bqlock(int qindex)
+{
-static __inline void
-numdirtywakeup(int level)
+ if (qindex == QUEUE_DIRTY)
+ return (struct mtx *)(&bqdirty);
+ return (struct mtx *)(&bqclean);
+}
+
+/*
+ * bdirtywakeup:
+ *
+ * Wakeup any bwillwrite() waiters.
+ */
+static void
+bdirtywakeup(void)
{
-
- if (numdirtybuffers <= level) {
- mtx_lock(&nblock);
- if (needsbuffer & VFS_BIO_NEED_DIRTYFLUSH) {
- needsbuffer &= ~VFS_BIO_NEED_DIRTYFLUSH;
- wakeup(&needsbuffer);
- }
- mtx_unlock(&nblock);
+ mtx_lock(&bdirtylock);
+ if (bdirtywait) {
+ bdirtywait = 0;
+ wakeup(&bdirtywait);
}
+ mtx_unlock(&bdirtylock);
}
/*
+ * bdirtysub:
+ *
+ * Decrement the numdirtybuffers count by one and wakeup any
+ * threads blocked in bwillwrite().
+ */
+static void
+bdirtysub(void)
+{
+
+ if (atomic_fetchadd_int(&numdirtybuffers, -1) ==
+ (lodirtybuffers + hidirtybuffers) / 2)
+ bdirtywakeup();
+}
+
+/*
+ * bdirtyadd:
+ *
+ * Increment the numdirtybuffers count by one and wakeup the buf
+ * daemon if needed.
+ */
+static void
+bdirtyadd(void)
+{
+
+ /*
+ * Only do the wakeup once as we cross the boundary. The
+ * buf daemon will keep running until the condition clears.
+ */
+ if (atomic_fetchadd_int(&numdirtybuffers, 1) ==
+ (lodirtybuffers + hidirtybuffers) / 2)
+ bd_wakeup();
+}
+
+/*
* bufspacewakeup:
*
* Called when buffer space is potentially available for recovery.
@@ -351,6 +428,7 @@
static __inline void
bufspacewakeup(void)
{
+ int need_wakeup, on;
/*
* If someone is waiting for BUF space, wake them up. Even
@@ -357,36 +435,70 @@
* though we haven't freed the kva space yet, the waiting
* process will be able to now.
*/
- mtx_lock(&nblock);
- if (needsbuffer & VFS_BIO_NEED_BUFSPACE) {
- needsbuffer &= ~VFS_BIO_NEED_BUFSPACE;
- wakeup(&needsbuffer);
+ rw_rlock(&nblock);
+ for (;;) {
+ need_wakeup = 0;
+ on = needsbuffer;
+ if ((on & VFS_BIO_NEED_BUFSPACE) == 0)
+ break;
+ need_wakeup = 1;
+ if (atomic_cmpset_rel_int(&needsbuffer, on,
+ on & ~VFS_BIO_NEED_BUFSPACE))
+ break;
}
- mtx_unlock(&nblock);
+ if (need_wakeup)
+ wakeup(__DEVOLATILE(void *, &needsbuffer));
+ rw_runlock(&nblock);
}
/*
- * runningbufwakeup() - in-progress I/O accounting.
+ * runningwakeup:
*
+ * Wake up processes that are waiting on asynchronous writes to fall
+ * below lorunningspace.
*/
+static void
+runningwakeup(void)
+{
+
+ mtx_lock(&rbreqlock);
+ if (runningbufreq) {
+ runningbufreq = 0;
+ wakeup(&runningbufreq);
+ }
+ mtx_unlock(&rbreqlock);
+}
+
+/*
+ * runningbufwakeup:
+ *
+ * Decrement the outstanding write count according.
+ */
void
runningbufwakeup(struct buf *bp)
{
+ long space, bspace;
- if (bp->b_runningbufspace) {
- atomic_subtract_long(&runningbufspace, bp->b_runningbufspace);
- bp->b_runningbufspace = 0;
- mtx_lock(&rbreqlock);
- if (runningbufreq && runningbufspace <= lorunningspace) {
- runningbufreq = 0;
- wakeup(&runningbufreq);
- }
- mtx_unlock(&rbreqlock);
- }
+ bspace = bp->b_runningbufspace;
+ if (bspace == 0)
+ return;
+ space = atomic_fetchadd_long(&runningbufspace, -bspace);
+ KASSERT(space >= bspace, ("runningbufspace underflow %ld %ld",
+ space, bspace));
+ bp->b_runningbufspace = 0;
+ /*
+ * Only acquire the lock and wakeup on the transition from exceeding
+ * the threshold to falling below it.
+ */
+ if (space < lorunningspace)
+ return;
+ if (space - bspace > lorunningspace)
+ return;
+ runningwakeup();
}
/*
- * bufcountwakeup:
+ * bufcountadd:
*
* Called when a buffer has been added to one of the free queues to
* account for the buffer and to wakeup anyone waiting for free buffers.
@@ -393,31 +505,60 @@
* This typically occurs when large amounts of metadata are being handled
* by the buffer cache ( else buffer space runs out first, usually ).
*/
-
static __inline void
-bufcountwakeup(struct buf *bp)
+bufcountadd(struct buf *bp)
{
- int old;
+ int mask, need_wakeup, old, on;
- KASSERT((bp->b_vflags & BV_INFREECNT) == 0,
+ KASSERT((bp->b_flags & B_INFREECNT) == 0,
("buf %p already counted as free", bp));
- if (bp->b_bufobj != NULL)
- mtx_assert(BO_MTX(bp->b_bufobj), MA_OWNED);
- bp->b_vflags |= BV_INFREECNT;
+ bp->b_flags |= B_INFREECNT;
old = atomic_fetchadd_int(&numfreebuffers, 1);
KASSERT(old >= 0 && old < nbuf,
("numfreebuffers climbed to %d", old + 1));
- mtx_lock(&nblock);
- if (needsbuffer) {
- needsbuffer &= ~VFS_BIO_NEED_ANY;
- if (numfreebuffers >= hifreebuffers)
- needsbuffer &= ~VFS_BIO_NEED_FREE;
- wakeup(&needsbuffer);
+ mask = VFS_BIO_NEED_ANY;
+ if (numfreebuffers >= hifreebuffers)
+ mask |= VFS_BIO_NEED_FREE;
+ rw_rlock(&nblock);
+ for (;;) {
+ need_wakeup = 0;
+ on = needsbuffer;
+ if (on == 0)
+ break;
+ need_wakeup = 1;
+ if (atomic_cmpset_rel_int(&needsbuffer, on, on & ~mask))
+ break;
}
- mtx_unlock(&nblock);
+ if (need_wakeup)
+ wakeup(__DEVOLATILE(void *, &needsbuffer));
+ rw_runlock(&nblock);
}
/*
+ * bufcountsub:
+ *
+ * Decrement the numfreebuffers count as needed.
+ */
+static void
+bufcountsub(struct buf *bp)
+{
+ int old;
+
+ /*
+ * Fixup numfreebuffers count. If the buffer is invalid or not
+ * delayed-write, the buffer was free and we must decrement
+ * numfreebuffers.
+ */
+ if ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0) {
+ KASSERT((bp->b_flags & B_INFREECNT) != 0,
+ ("buf %p not counted in numfreebuffers", bp));
+ bp->b_flags &= ~B_INFREECNT;
+ old = atomic_fetchadd_int(&numfreebuffers, -1);
+ KASSERT(old > 0, ("numfreebuffers dropped to %d", old - 1));
+ }
+}
+
+/*
* waitrunningbufspace()
*
* runningbufspace is a measure of the amount of I/O currently
@@ -425,9 +566,6 @@
* prevent creating huge backups of pending writes to a device.
* Only asynchronous writes are governed by this function.
*
- * Reads will adjust runningbufspace, but will not block based on it.
- * The read load has a side effect of reducing the allowed write load.
- *
* This does NOT turn an async write into a sync write. It waits
* for earlier writes to complete and generally returns before the
* caller's write has reached the device.
@@ -438,7 +576,7 @@
mtx_lock(&rbreqlock);
while (runningbufspace > hirunningspace) {
- ++runningbufreq;
+ runningbufreq = 1;
msleep(&runningbufreq, &rbreqlock, PVM, "wdrain", 0);
}
mtx_unlock(&rbreqlock);
@@ -459,7 +597,7 @@
vm_page_t m)
{
- VM_OBJECT_LOCK_ASSERT(m->object, MA_OWNED);
+ VM_OBJECT_ASSERT_LOCKED(m->object);
if (bp->b_flags & B_CACHE) {
int base = (foff + off) & PAGE_MASK;
if (vm_page_is_valid(m, base, size) == 0)
@@ -468,13 +606,12 @@
}
/* Wake up the buffer daemon if necessary */
-static __inline
-void
-bd_wakeup(int dirtybuflevel)
+static __inline void
+bd_wakeup(void)
{
mtx_lock(&bdlock);
- if (bd_request == 0 && numdirtybuffers >= dirtybuflevel) {
+ if (bd_request == 0) {
bd_request = 1;
wakeup(&bd_request);
}
@@ -484,7 +621,6 @@
/*
* bd_speedup - speedup the buffer cache flushing code
*/
-
void
bd_speedup(void)
{
@@ -501,6 +637,16 @@
mtx_unlock(&bdlock);
}
+#ifndef NSWBUF_MIN
+#define NSWBUF_MIN 16
+#endif
+
+#ifdef __i386__
+#define TRANSIENT_DENOM 5
+#else
+#define TRANSIENT_DENOM 10
+#endif
+
/*
* Calculating buffer cache scaling values and reserve space for buffer
* headers. This is called during low level kernel initialization and
@@ -511,7 +657,7 @@
kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est)
{
int tuned_nbuf;
- long maxbuf;
+ long maxbuf, maxbuf_sz, buf_sz, biotmap_sz;
/*
* physmem_est is in pages. Convert it to kilobytes (assumes
@@ -537,7 +683,8 @@
nbuf += min((physmem_est - 4096) / factor,
65536 / factor);
if (physmem_est > 65536)
- nbuf += (physmem_est - 65536) * 2 / (factor * 5);
+ nbuf += min((physmem_est - 65536) * 2 / (factor * 5),
+ 32 * 1024 * 1024 / (factor * 5));
if (maxbcache && nbuf > maxbcache / BKVASIZE)
nbuf = maxbcache / BKVASIZE;
@@ -555,17 +702,60 @@
}
/*
+ * Ideal allocation size for the transient bio submap if 10%
+ * of the maximal space buffer map. This roughly corresponds
+ * to the amount of the buffer mapped for typical UFS load.
+ *
+ * Clip the buffer map to reserve space for the transient
+ * BIOs, if its extent is bigger than 90% (80% on i386) of the
+ * maximum buffer map extent on the platform.
+ *
+ * The fall-back to the maxbuf in case of maxbcache unset,
+ * allows to not trim the buffer KVA for the architectures
+ * with ample KVA space.
+ */
+ if (bio_transient_maxcnt == 0 && unmapped_buf_allowed) {
+ maxbuf_sz = maxbcache != 0 ? maxbcache : maxbuf * BKVASIZE;
+ buf_sz = (long)nbuf * BKVASIZE;
+ if (buf_sz < maxbuf_sz / TRANSIENT_DENOM *
+ (TRANSIENT_DENOM - 1)) {
+ /*
+ * There is more KVA than memory. Do not
+ * adjust buffer map size, and assign the rest
+ * of maxbuf to transient map.
+ */
+ biotmap_sz = maxbuf_sz - buf_sz;
+ } else {
+ /*
+ * Buffer map spans all KVA we could afford on
+ * this platform. Give 10% (20% on i386) of
+ * the buffer map to the transient bio map.
+ */
+ biotmap_sz = buf_sz / TRANSIENT_DENOM;
+ buf_sz -= biotmap_sz;
+ }
+ if (biotmap_sz / INT_MAX > MAXPHYS)
+ bio_transient_maxcnt = INT_MAX;
+ else
+ bio_transient_maxcnt = biotmap_sz / MAXPHYS;
+ /*
+ * Artificially limit to 1024 simultaneous in-flight I/Os
+ * using the transient mapping.
+ */
+ if (bio_transient_maxcnt > 1024)
+ bio_transient_maxcnt = 1024;
+ if (tuned_nbuf)
+ nbuf = buf_sz / BKVASIZE;
+ }
+
+ /*
* swbufs are used as temporary holders for I/O, such as paging I/O.
* We have no less then 16 and no more then 256.
*/
- nswbuf = max(min(nbuf/4, 256), 16);
-#ifdef NSWBUF_MIN
+ nswbuf = min(nbuf / 4, 256);
+ TUNABLE_INT_FETCH("kern.nswbuf", &nswbuf);
if (nswbuf < NSWBUF_MIN)
nswbuf = NSWBUF_MIN;
-#endif
-#ifdef DIRECTIO
- ffs_rawread_setup();
-#endif
/*
* Reserve space for the buffer cache buffers
@@ -585,10 +775,13 @@
struct buf *bp;
int i;
- mtx_init(&bqlock, "buf queue lock", NULL, MTX_DEF);
+ CTASSERT(MAXBCACHEBUF >= MAXBSIZE);
+ mtx_init(&bqclean, "bufq clean lock", NULL, MTX_DEF);
+ mtx_init(&bqdirty, "bufq dirty lock", NULL, MTX_DEF);
mtx_init(&rbreqlock, "runningbufspace lock", NULL, MTX_DEF);
- mtx_init(&nblock, "needsbuffer lock", NULL, MTX_DEF);
+ rw_init(&nblock, "needsbuffer lock");
mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF);
+ mtx_init(&bdirtylock, "dirty buf lock", NULL, MTX_DEF);
/* next, make a null set of free lists */
for (i = 0; i < BUFFER_QUEUES; i++)
@@ -598,15 +791,17 @@
for (i = 0; i < nbuf; i++) {
bp = &buf[i];
bzero(bp, sizeof *bp);
- bp->b_flags = B_INVAL; /* we're just an empty header */
+ bp->b_flags = B_INVAL | B_INFREECNT;
bp->b_rcred = NOCRED;
bp->b_wcred = NOCRED;
bp->b_qindex = QUEUE_EMPTY;
- bp->b_vflags = BV_INFREECNT; /* buf is counted as free */
bp->b_xflags = 0;
LIST_INIT(&bp->b_dep);
BUF_LOCKINIT(bp);
TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
+#ifdef INVARIANTS
+ bq_len[QUEUE_EMPTY]++;
+#endif
}
/*
@@ -622,8 +817,8 @@
* by the system.
*/
maxbufspace = (long)nbuf * BKVASIZE;
- hibufspace = lmax(3 * maxbufspace / 4, maxbufspace - MAXBSIZE * 10);
- lobufspace = hibufspace - MAXBSIZE;
+ hibufspace = lmax(3 * maxbufspace / 4, maxbufspace - MAXBCACHEBUF * 10);
+ lobufspace = hibufspace - MAXBCACHEBUF;
/*
* Note: The 16 MiB upper limit for hirunningspace was chosen
@@ -633,9 +828,9 @@
* The lower 1 MiB limit is the historical upper limit for
* hirunningspace.
*/
- hirunningspace = lmax(lmin(roundup(hibufspace / 64, MAXBSIZE),
+ hirunningspace = lmax(lmin(roundup(hibufspace / 64, MAXBCACHEBUF),
16 * 1024 * 1024), 1024 * 1024);
- lorunningspace = roundup((hirunningspace * 2) / 3, MAXBSIZE);
+ lorunningspace = roundup((hirunningspace * 2) / 3, MAXBCACHEBUF);
/*
* Limit the amount of malloc memory since it is wired permanently into
@@ -675,8 +870,57 @@
bogus_page = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ |
VM_ALLOC_NORMAL | VM_ALLOC_WIRED);
+ unmapped_buf = (caddr_t)kva_alloc(MAXPHYS);
}
+#ifdef INVARIANTS
+static inline void
+vfs_buf_check_mapped(struct buf *bp)
+{
+
+ KASSERT((bp->b_flags & B_UNMAPPED) == 0,
+ ("mapped buf %p %x", bp, bp->b_flags));
+ KASSERT(bp->b_kvabase != unmapped_buf,
+ ("mapped buf: b_kvabase was not updated %p", bp));
+ KASSERT(bp->b_data != unmapped_buf,
+ ("mapped buf: b_data was not updated %p", bp));
+}
+
+static inline void
+vfs_buf_check_unmapped(struct buf *bp)
+{
+
+ KASSERT((bp->b_flags & B_UNMAPPED) == B_UNMAPPED,
+ ("unmapped buf %p %x", bp, bp->b_flags));
+ KASSERT(bp->b_kvabase == unmapped_buf,
+ ("unmapped buf: corrupted b_kvabase %p", bp));
+ KASSERT(bp->b_data == unmapped_buf,
+ ("unmapped buf: corrupted b_data %p", bp));
+}
+
+#define BUF_CHECK_MAPPED(bp) vfs_buf_check_mapped(bp)
+#define BUF_CHECK_UNMAPPED(bp) vfs_buf_check_unmapped(bp)
+#else
+#define BUF_CHECK_MAPPED(bp) do {} while (0)
+#define BUF_CHECK_UNMAPPED(bp) do {} while (0)
+#endif
+
+static void
+bpmap_qenter(struct buf *bp)
+{
+
+ BUF_CHECK_MAPPED(bp);
+
+ /*
+ * bp->b_data is relative to bp->b_offset, but
+ * bp->b_offset may be offset into the first page.
+ */
+ bp->b_data = (caddr_t)trunc_page((vm_offset_t)bp->b_data);
+ pmap_qenter((vm_offset_t)bp->b_data, bp->b_pages, bp->b_npages);
+ bp->b_data = (caddr_t)((vm_offset_t)bp->b_data |
+ (vm_offset_t)(bp->b_offset & PAGE_MASK));
+}
+
/*
* bfreekva() - free the kva allocation for a buffer.
*
@@ -686,26 +930,85 @@
bfreekva(struct buf *bp)
{
- if (bp->b_kvasize) {
- atomic_add_int(&buffreekvacnt, 1);
- atomic_subtract_long(&bufspace, bp->b_kvasize);
- vm_map_remove(buffer_map, (vm_offset_t) bp->b_kvabase,
- (vm_offset_t) bp->b_kvabase + bp->b_kvasize);
- bp->b_kvasize = 0;
- bufspacewakeup();
+ if (bp->b_kvasize == 0)
+ return;
+
+ atomic_add_int(&buffreekvacnt, 1);
+ atomic_subtract_long(&bufspace, bp->b_kvasize);
+ if ((bp->b_flags & B_UNMAPPED) == 0) {
+ BUF_CHECK_MAPPED(bp);
+ vmem_free(buffer_arena, (vm_offset_t)bp->b_kvabase,
+ bp->b_kvasize);
+ } else {
+ BUF_CHECK_UNMAPPED(bp);
+ if ((bp->b_flags & B_KVAALLOC) != 0) {
+ vmem_free(buffer_arena, (vm_offset_t)bp->b_kvaalloc,
+ bp->b_kvasize);
+ }
+ atomic_subtract_long(&unmapped_bufspace, bp->b_kvasize);
+ bp->b_flags &= ~(B_UNMAPPED | B_KVAALLOC);
}
+ bp->b_kvasize = 0;
+ bufspacewakeup();
}
/*
+ * binsfree:
+ *
+ * Insert the buffer into the appropriate free list.
+ */
+static void
+binsfree(struct buf *bp, int qindex)
+{
+ struct mtx *olock, *nlock;
+
+ BUF_ASSERT_XLOCKED(bp);
+
+ nlock = bqlock(qindex);
+ /* Handle delayed bremfree() processing. */
+ if (bp->b_flags & B_REMFREE) {
+ olock = bqlock(bp->b_qindex);
+ mtx_lock(olock);
+ bremfreel(bp);
+ if (olock != nlock) {
+ mtx_unlock(olock);
+ mtx_lock(nlock);
+ }
+ } else
+ mtx_lock(nlock);
+
+ if (bp->b_qindex != QUEUE_NONE)
+ panic("binsfree: free buffer onto another queue???");
+
+ bp->b_qindex = qindex;
+ if (bp->b_flags & B_AGE)
+ TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist);
+ else
+ TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist);
+#ifdef INVARIANTS
+ bq_len[bp->b_qindex]++;
+#endif
+ mtx_unlock(nlock);
+
+ /*
+ * Something we can maybe free or reuse.
+ */
+ if (bp->b_bufsize && !(bp->b_flags & B_DELWRI))
+ bufspacewakeup();
+
+ if ((bp->b_flags & B_INVAL) || !(bp->b_flags & B_DELWRI))
+ bufcountadd(bp);
+}
+
+/*
* bremfree:
*
- * Mark the buffer for removal from the appropriate free list in brelse.
+ * Mark the buffer for removal from the appropriate free list.
*
*/
void
bremfree(struct buf *bp)
{
- int old;
CTR3(KTR_BUF, "bremfree(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
KASSERT((bp->b_flags & B_REMFREE) == 0,
@@ -712,19 +1015,10 @@
("bremfree: buffer %p already marked for delayed removal.", bp));
KASSERT(bp->b_qindex != QUEUE_NONE,
("bremfree: buffer %p not on a queue.", bp));
- BUF_ASSERT_HELD(bp);
+ BUF_ASSERT_XLOCKED(bp);
bp->b_flags |= B_REMFREE;
- /* Fixup numfreebuffers count. */
- if ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0) {
- KASSERT((bp->b_vflags & BV_INFREECNT) != 0,
- ("buf %p not counted in numfreebuffers", bp));
- if (bp->b_bufobj != NULL)
- mtx_assert(BO_MTX(bp->b_bufobj), MA_OWNED);
- bp->b_vflags &= ~BV_INFREECNT;
- old = atomic_fetchadd_int(&numfreebuffers, -1);
- KASSERT(old > 0, ("numfreebuffers dropped to %d", old - 1));
- }
+ bufcountsub(bp);
}
/*
@@ -736,9 +1030,12 @@
void
bremfreef(struct buf *bp)
{
- mtx_lock(&bqlock);
+ struct mtx *qlock;
+
+ qlock = bqlock(bp->b_qindex);
+ mtx_lock(qlock);
bremfreel(bp);
- mtx_unlock(&bqlock);
+ mtx_unlock(qlock);
}
/*
@@ -745,21 +1042,25 @@
* bremfreel:
*
* Removes a buffer from the free list, must be called with the
- * bqlock held.
+ * correct qlock held.
*/
static void
bremfreel(struct buf *bp)
{
- int old;
CTR3(KTR_BUF, "bremfreel(%p) vp %p flags %X",
bp, bp->b_vp, bp->b_flags);
KASSERT(bp->b_qindex != QUEUE_NONE,
("bremfreel: buffer %p not on a queue.", bp));
- BUF_ASSERT_HELD(bp);
- mtx_assert(&bqlock, MA_OWNED);
+ BUF_ASSERT_XLOCKED(bp);
+ mtx_assert(bqlock(bp->b_qindex), MA_OWNED);
TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
+#ifdef INVARIANTS
+ KASSERT(bq_len[bp->b_qindex] >= 1, ("queue %d underflow",
+ bp->b_qindex));
+ bq_len[bp->b_qindex]--;
+#endif
bp->b_qindex = QUEUE_NONE;
/*
* If this was a delayed bremfree() we only need to remove the buffer
@@ -769,34 +1070,10 @@
bp->b_flags &= ~B_REMFREE;
return;
}
- /*
- * Fixup numfreebuffers count. If the buffer is invalid or not
- * delayed-write, the buffer was free and we must decrement
- * numfreebuffers.
- */
- if ((bp->b_flags & B_INVAL) || (bp->b_flags & B_DELWRI) == 0) {
- KASSERT((bp->b_vflags & BV_INFREECNT) != 0,
- ("buf %p not counted in numfreebuffers", bp));
- if (bp->b_bufobj != NULL)
- mtx_assert(BO_MTX(bp->b_bufobj), MA_OWNED);
- bp->b_vflags &= ~BV_INFREECNT;
- old = atomic_fetchadd_int(&numfreebuffers, -1);
- KASSERT(old > 0, ("numfreebuffers dropped to %d", old - 1));
- }
+ bufcountsub(bp);
}
/*
- * Get a buffer with the specified data.
- */
-int
-bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred,
- struct buf **bpp)
-{
-
- return (breadn_flags(vp, blkno, size, 0, 0, 0, cred, 0, bpp));
-}
-
-/*
* Attempt to initiate asynchronous I/O on read-ahead blocks. We must
* clear BIO_ERROR and B_INVAL prior to initiating I/O . If B_CACHE is set,
* the buffer is valid and we do not have to do anything.
@@ -833,33 +1110,7 @@
}
/*
- * Operates like bread, but with getblk flags.
- */
-int
-bread_gb(struct vnode * vp, daddr_t blkno, int cnt, struct ucred * cred,
- int gbflags, struct buf **bpp)
-{
-
- return (breadn_flags(vp, blkno, cnt, NULL, NULL, 0,
- cred, gbflags, bpp));
-}
-
-/*
- * Operates like bread, but also starts asynchronous I/O on
- * read-ahead blocks.
- */
-int
-breadn(struct vnode * vp, daddr_t blkno, int size,
- daddr_t * rablkno, int *rabsize,
- int cnt, struct ucred * cred, struct buf **bpp)
-{
-
- return (breadn_flags(vp, blkno, size, rablkno, rabsize, cnt,
- cred, 0, bpp));
-}
-
-/*
- * Entry point for bread() and breadn().
+ * Entry point for bread() and breadn() via #defines in sys/buf.h.
*
* Get a buffer with the specified data. Look in the cache first. We
* must clear BIO_ERROR and B_INVAL prior to initiating I/O. If B_CACHE
@@ -920,9 +1171,16 @@
{
int oldflags;
struct vnode *vp;
+ long space;
int vp_md;
CTR3(KTR_BUF, "bufwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
+ if ((bp->b_bufobj->bo_flag & BO_DEAD) != 0) {
+ bp->b_flags |= B_INVAL | B_RELBUF;
+ bp->b_flags &= ~B_CACHE;
+ brelse(bp);
+ return (ENXIO);
+ }
if (bp->b_flags & B_INVAL) {
brelse(bp);
return (0);
@@ -967,7 +1225,7 @@
* Normal bwrites pipeline writes
*/
bp->b_runningbufspace = bp->b_bufsize;
- atomic_add_long(&runningbufspace, bp->b_runningbufspace);
+ space = atomic_fetchadd_long(&runningbufspace, bp->b_runningbufspace);
if (!TD_IS_IDLETHREAD(curthread))
curthread->td_ru.ru_oublock++;
@@ -980,7 +1238,7 @@
int rtval = bufwait(bp);
brelse(bp);
return (rtval);
- } else {
+ } else if (space > hirunningspace) {
/*
* don't allow the async write to saturate the I/O
* system. We will not deadlock here because
@@ -1115,13 +1373,6 @@
bqrelse(bp);
/*
- * Wakeup the buffer flushing daemon if we have a lot of dirty
- * buffers (midpoint between our recovery point and our stall
- * point).
- */
- bd_wakeup((lodirtybuffers + hidirtybuffers) / 2);
-
- /*
* note: we cannot initiate I/O from a bdwrite even if we wanted to,
* due to the softdep code.
*/
@@ -1161,8 +1412,7 @@
if ((bp->b_flags & B_DELWRI) == 0) {
bp->b_flags |= /* XXX B_DONE | */ B_DELWRI;
reassignbuf(bp);
- atomic_add_int(&numdirtybuffers, 1);
- bd_wakeup((lodirtybuffers + hidirtybuffers) / 2);
+ bdirtyadd();
}
}
@@ -1190,8 +1440,7 @@
if (bp->b_flags & B_DELWRI) {
bp->b_flags &= ~B_DELWRI;
reassignbuf(bp);
- atomic_subtract_int(&numdirtybuffers, 1);
- numdirtywakeup(lodirtybuffers);
+ bdirtysub();
}
/*
* Since it is now being written, we can clear its deferred write flag.
@@ -1259,20 +1508,18 @@
* of any vnodes we attempt to avoid the situation where a locked vnode
* prevents the various system daemons from flushing related buffers.
*/
-
void
bwillwrite(void)
{
if (numdirtybuffers >= hidirtybuffers) {
- mtx_lock(&nblock);
+ mtx_lock(&bdirtylock);
while (numdirtybuffers >= hidirtybuffers) {
- bd_wakeup(1);
- needsbuffer |= VFS_BIO_NEED_DIRTYFLUSH;
- msleep(&needsbuffer, &nblock,
- (PRIBIO + 4), "flswai", 0);
+ bdirtywait = 1;
+ msleep(&bdirtywait, &bdirtylock, (PRIBIO + 4),
+ "flswai", 0);
}
- mtx_unlock(&nblock);
+ mtx_unlock(&bdirtylock);
}
}
@@ -1305,6 +1552,8 @@
void
brelse(struct buf *bp)
{
+ int qindex;
+
CTR3(KTR_BUF, "brelse(%p) vp %p flags %X",
bp, bp->b_vp, bp->b_flags);
KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)),
@@ -1324,6 +1573,12 @@
return;
}
+ if ((bp->b_vflags & (BV_BKGRDINPROG | BV_BKGRDERR)) == BV_BKGRDERR) {
+ BO_LOCK(bp->b_bufobj);
+ bp->b_vflags &= ~BV_BKGRDERR;
+ BO_UNLOCK(bp->b_bufobj);
+ bdirty(bp);
+ }
if (bp->b_iocmd == BIO_WRITE && (bp->b_ioflags & BIO_ERROR) &&
bp->b_error == EIO && !(bp->b_flags & B_INVAL)) {
/*
@@ -1343,10 +1598,8 @@
bp->b_flags |= B_INVAL;
if (!LIST_EMPTY(&bp->b_dep))
buf_deallocate(bp);
- if (bp->b_flags & B_DELWRI) {
- atomic_subtract_int(&numdirtybuffers, 1);
- numdirtywakeup(lodirtybuffers);
- }
+ if (bp->b_flags & B_DELWRI)
+ bdirtysub();
bp->b_flags &= ~(B_DELWRI | B_CACHE);
if ((bp->b_flags & B_VMIO) == 0) {
if (bp->b_bufsize)
@@ -1372,15 +1625,10 @@
bp->b_flags &= ~B_RELBUF;
else if (buf_vm_page_count_severe()) {
/*
- * The locking of the BO_LOCK is not necessary since
- * BKGRDINPROG cannot be set while we hold the buf
- * lock, it can only be cleared if it is already
- * pending.
+ * BKGRDINPROG can only be set with the buf and bufobj
+ * locks both held. We tolerate a race to clear it here.
*/
- if (bp->b_vp) {
- if (!(bp->b_vflags & BV_BKGRDINPROG))
- bp->b_flags |= B_RELBUF;
- } else
+ if (!(bp->b_vflags & BV_BKGRDINPROG))
bp->b_flags |= B_RELBUF;
}
@@ -1430,7 +1678,6 @@
*/
resid = bp->b_bufsize;
foff = bp->b_offset;
- VM_OBJECT_LOCK(obj);
for (i = 0; i < bp->b_npages; i++) {
int had_bogus = 0;
@@ -1444,6 +1691,7 @@
poff = OFF_TO_IDX(bp->b_offset);
had_bogus = 1;
+ VM_OBJECT_RLOCK(obj);
for (j = i; j < bp->b_npages; j++) {
vm_page_t mtmp;
mtmp = bp->b_pages[j];
@@ -1455,8 +1703,10 @@
bp->b_pages[j] = mtmp;
}
}
+ VM_OBJECT_RUNLOCK(obj);
- if ((bp->b_flags & B_INVAL) == 0) {
+ if ((bp->b_flags & (B_INVAL | B_UNMAPPED)) == 0) {
+ BUF_CHECK_MAPPED(bp);
pmap_qenter(
trunc_page((vm_offset_t)bp->b_data),
bp->b_pages, bp->b_npages);
@@ -1471,7 +1721,16 @@
(PAGE_SIZE - poffset) : resid;
KASSERT(presid >= 0, ("brelse: extra page"));
- vm_page_set_invalid(m, poffset, presid);
+ VM_OBJECT_WLOCK(obj);
+ while (vm_page_xbusied(m)) {
+ vm_page_lock(m);
+ VM_OBJECT_WUNLOCK(obj);
+ vm_page_busy_sleep(m, "mbncsh", true);
+ VM_OBJECT_WLOCK(obj);
+ }
+ if (pmap_page_wired_mappings(m) == 0)
+ vm_page_set_invalid(m, poffset, presid);
+ VM_OBJECT_WUNLOCK(obj);
if (had_bogus)
printf("avoided corruption bug in bogus_page/brelse code\n");
}
@@ -1478,7 +1737,6 @@
resid -= PAGE_SIZE - (foff & PAGE_MASK);
foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
}
- VM_OBJECT_UNLOCK(obj);
if (bp->b_flags & (B_INVAL | B_RELBUF))
vfs_vmio_release(bp);
@@ -1495,22 +1753,6 @@
brelvp(bp);
}
- /* enqueue */
- mtx_lock(&bqlock);
- /* Handle delayed bremfree() processing. */
- if (bp->b_flags & B_REMFREE) {
- struct bufobj *bo;
-
- bo = bp->b_bufobj;
- if (bo != NULL)
- BO_LOCK(bo);
- bremfreel(bp);
- if (bo != NULL)
- BO_UNLOCK(bo);
- }
- if (bp->b_qindex != QUEUE_NONE)
- panic("brelse: free buffer onto another queue???");
-
/*
* If the buffer has junk contents signal it and eventually
* clean up B_DELWRI and diassociate the vnode so that gbincore()
@@ -1531,12 +1773,11 @@
bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA);
if (bp->b_vflags & BV_BKGRDINPROG)
panic("losing buffer 1");
- if (bp->b_kvasize) {
- bp->b_qindex = QUEUE_EMPTYKVA;
- } else {
- bp->b_qindex = QUEUE_EMPTY;
- }
- TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist);
+ if (bp->b_kvasize)
+ qindex = QUEUE_EMPTYKVA;
+ else
+ qindex = QUEUE_EMPTY;
+ bp->b_flags |= B_AGE;
/* buffers with junk contents */
} else if (bp->b_flags & (B_INVAL | B_NOCACHE | B_RELBUF) ||
(bp->b_ioflags & BIO_ERROR)) {
@@ -1543,48 +1784,16 @@
bp->b_xflags &= ~(BX_BKGRDWRITE | BX_ALTDATA);
if (bp->b_vflags & BV_BKGRDINPROG)
panic("losing buffer 2");
- bp->b_qindex = QUEUE_CLEAN;
- TAILQ_INSERT_HEAD(&bufqueues[QUEUE_CLEAN], bp, b_freelist);
+ qindex = QUEUE_CLEAN;
+ bp->b_flags |= B_AGE;
/* remaining buffers */
- } else {
- if ((bp->b_flags & (B_DELWRI|B_NEEDSGIANT)) ==
- (B_DELWRI|B_NEEDSGIANT))
- bp->b_qindex = QUEUE_DIRTY_GIANT;
- else if (bp->b_flags & B_DELWRI)
- bp->b_qindex = QUEUE_DIRTY;
- else
- bp->b_qindex = QUEUE_CLEAN;
- if (bp->b_flags & B_AGE)
- TAILQ_INSERT_HEAD(&bufqueues[bp->b_qindex], bp, b_freelist);
- else
- TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist);
- }
- mtx_unlock(&bqlock);
+ } else if (bp->b_flags & B_DELWRI)
+ qindex = QUEUE_DIRTY;
+ else
+ qindex = QUEUE_CLEAN;
- /*
- * Fixup numfreebuffers count. The bp is on an appropriate queue
- * unless locked. We then bump numfreebuffers if it is not B_DELWRI.
- * We've already handled the B_INVAL case ( B_DELWRI will be clear
- * if B_INVAL is set ).
- */
+ binsfree(bp, qindex);
- if (!(bp->b_flags & B_DELWRI)) {
- struct bufobj *bo;
-
- bo = bp->b_bufobj;
- if (bo != NULL)
- BO_LOCK(bo);
- bufcountwakeup(bp);
- if (bo != NULL)
- BO_UNLOCK(bo);
- }
-
- /*
- * Something we can maybe free or reuse
- */
- if (bp->b_bufsize || bp->b_kvasize)
- bufspacewakeup();
-
bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF | B_DIRECT);
if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY))
panic("brelse: not dirty");
@@ -1606,7 +1815,7 @@
void
bqrelse(struct buf *bp)
{
- struct bufobj *bo;
+ int qindex;
CTR3(KTR_BUF, "bqrelse(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)),
@@ -1617,83 +1826,44 @@
BUF_UNLOCK(bp);
return;
}
+ bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
- bo = bp->b_bufobj;
if (bp->b_flags & B_MANAGED) {
- if (bp->b_flags & B_REMFREE) {
- mtx_lock(&bqlock);
- if (bo != NULL)
- BO_LOCK(bo);
- bremfreel(bp);
- if (bo != NULL)
- BO_UNLOCK(bo);
- mtx_unlock(&bqlock);
- }
- bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
- BUF_UNLOCK(bp);
- return;
+ if (bp->b_flags & B_REMFREE)
+ bremfreef(bp);
+ goto out;
}
- mtx_lock(&bqlock);
- /* Handle delayed bremfree() processing. */
- if (bp->b_flags & B_REMFREE) {
- if (bo != NULL)
- BO_LOCK(bo);
- bremfreel(bp);
- if (bo != NULL)
- BO_UNLOCK(bo);
- }
- if (bp->b_qindex != QUEUE_NONE)
- panic("bqrelse: free buffer onto another queue???");
/* buffers with stale but valid contents */
- if (bp->b_flags & B_DELWRI) {
- if (bp->b_flags & B_NEEDSGIANT)
- bp->b_qindex = QUEUE_DIRTY_GIANT;
- else
- bp->b_qindex = QUEUE_DIRTY;
- TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist);
+ if ((bp->b_flags & B_DELWRI) != 0 || (bp->b_vflags & (BV_BKGRDINPROG |
+ BV_BKGRDERR)) == BV_BKGRDERR) {
+ BO_LOCK(bp->b_bufobj);
+ bp->b_vflags &= ~BV_BKGRDERR;
+ BO_UNLOCK(bp->b_bufobj);
+ qindex = QUEUE_DIRTY;
} else {
+ if ((bp->b_flags & B_DELWRI) == 0 &&
+ (bp->b_xflags & BX_VNDIRTY))
+ panic("bqrelse: not dirty");
/*
- * The locking of the BO_LOCK for checking of the
- * BV_BKGRDINPROG is not necessary since the
- * BV_BKGRDINPROG cannot be set while we hold the buf
- * lock, it can only be cleared if it is already
- * pending.
+ * BKGRDINPROG can only be set with the buf and bufobj
+ * locks both held. We tolerate a race to clear it here.
*/
- if (!buf_vm_page_count_severe() || (bp->b_vflags & BV_BKGRDINPROG)) {
- bp->b_qindex = QUEUE_CLEAN;
- TAILQ_INSERT_TAIL(&bufqueues[QUEUE_CLEAN], bp,
- b_freelist);
- } else {
+ if (buf_vm_page_count_severe() &&
+ (bp->b_vflags & BV_BKGRDINPROG) == 0) {
/*
* We are too low on memory, we have to try to free
* the buffer (most importantly: the wired pages
* making up its backing store) *now*.
*/
- mtx_unlock(&bqlock);
brelse(bp);
return;
}
+ qindex = QUEUE_CLEAN;
}
- mtx_unlock(&bqlock);
+ binsfree(bp, qindex);
- if ((bp->b_flags & B_INVAL) || !(bp->b_flags & B_DELWRI)) {
- if (bo != NULL)
- BO_LOCK(bo);
- bufcountwakeup(bp);
- if (bo != NULL)
- BO_UNLOCK(bo);
- }
-
- /*
- * Something we can maybe free or reuse.
- */
- if (bp->b_bufsize && !(bp->b_flags & B_DELWRI))
- bufspacewakeup();
-
- bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
- if ((bp->b_flags & B_DELWRI) == 0 && (bp->b_xflags & BX_VNDIRTY))
- panic("bqrelse: not dirty");
+out:
/* unlock */
BUF_UNLOCK(bp);
}
@@ -1702,11 +1872,18 @@
static void
vfs_vmio_release(struct buf *bp)
{
+ vm_object_t obj;
+ vm_page_t m;
int i;
- vm_page_t m;
- pmap_qremove(trunc_page((vm_offset_t)bp->b_data), bp->b_npages);
- VM_OBJECT_LOCK(bp->b_bufobj->bo_object);
+ if ((bp->b_flags & B_UNMAPPED) == 0) {
+ BUF_CHECK_MAPPED(bp);
+ pmap_qremove(trunc_page((vm_offset_t)bp->b_data), bp->b_npages);
+ } else
+ BUF_CHECK_UNMAPPED(bp);
+ obj = bp->b_bufobj->bo_object;
+ if (obj != NULL)
+ VM_OBJECT_WLOCK(obj);
for (i = 0; i < bp->b_npages; i++) {
m = bp->b_pages[i];
bp->b_pages[i] = NULL;
@@ -1716,29 +1893,23 @@
*/
vm_page_lock(m);
vm_page_unwire(m, 0);
+
/*
- * We don't mess with busy pages, it is
- * the responsibility of the process that
- * busied the pages to deal with them.
+ * Might as well free the page if we can and it has
+ * no valid data. We also free the page if the
+ * buffer was used for direct I/O
*/
- if ((m->oflags & VPO_BUSY) == 0 && m->busy == 0 &&
- m->wire_count == 0) {
- /*
- * Might as well free the page if we can and it has
- * no valid data. We also free the page if the
- * buffer was used for direct I/O
- */
- if ((bp->b_flags & B_ASYNC) == 0 && !m->valid) {
+ if ((bp->b_flags & B_ASYNC) == 0 && !m->valid) {
+ if (m->wire_count == 0 && !vm_page_busied(m))
vm_page_free(m);
- } else if (bp->b_flags & B_DIRECT) {
- vm_page_try_to_free(m);
- } else if (buf_vm_page_count_severe()) {
- vm_page_try_to_cache(m);
- }
- }
+ } else if (bp->b_flags & B_DIRECT)
+ vm_page_try_to_free(m);
+ else if (buf_vm_page_count_severe())
+ vm_page_try_to_cache(m);
vm_page_unlock(m);
}
- VM_OBJECT_UNLOCK(bp->b_bufobj->bo_object);
+ if (obj != NULL)
+ VM_OBJECT_WUNLOCK(obj);
if (bp->b_bufsize) {
bufspacewakeup();
@@ -1809,8 +1980,10 @@
int nwritten;
int size;
int maxcl;
+ int gbflags;
bo = &vp->v_bufobj;
+ gbflags = (bp->b_flags & B_UNMAPPED) != 0 ? GB_UNMAPPED : 0;
/*
* right now we support clustered writing only to regular files. If
* we find a clusterable block we could be in the middle of a cluster
@@ -1823,7 +1996,7 @@
size = vp->v_mount->mnt_stat.f_iosize;
maxcl = MAXPHYS / size;
- BO_LOCK(bo);
+ BO_RLOCK(bo);
for (i = 1; i < maxcl; i++)
if (vfs_bio_clcheck(vp, size, lblkno + i,
bp->b_blkno + ((i * size) >> DEV_BSHIFT)) == 0)
@@ -1833,7 +2006,7 @@
if (vfs_bio_clcheck(vp, size, lblkno - j,
bp->b_blkno - ((j * size) >> DEV_BSHIFT)) == 0)
break;
- BO_UNLOCK(bo);
+ BO_RUNLOCK(bo);
--j;
ncl = i + j;
/*
@@ -1841,7 +2014,8 @@
*/
if (ncl != 1) {
BUF_UNLOCK(bp);
- nwritten = cluster_wbuild(vp, size, lblkno - j, ncl);
+ nwritten = cluster_wbuild(vp, size, lblkno - j, ncl,
+ gbflags);
return (nwritten);
}
}
@@ -1858,46 +2032,207 @@
return (nwritten);
}
+static void
+setbufkva(struct buf *bp, vm_offset_t addr, int maxsize, int gbflags)
+{
+
+ KASSERT((bp->b_flags & (B_UNMAPPED | B_KVAALLOC)) == 0 &&
+ bp->b_kvasize == 0, ("call bfreekva(%p)", bp));
+ if ((gbflags & GB_UNMAPPED) == 0) {
+ bp->b_kvabase = (caddr_t)addr;
+ } else if ((gbflags & GB_KVAALLOC) != 0) {
+ KASSERT((gbflags & GB_UNMAPPED) != 0,
+ ("GB_KVAALLOC without GB_UNMAPPED"));
+ bp->b_kvaalloc = (caddr_t)addr;
+ bp->b_flags |= B_UNMAPPED | B_KVAALLOC;
+ atomic_add_long(&unmapped_bufspace, bp->b_kvasize);
+ }
+ bp->b_kvasize = maxsize;
+}
+
/*
- * getnewbuf:
- *
- * Find and initialize a new buffer header, freeing up existing buffers
- * in the bufqueues as necessary. The new buffer is returned locked.
- *
- * Important: B_INVAL is not set. If the caller wishes to throw the
- * buffer away, the caller must set B_INVAL prior to calling brelse().
- *
- * We block if:
- * We have insufficient buffer headers
- * We have insufficient buffer space
- * buffer_map is too fragmented ( space reservation fails )
- * If we have to flush dirty buffers ( but we try to avoid this )
- *
- * To avoid VFS layer recursion we do not flush dirty buffers ourselves.
- * Instead we ask the buf daemon to do it for us. We attempt to
- * avoid piecemeal wakeups of the pageout daemon.
+ * Allocate the buffer KVA and set b_kvasize. Also set b_kvabase if
+ * needed.
*/
+static int
+allocbufkva(struct buf *bp, int maxsize, int gbflags)
+{
+ vm_offset_t addr;
-static struct buf *
-getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int size, int maxsize,
- int gbflags)
+ bfreekva(bp);
+ addr = 0;
+
+ if (vmem_alloc(buffer_arena, maxsize, M_BESTFIT | M_NOWAIT, &addr)) {
+ /*
+ * Buffer map is too fragmented. Request the caller
+ * to defragment the map.
+ */
+ atomic_add_int(&bufdefragcnt, 1);
+ return (1);
+ }
+ setbufkva(bp, addr, maxsize, gbflags);
+ atomic_add_long(&bufspace, bp->b_kvasize);
+ return (0);
+}
+
+/*
+ * Ask the bufdaemon for help, or act as bufdaemon itself, when a
+ * locked vnode is supplied.
+ */
+static void
+getnewbuf_bufd_help(struct vnode *vp, int gbflags, int slpflag, int slptimeo,
+ int defrag)
{
struct thread *td;
- struct buf *bp;
- struct buf *nbp;
- int defrag = 0;
- int nqindex;
- static int flushingbufs;
+ char *waitmsg;
+ int error, fl, flags, norunbuf;
+ mtx_assert(&bqclean, MA_OWNED);
+
+ if (defrag) {
+ flags = VFS_BIO_NEED_BUFSPACE;
+ waitmsg = "nbufkv";
+ } else if (bufspace >= hibufspace) {
+ waitmsg = "nbufbs";
+ flags = VFS_BIO_NEED_BUFSPACE;
+ } else {
+ waitmsg = "newbuf";
+ flags = VFS_BIO_NEED_ANY;
+ }
+ atomic_set_int(&needsbuffer, flags);
+ mtx_unlock(&bqclean);
+
+ bd_speedup(); /* heeeelp */
+ if ((gbflags & GB_NOWAIT_BD) != 0)
+ return;
+
td = curthread;
+ rw_wlock(&nblock);
+ while ((needsbuffer & flags) != 0) {
+ if (vp != NULL && vp->v_type != VCHR &&
+ (td->td_pflags & TDP_BUFNEED) == 0) {
+ rw_wunlock(&nblock);
+ /*
+ * getblk() is called with a vnode locked, and
+ * some majority of the dirty buffers may as
+ * well belong to the vnode. Flushing the
+ * buffers there would make a progress that
+ * cannot be achieved by the buf_daemon, that
+ * cannot lock the vnode.
+ */
+ norunbuf = ~(TDP_BUFNEED | TDP_NORUNNINGBUF) |
+ (td->td_pflags & TDP_NORUNNINGBUF);
+
+ /*
+ * Play bufdaemon. The getnewbuf() function
+ * may be called while the thread owns lock
+ * for another dirty buffer for the same
+ * vnode, which makes it impossible to use
+ * VOP_FSYNC() there, due to the buffer lock
+ * recursion.
+ */
+ td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF;
+ fl = buf_flush(vp, flushbufqtarget);
+ td->td_pflags &= norunbuf;
+ rw_wlock(&nblock);
+ if (fl != 0)
+ continue;
+ if ((needsbuffer & flags) == 0)
+ break;
+ }
+ error = rw_sleep(__DEVOLATILE(void *, &needsbuffer), &nblock,
+ (PRIBIO + 4) | slpflag, waitmsg, slptimeo);
+ if (error != 0)
+ break;
+ }
+ rw_wunlock(&nblock);
+}
+
+static void
+getnewbuf_reuse_bp(struct buf *bp, int qindex)
+{
+
+ CTR6(KTR_BUF, "getnewbuf(%p) vp %p flags %X kvasize %d bufsize %d "
+ "queue %d (recycling)", bp, bp->b_vp, bp->b_flags,
+ bp->b_kvasize, bp->b_bufsize, qindex);
+ mtx_assert(&bqclean, MA_NOTOWNED);
+
/*
- * We can't afford to block since we might be holding a vnode lock,
- * which may prevent system daemons from running. We deal with
- * low-memory situations by proactively returning memory and running
- * async I/O rather then sync I/O.
+ * Note: we no longer distinguish between VMIO and non-VMIO
+ * buffers.
*/
- atomic_add_int(&getnewbufcalls, 1);
- atomic_subtract_int(&getnewbufrestarts, 1);
+ KASSERT((bp->b_flags & B_DELWRI) == 0,
+ ("delwri buffer %p found in queue %d", bp, qindex));
+
+ if (qindex == QUEUE_CLEAN) {
+ if (bp->b_flags & B_VMIO) {
+ bp->b_flags &= ~B_ASYNC;
+ vfs_vmio_release(bp);
+ }
+ if (bp->b_vp != NULL)
+ brelvp(bp);
+ }
+
+ /*
+ * Get the rest of the buffer freed up. b_kva* is still valid
+ * after this operation.
+ */
+
+ if (bp->b_rcred != NOCRED) {
+ crfree(bp->b_rcred);
+ bp->b_rcred = NOCRED;
+ }
+ if (bp->b_wcred != NOCRED) {
+ crfree(bp->b_wcred);
+ bp->b_wcred = NOCRED;
+ }
+ if (!LIST_EMPTY(&bp->b_dep))
+ buf_deallocate(bp);
+ if (bp->b_vflags & BV_BKGRDINPROG)
+ panic("losing buffer 3");
+ KASSERT(bp->b_vp == NULL, ("bp: %p still has vnode %p. qindex: %d",
+ bp, bp->b_vp, qindex));
+ KASSERT((bp->b_xflags & (BX_VNCLEAN|BX_VNDIRTY)) == 0,
+ ("bp: %p still on a buffer list. xflags %X", bp, bp->b_xflags));
+
+ if (bp->b_bufsize)
+ allocbuf(bp, 0);
+
+ bp->b_flags &= B_UNMAPPED | B_KVAALLOC;
+ bp->b_ioflags = 0;
+ bp->b_xflags = 0;
+ KASSERT((bp->b_flags & B_INFREECNT) == 0,
+ ("buf %p still counted as free?", bp));
+ bp->b_vflags = 0;
+ bp->b_vp = NULL;
+ bp->b_blkno = bp->b_lblkno = 0;
+ bp->b_offset = NOOFFSET;
+ bp->b_iodone = 0;
+ bp->b_error = 0;
+ bp->b_resid = 0;
+ bp->b_bcount = 0;
+ bp->b_npages = 0;
+ bp->b_dirtyoff = bp->b_dirtyend = 0;
+ bp->b_bufobj = NULL;
+ bp->b_pin_count = 0;
+ bp->b_fsprivate1 = NULL;
+ bp->b_fsprivate2 = NULL;
+ bp->b_fsprivate3 = NULL;
+
+ LIST_INIT(&bp->b_dep);
+}
+
+static int flushingbufs;
+
+static struct buf *
+getnewbuf_scan(int maxsize, int defrag, int unmapped, int metadata)
+{
+ struct buf *bp, *nbp;
+ int nqindex, qindex, pass;
+
+ KASSERT(!unmapped || !defrag, ("both unmapped and defrag"));
+
+ pass = 1;
restart:
atomic_add_int(&getnewbufrestarts, 1);
@@ -1907,66 +2242,90 @@
* that if we are specially marked process, we are allowed to
* dip into our reserves.
*
- * The scanning sequence is nominally: EMPTY->EMPTYKVA->CLEAN
+ * The scanning sequence is nominally: EMPTY->EMPTYKVA->CLEAN
+ * for the allocation of the mapped buffer. For unmapped, the
+ * easiest is to start with EMPTY outright.
*
* We start with EMPTYKVA. If the list is empty we backup to EMPTY.
* However, there are a number of cases (defragging, reusing, ...)
* where we cannot backup.
*/
- mtx_lock(&bqlock);
- nqindex = QUEUE_EMPTYKVA;
- nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]);
-
+ nbp = NULL;
+ mtx_lock(&bqclean);
+ if (!defrag && unmapped) {
+ nqindex = QUEUE_EMPTY;
+ nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
+ }
if (nbp == NULL) {
- /*
- * If no EMPTYKVA buffers and we are either
- * defragging or reusing, locate a CLEAN buffer
- * to free or reuse. If bufspace useage is low
- * skip this step so we can allocate a new buffer.
- */
- if (defrag || bufspace >= lobufspace) {
- nqindex = QUEUE_CLEAN;
- nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]);
- }
+ nqindex = QUEUE_EMPTYKVA;
+ nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]);
+ }
- /*
- * If we could not find or were not allowed to reuse a
- * CLEAN buffer, check to see if it is ok to use an EMPTY
- * buffer. We can only use an EMPTY buffer if allocating
- * its KVA would not otherwise run us out of buffer space.
- */
- if (nbp == NULL && defrag == 0 &&
- bufspace + maxsize < hibufspace) {
- nqindex = QUEUE_EMPTY;
- nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
- }
+ /*
+ * If no EMPTYKVA buffers and we are either defragging or
+ * reusing, locate a CLEAN buffer to free or reuse. If
+ * bufspace useage is low skip this step so we can allocate a
+ * new buffer.
+ */
+ if (nbp == NULL && (defrag || bufspace >= lobufspace)) {
+ nqindex = QUEUE_CLEAN;
+ nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]);
}
/*
+ * If we could not find or were not allowed to reuse a CLEAN
+ * buffer, check to see if it is ok to use an EMPTY buffer.
+ * We can only use an EMPTY buffer if allocating its KVA would
+ * not otherwise run us out of buffer space. No KVA is needed
+ * for the unmapped allocation.
+ */
+ if (nbp == NULL && defrag == 0 && (bufspace + maxsize < hibufspace ||
+ metadata)) {
+ nqindex = QUEUE_EMPTY;
+ nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]);
+ }
+
+ /*
+ * All available buffers might be clean, retry ignoring the
+ * lobufspace as the last resort.
+ */
+ if (nbp == NULL && !TAILQ_EMPTY(&bufqueues[QUEUE_CLEAN])) {
+ nqindex = QUEUE_CLEAN;
+ nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]);
+ }
+
+ /*
* Run scan, possibly freeing data and/or kva mappings on the fly
* depending.
*/
-
while ((bp = nbp) != NULL) {
- int qindex = nqindex;
+ qindex = nqindex;
/*
- * Calculate next bp ( we can only use it if we do not block
- * or do other fancy things ).
+ * Calculate next bp (we can only use it if we do not
+ * block or do other fancy things).
*/
if ((nbp = TAILQ_NEXT(bp, b_freelist)) == NULL) {
- switch(qindex) {
+ switch (qindex) {
case QUEUE_EMPTY:
nqindex = QUEUE_EMPTYKVA;
- if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA])))
+ nbp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTYKVA]);
+ if (nbp != NULL)
break;
/* FALLTHROUGH */
case QUEUE_EMPTYKVA:
nqindex = QUEUE_CLEAN;
- if ((nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN])))
+ nbp = TAILQ_FIRST(&bufqueues[QUEUE_CLEAN]);
+ if (nbp != NULL)
break;
/* FALLTHROUGH */
case QUEUE_CLEAN:
+ if (metadata && pass == 1) {
+ pass = 2;
+ nqindex = QUEUE_EMPTY;
+ nbp = TAILQ_FIRST(
+ &bufqueues[QUEUE_EMPTY]);
+ }
/*
* nbp is NULL.
*/
@@ -1990,101 +2349,38 @@
*/
if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
continue;
- if (bp->b_vp) {
- BO_LOCK(bp->b_bufobj);
- if (bp->b_vflags & BV_BKGRDINPROG) {
- BO_UNLOCK(bp->b_bufobj);
- BUF_UNLOCK(bp);
- continue;
- }
- BO_UNLOCK(bp->b_bufobj);
- }
- CTR6(KTR_BUF,
- "getnewbuf(%p) vp %p flags %X kvasize %d bufsize %d "
- "queue %d (recycling)", bp, bp->b_vp, bp->b_flags,
- bp->b_kvasize, bp->b_bufsize, qindex);
-
/*
- * Sanity Checks
+ * BKGRDINPROG can only be set with the buf and bufobj
+ * locks both held. We tolerate a race to clear it here.
*/
- KASSERT(bp->b_qindex == qindex, ("getnewbuf: inconsistant queue %d bp %p", qindex, bp));
+ if (bp->b_vflags & BV_BKGRDINPROG) {
+ BUF_UNLOCK(bp);
+ continue;
+ }
/*
- * Note: we no longer distinguish between VMIO and non-VMIO
- * buffers.
+ * Requeue the background write buffer with error.
*/
+ if ((bp->b_vflags & BV_BKGRDERR) != 0) {
+ bremfreel(bp);
+ mtx_unlock(&bqclean);
+ bqrelse(bp);
+ continue;
+ }
- KASSERT((bp->b_flags & B_DELWRI) == 0, ("delwri buffer %p found in queue %d", bp, qindex));
+ KASSERT(bp->b_qindex == qindex,
+ ("getnewbuf: inconsistent queue %d bp %p", qindex, bp));
- if (bp->b_bufobj != NULL)
- BO_LOCK(bp->b_bufobj);
bremfreel(bp);
- if (bp->b_bufobj != NULL)
- BO_UNLOCK(bp->b_bufobj);
- mtx_unlock(&bqlock);
-
- if (qindex == QUEUE_CLEAN) {
- if (bp->b_flags & B_VMIO) {
- bp->b_flags &= ~B_ASYNC;
- vfs_vmio_release(bp);
- }
- if (bp->b_vp)
- brelvp(bp);
- }
-
+ mtx_unlock(&bqclean);
/*
* NOTE: nbp is now entirely invalid. We can only restart
* the scan from this point on.
- *
- * Get the rest of the buffer freed up. b_kva* is still
- * valid after this operation.
*/
- if (bp->b_rcred != NOCRED) {
- crfree(bp->b_rcred);
- bp->b_rcred = NOCRED;
- }
- if (bp->b_wcred != NOCRED) {
- crfree(bp->b_wcred);
- bp->b_wcred = NOCRED;
- }
- if (!LIST_EMPTY(&bp->b_dep))
- buf_deallocate(bp);
- if (bp->b_vflags & BV_BKGRDINPROG)
- panic("losing buffer 3");
- KASSERT(bp->b_vp == NULL,
- ("bp: %p still has vnode %p. qindex: %d",
- bp, bp->b_vp, qindex));
- KASSERT((bp->b_xflags & (BX_VNCLEAN|BX_VNDIRTY)) == 0,
- ("bp: %p still on a buffer list. xflags %X",
- bp, bp->b_xflags));
+ getnewbuf_reuse_bp(bp, qindex);
+ mtx_assert(&bqclean, MA_NOTOWNED);
- if (bp->b_bufsize)
- allocbuf(bp, 0);
-
- bp->b_flags = 0;
- bp->b_ioflags = 0;
- bp->b_xflags = 0;
- KASSERT((bp->b_vflags & BV_INFREECNT) == 0,
- ("buf %p still counted as free?", bp));
- bp->b_vflags = 0;
- bp->b_vp = NULL;
- bp->b_blkno = bp->b_lblkno = 0;
- bp->b_offset = NOOFFSET;
- bp->b_iodone = 0;
- bp->b_error = 0;
- bp->b_resid = 0;
- bp->b_bcount = 0;
- bp->b_npages = 0;
- bp->b_dirtyoff = bp->b_dirtyend = 0;
- bp->b_bufobj = NULL;
- bp->b_pin_count = 0;
- bp->b_fsprivate1 = NULL;
- bp->b_fsprivate2 = NULL;
- bp->b_fsprivate3 = NULL;
-
- LIST_INIT(&bp->b_dep);
-
/*
* If we are defragging then free the buffer.
*/
@@ -2107,6 +2403,9 @@
goto restart;
}
+ if (metadata)
+ break;
+
/*
* If we are overcomitted then recover the buffer and its
* KVM space. This occurs in rare situations when multiple
@@ -2124,72 +2423,79 @@
flushingbufs = 0;
break;
}
+ return (bp);
+}
+/*
+ * getnewbuf:
+ *
+ * Find and initialize a new buffer header, freeing up existing buffers
+ * in the bufqueues as necessary. The new buffer is returned locked.
+ *
+ * Important: B_INVAL is not set. If the caller wishes to throw the
+ * buffer away, the caller must set B_INVAL prior to calling brelse().
+ *
+ * We block if:
+ * We have insufficient buffer headers
+ * We have insufficient buffer space
+ * buffer_arena is too fragmented ( space reservation fails )
+ * If we have to flush dirty buffers ( but we try to avoid this )
+ */
+static struct buf *
+getnewbuf(struct vnode *vp, int slpflag, int slptimeo, int size, int maxsize,
+ int gbflags)
+{
+ struct buf *bp;
+ int defrag, metadata;
+
+ KASSERT((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC,
+ ("GB_KVAALLOC only makes sense with GB_UNMAPPED"));
+ if (!unmapped_buf_allowed)
+ gbflags &= ~(GB_UNMAPPED | GB_KVAALLOC);
+
+ defrag = 0;
+ if (vp == NULL || (vp->v_vflag & (VV_MD | VV_SYSTEM)) != 0 ||
+ vp->v_type == VCHR)
+ metadata = 1;
+ else
+ metadata = 0;
/*
+ * We can't afford to block since we might be holding a vnode lock,
+ * which may prevent system daemons from running. We deal with
+ * low-memory situations by proactively returning memory and running
+ * async I/O rather then sync I/O.
+ */
+ atomic_add_int(&getnewbufcalls, 1);
+ atomic_subtract_int(&getnewbufrestarts, 1);
+restart:
+ bp = getnewbuf_scan(maxsize, defrag, (gbflags & (GB_UNMAPPED |
+ GB_KVAALLOC)) == GB_UNMAPPED, metadata);
+ if (bp != NULL)
+ defrag = 0;
+
+ /*
* If we exhausted our list, sleep as appropriate. We may have to
* wakeup various daemons and write out some dirty buffers.
*
* Generally we are sleeping due to insufficient buffer space.
*/
-
if (bp == NULL) {
- int flags, norunbuf;
- char *waitmsg;
- int fl;
+ mtx_assert(&bqclean, MA_OWNED);
+ getnewbuf_bufd_help(vp, gbflags, slpflag, slptimeo, defrag);
+ mtx_assert(&bqclean, MA_NOTOWNED);
+ } else if ((gbflags & (GB_UNMAPPED | GB_KVAALLOC)) == GB_UNMAPPED) {
+ mtx_assert(&bqclean, MA_NOTOWNED);
- if (defrag) {
- flags = VFS_BIO_NEED_BUFSPACE;
- waitmsg = "nbufkv";
- } else if (bufspace >= hibufspace) {
- waitmsg = "nbufbs";
- flags = VFS_BIO_NEED_BUFSPACE;
- } else {
- waitmsg = "newbuf";
- flags = VFS_BIO_NEED_ANY;
- }
- mtx_lock(&nblock);
- needsbuffer |= flags;
- mtx_unlock(&nblock);
- mtx_unlock(&bqlock);
+ bfreekva(bp);
+ bp->b_flags |= B_UNMAPPED;
+ bp->b_kvabase = bp->b_data = unmapped_buf;
+ bp->b_kvasize = maxsize;
+ atomic_add_long(&bufspace, bp->b_kvasize);
+ atomic_add_long(&unmapped_bufspace, bp->b_kvasize);
+ atomic_add_int(&bufreusecnt, 1);
+ } else {
+ mtx_assert(&bqclean, MA_NOTOWNED);
- bd_speedup(); /* heeeelp */
- if (gbflags & GB_NOWAIT_BD)
- return (NULL);
-
- mtx_lock(&nblock);
- while (needsbuffer & flags) {
- if (vp != NULL && (td->td_pflags & TDP_BUFNEED) == 0) {
- mtx_unlock(&nblock);
- /*
- * getblk() is called with a vnode
- * locked, and some majority of the
- * dirty buffers may as well belong to
- * the vnode. Flushing the buffers
- * there would make a progress that
- * cannot be achieved by the
- * buf_daemon, that cannot lock the
- * vnode.
- */
- norunbuf = ~(TDP_BUFNEED | TDP_NORUNNINGBUF) |
- (td->td_pflags & TDP_NORUNNINGBUF);
- /* play bufdaemon */
- td->td_pflags |= TDP_BUFNEED | TDP_NORUNNINGBUF;
- fl = buf_do_flush(vp);
- td->td_pflags &= norunbuf;
- mtx_lock(&nblock);
- if (fl != 0)
- continue;
- if ((needsbuffer & flags) == 0)
- break;
- }
- if (msleep(&needsbuffer, &nblock,
- (PRIBIO + 4) | slpflag, waitmsg, slptimeo)) {
- mtx_unlock(&nblock);
- return (NULL);
- }
- }
- mtx_unlock(&nblock);
- } else {
/*
* We finally have a valid bp. We aren't quite out of the
* woods, we still have to reserve kva space. In order
@@ -2198,39 +2504,47 @@
*/
maxsize = (maxsize + BKVAMASK) & ~BKVAMASK;
- if (maxsize != bp->b_kvasize) {
- vm_offset_t addr = 0;
- int rv;
-
- bfreekva(bp);
-
- vm_map_lock(buffer_map);
- if (vm_map_findspace(buffer_map,
- vm_map_min(buffer_map), maxsize, &addr)) {
- /*
- * Buffer map is too fragmented.
- * We must defragment the map.
- */
- atomic_add_int(&bufdefragcnt, 1);
- vm_map_unlock(buffer_map);
+ if (maxsize != bp->b_kvasize || (bp->b_flags & (B_UNMAPPED |
+ B_KVAALLOC)) == B_UNMAPPED) {
+ if (allocbufkva(bp, maxsize, gbflags)) {
defrag = 1;
bp->b_flags |= B_INVAL;
brelse(bp);
goto restart;
}
- rv = vm_map_insert(buffer_map, NULL, 0, addr,
- addr + maxsize, VM_PROT_ALL, VM_PROT_ALL,
- MAP_NOFAULT);
- KASSERT(rv == KERN_SUCCESS,
- ("vm_map_insert(buffer_map) rv %d", rv));
- vm_map_unlock(buffer_map);
- bp->b_kvabase = (caddr_t)addr;
- bp->b_kvasize = maxsize;
- atomic_add_long(&bufspace, bp->b_kvasize);
atomic_add_int(&bufreusecnt, 1);
+ } else if ((bp->b_flags & B_KVAALLOC) != 0 &&
+ (gbflags & (GB_UNMAPPED | GB_KVAALLOC)) == 0) {
+ /*
+ * If the reused buffer has KVA allocated,
+ * reassign b_kvaalloc to b_kvabase.
+ */
+ bp->b_kvabase = bp->b_kvaalloc;
+ bp->b_flags &= ~B_KVAALLOC;
+ atomic_subtract_long(&unmapped_bufspace,
+ bp->b_kvasize);
+ atomic_add_int(&bufreusecnt, 1);
+ } else if ((bp->b_flags & (B_UNMAPPED | B_KVAALLOC)) == 0 &&
+ (gbflags & (GB_UNMAPPED | GB_KVAALLOC)) == (GB_UNMAPPED |
+ GB_KVAALLOC)) {
+ /*
+ * The case of reused buffer already have KVA
+ * mapped, but the request is for unmapped
+ * buffer with KVA allocated.
+ */
+ bp->b_kvaalloc = bp->b_kvabase;
+ bp->b_data = bp->b_kvabase = unmapped_buf;
+ bp->b_flags |= B_UNMAPPED | B_KVAALLOC;
+ atomic_add_long(&unmapped_bufspace,
+ bp->b_kvasize);
+ atomic_add_int(&bufreusecnt, 1);
}
- bp->b_saveaddr = bp->b_kvabase;
- bp->b_data = bp->b_saveaddr;
+ if ((gbflags & GB_UNMAPPED) == 0) {
+ bp->b_saveaddr = bp->b_kvabase;
+ bp->b_data = bp->b_saveaddr;
+ bp->b_flags &= ~B_UNMAPPED;
+ BUF_CHECK_MAPPED(bp);
+ }
}
return (bp);
}
@@ -2251,17 +2565,11 @@
SYSINIT(bufdaemon, SI_SUB_KTHREAD_BUF, SI_ORDER_FIRST, kproc_start, &buf_kp);
static int
-buf_do_flush(struct vnode *vp)
+buf_flush(struct vnode *vp, int target)
{
int flushed;
- flushed = flushbufqueues(vp, QUEUE_DIRTY, 0);
- /* The list empty check here is slightly racy */
- if (!TAILQ_EMPTY(&bufqueues[QUEUE_DIRTY_GIANT])) {
- mtx_lock(&Giant);
- flushed += flushbufqueues(vp, QUEUE_DIRTY_GIANT, 0);
- mtx_unlock(&Giant);
- }
+ flushed = flushbufqueues(vp, target, 0);
if (flushed == 0) {
/*
* Could not find any buffers without rollback
@@ -2268,13 +2576,9 @@
* dependencies, so just write the first one
* in the hopes of eventually making progress.
*/
- flushbufqueues(vp, QUEUE_DIRTY, 1);
- if (!TAILQ_EMPTY(
- &bufqueues[QUEUE_DIRTY_GIANT])) {
- mtx_lock(&Giant);
- flushbufqueues(vp, QUEUE_DIRTY_GIANT, 1);
- mtx_unlock(&Giant);
- }
+ if (vp != NULL && target > 2)
+ target /= 2;
+ flushbufqueues(vp, target, 1);
}
return (flushed);
}
@@ -2282,7 +2586,7 @@
static void
buf_daemon()
{
- int lodirtysave;
+ int lodirty;
/*
* This process needs to be suspended prior to shutdown sync.
@@ -2300,23 +2604,21 @@
mtx_unlock(&bdlock);
kproc_suspend_check(bufdaemonproc);
- lodirtysave = lodirtybuffers;
+ lodirty = lodirtybuffers;
if (bd_speedupreq) {
- lodirtybuffers = numdirtybuffers / 2;
+ lodirty = numdirtybuffers / 2;
bd_speedupreq = 0;
}
/*
* Do the flush. Limit the amount of in-transit I/O we
* allow to build up, otherwise we would completely saturate
- * the I/O system. Wakeup any waiting processes before we
- * normally would so they can run in parallel with our drain.
+ * the I/O system.
*/
- while (numdirtybuffers > lodirtybuffers) {
- if (buf_do_flush(NULL) == 0)
+ while (numdirtybuffers > lodirty) {
+ if (buf_flush(NULL, numdirtybuffers - lodirty) == 0)
break;
- kern_yield(PRI_UNCHANGED);
+ kern_yield(PRI_USER);
}
- lodirtybuffers = lodirtysave;
/*
* Only clear bd_request if we have reached our low water
@@ -2325,8 +2627,8 @@
* built up, within reason.
*
* If we were unable to hit our low water mark and couldn't
- * find any flushable buffers, we sleep half a second.
- * Otherwise we loop immediately.
+ * find any flushable buffers, we sleep for a short period
+ * to avoid endless loops on unlockable buffers.
*/
mtx_lock(&bdlock);
if (numdirtybuffers <= lodirtybuffers) {
@@ -2336,6 +2638,14 @@
* The sleep is just so the suspend code works.
*/
bd_request = 0;
+ /*
+ * Do an extra wakeup in case dirty threshold
+ * changed via sysctl and the explicit transition
+ * out of shortfall was missed.
+ */
+ bdirtywakeup();
+ if (runningbufspace <= lorunningspace)
+ runningwakeup();
msleep(&bd_request, &bdlock, PVM, "psleep", hz);
} else {
/*
@@ -2360,7 +2670,7 @@
0, "Number of buffers flushed with dependecies that require rollbacks");
static int
-flushbufqueues(struct vnode *lvp, int queue, int flushdeps)
+flushbufqueues(struct vnode *lvp, int target, int flushdeps)
{
struct buf *sentinel;
struct vnode *vp;
@@ -2368,61 +2678,63 @@
struct buf *bp;
int hasdeps;
int flushed;
- int target;
+ int queue;
+ int error;
+ bool unlock;
- if (lvp == NULL) {
- target = numdirtybuffers - lodirtybuffers;
- if (flushdeps && target > 2)
- target /= 2;
- } else
- target = flushbufqtarget;
flushed = 0;
+ queue = QUEUE_DIRTY;
bp = NULL;
sentinel = malloc(sizeof(struct buf), M_TEMP, M_WAITOK | M_ZERO);
sentinel->b_qindex = QUEUE_SENTINEL;
- mtx_lock(&bqlock);
+ mtx_lock(&bqdirty);
TAILQ_INSERT_HEAD(&bufqueues[queue], sentinel, b_freelist);
+ mtx_unlock(&bqdirty);
while (flushed != target) {
+ maybe_yield();
+ mtx_lock(&bqdirty);
bp = TAILQ_NEXT(sentinel, b_freelist);
if (bp != NULL) {
TAILQ_REMOVE(&bufqueues[queue], sentinel, b_freelist);
TAILQ_INSERT_AFTER(&bufqueues[queue], bp, sentinel,
b_freelist);
- } else
+ } else {
+ mtx_unlock(&bqdirty);
break;
+ }
/*
* Skip sentinels inserted by other invocations of the
* flushbufqueues(), taking care to not reorder them.
- */
- if (bp->b_qindex == QUEUE_SENTINEL)
- continue;
- /*
+ *
* Only flush the buffers that belong to the
* vnode locked by the curthread.
*/
- if (lvp != NULL && bp->b_vp != lvp)
+ if (bp->b_qindex == QUEUE_SENTINEL || (lvp != NULL &&
+ bp->b_vp != lvp)) {
+ mtx_unlock(&bqdirty);
+ continue;
+ }
+ error = BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL);
+ mtx_unlock(&bqdirty);
+ if (error != 0)
continue;
- if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
- continue;
if (bp->b_pin_count > 0) {
BUF_UNLOCK(bp);
continue;
}
- BO_LOCK(bp->b_bufobj);
+ /*
+ * BKGRDINPROG can only be set with the buf and bufobj
+ * locks both held. We tolerate a race to clear it here.
+ */
if ((bp->b_vflags & BV_BKGRDINPROG) != 0 ||
(bp->b_flags & B_DELWRI) == 0) {
- BO_UNLOCK(bp->b_bufobj);
BUF_UNLOCK(bp);
continue;
}
- BO_UNLOCK(bp->b_bufobj);
if (bp->b_flags & B_INVAL) {
- bremfreel(bp);
- mtx_unlock(&bqlock);
+ bremfreef(bp);
brelse(bp);
flushed++;
- numdirtywakeup((lodirtybuffers + hidirtybuffers) / 2);
- mtx_lock(&bqlock);
continue;
}
@@ -2449,19 +2761,28 @@
BUF_UNLOCK(bp);
continue;
}
- if (vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT | LK_CANRECURSE) == 0) {
- mtx_unlock(&bqlock);
+ if (lvp == NULL) {
+ unlock = true;
+ error = vn_lock(vp, LK_EXCLUSIVE | LK_NOWAIT);
+ } else {
+ ASSERT_VOP_LOCKED(vp, "getbuf");
+ unlock = false;
+ error = VOP_ISLOCKED(vp) == LK_EXCLUSIVE ? 0 :
+ vn_lock(vp, LK_TRYUPGRADE);
+ }
+ if (error == 0) {
CTR3(KTR_BUF, "flushbufqueue(%p) vp %p flags %X",
bp, bp->b_vp, bp->b_flags);
- if (curproc == bufdaemonproc)
+ if (curproc == bufdaemonproc) {
vfs_bio_awrite(bp);
- else {
+ } else {
bremfree(bp);
bwrite(bp);
- notbufdflashes++;
+ notbufdflushes++;
}
vn_finished_write(mp);
- VOP_UNLOCK(vp, 0);
+ if (unlock)
+ VOP_UNLOCK(vp, 0);
flushwithdeps += hasdeps;
flushed++;
@@ -2469,17 +2790,17 @@
* Sleeping on runningbufspace while holding
* vnode lock leads to deadlock.
*/
- if (curproc == bufdaemonproc)
+ if (curproc == bufdaemonproc &&
+ runningbufspace > hirunningspace)
waitrunningbufspace();
- numdirtywakeup((lodirtybuffers + hidirtybuffers) / 2);
- mtx_lock(&bqlock);
continue;
}
vn_finished_write(mp);
BUF_UNLOCK(bp);
}
+ mtx_lock(&bqdirty);
TAILQ_REMOVE(&bufqueues[queue], sentinel, b_freelist);
- mtx_unlock(&bqlock);
+ mtx_unlock(&bqdirty);
free(sentinel, M_TEMP);
return (flushed);
}
@@ -2492,9 +2813,9 @@
{
struct buf *bp;
- BO_LOCK(bo);
+ BO_RLOCK(bo);
bp = gbincore(bo, blkno);
- BO_UNLOCK(bo);
+ BO_RUNLOCK(bo);
return (bp);
}
@@ -2527,7 +2848,7 @@
size = vp->v_mount->mnt_stat.f_iosize;
off = (vm_ooffset_t)blkno * (vm_ooffset_t)vp->v_mount->mnt_stat.f_iosize;
- VM_OBJECT_LOCK(obj);
+ VM_OBJECT_RLOCK(obj);
for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) {
m = vm_page_lookup(obj, OFF_TO_IDX(off + toff));
if (!m)
@@ -2539,11 +2860,11 @@
(vm_offset_t) ((toff + off) & PAGE_MASK), tinc) == 0)
goto notinmem;
}
- VM_OBJECT_UNLOCK(obj);
+ VM_OBJECT_RUNLOCK(obj);
return 1;
notinmem:
- VM_OBJECT_UNLOCK(obj);
+ VM_OBJECT_RUNLOCK(obj);
return (0);
}
@@ -2573,7 +2894,7 @@
KASSERT(bp->b_offset != NOOFFSET,
("vfs_clean_pages_dirty_buf: no buffer offset"));
- VM_OBJECT_LOCK(bp->b_bufobj->bo_object);
+ VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
vfs_drain_busy_pages(bp);
vfs_setdirty_locked_object(bp);
for (i = 0; i < bp->b_npages; i++) {
@@ -2586,7 +2907,7 @@
/* vm_page_clear_dirty(m, foff & PAGE_MASK, eoff - foff); */
foff = noff;
}
- VM_OBJECT_UNLOCK(bp->b_bufobj->bo_object);
+ VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
}
static void
@@ -2596,7 +2917,7 @@
int i;
object = bp->b_bufobj->bo_object;
- VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
+ VM_OBJECT_ASSERT_WLOCKED(object);
/*
* We qualify the scan for modified pages on whether the
@@ -2653,6 +2974,90 @@
}
/*
+ * Allocate the KVA mapping for an existing buffer. It handles the
+ * cases of both B_UNMAPPED buffer, and buffer with the preallocated
+ * KVA which is not mapped (B_KVAALLOC).
+ */
+static void
+bp_unmapped_get_kva(struct buf *bp, daddr_t blkno, int size, int gbflags)
+{
+ struct buf *scratch_bp;
+ int bsize, maxsize, need_mapping, need_kva;
+ off_t offset;
+
+ need_mapping = (bp->b_flags & B_UNMAPPED) != 0 &&
+ (gbflags & GB_UNMAPPED) == 0;
+ need_kva = (bp->b_flags & (B_KVAALLOC | B_UNMAPPED)) == B_UNMAPPED &&
+ (gbflags & GB_KVAALLOC) != 0;
+ if (!need_mapping && !need_kva)
+ return;
+
+ BUF_CHECK_UNMAPPED(bp);
+
+ if (need_mapping && (bp->b_flags & B_KVAALLOC) != 0) {
+ /*
+ * Buffer is not mapped, but the KVA was already
+ * reserved at the time of the instantiation. Use the
+ * allocated space.
+ */
+ bp->b_flags &= ~B_KVAALLOC;
+ KASSERT(bp->b_kvaalloc != 0, ("kvaalloc == 0"));
+ bp->b_kvabase = bp->b_kvaalloc;
+ atomic_subtract_long(&unmapped_bufspace, bp->b_kvasize);
+ goto has_addr;
+ }
+
+ /*
+ * Calculate the amount of the address space we would reserve
+ * if the buffer was mapped.
+ */
+ bsize = vn_isdisk(bp->b_vp, NULL) ? DEV_BSIZE : bp->b_bufobj->bo_bsize;
+ offset = blkno * bsize;
+ maxsize = size + (offset & PAGE_MASK);
+ maxsize = imax(maxsize, bsize);
+
+mapping_loop:
+ if (allocbufkva(bp, maxsize, gbflags)) {
+ /*
+ * Request defragmentation. getnewbuf() returns us the
+ * allocated space by the scratch buffer KVA.
+ */
+ scratch_bp = getnewbuf(bp->b_vp, 0, 0, size, maxsize, gbflags |
+ (GB_UNMAPPED | GB_KVAALLOC));
+ if (scratch_bp == NULL) {
+ if ((gbflags & GB_NOWAIT_BD) != 0) {
+ /*
+ * XXXKIB: defragmentation cannot
+ * succeed, not sure what else to do.
+ */
+ panic("GB_NOWAIT_BD and B_UNMAPPED %p", bp);
+ }
+ atomic_add_int(&mappingrestarts, 1);
+ goto mapping_loop;
+ }
+ KASSERT((scratch_bp->b_flags & B_KVAALLOC) != 0,
+ ("scratch bp !B_KVAALLOC %p", scratch_bp));
+ setbufkva(bp, (vm_offset_t)scratch_bp->b_kvaalloc,
+ scratch_bp->b_kvasize, gbflags);
+
+ /* Get rid of the scratch buffer. */
+ scratch_bp->b_kvasize = 0;
+ scratch_bp->b_flags |= B_INVAL;
+ scratch_bp->b_flags &= ~(B_UNMAPPED | B_KVAALLOC);
+ brelse(scratch_bp);
+ }
+ if (!need_mapping)
+ return;
+
+has_addr:
+ bp->b_saveaddr = bp->b_kvabase;
+ bp->b_data = bp->b_saveaddr; /* b_offset is handled by bpmap_qenter */
+ bp->b_flags &= ~B_UNMAPPED;
+ BUF_CHECK_MAPPED(bp);
+ bpmap_qenter(bp);
+}
+
+/*
* getblk:
*
* Get a block given a specified block and offset into a file/device.
@@ -2684,7 +3089,7 @@
* to clear B_INVAL. If the caller does this without issuing an I/O,
* the caller should set B_CACHE ( as an optimization ), else the caller
* should issue the I/O and biodone() will set B_CACHE if the I/O was
- * a write attempt or if it was a successfull read. If the caller
+ * a write attempt or if it was a successful read. If the caller
* intends to issue a READ, the caller must clear B_INVAL and BIO_ERROR
* prior to issuing the READ. biodone() will *not* clear B_INVAL.
*/
@@ -2694,38 +3099,28 @@
{
struct buf *bp;
struct bufobj *bo;
- int error;
+ int bsize, error, maxsize, vmio;
+ off_t offset;
CTR3(KTR_BUF, "getblk(%p, %ld, %d)", vp, (long)blkno, size);
+ KASSERT((flags & (GB_UNMAPPED | GB_KVAALLOC)) != GB_KVAALLOC,
+ ("GB_KVAALLOC only makes sense with GB_UNMAPPED"));
ASSERT_VOP_LOCKED(vp, "getblk");
- if (size > MAXBSIZE)
- panic("getblk: size(%d) > MAXBSIZE(%d)\n", size, MAXBSIZE);
+ if (size > MAXBCACHEBUF)
+ panic("getblk: size(%d) > MAXBCACHEBUF(%d)\n", size,
+ MAXBCACHEBUF);
+ if (!unmapped_buf_allowed)
+ flags &= ~(GB_UNMAPPED | GB_KVAALLOC);
bo = &vp->v_bufobj;
loop:
- /*
- * Block if we are low on buffers. Certain processes are allowed
- * to completely exhaust the buffer cache.
- *
- * If this check ever becomes a bottleneck it may be better to
- * move it into the else, when gbincore() fails. At the moment
- * it isn't a problem.
- */
- if (numfreebuffers == 0) {
- if (TD_IS_IDLETHREAD(curthread))
- return NULL;
- mtx_lock(&nblock);
- needsbuffer |= VFS_BIO_NEED_ANY;
- mtx_unlock(&nblock);
- }
-
- BO_LOCK(bo);
+ BO_RLOCK(bo);
bp = gbincore(bo, blkno);
if (bp != NULL) {
int lockflags;
/*
- * Buffer is in-core. If the buffer is not busy, it must
- * be on a queue.
+ * Buffer is in-core. If the buffer is not busy nor managed,
+ * it must be on a queue.
*/
lockflags = LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK;
@@ -2733,7 +3128,7 @@
lockflags |= LK_NOWAIT;
error = BUF_TIMELOCK(bp, lockflags,
- BO_MTX(bo), "getblk", slpflag, slptimeo);
+ BO_LOCKPTR(bo), "getblk", slpflag, slptimeo);
/*
* If we slept and got the lock we have to restart in case
@@ -2758,9 +3153,10 @@
bp->b_flags &= ~B_CACHE;
else if ((bp->b_flags & (B_VMIO | B_INVAL)) == 0)
bp->b_flags |= B_CACHE;
- BO_LOCK(bo);
- bremfree(bp);
- BO_UNLOCK(bo);
+ if (bp->b_flags & B_MANAGED)
+ MPASS(bp->b_qindex == QUEUE_NONE);
+ else
+ bremfree(bp);
/*
* check for size inconsistencies for non-VMIO case.
@@ -2798,12 +3194,18 @@
}
/*
- * If the size is inconsistant in the VMIO case, we can resize
+ * Handle the case of unmapped buffer which should
+ * become mapped, or the buffer for which KVA
+ * reservation is requested.
+ */
+ bp_unmapped_get_kva(bp, blkno, size, flags);
+
+ /*
+ * If the size is inconsistent in the VMIO case, we can resize
* the buffer. This might lead to B_CACHE getting set or
* cleared. If the size has not changed, B_CACHE remains
* unchanged from its previous state.
*/
-
if (bp->b_bcount != size)
allocbuf(bp, size);
@@ -2844,15 +3246,12 @@
}
bp->b_flags &= ~B_DONE;
} else {
- int bsize, maxsize, vmio;
- off_t offset;
-
/*
* Buffer is not in-core, create new buffer. The buffer
* returned by getnewbuf() is locked. Note that the returned
* buffer is also considered valid (not marked B_INVAL).
*/
- BO_UNLOCK(bo);
+ BO_RUNLOCK(bo);
/*
* If the user does not want us to create the buffer, bail out
* here.
@@ -2859,10 +3258,19 @@
*/
if (flags & GB_NOCREAT)
return NULL;
+ if (numfreebuffers == 0 && TD_IS_IDLETHREAD(curthread))
+ return NULL;
+
bsize = vn_isdisk(vp, NULL) ? DEV_BSIZE : bo->bo_bsize;
offset = blkno * bsize;
vmio = vp->v_object != NULL;
- maxsize = vmio ? size + (offset & PAGE_MASK) : size;
+ if (vmio) {
+ maxsize = size + (offset & PAGE_MASK);
+ } else {
+ maxsize = size;
+ /* Do not allow non-VMIO notmapped buffers. */
+ flags &= ~GB_UNMAPPED;
+ }
maxsize = imax(maxsize, bsize);
bp = getnewbuf(vp, slpflag, slptimeo, size, maxsize, flags);
@@ -2918,6 +3326,7 @@
KASSERT(bp->b_bufobj->bo_object == NULL,
("ARGH! has b_bufobj->bo_object %p %p\n",
bp, bp->b_bufobj->bo_object));
+ BUF_CHECK_MAPPED(bp);
}
allocbuf(bp, size);
@@ -2961,7 +3370,7 @@
* resize a buffer up or down.
*
* Note that this code is tricky, and has many complications to resolve
- * deadlock or inconsistant data situations. Tread lightly!!!
+ * deadlock or inconsistent data situations. Tread lightly!!!
* There are B_CACHE and B_DELWRI interactions that must be dealt with by
* the caller. Calling this code willy nilly can result in the loss of data.
*
@@ -3093,11 +3502,15 @@
if (desiredpages < bp->b_npages) {
vm_page_t m;
- pmap_qremove((vm_offset_t)trunc_page(
- (vm_offset_t)bp->b_data) +
- (desiredpages << PAGE_SHIFT),
- (bp->b_npages - desiredpages));
- VM_OBJECT_LOCK(bp->b_bufobj->bo_object);
+ if ((bp->b_flags & B_UNMAPPED) == 0) {
+ BUF_CHECK_MAPPED(bp);
+ pmap_qremove((vm_offset_t)trunc_page(
+ (vm_offset_t)bp->b_data) +
+ (desiredpages << PAGE_SHIFT),
+ (bp->b_npages - desiredpages));
+ } else
+ BUF_CHECK_UNMAPPED(bp);
+ VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
for (i = desiredpages; i < bp->b_npages; i++) {
/*
* the page is not freed here -- it
@@ -3107,7 +3520,7 @@
m = bp->b_pages[i];
KASSERT(m != bogus_page,
("allocbuf: bogus page found"));
- while (vm_page_sleep_if_busy(m, TRUE,
+ while (vm_page_sleep_if_busy(m,
"biodep"))
continue;
@@ -3116,7 +3529,7 @@
vm_page_unwire(m, 0);
vm_page_unlock(m);
}
- VM_OBJECT_UNLOCK(bp->b_bufobj->bo_object);
+ VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
bp->b_npages = desiredpages;
}
} else if (size > bp->b_bcount) {
@@ -3137,7 +3550,7 @@
obj = bp->b_bufobj->bo_object;
- VM_OBJECT_LOCK(obj);
+ VM_OBJECT_WLOCK(obj);
while (bp->b_npages < desiredpages) {
vm_page_t m;
@@ -3146,15 +3559,15 @@
* here could interfere with paging I/O, no
* matter which process we are.
*
- * We can only test VPO_BUSY here. Blocking on
- * m->busy might lead to a deadlock:
- * vm_fault->getpages->cluster_read->allocbuf
- * Thus, we specify VM_ALLOC_IGN_SBUSY.
+ * Only exclusive busy can be tested here.
+ * Blocking on shared busy might lead to
+ * deadlocks once allocbuf() is called after
+ * pages are vfs_busy_pages().
*/
m = vm_page_grab(obj, OFF_TO_IDX(bp->b_offset) +
bp->b_npages, VM_ALLOC_NOBUSY |
VM_ALLOC_SYSTEM | VM_ALLOC_WIRED |
- VM_ALLOC_RETRY | VM_ALLOC_IGN_SBUSY |
+ VM_ALLOC_IGN_SBUSY |
VM_ALLOC_COUNT(desiredpages - bp->b_npages));
if (m->valid == 0)
bp->b_flags &= ~B_CACHE;
@@ -3199,24 +3612,15 @@
toff += tinc;
tinc = PAGE_SIZE;
}
- VM_OBJECT_UNLOCK(obj);
+ VM_OBJECT_WUNLOCK(obj);
/*
- * Step 3, fixup the KVM pmap. Remember that
- * bp->b_data is relative to bp->b_offset, but
- * bp->b_offset may be offset into the first page.
+ * Step 3, fixup the KVM pmap.
*/
-
- bp->b_data = (caddr_t)
- trunc_page((vm_offset_t)bp->b_data);
- pmap_qenter(
- (vm_offset_t)bp->b_data,
- bp->b_pages,
- bp->b_npages
- );
-
- bp->b_data = (caddr_t)((vm_offset_t)bp->b_data |
- (vm_offset_t)(bp->b_offset & PAGE_MASK));
+ if ((bp->b_flags & B_UNMAPPED) == 0)
+ bpmap_qenter(bp);
+ else
+ BUF_CHECK_UNMAPPED(bp);
}
}
if (newbsize < bp->b_bufsize)
@@ -3226,28 +3630,39 @@
return 1;
}
+extern int inflight_transient_maps;
+
void
biodone(struct bio *bp)
{
struct mtx *mtxp;
void (*done)(struct bio *);
+ vm_offset_t start, end;
- mtxp = mtx_pool_find(mtxpool_sleep, bp);
- mtx_lock(mtxp);
- bp->bio_flags |= BIO_DONE;
+ if ((bp->bio_flags & BIO_TRANSIENT_MAPPING) != 0) {
+ bp->bio_flags &= ~BIO_TRANSIENT_MAPPING;
+ bp->bio_flags |= BIO_UNMAPPED;
+ start = trunc_page((vm_offset_t)bp->bio_data);
+ end = round_page((vm_offset_t)bp->bio_data + bp->bio_length);
+ pmap_qremove(start, OFF_TO_IDX(end - start));
+ vmem_free(transient_arena, start, end - start);
+ atomic_add_int(&inflight_transient_maps, -1);
+ }
done = bp->bio_done;
- if (done == NULL)
+ if (done == NULL) {
+ mtxp = mtx_pool_find(mtxpool_sleep, bp);
+ mtx_lock(mtxp);
+ bp->bio_flags |= BIO_DONE;
wakeup(bp);
- mtx_unlock(mtxp);
- if (done != NULL)
+ mtx_unlock(mtxp);
+ } else {
+ bp->bio_flags |= BIO_DONE;
done(bp);
+ }
}
/*
* Wait for a BIO to finish.
- *
- * XXX: resort to a timeout for now. The optimal locking (if any) for this
- * case is not yet clear.
*/
int
biowait(struct bio *bp, const char *wchan)
@@ -3257,7 +3672,7 @@
mtxp = mtx_pool_find(mtxpool_sleep, bp);
mtx_lock(mtxp);
while ((bp->bio_flags & BIO_DONE) == 0)
- msleep(bp, mtxp, PRIBIO, wchan, hz / 10);
+ msleep(bp, mtxp, PRIBIO, wchan, 0);
mtx_unlock(mtxp);
if (bp->bio_error != 0)
return (bp->bio_error);
@@ -3366,7 +3781,7 @@
bip->bio_offset = bp->b_iooffset;
bip->bio_length = bp->b_bcount;
bip->bio_bcount = bp->b_bcount; /* XXX: remove */
- bip->bio_data = bp->b_data;
+ bdata2bio(bp, bip);
bip->bio_done = bufdonebio;
bip->bio_caller2 = bp;
bip->bio_dev = dev;
@@ -3385,11 +3800,11 @@
* assuming B_INVAL is clear.
*
* For the VMIO case, we set B_CACHE if the op was a read and no
- * read error occured, or if the op was a write. B_CACHE is never
+ * read error occurred, or if the op was a write. B_CACHE is never
* set if the buffer is invalid or otherwise uncacheable.
*
* biodone does not mess with B_INVAL, allowing the I/O routine or the
- * initiator to leave B_INVAL set to brelse the buffer out of existance
+ * initiator to leave B_INVAL set to brelse the buffer out of existence
* in the biodone routine.
*/
void
@@ -3455,7 +3870,7 @@
/*
* Set B_CACHE if the op was a normal read and no error
- * occured. B_CACHE is set for writes in the b*write()
+ * occurred. B_CACHE is set for writes in the b*write()
* routines.
*/
iosize = bp->b_bcount - bp->b_resid;
@@ -3465,7 +3880,7 @@
bp->b_flags |= B_CACHE;
}
bogus = 0;
- VM_OBJECT_LOCK(obj);
+ VM_OBJECT_WLOCK(obj);
for (i = 0; i < bp->b_npages; i++) {
int bogusflag = 0;
int resid;
@@ -3501,16 +3916,18 @@
vfs_page_set_valid(bp, foff, m);
}
- vm_page_io_finish(m);
+ vm_page_sunbusy(m);
vm_object_pip_subtract(obj, 1);
foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
iosize -= resid;
}
vm_object_pip_wakeupn(obj, 0);
- VM_OBJECT_UNLOCK(obj);
- if (bogus)
+ VM_OBJECT_WUNLOCK(obj);
+ if (bogus && (bp->b_flags & B_UNMAPPED) == 0) {
+ BUF_CHECK_MAPPED(bp);
pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
bp->b_pages, bp->b_npages);
+ }
}
/*
@@ -3531,7 +3948,7 @@
/*
* This routine is called in lieu of iodone in the case of
* incomplete I/O. This keeps the busy status for pages
- * consistant.
+ * consistent.
*/
void
vfs_unbusy_pages(struct buf *bp)
@@ -3545,7 +3962,7 @@
return;
obj = bp->b_bufobj->bo_object;
- VM_OBJECT_LOCK(obj);
+ VM_OBJECT_WLOCK(obj);
for (i = 0; i < bp->b_npages; i++) {
m = bp->b_pages[i];
if (m == bogus_page) {
@@ -3553,14 +3970,18 @@
if (!m)
panic("vfs_unbusy_pages: page missing\n");
bp->b_pages[i] = m;
- pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
- bp->b_pages, bp->b_npages);
+ if ((bp->b_flags & B_UNMAPPED) == 0) {
+ BUF_CHECK_MAPPED(bp);
+ pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
+ bp->b_pages, bp->b_npages);
+ } else
+ BUF_CHECK_UNMAPPED(bp);
}
vm_object_pip_subtract(obj, 1);
- vm_page_io_finish(m);
+ vm_page_sunbusy(m);
}
vm_object_pip_wakeupn(obj, 0);
- VM_OBJECT_UNLOCK(obj);
+ VM_OBJECT_WUNLOCK(obj);
}
/*
@@ -3591,7 +4012,7 @@
* entire page.
*/
if (eoff > off)
- vm_page_set_valid(m, off & PAGE_MASK, eoff - off);
+ vm_page_set_valid_range(m, off & PAGE_MASK, eoff - off);
}
/*
@@ -3607,7 +4028,7 @@
/*
* Start and end offsets in buffer. eoff - soff may not cross a
- * page boundry or cross the end of the buffer. The end of the
+ * page boundary or cross the end of the buffer. The end of the
* buffer, in this case, is our file EOF, not the allocation size
* of the buffer.
*/
@@ -3630,28 +4051,32 @@
}
/*
- * Ensure that all buffer pages are not busied by VPO_BUSY flag. If
- * any page is busy, drain the flag.
+ * Ensure that all buffer pages are not exclusive busied. If any page is
+ * exclusive busy, drain it.
*/
-static void
+void
vfs_drain_busy_pages(struct buf *bp)
{
vm_page_t m;
int i, last_busied;
- VM_OBJECT_LOCK_ASSERT(bp->b_bufobj->bo_object, MA_OWNED);
+ VM_OBJECT_ASSERT_WLOCKED(bp->b_bufobj->bo_object);
last_busied = 0;
for (i = 0; i < bp->b_npages; i++) {
m = bp->b_pages[i];
- if ((m->oflags & VPO_BUSY) != 0) {
+ if (vm_page_xbusied(m)) {
for (; last_busied < i; last_busied++)
- vm_page_busy(bp->b_pages[last_busied]);
- while ((m->oflags & VPO_BUSY) != 0)
- vm_page_sleep(m, "vbpage");
+ vm_page_sbusy(bp->b_pages[last_busied]);
+ while (vm_page_xbusied(m)) {
+ vm_page_lock(m);
+ VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
+ vm_page_busy_sleep(m, "vbpage", true);
+ VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
+ }
}
}
for (i = 0; i < last_busied; i++)
- vm_page_wakeup(bp->b_pages[i]);
+ vm_page_sunbusy(bp->b_pages[i]);
}
/*
@@ -3658,12 +4083,12 @@
* This routine is called before a device strategy routine.
* It is used to tell the VM system that paging I/O is in
* progress, and treat the pages associated with the buffer
- * almost as being VPO_BUSY. Also the object paging_in_progress
+ * almost as being exclusive busy. Also the object paging_in_progress
* flag is handled to make sure that the object doesn't become
- * inconsistant.
+ * inconsistent.
*
* Since I/O has not been initiated yet, certain buffer flags
- * such as BIO_ERROR or B_INVAL may be in an inconsistant state
+ * such as BIO_ERROR or B_INVAL may be in an inconsistent state
* and should be ignored.
*/
void
@@ -3681,7 +4106,7 @@
foff = bp->b_offset;
KASSERT(bp->b_offset != NOOFFSET,
("vfs_busy_pages: no buffer offset"));
- VM_OBJECT_LOCK(obj);
+ VM_OBJECT_WLOCK(obj);
vfs_drain_busy_pages(bp);
if (bp->b_bufsize != 0)
vfs_setdirty_locked_object(bp);
@@ -3691,7 +4116,7 @@
if ((bp->b_flags & B_CLUSTER) == 0) {
vm_object_pip_add(obj, 1);
- vm_page_io_start(m);
+ vm_page_sbusy(m);
}
/*
* When readying a buffer for a read ( i.e
@@ -3718,10 +4143,12 @@
}
foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
}
- VM_OBJECT_UNLOCK(obj);
- if (bogus)
+ VM_OBJECT_WUNLOCK(obj);
+ if (bogus && (bp->b_flags & B_UNMAPPED) == 0) {
+ BUF_CHECK_MAPPED(bp);
pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
bp->b_pages, bp->b_npages);
+ }
}
/*
@@ -3749,17 +4176,17 @@
base += (bp->b_offset & PAGE_MASK);
n = PAGE_SIZE - (base & PAGE_MASK);
- VM_OBJECT_LOCK(bp->b_bufobj->bo_object);
+ VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) {
m = bp->b_pages[i];
if (n > size)
n = size;
- vm_page_set_valid(m, base & PAGE_MASK, n);
+ vm_page_set_valid_range(m, base & PAGE_MASK, n);
base += n;
size -= n;
n = PAGE_SIZE;
}
- VM_OBJECT_UNLOCK(bp->b_bufobj->bo_object);
+ VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
}
/*
@@ -3785,13 +4212,13 @@
}
bp->b_flags &= ~B_INVAL;
bp->b_ioflags &= ~BIO_ERROR;
- VM_OBJECT_LOCK(bp->b_bufobj->bo_object);
+ VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
if ((bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE) &&
(bp->b_offset & PAGE_MASK) == 0) {
if (bp->b_pages[0] == bogus_page)
goto unlock;
mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1;
- VM_OBJECT_LOCK_ASSERT(bp->b_pages[0]->object, MA_OWNED);
+ VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[0]->object);
if ((bp->b_pages[0]->valid & mask) == mask)
goto unlock;
if ((bp->b_pages[0]->valid & mask) == 0) {
@@ -3811,7 +4238,7 @@
continue;
j = sa / DEV_BSIZE;
mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j;
- VM_OBJECT_LOCK_ASSERT(bp->b_pages[i]->object, MA_OWNED);
+ VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[i]->object);
if ((bp->b_pages[i]->valid & mask) == mask)
continue;
if ((bp->b_pages[i]->valid & mask) == 0)
@@ -3827,10 +4254,34 @@
bp->b_pages[i]->valid |= mask;
}
unlock:
- VM_OBJECT_UNLOCK(bp->b_bufobj->bo_object);
+ VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
bp->b_resid = 0;
}
+void
+vfs_bio_bzero_buf(struct buf *bp, int base, int size)
+{
+ vm_page_t m;
+ int i, n;
+
+ if ((bp->b_flags & B_UNMAPPED) == 0) {
+ BUF_CHECK_MAPPED(bp);
+ bzero(bp->b_data + base, size);
+ } else {
+ BUF_CHECK_UNMAPPED(bp);
+ n = PAGE_SIZE - (base & PAGE_MASK);
+ for (i = base / PAGE_SIZE; size > 0 && i < bp->b_npages; ++i) {
+ m = bp->b_pages[i];
+ if (n > size)
+ n = size;
+ pmap_zero_page_area(m, base & PAGE_MASK, n);
+ base += n;
+ size -= n;
+ n = PAGE_SIZE;
+ }
+ }
+}
+
/*
* vm_hold_load_pages and vm_hold_free_pages get pages into
* a buffers address space. The pages are anonymous and are
@@ -3843,6 +4294,8 @@
vm_page_t p;
int index;
+ BUF_CHECK_MAPPED(bp);
+
to = round_page(to);
from = round_page(from);
index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
@@ -3874,6 +4327,8 @@
vm_page_t p;
int index, newnpages;
+ BUF_CHECK_MAPPED(bp);
+
from = round_page((vm_offset_t)bp->b_data + newbsize);
newnpages = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT;
if (bp->b_npages > newnpages)
@@ -3881,7 +4336,7 @@
for (index = newnpages; index < bp->b_npages; index++) {
p = bp->b_pages[index];
bp->b_pages[index] = NULL;
- if (p->busy != 0)
+ if (vm_page_sbusied(p))
printf("vm_hold_free_pages: blkno: %jd, lblkno: %jd\n",
(intmax_t)bp->b_blkno, (intmax_t)bp->b_lblkno);
p->wire_count--;
@@ -3904,7 +4359,7 @@
* check the return value.
*/
int
-vmapbuf(struct buf *bp)
+vmapbuf(struct buf *bp, int mapbuf)
{
caddr_t kva;
vm_prot_t prot;
@@ -3919,12 +4374,19 @@
(vm_offset_t)bp->b_data, bp->b_bufsize, prot, bp->b_pages,
btoc(MAXPHYS))) < 0)
return (-1);
- pmap_qenter((vm_offset_t)bp->b_saveaddr, bp->b_pages, pidx);
-
- kva = bp->b_saveaddr;
bp->b_npages = pidx;
- bp->b_saveaddr = bp->b_data;
- bp->b_data = kva + (((vm_offset_t) bp->b_data) & PAGE_MASK);
+ if (mapbuf || !unmapped_buf_allowed) {
+ pmap_qenter((vm_offset_t)bp->b_saveaddr, bp->b_pages, pidx);
+ kva = bp->b_saveaddr;
+ bp->b_saveaddr = bp->b_data;
+ bp->b_data = kva + (((vm_offset_t)bp->b_data) & PAGE_MASK);
+ bp->b_flags &= ~B_UNMAPPED;
+ } else {
+ bp->b_flags |= B_UNMAPPED;
+ bp->b_offset = ((vm_offset_t)bp->b_data) & PAGE_MASK;
+ bp->b_saveaddr = bp->b_data;
+ bp->b_data = unmapped_buf;
+ }
return(0);
}
@@ -3938,7 +4400,10 @@
int npages;
npages = bp->b_npages;
- pmap_qremove(trunc_page((vm_offset_t)bp->b_data), npages);
+ if (bp->b_flags & B_UNMAPPED)
+ bp->b_flags &= ~B_UNMAPPED;
+ else
+ pmap_qremove(trunc_page((vm_offset_t)bp->b_data), npages);
vm_page_unhold_pages(bp->b_pages, npages);
bp->b_data = bp->b_saveaddr;
@@ -3994,7 +4459,7 @@
{
KASSERT(bo != NULL, ("NULL bo in bufobj_wref"));
- ASSERT_BO_LOCKED(bo);
+ ASSERT_BO_WLOCKED(bo);
bo->bo_numoutput++;
}
@@ -4028,11 +4493,11 @@
int error;
KASSERT(bo != NULL, ("NULL bo in bufobj_wwait"));
- ASSERT_BO_LOCKED(bo);
+ ASSERT_BO_WLOCKED(bo);
error = 0;
while (bo->bo_numoutput) {
bo->bo_flag |= BO_WWAIT;
- error = msleep(&bo->bo_numoutput, BO_MTX(bo),
+ error = msleep(&bo->bo_numoutput, BO_LOCKPTR(bo),
slpflag | (PRIBIO + 1), "bo_wwait", timeo);
if (error)
break;
@@ -4075,6 +4540,30 @@
mtx_unlock(mtxp);
}
+/*
+ * Set bio_data or bio_ma for struct bio from the struct buf.
+ */
+void
+bdata2bio(struct buf *bp, struct bio *bip)
+{
+
+ if ((bp->b_flags & B_UNMAPPED) != 0) {
+ KASSERT(unmapped_buf_allowed, ("unmapped"));
+ bip->bio_ma = bp->b_pages;
+ bip->bio_ma_n = bp->b_npages;
+ bip->bio_data = unmapped_buf;
+ bip->bio_ma_offset = (vm_offset_t)bp->b_offset & PAGE_MASK;
+ bip->bio_flags |= BIO_UNMAPPED;
+ KASSERT(round_page(bip->bio_ma_offset + bip->bio_length) /
+ PAGE_SIZE == bp->b_npages,
+ ("Buffer %p too short: %d %lld %d", bp, bip->bio_ma_offset,
+ (long long)bip->bio_length, bip->bio_ma_n));
+ } else {
+ bip->bio_data = bp->b_data;
+ bip->bio_ma = NULL;
+ }
+}
+
#include "opt_ddb.h"
#ifdef DDB
#include <ddb/ddb.h>
@@ -4166,7 +4655,7 @@
for (i = 0; i < nbuf; i++) {
bp = &buf[i];
- if ((bp->b_vflags & BV_INFREECNT) != 0)
+ if ((bp->b_flags & B_INFREECNT) != 0)
nfree++;
else
used++;
Modified: trunk/sys/kern/vfs_cache.c
===================================================================
--- trunk/sys/kern/vfs_cache.c 2018-05-25 21:07:58 UTC (rev 9951)
+++ trunk/sys/kern/vfs_cache.c 2018-05-26 14:24:52 UTC (rev 9952)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 1989, 1993, 1995
* The Regents of the University of California. All rights reserved.
@@ -33,7 +34,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/vfs_cache.c 324611 2017-10-13 21:58:44Z jhb $");
#include "opt_kdtrace.h"
#include "opt_ktrace.h"
@@ -62,28 +63,28 @@
#include <vm/uma.h>
SDT_PROVIDER_DECLARE(vfs);
-SDT_PROBE_DEFINE3(vfs, namecache, enter, done, done, "struct vnode *", "char *",
+SDT_PROBE_DEFINE3(vfs, namecache, enter, done, "struct vnode *", "char *",
"struct vnode *");
-SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, done, "struct vnode *",
+SDT_PROBE_DEFINE2(vfs, namecache, enter_negative, done, "struct vnode *",
"char *");
-SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, entry, "struct vnode *");
-SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, hit, "struct vnode *",
- "struct char *", "struct vnode *");
-SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, miss, "struct vnode *");
-SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, return, "int",
- "struct vnode *", "struct char *");
-SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, hit, "struct vnode *", "char *",
+SDT_PROBE_DEFINE1(vfs, namecache, fullpath, entry, "struct vnode *");
+SDT_PROBE_DEFINE3(vfs, namecache, fullpath, hit, "struct vnode *",
+ "char *", "struct vnode *");
+SDT_PROBE_DEFINE1(vfs, namecache, fullpath, miss, "struct vnode *");
+SDT_PROBE_DEFINE3(vfs, namecache, fullpath, return, "int",
+ "struct vnode *", "char *");
+SDT_PROBE_DEFINE3(vfs, namecache, lookup, hit, "struct vnode *", "char *",
"struct vnode *");
-SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit_negative, hit-negative,
+SDT_PROBE_DEFINE2(vfs, namecache, lookup, hit__negative,
"struct vnode *", "char *");
-SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, miss, "struct vnode *",
+SDT_PROBE_DEFINE2(vfs, namecache, lookup, miss, "struct vnode *",
"char *");
-SDT_PROBE_DEFINE1(vfs, namecache, purge, done, done, "struct vnode *");
-SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, done, "struct vnode *");
-SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, done, "struct mount *");
-SDT_PROBE_DEFINE3(vfs, namecache, zap, done, done, "struct vnode *", "char *",
+SDT_PROBE_DEFINE1(vfs, namecache, purge, done, "struct vnode *");
+SDT_PROBE_DEFINE1(vfs, namecache, purge_negative, done, "struct vnode *");
+SDT_PROBE_DEFINE1(vfs, namecache, purgevfs, done, "struct mount *");
+SDT_PROBE_DEFINE3(vfs, namecache, zap, done, "struct vnode *", "char *",
"struct vnode *");
-SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, done, "struct vnode *",
+SDT_PROBE_DEFINE2(vfs, namecache, zap_negative, done, "struct vnode *",
"char *");
/*
@@ -150,7 +151,7 @@
*/
/*
- * Structures associated with name cacheing.
+ * Structures associated with name caching.
*/
#define NCHHASH(hash) \
(&nchashtbl[(hash) & nchash])
@@ -265,7 +266,7 @@
"VFS namecache enabled");
/* Export size information to userland */
-SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, 0,
+SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, SYSCTL_NULL_INT_PTR,
sizeof(struct namecache), "sizeof(struct namecache)");
/*
@@ -289,7 +290,7 @@
"Number of cache misses");
static u_long nummisszap; STATNODE(CTLFLAG_RD, nummisszap, &nummisszap,
"Number of cache misses we do not want to cache");
-static u_long numposzaps; STATNODE(CTLFLAG_RD, numposzaps, &numposzaps,
+static u_long numposzaps; STATNODE(CTLFLAG_RD, numposzaps, &numposzaps,
"Number of cache hits (positive) we do not want to cache");
static u_long numposhits; STATNODE(CTLFLAG_RD, numposhits, &numposhits,
"Number of cache hits (positive)");
@@ -304,8 +305,6 @@
&nchstats, sizeof(nchstats), "LU",
"VFS cache effectiveness statistics");
-
-
static void cache_zap(struct namecache *ncp);
static int vn_vptocnp_locked(struct vnode **vp, struct ucred *cred, char *buf,
u_int *buflen);
@@ -324,29 +323,31 @@
static int
sysctl_debug_hashstat_rawnchash(SYSCTL_HANDLER_ARGS)
{
- int error;
struct nchashhead *ncpp;
struct namecache *ncp;
- int n_nchash;
- int count;
+ int i, error, n_nchash, *cntbuf;
+retry:
n_nchash = nchash + 1; /* nchash is max index, not count */
if (!req->oldptr)
return SYSCTL_OUT(req, 0, n_nchash * sizeof(int));
-
- /* Scan hash tables for applicable entries */
- for (ncpp = nchashtbl; n_nchash > 0; n_nchash--, ncpp++) {
- CACHE_RLOCK();
- count = 0;
- LIST_FOREACH(ncp, ncpp, nc_hash) {
- count++;
- }
+ cntbuf = malloc(n_nchash * sizeof(int), M_TEMP, M_ZERO | M_WAITOK);
+ CACHE_RLOCK();
+ if (n_nchash != nchash + 1) {
CACHE_RUNLOCK();
- error = SYSCTL_OUT(req, &count, sizeof(count));
- if (error)
- return (error);
+ free(cntbuf, M_TEMP);
+ goto retry;
}
- return (0);
+ /* Scan hash tables counting entries */
+ for (ncpp = nchashtbl, i = 0; i < n_nchash; ncpp++, i++)
+ LIST_FOREACH(ncp, ncpp, nc_hash)
+ cntbuf[i]++;
+ CACHE_RUNLOCK();
+ for (error = 0, i = 0; i < n_nchash; i++)
+ if ((error = SYSCTL_OUT(req, &cntbuf[i], sizeof(int))) != 0)
+ break;
+ free(cntbuf, M_TEMP);
+ return (error);
}
SYSCTL_PROC(_debug_hashstat, OID_AUTO, rawnchash, CTLTYPE_INT|CTLFLAG_RD|
CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_rawnchash, "S,int",
@@ -382,7 +383,7 @@
maxlength = count;
}
n_nchash = nchash + 1;
- pct = (used * 100 * 100) / n_nchash;
+ pct = (used * 100) / (n_nchash / 100);
error = SYSCTL_OUT(req, &n_nchash, sizeof(n_nchash));
if (error)
return (error);
@@ -399,7 +400,7 @@
}
SYSCTL_PROC(_debug_hashstat, OID_AUTO, nchash, CTLTYPE_INT|CTLFLAG_RD|
CTLFLAG_MPSAFE, 0, 0, sysctl_debug_hashstat_nchash, "I",
- "nchash chain lengths");
+ "nchash statistics (number of total/used buckets, maximum chain length, usage percentage)");
#endif
/*
@@ -409,22 +410,19 @@
* pointer to a vnode or if it is just a negative cache entry.
*/
static void
-cache_zap(ncp)
- struct namecache *ncp;
+cache_zap(struct namecache *ncp)
{
struct vnode *vp;
rw_assert(&cache_lock, RA_WLOCKED);
CTR2(KTR_VFS, "cache_zap(%p) vp %p", ncp, ncp->nc_vp);
-#ifdef KDTRACE_HOOKS
if (ncp->nc_vp != NULL) {
- SDT_PROBE(vfs, namecache, zap, done, ncp->nc_dvp,
- nc_get_name(ncp), ncp->nc_vp, 0, 0);
+ SDT_PROBE3(vfs, namecache, zap, done, ncp->nc_dvp,
+ nc_get_name(ncp), ncp->nc_vp);
} else {
- SDT_PROBE(vfs, namecache, zap_negative, done, ncp->nc_dvp,
- nc_get_name(ncp), 0, 0, 0);
+ SDT_PROBE2(vfs, namecache, zap_negative, done, ncp->nc_dvp,
+ nc_get_name(ncp));
}
-#endif
vp = NULL;
LIST_REMOVE(ncp, nc_hash);
if (ncp->nc_flag & NCF_ISDOTDOT) {
@@ -447,7 +445,7 @@
}
numcache--;
cache_free(ncp);
- if (vp)
+ if (vp != NULL)
vdrop(vp);
}
@@ -458,7 +456,7 @@
* cnp pointing to the name of the entry being sought. If the lookup
* succeeds, the vnode is returned in *vpp, and a status of -1 is
* returned. If the lookup determines that the name does not exist
- * (negative cacheing), a status of ENOENT is returned. If the lookup
+ * (negative caching), a status of ENOENT is returned. If the lookup
* fails, a status of zero is returned. If the directory vnode is
* recycled out from under us due to a forced unmount, a status of
* ENOENT is returned.
@@ -469,12 +467,8 @@
*/
int
-cache_lookup_times(dvp, vpp, cnp, tsp, ticksp)
- struct vnode *dvp;
- struct vnode **vpp;
- struct componentname *cnp;
- struct timespec *tsp;
- int *ticksp;
+cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp,
+ struct timespec *tsp, int *ticksp)
{
struct namecache *ncp;
uint32_t hash;
@@ -497,8 +491,7 @@
CTR2(KTR_VFS, "cache_lookup(%p, %s) found via .",
dvp, cnp->cn_nameptr);
dothits++;
- SDT_PROBE(vfs, namecache, lookup, hit, dvp, ".",
- *vpp, 0, 0);
+ SDT_PROBE3(vfs, namecache, lookup, hit, dvp, ".", *vpp);
if (tsp != NULL)
timespecclear(tsp);
if (ticksp != NULL)
@@ -508,8 +501,8 @@
if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') {
dotdothits++;
if (dvp->v_cache_dd == NULL) {
- SDT_PROBE(vfs, namecache, lookup, miss, dvp,
- "..", NULL, 0, 0);
+ SDT_PROBE3(vfs, namecache, lookup, miss, dvp,
+ "..", NULL);
goto unlock;
}
if ((cnp->cn_flags & MAKEENTRY) == 0) {
@@ -531,8 +524,8 @@
goto negative_success;
CTR3(KTR_VFS, "cache_lookup(%p, %s) found %p via ..",
dvp, cnp->cn_nameptr, *vpp);
- SDT_PROBE(vfs, namecache, lookup, hit, dvp, "..",
- *vpp, 0, 0);
+ SDT_PROBE3(vfs, namecache, lookup, hit, dvp, "..",
+ *vpp);
cache_out_ts(ncp, tsp, ticksp);
if ((ncp->nc_flag & (NCF_ISDOTDOT | NCF_DTS)) ==
NCF_DTS && tsp != NULL)
@@ -553,8 +546,8 @@
/* We failed to find an entry */
if (ncp == NULL) {
- SDT_PROBE(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr,
- NULL, 0, 0);
+ SDT_PROBE3(vfs, namecache, lookup, miss, dvp, cnp->cn_nameptr,
+ NULL);
if ((cnp->cn_flags & MAKEENTRY) == 0) {
nummisszap++;
} else {
@@ -582,8 +575,8 @@
*vpp = ncp->nc_vp;
CTR4(KTR_VFS, "cache_lookup(%p, %s) found %p via ncp %p",
dvp, cnp->cn_nameptr, *vpp, ncp);
- SDT_PROBE(vfs, namecache, lookup, hit, dvp, nc_get_name(ncp),
- *vpp, 0, 0);
+ SDT_PROBE3(vfs, namecache, lookup, hit, dvp, nc_get_name(ncp),
+ *vpp);
cache_out_ts(ncp, tsp, ticksp);
goto success;
}
@@ -614,8 +607,8 @@
nchstats.ncs_neghits++;
if (ncp->nc_flag & NCF_WHITE)
cnp->cn_flags |= ISWHITEOUT;
- SDT_PROBE(vfs, namecache, lookup, hit_negative, dvp, nc_get_name(ncp),
- 0, 0, 0);
+ SDT_PROBE2(vfs, namecache, lookup, hit__negative, dvp,
+ nc_get_name(ncp));
cache_out_ts(ncp, tsp, ticksp);
CACHE_WUNLOCK();
return (ENOENT);
@@ -703,12 +696,8 @@
* Add an entry to the cache.
*/
void
-cache_enter_time(dvp, vp, cnp, tsp, dtsp)
- struct vnode *dvp;
- struct vnode *vp;
- struct componentname *cnp;
- struct timespec *tsp;
- struct timespec *dtsp;
+cache_enter_time(struct vnode *dvp, struct vnode *vp, struct componentname *cnp,
+ struct timespec *tsp, struct timespec *dtsp)
{
struct namecache *ncp, *n2;
struct namecache_ts *n3;
@@ -749,23 +738,26 @@
ncp->nc_flag & NCF_ISDOTDOT) {
KASSERT(ncp->nc_dvp == dvp,
("wrong isdotdot parent"));
- if (ncp->nc_vp != NULL)
+ if (ncp->nc_vp != NULL) {
TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst,
ncp, nc_dst);
- else
+ } else {
TAILQ_REMOVE(&ncneg, ncp, nc_dst);
- if (vp != NULL)
+ numneg--;
+ }
+ if (vp != NULL) {
TAILQ_INSERT_HEAD(&vp->v_cache_dst,
ncp, nc_dst);
- else
+ } else {
TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
+ numneg++;
+ }
ncp->nc_vp = vp;
CACHE_WUNLOCK();
return;
}
dvp->v_cache_dd = NULL;
- SDT_PROBE(vfs, namecache, enter, done, dvp, "..", vp,
- 0, 0);
+ SDT_PROBE3(vfs, namecache, enter, done, dvp, "..", vp);
CACHE_WUNLOCK();
flag = NCF_ISDOTDOT;
}
@@ -835,9 +827,9 @@
* has populated v_cache_dd pointer already.
*/
if (dvp->v_cache_dd != NULL) {
- CACHE_WUNLOCK();
- cache_free(ncp);
- return;
+ CACHE_WUNLOCK();
+ cache_free(ncp);
+ return;
}
KASSERT(vp == NULL || vp->v_type == VDIR,
("wrong vnode type %p", vp));
@@ -845,7 +837,7 @@
}
numcache++;
- if (!vp) {
+ if (vp == NULL) {
numneg++;
if (cnp->cn_flags & ISWHITEOUT)
ncp->nc_flag |= NCF_WHITE;
@@ -883,17 +875,19 @@
* "negative" cache queue, otherwise, we place it into the
* destination vnode's cache entries queue.
*/
- if (vp) {
+ if (vp != NULL) {
TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst);
- SDT_PROBE(vfs, namecache, enter, done, dvp, nc_get_name(ncp),
- vp, 0, 0);
+ SDT_PROBE3(vfs, namecache, enter, done, dvp, nc_get_name(ncp),
+ vp);
} else {
TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst);
- SDT_PROBE(vfs, namecache, enter_negative, done, dvp,
- nc_get_name(ncp), 0, 0, 0);
+ SDT_PROBE2(vfs, namecache, enter_negative, done, dvp,
+ nc_get_name(ncp));
}
if (numneg * ncnegfactor > numcache) {
ncp = TAILQ_FIRST(&ncneg);
+ KASSERT(ncp->nc_vp == NULL, ("ncp %p vp %p on ncneg",
+ ncp, ncp->nc_vp));
zap = 1;
}
if (hold)
@@ -914,32 +908,73 @@
cache_zone_small = uma_zcreate("S VFS Cache",
sizeof(struct namecache) + CACHE_PATH_CUTOFF + 1,
- NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT);
+ NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache),
+ UMA_ZONE_ZINIT);
cache_zone_small_ts = uma_zcreate("STS VFS Cache",
sizeof(struct namecache_ts) + CACHE_PATH_CUTOFF + 1,
- NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT);
+ NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache_ts),
+ UMA_ZONE_ZINIT);
cache_zone_large = uma_zcreate("L VFS Cache",
sizeof(struct namecache) + NAME_MAX + 1,
- NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT);
+ NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache),
+ UMA_ZONE_ZINIT);
cache_zone_large_ts = uma_zcreate("LTS VFS Cache",
sizeof(struct namecache_ts) + NAME_MAX + 1,
- NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_ZINIT);
+ NULL, NULL, NULL, NULL, UMA_ALIGNOF(struct namecache_ts),
+ UMA_ZONE_ZINIT);
nchashtbl = hashinit(desiredvnodes * 2, M_VFSCACHE, &nchash);
}
SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nchinit, NULL);
+void
+cache_changesize(int newmaxvnodes)
+{
+ struct nchashhead *new_nchashtbl, *old_nchashtbl;
+ u_long new_nchash, old_nchash;
+ struct namecache *ncp;
+ uint32_t hash;
+ int i;
+ new_nchashtbl = hashinit(newmaxvnodes * 2, M_VFSCACHE, &new_nchash);
+ /* If same hash table size, nothing to do */
+ if (nchash == new_nchash) {
+ free(new_nchashtbl, M_VFSCACHE);
+ return;
+ }
+ /*
+ * Move everything from the old hash table to the new table.
+ * None of the namecache entries in the table can be removed
+ * because to do so, they have to be removed from the hash table.
+ */
+ CACHE_WLOCK();
+ old_nchashtbl = nchashtbl;
+ old_nchash = nchash;
+ nchashtbl = new_nchashtbl;
+ nchash = new_nchash;
+ for (i = 0; i <= old_nchash; i++) {
+ while ((ncp = LIST_FIRST(&old_nchashtbl[i])) != NULL) {
+ hash = fnv_32_buf(nc_get_name(ncp), ncp->nc_nlen,
+ FNV1_32_INIT);
+ hash = fnv_32_buf(&ncp->nc_dvp, sizeof(ncp->nc_dvp),
+ hash);
+ LIST_REMOVE(ncp, nc_hash);
+ LIST_INSERT_HEAD(NCHHASH(hash), ncp, nc_hash);
+ }
+ }
+ CACHE_WUNLOCK();
+ free(old_nchashtbl, M_VFSCACHE);
+}
+
/*
* Invalidate all entries to a particular vnode.
*/
void
-cache_purge(vp)
- struct vnode *vp;
+cache_purge(struct vnode *vp)
{
CTR1(KTR_VFS, "cache_purge(%p)", vp);
- SDT_PROBE(vfs, namecache, purge, done, vp, 0, 0, 0, 0);
+ SDT_PROBE1(vfs, namecache, purge, done, vp);
CACHE_WLOCK();
while (!LIST_EMPTY(&vp->v_cache_src))
cache_zap(LIST_FIRST(&vp->v_cache_src));
@@ -958,13 +993,12 @@
* Invalidate all negative entries for a particular directory vnode.
*/
void
-cache_purge_negative(vp)
- struct vnode *vp;
+cache_purge_negative(struct vnode *vp)
{
struct namecache *cp, *ncp;
CTR1(KTR_VFS, "cache_purge_negative(%p)", vp);
- SDT_PROBE(vfs, namecache, purge_negative, done, vp, 0, 0, 0, 0);
+ SDT_PROBE1(vfs, namecache, purge_negative, done, vp);
CACHE_WLOCK();
LIST_FOREACH_SAFE(cp, &vp->v_cache_src, nc_src, ncp) {
if (cp->nc_vp == NULL)
@@ -977,14 +1011,13 @@
* Flush all entries referencing a particular filesystem.
*/
void
-cache_purgevfs(mp)
- struct mount *mp;
+cache_purgevfs(struct mount *mp)
{
struct nchashhead *ncpp;
struct namecache *ncp, *nnp;
/* Scan hash tables for applicable entries */
- SDT_PROBE(vfs, namecache, purgevfs, done, mp, 0, 0, 0, 0);
+ SDT_PROBE1(vfs, namecache, purgevfs, done, mp);
CACHE_WLOCK();
for (ncpp = &nchashtbl[nchash]; ncpp >= nchashtbl; ncpp--) {
LIST_FOREACH_SAFE(ncp, ncpp, nc_hash, nnp) {
@@ -1001,12 +1034,7 @@
*/
int
-vfs_cache_lookup(ap)
- struct vop_lookup_args /* {
- struct vnode *a_dvp;
- struct vnode **a_vpp;
- struct componentname *a_cnp;
- } */ *ap;
+vfs_cache_lookup(struct vop_lookup_args *ap)
{
struct vnode *dvp;
int error;
@@ -1030,7 +1058,7 @@
if (error)
return (error);
- error = cache_lookup(dvp, vpp, cnp);
+ error = cache_lookup(dvp, vpp, cnp, NULL, NULL);
if (error == 0)
return (VOP_CACHEDLOOKUP(dvp, vpp, cnp));
if (error == -1)
@@ -1038,14 +1066,6 @@
return (error);
}
-
-#ifndef _SYS_SYSPROTO_H_
-struct __getcwd_args {
- u_char *buf;
- u_int buflen;
-};
-#endif
-
/*
* XXX All of these sysctls would probably be more productive dead.
*/
@@ -1055,28 +1075,28 @@
/* Implementation of the getcwd syscall. */
int
-sys___getcwd(td, uap)
- struct thread *td;
- struct __getcwd_args *uap;
+sys___getcwd(struct thread *td, struct __getcwd_args *uap)
{
- return (kern___getcwd(td, uap->buf, UIO_USERSPACE, uap->buflen));
+ return (kern___getcwd(td, uap->buf, UIO_USERSPACE, uap->buflen,
+ MAXPATHLEN));
}
int
-kern___getcwd(struct thread *td, u_char *buf, enum uio_seg bufseg, u_int buflen)
+kern___getcwd(struct thread *td, char *buf, enum uio_seg bufseg, u_int buflen,
+ u_int path_max)
{
char *bp, *tmpbuf;
struct filedesc *fdp;
struct vnode *cdir, *rdir;
- int error, vfslocked;
+ int error;
if (disablecwd)
return (ENODEV);
if (buflen < 2)
return (EINVAL);
- if (buflen > MAXPATHLEN)
- buflen = MAXPATHLEN;
+ if (buflen > path_max)
+ buflen = path_max;
tmpbuf = malloc(buflen, M_TEMP, M_WAITOK);
fdp = td->td_proc->p_fd;
@@ -1087,12 +1107,8 @@
VREF(rdir);
FILEDESC_SUNLOCK(fdp);
error = vn_fullpath1(td, cdir, rdir, tmpbuf, &bp, buflen);
- vfslocked = VFS_LOCK_GIANT(rdir->v_mount);
vrele(rdir);
- VFS_UNLOCK_GIANT(vfslocked);
- vfslocked = VFS_LOCK_GIANT(cdir->v_mount);
vrele(cdir);
- VFS_UNLOCK_GIANT(vfslocked);
if (!error) {
if (bufseg == UIO_SYSSPACE)
@@ -1139,7 +1155,7 @@
char *buf;
struct filedesc *fdp;
struct vnode *rdir;
- int error, vfslocked;
+ int error;
if (disablefullpath)
return (ENODEV);
@@ -1153,9 +1169,7 @@
VREF(rdir);
FILEDESC_SUNLOCK(fdp);
error = vn_fullpath1(td, vn, rdir, buf, retbuf, MAXPATHLEN);
- vfslocked = VFS_LOCK_GIANT(rdir->v_mount);
vrele(rdir);
- VFS_UNLOCK_GIANT(vfslocked);
if (!error)
*freebuf = buf;
@@ -1208,7 +1222,7 @@
{
struct vnode *dvp;
struct namecache *ncp;
- int error, vfslocked;
+ int error;
TAILQ_FOREACH(ncp, &((*vp)->v_cache_dst), nc_dst) {
if ((ncp->nc_flag & NCF_ISDOTDOT) == 0)
@@ -1217,41 +1231,34 @@
if (ncp != NULL) {
if (*buflen < ncp->nc_nlen) {
CACHE_RUNLOCK();
- vfslocked = VFS_LOCK_GIANT((*vp)->v_mount);
vrele(*vp);
- VFS_UNLOCK_GIANT(vfslocked);
numfullpathfail4++;
error = ENOMEM;
- SDT_PROBE(vfs, namecache, fullpath, return, error,
- vp, NULL, 0, 0);
+ SDT_PROBE3(vfs, namecache, fullpath, return, error,
+ vp, NULL);
return (error);
}
*buflen -= ncp->nc_nlen;
memcpy(buf + *buflen, nc_get_name(ncp), ncp->nc_nlen);
- SDT_PROBE(vfs, namecache, fullpath, hit, ncp->nc_dvp,
- nc_get_name(ncp), vp, 0, 0);
+ SDT_PROBE3(vfs, namecache, fullpath, hit, ncp->nc_dvp,
+ nc_get_name(ncp), vp);
dvp = *vp;
*vp = ncp->nc_dvp;
vref(*vp);
CACHE_RUNLOCK();
- vfslocked = VFS_LOCK_GIANT(dvp->v_mount);
vrele(dvp);
- VFS_UNLOCK_GIANT(vfslocked);
CACHE_RLOCK();
return (0);
}
- SDT_PROBE(vfs, namecache, fullpath, miss, vp, 0, 0, 0, 0);
+ SDT_PROBE1(vfs, namecache, fullpath, miss, vp);
CACHE_RUNLOCK();
- vfslocked = VFS_LOCK_GIANT((*vp)->v_mount);
vn_lock(*vp, LK_SHARED | LK_RETRY);
error = VOP_VPTOCNP(*vp, &dvp, cred, buf, buflen);
vput(*vp);
- VFS_UNLOCK_GIANT(vfslocked);
if (error) {
numfullpathfail2++;
- SDT_PROBE(vfs, namecache, fullpath, return, error, vp,
- NULL, 0, 0);
+ SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL);
return (error);
}
@@ -1260,12 +1267,9 @@
if (dvp->v_iflag & VI_DOOMED) {
/* forced unmount */
CACHE_RUNLOCK();
- vfslocked = VFS_LOCK_GIANT(dvp->v_mount);
vrele(dvp);
- VFS_UNLOCK_GIANT(vfslocked);
error = ENOENT;
- SDT_PROBE(vfs, namecache, fullpath, return, error, vp,
- NULL, 0, 0);
+ SDT_PROBE3(vfs, namecache, fullpath, return, error, vp, NULL);
return (error);
}
/*
@@ -1282,7 +1286,7 @@
vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir,
char *buf, char **retbuf, u_int buflen)
{
- int error, slash_prefixed, vfslocked;
+ int error, slash_prefixed;
#ifdef KDTRACE_HOOKS
struct vnode *startvp = vp;
#endif
@@ -1293,7 +1297,7 @@
error = 0;
slash_prefixed = 0;
- SDT_PROBE(vfs, namecache, fullpath, entry, vp, 0, 0, 0, 0);
+ SDT_PROBE1(vfs, namecache, fullpath, entry, vp);
numfullpathcalls++;
vref(vp);
CACHE_RLOCK();
@@ -1303,9 +1307,7 @@
return (error);
if (buflen == 0) {
CACHE_RUNLOCK();
- vfslocked = VFS_LOCK_GIANT(vp->v_mount);
vrele(vp);
- VFS_UNLOCK_GIANT(vfslocked);
return (ENOMEM);
}
buf[--buflen] = '/';
@@ -1315,20 +1317,16 @@
if (vp->v_vflag & VV_ROOT) {
if (vp->v_iflag & VI_DOOMED) { /* forced unmount */
CACHE_RUNLOCK();
- vfslocked = VFS_LOCK_GIANT(vp->v_mount);
vrele(vp);
- VFS_UNLOCK_GIANT(vfslocked);
error = ENOENT;
- SDT_PROBE(vfs, namecache, fullpath, return,
- error, vp, NULL, 0, 0);
+ SDT_PROBE3(vfs, namecache, fullpath, return,
+ error, vp, NULL);
break;
}
vp1 = vp->v_mount->mnt_vnodecovered;
vref(vp1);
CACHE_RUNLOCK();
- vfslocked = VFS_LOCK_GIANT(vp->v_mount);
vrele(vp);
- VFS_UNLOCK_GIANT(vfslocked);
vp = vp1;
CACHE_RLOCK();
continue;
@@ -1335,13 +1333,11 @@
}
if (vp->v_type != VDIR) {
CACHE_RUNLOCK();
- vfslocked = VFS_LOCK_GIANT(vp->v_mount);
vrele(vp);
- VFS_UNLOCK_GIANT(vfslocked);
numfullpathfail1++;
error = ENOTDIR;
- SDT_PROBE(vfs, namecache, fullpath, return,
- error, vp, NULL, 0, 0);
+ SDT_PROBE3(vfs, namecache, fullpath, return,
+ error, vp, NULL);
break;
}
error = vn_vptocnp_locked(&vp, td->td_ucred, buf, &buflen);
@@ -1349,12 +1345,10 @@
break;
if (buflen == 0) {
CACHE_RUNLOCK();
- vfslocked = VFS_LOCK_GIANT(vp->v_mount);
vrele(vp);
- VFS_UNLOCK_GIANT(vfslocked);
error = ENOMEM;
- SDT_PROBE(vfs, namecache, fullpath, return, error,
- startvp, NULL, 0, 0);
+ SDT_PROBE3(vfs, namecache, fullpath, return, error,
+ startvp, NULL);
break;
}
buf[--buflen] = '/';
@@ -1365,12 +1359,10 @@
if (!slash_prefixed) {
if (buflen == 0) {
CACHE_RUNLOCK();
- vfslocked = VFS_LOCK_GIANT(vp->v_mount);
vrele(vp);
- VFS_UNLOCK_GIANT(vfslocked);
numfullpathfail4++;
- SDT_PROBE(vfs, namecache, fullpath, return, ENOMEM,
- startvp, NULL, 0, 0);
+ SDT_PROBE3(vfs, namecache, fullpath, return, ENOMEM,
+ startvp, NULL);
return (ENOMEM);
}
buf[--buflen] = '/';
@@ -1377,12 +1369,9 @@
}
numfullpathfound++;
CACHE_RUNLOCK();
- vfslocked = VFS_LOCK_GIANT(vp->v_mount);
vrele(vp);
- VFS_UNLOCK_GIANT(vfslocked);
- SDT_PROBE(vfs, namecache, fullpath, return, 0, startvp, buf + buflen,
- 0, 0);
+ SDT_PROBE3(vfs, namecache, fullpath, return, 0, startvp, buf + buflen);
*retbuf = buf + buflen;
return (0);
}
@@ -1432,12 +1421,9 @@
/* ABI compat shims for old kernel modules. */
#undef cache_enter
-#undef cache_lookup
void cache_enter(struct vnode *dvp, struct vnode *vp,
struct componentname *cnp);
-int cache_lookup(struct vnode *dvp, struct vnode **vpp,
- struct componentname *cnp);
void
cache_enter(struct vnode *dvp, struct vnode *vp, struct componentname *cnp)
@@ -1446,13 +1432,6 @@
cache_enter_time(dvp, vp, cnp, NULL, NULL);
}
-int
-cache_lookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp)
-{
-
- return (cache_lookup_times(dvp, vpp, cnp, NULL, NULL));
-}
-
/*
* This function updates path string to vnode's full global path
* and checks the size of the new path string against the pathlen argument.
@@ -1473,9 +1452,8 @@
struct nameidata nd;
struct vnode *vp1;
char *rpath, *fbuf;
- int error, vfslocked;
+ int error;
- VFS_ASSERT_GIANT(vp->v_mount);
ASSERT_VOP_ELOCKED(vp, __func__);
/* Return ENODEV if sysctl debug.disablefullpath==1 */
@@ -1502,7 +1480,7 @@
* As a side effect, the vnode is relocked.
* If vnode was renamed, return ENOENT.
*/
- NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1,
UIO_SYSSPACE, path, td);
error = namei(&nd);
if (error != 0) {
@@ -1509,7 +1487,6 @@
vrele(vp);
goto out;
}
- vfslocked = NDHASGIANT(&nd);
NDFREE(&nd, NDF_ONLY_PNBUF);
vp1 = nd.ni_vp;
vrele(vp);
@@ -1519,7 +1496,6 @@
vput(vp1);
error = ENOENT;
}
- VFS_UNLOCK_GIANT(vfslocked);
out:
free(fbuf, M_TEMP);
Modified: trunk/sys/kern/vfs_cluster.c
===================================================================
--- trunk/sys/kern/vfs_cluster.c 2018-05-25 21:07:58 UTC (rev 9951)
+++ trunk/sys/kern/vfs_cluster.c 2018-05-26 14:24:52 UTC (rev 9952)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 1993
* The Regents of the University of California. All rights reserved.
@@ -32,7 +33,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/vfs_cluster.c 302234 2016-06-27 21:50:30Z bdrewery $");
#include "opt_debug_cluster.h"
@@ -46,6 +47,7 @@
#include <sys/malloc.h>
#include <sys/mount.h>
#include <sys/resourcevar.h>
+#include <sys/rwlock.h>
#include <sys/vmmeter.h>
#include <vm/vm.h>
#include <vm/vm_object.h>
@@ -60,11 +62,11 @@
static MALLOC_DEFINE(M_SEGMENT, "cl_savebuf", "cluster_save buffer");
-static struct cluster_save *
- cluster_collectbufs(struct vnode *vp, struct buf *last_bp);
-static struct buf *
- cluster_rbuild(struct vnode *vp, u_quad_t filesize, daddr_t lbn,
- daddr_t blkno, long size, int run, struct buf *fbp);
+static struct cluster_save *cluster_collectbufs(struct vnode *vp,
+ struct buf *last_bp, int gbflags);
+static struct buf *cluster_rbuild(struct vnode *vp, u_quad_t filesize,
+ daddr_t lbn, daddr_t blkno, long size, int run, int gbflags,
+ struct buf *fbp);
static void cluster_callback(struct buf *);
static int write_behind = 1;
@@ -88,15 +90,6 @@
*/
int
cluster_read(struct vnode *vp, u_quad_t filesize, daddr_t lblkno, long size,
- struct ucred *cred, long totread, int seqcount, struct buf **bpp)
-{
-
- return (cluster_read_gb(vp, filesize, lblkno, size, cred, totread,
- seqcount, 0, bpp));
-}
-
-int
-cluster_read_gb(struct vnode *vp, u_quad_t filesize, daddr_t lblkno, long size,
struct ucred *cred, long totread, int seqcount, int gbflags,
struct buf **bpp)
{
@@ -109,6 +102,8 @@
error = 0;
bo = &vp->v_bufobj;
+ if (!unmapped_buf_allowed)
+ gbflags &= ~GB_UNMAPPED;
/*
* Try to limit the amount of read-ahead by a few
@@ -124,7 +119,7 @@
/*
* get the requested block
*/
- *bpp = reqbp = bp = getblk(vp, lblkno, size, 0, 0, 0);
+ *bpp = reqbp = bp = getblk(vp, lblkno, size, 0, 0, gbflags);
origblkno = lblkno;
/*
@@ -139,7 +134,7 @@
return 0;
} else {
bp->b_flags &= ~B_RAM;
- BO_LOCK(bo);
+ BO_RLOCK(bo);
for (i = 1; i < maxra; i++) {
/*
* Stop if the buffer does not exist or it
@@ -162,7 +157,7 @@
BUF_UNLOCK(rbp);
}
}
- BO_UNLOCK(bo);
+ BO_RUNLOCK(bo);
if (i >= maxra) {
return 0;
}
@@ -223,7 +218,7 @@
if (ncontig < nblks)
nblks = ncontig;
bp = cluster_rbuild(vp, filesize, lblkno,
- blkno, size, nblks, bp);
+ blkno, size, nblks, gbflags, bp);
lblkno += (bp->b_bufsize / size);
} else {
bp->b_flags |= B_RAM;
@@ -267,7 +262,7 @@
if (ncontig) {
ncontig = min(ncontig + 1, racluster);
rbp = cluster_rbuild(vp, filesize, lblkno, blkno,
- size, ncontig, NULL);
+ size, ncontig, gbflags, NULL);
lblkno += (rbp->b_bufsize / size);
if (rbp->b_flags & B_DELWRI) {
bqrelse(rbp);
@@ -274,7 +269,7 @@
continue;
}
} else {
- rbp = getblk(vp, lblkno, size, 0, 0, 0);
+ rbp = getblk(vp, lblkno, size, 0, 0, gbflags);
lblkno += 1;
if (rbp->b_flags & B_DELWRI) {
bqrelse(rbp);
@@ -313,24 +308,17 @@
* and then parcel them up into logical blocks in the buffer hash table.
*/
static struct buf *
-cluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp)
- struct vnode *vp;
- u_quad_t filesize;
- daddr_t lbn;
- daddr_t blkno;
- long size;
- int run;
- struct buf *fbp;
+cluster_rbuild(struct vnode *vp, u_quad_t filesize, daddr_t lbn,
+ daddr_t blkno, long size, int run, int gbflags, struct buf *fbp)
{
- struct bufobj *bo;
struct buf *bp, *tbp;
daddr_t bn;
off_t off;
long tinc, tsize;
- int i, inc, j, toff;
+ int i, inc, j, k, toff;
KASSERT(size == vp->v_mount->mnt_stat.f_iosize,
- ("cluster_rbuild: size %ld != filesize %jd\n",
+ ("cluster_rbuild: size %ld != f_iosize %jd\n",
size, (intmax_t)vp->v_mount->mnt_stat.f_iosize));
/*
@@ -344,7 +332,7 @@
tbp = fbp;
tbp->b_iocmd = BIO_READ;
} else {
- tbp = getblk(vp, lbn, size, 0, 0, 0);
+ tbp = getblk(vp, lbn, size, 0, 0, gbflags);
if (tbp->b_flags & B_CACHE)
return tbp;
tbp->b_flags |= B_ASYNC | B_RAM;
@@ -365,9 +353,14 @@
* address may not be either. Inherit the b_data offset
* from the original buffer.
*/
- bp->b_data = (char *)((vm_offset_t)bp->b_data |
- ((vm_offset_t)tbp->b_data & PAGE_MASK));
bp->b_flags = B_ASYNC | B_CLUSTER | B_VMIO;
+ if ((gbflags & GB_UNMAPPED) != 0) {
+ bp->b_flags |= B_UNMAPPED;
+ bp->b_data = unmapped_buf;
+ } else {
+ bp->b_data = (char *)((vm_offset_t)bp->b_data |
+ ((vm_offset_t)tbp->b_data & PAGE_MASK));
+ }
bp->b_iocmd = BIO_READ;
bp->b_iodone = cluster_callback;
bp->b_blkno = blkno;
@@ -383,15 +376,23 @@
bp->b_npages = 0;
inc = btodb(size);
- bo = &vp->v_bufobj;
for (bn = blkno, i = 0; i < run; ++i, bn += inc) {
- if (i != 0) {
+ if (i == 0) {
+ VM_OBJECT_WLOCK(tbp->b_bufobj->bo_object);
+ vfs_drain_busy_pages(tbp);
+ vm_object_pip_add(tbp->b_bufobj->bo_object,
+ tbp->b_npages);
+ for (k = 0; k < tbp->b_npages; k++)
+ vm_page_sbusy(tbp->b_pages[k]);
+ VM_OBJECT_WUNLOCK(tbp->b_bufobj->bo_object);
+ } else {
if ((bp->b_npages * PAGE_SIZE) +
round_page(size) > vp->v_mount->mnt_iosize_max) {
break;
}
- tbp = getblk(vp, lbn + i, size, 0, 0, GB_LOCK_NOWAIT);
+ tbp = getblk(vp, lbn + i, size, 0, 0, GB_LOCK_NOWAIT |
+ (gbflags & GB_UNMAPPED));
/* Don't wait around for locked bufs. */
if (tbp == NULL)
@@ -402,17 +403,16 @@
* (marked B_CACHE), or locked (may be doing a
* background write), or if the buffer is not
* VMIO backed. The clustering code can only deal
- * with VMIO-backed buffers.
+ * with VMIO-backed buffers. The bo lock is not
+ * required for the BKGRDINPROG check since it
+ * can not be set without the buf lock.
*/
- BO_LOCK(bo);
if ((tbp->b_vflags & BV_BKGRDINPROG) ||
(tbp->b_flags & B_CACHE) ||
(tbp->b_flags & B_VMIO) == 0) {
- BO_UNLOCK(bo);
bqrelse(tbp);
break;
}
- BO_UNLOCK(bo);
/*
* The buffer must be completely invalid in order to
@@ -421,25 +421,33 @@
*/
off = tbp->b_offset;
tsize = size;
- VM_OBJECT_LOCK(tbp->b_bufobj->bo_object);
+ VM_OBJECT_WLOCK(tbp->b_bufobj->bo_object);
for (j = 0; tsize > 0; j++) {
toff = off & PAGE_MASK;
tinc = tsize;
if (toff + tinc > PAGE_SIZE)
tinc = PAGE_SIZE - toff;
- VM_OBJECT_LOCK_ASSERT(tbp->b_pages[j]->object,
- MA_OWNED);
+ VM_OBJECT_ASSERT_WLOCKED(tbp->b_pages[j]->object);
if ((tbp->b_pages[j]->valid &
vm_page_bits(toff, tinc)) != 0)
break;
+ if (vm_page_xbusied(tbp->b_pages[j]))
+ break;
+ vm_object_pip_add(tbp->b_bufobj->bo_object, 1);
+ vm_page_sbusy(tbp->b_pages[j]);
off += tinc;
tsize -= tinc;
}
- VM_OBJECT_UNLOCK(tbp->b_bufobj->bo_object);
if (tsize > 0) {
+clean_sbusy:
+ vm_object_pip_add(tbp->b_bufobj->bo_object, -j);
+ for (k = 0; k < j; k++)
+ vm_page_sunbusy(tbp->b_pages[k]);
+ VM_OBJECT_WUNLOCK(tbp->b_bufobj->bo_object);
bqrelse(tbp);
break;
}
+ VM_OBJECT_WUNLOCK(tbp->b_bufobj->bo_object);
/*
* Set a read-ahead mark as appropriate
@@ -459,8 +467,8 @@
if (tbp->b_blkno == tbp->b_lblkno) {
tbp->b_blkno = bn;
} else if (tbp->b_blkno != bn) {
- brelse(tbp);
- break;
+ VM_OBJECT_WLOCK(tbp->b_bufobj->bo_object);
+ goto clean_sbusy;
}
}
/*
@@ -470,14 +478,12 @@
BUF_KERNPROC(tbp);
TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head,
tbp, b_cluster.cluster_entry);
- VM_OBJECT_LOCK(tbp->b_bufobj->bo_object);
+ VM_OBJECT_WLOCK(tbp->b_bufobj->bo_object);
for (j = 0; j < tbp->b_npages; j += 1) {
vm_page_t m;
m = tbp->b_pages[j];
- vm_page_io_start(m);
- vm_object_pip_add(m->object, 1);
if ((bp->b_npages == 0) ||
- (bp->b_pages[bp->b_npages-1] != m)) {
+ (bp->b_pages[bp->b_npages-1] != m)) {
bp->b_pages[bp->b_npages] = m;
bp->b_npages++;
}
@@ -484,7 +490,7 @@
if (m->valid == VM_PAGE_BITS_ALL)
tbp->b_pages[j] = bogus_page;
}
- VM_OBJECT_UNLOCK(tbp->b_bufobj->bo_object);
+ VM_OBJECT_WUNLOCK(tbp->b_bufobj->bo_object);
/*
* Don't inherit tbp->b_bufsize as it may be larger due to
* a non-page-aligned size. Instead just aggregate using
@@ -502,20 +508,22 @@
* Fully valid pages in the cluster are already good and do not need
* to be re-read from disk. Replace the page with bogus_page
*/
- VM_OBJECT_LOCK(bp->b_bufobj->bo_object);
+ VM_OBJECT_WLOCK(bp->b_bufobj->bo_object);
for (j = 0; j < bp->b_npages; j++) {
- VM_OBJECT_LOCK_ASSERT(bp->b_pages[j]->object, MA_OWNED);
+ VM_OBJECT_ASSERT_WLOCKED(bp->b_pages[j]->object);
if (bp->b_pages[j]->valid == VM_PAGE_BITS_ALL)
bp->b_pages[j] = bogus_page;
}
- VM_OBJECT_UNLOCK(bp->b_bufobj->bo_object);
+ VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object);
if (bp->b_bufsize > bp->b_kvasize)
panic("cluster_rbuild: b_bufsize(%ld) > b_kvasize(%d)\n",
bp->b_bufsize, bp->b_kvasize);
bp->b_kvasize = bp->b_bufsize;
- pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
- (vm_page_t *)bp->b_pages, bp->b_npages);
+ if ((bp->b_flags & B_UNMAPPED) == 0) {
+ pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
+ (vm_page_t *)bp->b_pages, bp->b_npages);
+ }
return (bp);
}
@@ -533,12 +541,15 @@
int error = 0;
/*
- * Must propogate errors to all the components.
+ * Must propagate errors to all the components.
*/
if (bp->b_ioflags & BIO_ERROR)
error = bp->b_error;
- pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
+ if ((bp->b_flags & B_UNMAPPED) == 0) {
+ pmap_qremove(trunc_page((vm_offset_t) bp->b_data),
+ bp->b_npages);
+ }
/*
* Move memory from the large cluster buffer into the component
* buffers and mark IO as done on these.
@@ -580,7 +591,8 @@
*/
static __inline int
-cluster_wbuild_wb(struct vnode *vp, long size, daddr_t start_lbn, int len)
+cluster_wbuild_wb(struct vnode *vp, long size, daddr_t start_lbn, int len,
+ int gbflags)
{
int r = 0;
@@ -591,7 +603,7 @@
start_lbn -= len;
/* FALLTHROUGH */
case 1:
- r = cluster_wbuild(vp, size, start_lbn, len);
+ r = cluster_wbuild(vp, size, start_lbn, len, gbflags);
/* FALLTHROUGH */
default:
/* FALLTHROUGH */
@@ -611,21 +623,17 @@
* 4. end of a cluster - asynchronously write cluster
*/
void
-cluster_write(struct vnode *vp, struct buf *bp, u_quad_t filesize, int seqcount)
+cluster_write(struct vnode *vp, struct buf *bp, u_quad_t filesize, int seqcount,
+ int gbflags)
{
-
- cluster_write_gb(vp, bp, filesize, seqcount, 0);
-}
-
-void
-cluster_write_gb(struct vnode *vp, struct buf *bp, u_quad_t filesize,
- int seqcount, int gbflags)
-{
daddr_t lbn;
int maxclen, cursize;
int lblocksize;
int async;
+ if (!unmapped_buf_allowed)
+ gbflags &= ~GB_UNMAPPED;
+
if (vp->v_type == VREG) {
async = DOINGASYNC(vp);
lblocksize = vp->v_mount->mnt_stat.f_iosize;
@@ -665,13 +673,13 @@
lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) {
if (!async && seqcount > 0) {
cluster_wbuild_wb(vp, lblocksize,
- vp->v_cstart, cursize);
+ vp->v_cstart, cursize, gbflags);
}
} else {
struct buf **bpp, **endbp;
struct cluster_save *buflist;
- buflist = cluster_collectbufs(vp, bp);
+ buflist = cluster_collectbufs(vp, bp, gbflags);
endbp = &buflist->bs_children
[buflist->bs_nchildren - 1];
if (VOP_REALLOCBLKS(vp, buflist)) {
@@ -690,7 +698,7 @@
if (seqcount > 1) {
cluster_wbuild_wb(vp,
lblocksize, vp->v_cstart,
- cursize);
+ cursize, gbflags);
}
} else {
/*
@@ -738,8 +746,10 @@
* update daemon handle it.
*/
bdwrite(bp);
- if (seqcount > 1)
- cluster_wbuild_wb(vp, lblocksize, vp->v_cstart, vp->v_clen + 1);
+ if (seqcount > 1) {
+ cluster_wbuild_wb(vp, lblocksize, vp->v_cstart,
+ vp->v_clen + 1, gbflags);
+ }
vp->v_clen = 0;
vp->v_cstart = lbn + 1;
} else if (vm_page_count_severe()) {
@@ -765,14 +775,7 @@
* the current block (if last_bp == NULL).
*/
int
-cluster_wbuild(struct vnode *vp, long size, daddr_t start_lbn, int len)
-{
-
- return (cluster_wbuild_gb(vp, size, start_lbn, len, 0));
-}
-
-int
-cluster_wbuild_gb(struct vnode *vp, long size, daddr_t start_lbn, int len,
+cluster_wbuild(struct vnode *vp, long size, daddr_t start_lbn, int len,
int gbflags)
{
struct buf *bp, *tbp;
@@ -781,6 +784,9 @@
int totalwritten = 0;
int dbsize = btodb(size);
+ if (!unmapped_buf_allowed)
+ gbflags &= ~GB_UNMAPPED;
+
bo = &vp->v_bufobj;
while (len > 0) {
/*
@@ -797,7 +803,7 @@
continue;
}
if (BUF_LOCK(tbp,
- LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, BO_MTX(bo))) {
+ LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK, BO_LOCKPTR(bo))) {
++start_lbn;
--len;
continue;
@@ -830,7 +836,9 @@
(tbp->b_bcount != tbp->b_bufsize) ||
(tbp->b_bcount != size) ||
(len == 1) ||
- ((bp = getpbuf(&cluster_pbuf_freecnt)) == NULL)) {
+ ((bp = (vp->v_vflag & VV_MD) != 0 ?
+ trypbuf(&cluster_pbuf_freecnt) :
+ getpbuf(&cluster_pbuf_freecnt)) == NULL)) {
totalwritten += tbp->b_bufsize;
bawrite(tbp);
++start_lbn;
@@ -859,10 +867,16 @@
* address may not be either. Inherit the b_data offset
* from the original buffer.
*/
- bp->b_data = (char *)((vm_offset_t)bp->b_data |
- ((vm_offset_t)tbp->b_data & PAGE_MASK));
- bp->b_flags |= B_CLUSTER |
- (tbp->b_flags & (B_VMIO | B_NEEDCOMMIT));
+ if ((gbflags & GB_UNMAPPED) == 0 ||
+ (tbp->b_flags & B_VMIO) == 0) {
+ bp->b_data = (char *)((vm_offset_t)bp->b_data |
+ ((vm_offset_t)tbp->b_data & PAGE_MASK));
+ } else {
+ bp->b_flags |= B_UNMAPPED;
+ bp->b_data = unmapped_buf;
+ }
+ bp->b_flags |= B_CLUSTER | (tbp->b_flags & (B_VMIO |
+ B_NEEDCOMMIT));
bp->b_iodone = cluster_callback;
pbgetvp(vp, bp);
/*
@@ -892,7 +906,7 @@
*/
if (BUF_LOCK(tbp,
LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK,
- BO_MTX(bo)))
+ BO_LOCKPTR(bo)))
break;
if ((tbp->b_flags & (B_VMIO | B_CLUSTEROK |
@@ -945,12 +959,14 @@
if (tbp->b_flags & B_VMIO) {
vm_page_t m;
- VM_OBJECT_LOCK(tbp->b_bufobj->bo_object);
- if (i != 0) { /* if not first buffer */
+ VM_OBJECT_WLOCK(tbp->b_bufobj->bo_object);
+ if (i == 0) {
+ vfs_drain_busy_pages(tbp);
+ } else { /* if not first buffer */
for (j = 0; j < tbp->b_npages; j += 1) {
m = tbp->b_pages[j];
- if (m->oflags & VPO_BUSY) {
- VM_OBJECT_UNLOCK(
+ if (vm_page_xbusied(m)) {
+ VM_OBJECT_WUNLOCK(
tbp->b_object);
bqrelse(tbp);
goto finishcluster;
@@ -959,7 +975,7 @@
}
for (j = 0; j < tbp->b_npages; j += 1) {
m = tbp->b_pages[j];
- vm_page_io_start(m);
+ vm_page_sbusy(m);
vm_object_pip_add(m->object, 1);
if ((bp->b_npages == 0) ||
(bp->b_pages[bp->b_npages - 1] != m)) {
@@ -967,15 +983,21 @@
bp->b_npages++;
}
}
- VM_OBJECT_UNLOCK(tbp->b_bufobj->bo_object);
+ VM_OBJECT_WUNLOCK(tbp->b_bufobj->bo_object);
}
bp->b_bcount += size;
bp->b_bufsize += size;
- bundirty(tbp);
- tbp->b_flags &= ~B_DONE;
+ /*
+ * If any of the clustered buffers have their
+ * B_BARRIER flag set, transfer that request to
+ * the cluster.
+ */
+ bp->b_flags |= (tbp->b_flags & B_BARRIER);
+ tbp->b_flags &= ~(B_DONE | B_BARRIER);
+ tbp->b_flags |= B_ASYNC;
tbp->b_ioflags &= ~BIO_ERROR;
- tbp->b_flags |= B_ASYNC;
tbp->b_iocmd = BIO_WRITE;
+ bundirty(tbp);
reassignbuf(tbp); /* put on clean list */
bufobj_wref(tbp->b_bufobj);
BUF_KERNPROC(tbp);
@@ -983,8 +1005,10 @@
tbp, b_cluster.cluster_entry);
}
finishcluster:
- pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
- (vm_page_t *) bp->b_pages, bp->b_npages);
+ if ((bp->b_flags & B_UNMAPPED) == 0) {
+ pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
+ (vm_page_t *)bp->b_pages, bp->b_npages);
+ }
if (bp->b_bufsize > bp->b_kvasize)
panic(
"cluster_wbuild: b_bufsize(%ld) > b_kvasize(%d)\n",
@@ -1005,9 +1029,7 @@
* Plus add one additional buffer.
*/
static struct cluster_save *
-cluster_collectbufs(vp, last_bp)
- struct vnode *vp;
- struct buf *last_bp;
+cluster_collectbufs(struct vnode *vp, struct buf *last_bp, int gbflags)
{
struct cluster_save *buflist;
struct buf *bp;
@@ -1020,7 +1042,8 @@
buflist->bs_nchildren = 0;
buflist->bs_children = (struct buf **) (buflist + 1);
for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++) {
- (void) bread(vp, lbn, last_bp->b_bcount, NOCRED, &bp);
+ (void)bread_gb(vp, lbn, last_bp->b_bcount, NOCRED,
+ gbflags, &bp);
buflist->bs_children[i] = bp;
if (bp->b_blkno == bp->b_lblkno)
VOP_BMAP(vp, bp->b_lblkno, NULL, &bp->b_blkno,
Modified: trunk/sys/kern/vfs_default.c
===================================================================
--- trunk/sys/kern/vfs_default.c 2018-05-25 21:07:58 UTC (rev 9951)
+++ trunk/sys/kern/vfs_default.c 2018-05-26 14:24:52 UTC (rev 9952)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 1989, 1993
* The Regents of the University of California. All rights reserved.
@@ -33,7 +34,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/vfs_default.c 330266 2018-03-02 04:43:07Z mckusick $");
#include <sys/param.h>
#include <sys/systm.h>
@@ -47,8 +48,8 @@
#include <sys/lockf.h>
#include <sys/malloc.h>
#include <sys/mount.h>
-#include <sys/mutex.h>
#include <sys/namei.h>
+#include <sys/rwlock.h>
#include <sys/fcntl.h>
#include <sys/unistd.h>
#include <sys/vnode.h>
@@ -354,8 +355,8 @@
if (error)
goto out;
- if ((dp->d_type != DT_WHT) &&
- !strcmp(dp->d_name, dirname)) {
+ if (dp->d_type != DT_WHT && dp->d_fileno != 0 &&
+ strcmp(dp->d_name, dirname) == 0) {
found = 1;
goto out;
}
@@ -399,17 +400,24 @@
vop_stdadvlock(struct vop_advlock_args *ap)
{
struct vnode *vp;
- struct ucred *cred;
struct vattr vattr;
int error;
vp = ap->a_vp;
- cred = curthread->td_ucred;
- vn_lock(vp, LK_SHARED | LK_RETRY);
- error = VOP_GETATTR(vp, &vattr, cred);
- VOP_UNLOCK(vp, 0);
- if (error)
- return (error);
+ if (ap->a_fl->l_whence == SEEK_END) {
+ /*
+ * The NFSv4 server must avoid doing a vn_lock() here, since it
+ * can deadlock the nfsd threads, due to a LOR. Fortunately
+ * the NFSv4 server always uses SEEK_SET and this code is
+ * only required for the SEEK_END case.
+ */
+ vn_lock(vp, LK_SHARED | LK_RETRY);
+ error = VOP_GETATTR(vp, &vattr, curthread->td_ucred);
+ VOP_UNLOCK(vp, 0);
+ if (error)
+ return (error);
+ } else
+ vattr.va_size = 0;
return (lf_advlock(ap, &(vp->v_lockf), vattr.va_size));
}
@@ -418,17 +426,19 @@
vop_stdadvlockasync(struct vop_advlockasync_args *ap)
{
struct vnode *vp;
- struct ucred *cred;
struct vattr vattr;
int error;
vp = ap->a_vp;
- cred = curthread->td_ucred;
- vn_lock(vp, LK_SHARED | LK_RETRY);
- error = VOP_GETATTR(vp, &vattr, cred);
- VOP_UNLOCK(vp, 0);
- if (error)
- return (error);
+ if (ap->a_fl->l_whence == SEEK_END) {
+ /* The size argument is only needed for SEEK_END. */
+ vn_lock(vp, LK_SHARED | LK_RETRY);
+ error = VOP_GETATTR(vp, &vattr, curthread->td_ucred);
+ VOP_UNLOCK(vp, 0);
+ if (error)
+ return (error);
+ } else
+ vattr.va_size = 0;
return (lf_advlockasync(ap, &(vp->v_lockf), vattr.va_size));
}
@@ -626,18 +636,25 @@
vop_stdfsync(ap)
struct vop_fsync_args /* {
struct vnode *a_vp;
- struct ucred *a_cred;
int a_waitfor;
struct thread *a_td;
} */ *ap;
{
- struct vnode *vp = ap->a_vp;
- struct buf *bp;
+ struct vnode *vp;
+ struct buf *bp, *nbp;
struct bufobj *bo;
- struct buf *nbp;
- int error = 0;
- int maxretry = 1000; /* large, arbitrarily chosen */
+ struct mount *mp;
+ int error, maxretry;
+ error = 0;
+ maxretry = 10000; /* large, arbitrarily chosen */
+ vp = ap->a_vp;
+ mp = NULL;
+ if (vp->v_type == VCHR) {
+ VI_LOCK(vp);
+ mp = vp->v_rdev->si_mountpt;
+ VI_UNLOCK(vp);
+ }
bo = &vp->v_bufobj;
BO_LOCK(bo);
loop1:
@@ -662,7 +679,7 @@
continue;
if (BUF_LOCK(bp,
LK_EXCLUSIVE | LK_INTERLOCK | LK_SLEEPFAIL,
- BO_MTX(bo)) != 0) {
+ BO_LOCKPTR(bo)) != 0) {
BO_LOCK(bo);
goto loop1;
}
@@ -680,6 +697,8 @@
bremfree(bp);
bawrite(bp);
}
+ if (maxretry < 1000)
+ pause("dirty", hz < 1000 ? 1 : hz / 1000);
BO_LOCK(bo);
goto loop2;
}
@@ -701,14 +720,16 @@
TAILQ_FOREACH(bp, &bo->bo_dirty.bv_hd, b_bobufs)
if ((error = bp->b_error) == 0)
continue;
- if (error == 0 && --maxretry >= 0)
+ if ((mp != NULL && mp->mnt_secondary_writes > 0) ||
+ (error == 0 && --maxretry >= 0))
goto loop1;
- error = EAGAIN;
+ if (error == 0)
+ error = EAGAIN;
}
}
BO_UNLOCK(bo);
- if (error == EAGAIN)
- vprint("fsync: giving up on dirty", vp);
+ if (error != 0)
+ vn_printf(vp, "fsync: giving up on dirty (error = %d) ", error);
return (error);
}
@@ -1017,7 +1038,7 @@
{
struct vnode *vp;
off_t start, end;
- int error, vfslocked;
+ int error;
vp = ap->a_vp;
switch (ap->a_advice) {
@@ -1038,11 +1059,9 @@
* requested range.
*/
error = 0;
- vfslocked = VFS_LOCK_GIANT(vp->v_mount);
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
if (vp->v_iflag & VI_DOOMED) {
VOP_UNLOCK(vp, 0);
- VFS_UNLOCK_GIANT(vfslocked);
break;
}
vinvalbuf(vp, V_CLEANONLY, 0, 0);
@@ -1049,13 +1068,12 @@
if (vp->v_object != NULL) {
start = trunc_page(ap->a_start);
end = round_page(ap->a_end);
- VM_OBJECT_LOCK(vp->v_object);
+ VM_OBJECT_WLOCK(vp->v_object);
vm_object_page_cache(vp->v_object, OFF_TO_IDX(start),
OFF_TO_IDX(end));
- VM_OBJECT_UNLOCK(vp->v_object);
+ VM_OBJECT_WUNLOCK(vp->v_object);
}
VOP_UNLOCK(vp, 0);
- VFS_UNLOCK_GIANT(vfslocked);
break;
default:
error = EINVAL;
Modified: trunk/sys/kern/vfs_export.c
===================================================================
--- trunk/sys/kern/vfs_export.c 2018-05-25 21:07:58 UTC (rev 9951)
+++ trunk/sys/kern/vfs_export.c 2018-05-26 14:24:52 UTC (rev 9952)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 1989, 1993
* The Regents of the University of California. All rights reserved.
@@ -35,7 +36,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/vfs_export.c 247116 2013-02-21 19:02:50Z jhb $");
#include <sys/param.h>
#include <sys/dirent.h>
Modified: trunk/sys/kern/vfs_extattr.c
===================================================================
--- trunk/sys/kern/vfs_extattr.c 2018-05-25 21:07:58 UTC (rev 9951)
+++ trunk/sys/kern/vfs_extattr.c 2018-05-26 14:24:52 UTC (rev 9952)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 1999-2001 Robert N. M. Watson
* All rights reserved.
@@ -27,11 +28,11 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/vfs_extattr.c 280258 2015-03-19 13:37:36Z rwatson $");
#include <sys/param.h>
#include <sys/systm.h>
-#include <sys/capability.h>
+#include <sys/capsicum.h>
#include <sys/lock.h>
#include <sys/mount.h>
#include <sys/mutex.h>
@@ -69,7 +70,7 @@
struct nameidata nd;
struct mount *mp, *mp_writable;
char attrname[EXTATTR_MAXNAMELEN];
- int vfslocked, fnvfslocked, error;
+ int error;
AUDIT_ARG_CMD(uap->cmd);
AUDIT_ARG_VALUE(uap->attrnamespace);
@@ -85,27 +86,24 @@
}
AUDIT_ARG_TEXT(attrname);
- vfslocked = fnvfslocked = 0;
mp = NULL;
filename_vp = NULL;
if (uap->filename != NULL) {
- NDINIT(&nd, LOOKUP, MPSAFE | FOLLOW | AUDITVNODE2,
+ NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE2,
UIO_USERSPACE, uap->filename, td);
error = namei(&nd);
if (error)
return (error);
- fnvfslocked = NDHASGIANT(&nd);
filename_vp = nd.ni_vp;
NDFREE(&nd, NDF_NO_VP_RELE);
}
/* uap->path is always defined. */
- NDINIT(&nd, LOOKUP, MPSAFE | FOLLOW | LOCKLEAF | AUDITVNODE1,
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1,
UIO_USERSPACE, uap->path, td);
error = namei(&nd);
if (error)
goto out;
- vfslocked = NDHASGIANT(&nd);
mp = nd.ni_vp->v_mount;
error = vfs_busy(mp, 0);
if (error) {
@@ -145,8 +143,6 @@
*/
if (filename_vp != NULL)
vrele(filename_vp);
- VFS_UNLOCK_GIANT(fnvfslocked);
- VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
@@ -170,7 +166,6 @@
ssize_t cnt;
int error;
- VFS_ASSERT_GIANT(vp->v_mount);
error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
if (error)
return (error);
@@ -222,7 +217,8 @@
{
struct file *fp;
char attrname[EXTATTR_MAXNAMELEN];
- int vfslocked, error;
+ cap_rights_t rights;
+ int error;
AUDIT_ARG_FD(uap->fd);
AUDIT_ARG_VALUE(uap->attrnamespace);
@@ -231,15 +227,14 @@
return (error);
AUDIT_ARG_TEXT(attrname);
- error = getvnode(td->td_proc->p_fd, uap->fd, CAP_EXTATTR_SET, &fp);
+ error = getvnode(td->td_proc->p_fd, uap->fd,
+ cap_rights_init(&rights, CAP_EXTATTR_SET), &fp);
if (error)
return (error);
- vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
error = extattr_set_vp(fp->f_vnode, uap->attrnamespace,
attrname, uap->data, uap->nbytes, td);
fdrop(fp, td);
- VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
@@ -257,7 +252,7 @@
{
struct nameidata nd;
char attrname[EXTATTR_MAXNAMELEN];
- int vfslocked, error;
+ int error;
AUDIT_ARG_VALUE(uap->attrnamespace);
error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
@@ -265,7 +260,7 @@
return (error);
AUDIT_ARG_TEXT(attrname);
- NDINIT(&nd, LOOKUP, MPSAFE | FOLLOW | AUDITVNODE1, UIO_USERSPACE,
+ NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, UIO_USERSPACE,
uap->path, td);
error = namei(&nd);
if (error)
@@ -272,12 +267,10 @@
return (error);
NDFREE(&nd, NDF_ONLY_PNBUF);
- vfslocked = NDHASGIANT(&nd);
error = extattr_set_vp(nd.ni_vp, uap->attrnamespace, attrname,
uap->data, uap->nbytes, td);
vrele(nd.ni_vp);
- VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
@@ -294,7 +287,7 @@
{
struct nameidata nd;
char attrname[EXTATTR_MAXNAMELEN];
- int vfslocked, error;
+ int error;
AUDIT_ARG_VALUE(uap->attrnamespace);
error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
@@ -302,7 +295,7 @@
return (error);
AUDIT_ARG_TEXT(attrname);
- NDINIT(&nd, LOOKUP, MPSAFE | NOFOLLOW | AUDITVNODE1, UIO_USERSPACE,
+ NDINIT(&nd, LOOKUP, NOFOLLOW | AUDITVNODE1, UIO_USERSPACE,
uap->path, td);
error = namei(&nd);
if (error)
@@ -309,12 +302,10 @@
return (error);
NDFREE(&nd, NDF_ONLY_PNBUF);
- vfslocked = NDHASGIANT(&nd);
error = extattr_set_vp(nd.ni_vp, uap->attrnamespace, attrname,
uap->data, uap->nbytes, td);
vrele(nd.ni_vp);
- VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
@@ -338,7 +329,6 @@
size_t size, *sizep;
int error;
- VFS_ASSERT_GIANT(vp->v_mount);
vn_lock(vp, LK_SHARED | LK_RETRY);
/*
@@ -402,7 +392,8 @@
{
struct file *fp;
char attrname[EXTATTR_MAXNAMELEN];
- int vfslocked, error;
+ cap_rights_t rights;
+ int error;
AUDIT_ARG_FD(uap->fd);
AUDIT_ARG_VALUE(uap->attrnamespace);
@@ -411,16 +402,15 @@
return (error);
AUDIT_ARG_TEXT(attrname);
- error = getvnode(td->td_proc->p_fd, uap->fd, CAP_EXTATTR_GET, &fp);
+ error = getvnode(td->td_proc->p_fd, uap->fd,
+ cap_rights_init(&rights, CAP_EXTATTR_GET), &fp);
if (error)
return (error);
- vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
error = extattr_get_vp(fp->f_vnode, uap->attrnamespace,
attrname, uap->data, uap->nbytes, td);
fdrop(fp, td);
- VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
@@ -437,7 +427,7 @@
{
struct nameidata nd;
char attrname[EXTATTR_MAXNAMELEN];
- int vfslocked, error;
+ int error;
AUDIT_ARG_VALUE(uap->attrnamespace);
error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
@@ -445,19 +435,16 @@
return (error);
AUDIT_ARG_TEXT(attrname);
- NDINIT(&nd, LOOKUP, MPSAFE | FOLLOW | AUDITVNODE1, UIO_USERSPACE,
- uap->path, td);
+ NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->path, td);
error = namei(&nd);
if (error)
return (error);
NDFREE(&nd, NDF_ONLY_PNBUF);
- vfslocked = NDHASGIANT(&nd);
error = extattr_get_vp(nd.ni_vp, uap->attrnamespace, attrname,
uap->data, uap->nbytes, td);
vrele(nd.ni_vp);
- VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
@@ -474,7 +461,7 @@
{
struct nameidata nd;
char attrname[EXTATTR_MAXNAMELEN];
- int vfslocked, error;
+ int error;
AUDIT_ARG_VALUE(uap->attrnamespace);
error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
@@ -482,19 +469,17 @@
return (error);
AUDIT_ARG_TEXT(attrname);
- NDINIT(&nd, LOOKUP, MPSAFE | NOFOLLOW | AUDITVNODE1, UIO_USERSPACE,
- uap->path, td);
+ NDINIT(&nd, LOOKUP, NOFOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->path,
+ td);
error = namei(&nd);
if (error)
return (error);
NDFREE(&nd, NDF_ONLY_PNBUF);
- vfslocked = NDHASGIANT(&nd);
error = extattr_get_vp(nd.ni_vp, uap->attrnamespace, attrname,
uap->data, uap->nbytes, td);
vrele(nd.ni_vp);
- VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
@@ -515,7 +500,6 @@
struct mount *mp;
int error;
- VFS_ASSERT_GIANT(vp->v_mount);
error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
if (error)
return (error);
@@ -552,7 +536,8 @@
{
struct file *fp;
char attrname[EXTATTR_MAXNAMELEN];
- int vfslocked, error;
+ cap_rights_t rights;
+ int error;
AUDIT_ARG_FD(uap->fd);
AUDIT_ARG_VALUE(uap->attrnamespace);
@@ -561,16 +546,14 @@
return (error);
AUDIT_ARG_TEXT(attrname);
- error = getvnode(td->td_proc->p_fd, uap->fd, CAP_EXTATTR_DELETE,
- &fp);
+ error = getvnode(td->td_proc->p_fd, uap->fd,
+ cap_rights_init(&rights, CAP_EXTATTR_DELETE), &fp);
if (error)
return (error);
- vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
error = extattr_delete_vp(fp->f_vnode, uap->attrnamespace,
attrname, td);
fdrop(fp, td);
- VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
@@ -585,7 +568,7 @@
{
struct nameidata nd;
char attrname[EXTATTR_MAXNAMELEN];
- int vfslocked, error;
+ int error;
AUDIT_ARG_VALUE(uap->attrnamespace);
error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
@@ -593,17 +576,14 @@
return(error);
AUDIT_ARG_TEXT(attrname);
- NDINIT(&nd, LOOKUP, MPSAFE | FOLLOW | AUDITVNODE1, UIO_USERSPACE,
- uap->path, td);
+ NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->path, td);
error = namei(&nd);
if (error)
return(error);
NDFREE(&nd, NDF_ONLY_PNBUF);
- vfslocked = NDHASGIANT(&nd);
error = extattr_delete_vp(nd.ni_vp, uap->attrnamespace, attrname, td);
vrele(nd.ni_vp);
- VFS_UNLOCK_GIANT(vfslocked);
return(error);
}
@@ -618,7 +598,7 @@
{
struct nameidata nd;
char attrname[EXTATTR_MAXNAMELEN];
- int vfslocked, error;
+ int error;
AUDIT_ARG_VALUE(uap->attrnamespace);
error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
@@ -626,17 +606,14 @@
return(error);
AUDIT_ARG_TEXT(attrname);
- NDINIT(&nd, LOOKUP, MPSAFE | NOFOLLOW | AUDITVNODE1, UIO_USERSPACE,
- uap->path, td);
+ NDINIT(&nd, LOOKUP, NOFOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->path, td);
error = namei(&nd);
if (error)
return(error);
NDFREE(&nd, NDF_ONLY_PNBUF);
- vfslocked = NDHASGIANT(&nd);
error = extattr_delete_vp(nd.ni_vp, uap->attrnamespace, attrname, td);
vrele(nd.ni_vp);
- VFS_UNLOCK_GIANT(vfslocked);
return(error);
}
@@ -660,7 +637,6 @@
ssize_t cnt;
int error;
- VFS_ASSERT_GIANT(vp->v_mount);
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
auiop = NULL;
@@ -717,20 +693,20 @@
} */ *uap;
{
struct file *fp;
- int vfslocked, error;
+ cap_rights_t rights;
+ int error;
AUDIT_ARG_FD(uap->fd);
AUDIT_ARG_VALUE(uap->attrnamespace);
- error = getvnode(td->td_proc->p_fd, uap->fd, CAP_EXTATTR_LIST, &fp);
+ error = getvnode(td->td_proc->p_fd, uap->fd,
+ cap_rights_init(&rights, CAP_EXTATTR_LIST), &fp);
if (error)
return (error);
- vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
error = extattr_list_vp(fp->f_vnode, uap->attrnamespace, uap->data,
uap->nbytes, td);
fdrop(fp, td);
- VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
@@ -745,22 +721,19 @@
} */ *uap;
{
struct nameidata nd;
- int vfslocked, error;
+ int error;
AUDIT_ARG_VALUE(uap->attrnamespace);
- NDINIT(&nd, LOOKUP, MPSAFE | FOLLOW | AUDITVNODE1, UIO_USERSPACE,
- uap->path, td);
+ NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->path, td);
error = namei(&nd);
if (error)
return (error);
NDFREE(&nd, NDF_ONLY_PNBUF);
- vfslocked = NDHASGIANT(&nd);
error = extattr_list_vp(nd.ni_vp, uap->attrnamespace, uap->data,
uap->nbytes, td);
vrele(nd.ni_vp);
- VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
@@ -775,21 +748,19 @@
} */ *uap;
{
struct nameidata nd;
- int vfslocked, error;
+ int error;
AUDIT_ARG_VALUE(uap->attrnamespace);
- NDINIT(&nd, LOOKUP, MPSAFE | NOFOLLOW | AUDITVNODE1, UIO_USERSPACE,
- uap->path, td);
+ NDINIT(&nd, LOOKUP, NOFOLLOW | AUDITVNODE1, UIO_USERSPACE, uap->path,
+ td);
error = namei(&nd);
if (error)
return (error);
NDFREE(&nd, NDF_ONLY_PNBUF);
- vfslocked = NDHASGIANT(&nd);
error = extattr_list_vp(nd.ni_vp, uap->attrnamespace, uap->data,
uap->nbytes, td);
vrele(nd.ni_vp);
- VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
Modified: trunk/sys/kern/vfs_hash.c
===================================================================
--- trunk/sys/kern/vfs_hash.c 2018-05-25 21:07:58 UTC (rev 9951)
+++ trunk/sys/kern/vfs_hash.c 2018-05-26 14:24:52 UTC (rev 9952)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 2005 Poul-Henning Kamp
* All rights reserved.
@@ -26,7 +27,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/vfs_hash.c 300140 2016-05-18 11:58:16Z kib $");
#include <sys/param.h>
#include <sys/systm.h>
@@ -69,7 +70,8 @@
}
int
-vfs_hash_get(const struct mount *mp, u_int hash, int flags, struct thread *td, struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg)
+vfs_hash_get(const struct mount *mp, u_int hash, int flags, struct thread *td,
+ struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg)
{
struct vnode *vp;
int error;
@@ -102,6 +104,36 @@
}
void
+vfs_hash_ref(const struct mount *mp, u_int hash, struct thread *td,
+ struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg)
+{
+ struct vnode *vp;
+
+ while (1) {
+ mtx_lock(&vfs_hash_mtx);
+ LIST_FOREACH(vp, vfs_hash_bucket(mp, hash), v_hashlist) {
+ if (vp->v_hash != hash)
+ continue;
+ if (vp->v_mount != mp)
+ continue;
+ if (fn != NULL && fn(vp, arg))
+ continue;
+ vhold(vp);
+ mtx_unlock(&vfs_hash_mtx);
+ vref(vp);
+ vdrop(vp);
+ *vpp = vp;
+ return;
+ }
+ if (vp == NULL) {
+ mtx_unlock(&vfs_hash_mtx);
+ *vpp = NULL;
+ return;
+ }
+ }
+}
+
+void
vfs_hash_remove(struct vnode *vp)
{
@@ -111,7 +143,8 @@
}
int
-vfs_hash_insert(struct vnode *vp, u_int hash, int flags, struct thread *td, struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg)
+vfs_hash_insert(struct vnode *vp, u_int hash, int flags, struct thread *td,
+ struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg)
{
struct vnode *vp2;
int error;
@@ -160,3 +193,40 @@
vp->v_hash = hash;
mtx_unlock(&vfs_hash_mtx);
}
+
+void
+vfs_hash_changesize(int newmaxvnodes)
+{
+ struct vfs_hash_head *vfs_hash_newtbl, *vfs_hash_oldtbl;
+ u_long vfs_hash_newmask, vfs_hash_oldmask;
+ struct vnode *vp;
+ int i;
+
+ vfs_hash_newtbl = hashinit(newmaxvnodes, M_VFS_HASH,
+ &vfs_hash_newmask);
+ /* If same hash table size, nothing to do */
+ if (vfs_hash_mask == vfs_hash_newmask) {
+ free(vfs_hash_newtbl, M_VFS_HASH);
+ return;
+ }
+ /*
+ * Move everything from the old hash table to the new table.
+ * None of the vnodes in the table can be recycled because to
+ * do so, they have to be removed from the hash table.
+ */
+ mtx_lock(&vfs_hash_mtx);
+ vfs_hash_oldtbl = vfs_hash_tbl;
+ vfs_hash_oldmask = vfs_hash_mask;
+ vfs_hash_tbl = vfs_hash_newtbl;
+ vfs_hash_mask = vfs_hash_newmask;
+ for (i = 0; i <= vfs_hash_oldmask; i++) {
+ while ((vp = LIST_FIRST(&vfs_hash_oldtbl[i])) != NULL) {
+ LIST_REMOVE(vp, v_hashlist);
+ LIST_INSERT_HEAD(
+ vfs_hash_bucket(vp->v_mount, vp->v_hash),
+ vp, v_hashlist);
+ }
+ }
+ mtx_unlock(&vfs_hash_mtx);
+ free(vfs_hash_oldtbl, M_VFS_HASH);
+}
Modified: trunk/sys/kern/vfs_init.c
===================================================================
--- trunk/sys/kern/vfs_init.c 2018-05-25 21:07:58 UTC (rev 9951)
+++ trunk/sys/kern/vfs_init.c 2018-05-26 14:24:52 UTC (rev 9952)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 1989, 1993
* The Regents of the University of California. All rights reserved.
@@ -35,7 +36,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/vfs_init.c 284021 2015-06-05 08:36:25Z kib $");
#include <sys/param.h>
#include <sys/systm.h>
@@ -44,6 +45,7 @@
#include <sys/linker.h>
#include <sys/mount.h>
#include <sys/proc.h>
+#include <sys/sx.h>
#include <sys/syscallsubr.h>
#include <sys/sysctl.h>
#include <sys/vnode.h>
@@ -64,6 +66,8 @@
* New entries are added/deleted by vfs_register()/vfs_unregister()
*/
struct vfsconfhead vfsconf = TAILQ_HEAD_INITIALIZER(vfsconf);
+struct sx vfsconf_sx;
+SX_SYSINIT(vfsconf, &vfsconf_sx, "vfsconf");
/*
* Loader.conf variable vfs.typenumhash enables setting vfc_typenum using a hash
@@ -105,20 +109,33 @@
* Routines having to do with the management of the vnode table.
*/
-struct vfsconf *
-vfs_byname(const char *name)
+static struct vfsconf *
+vfs_byname_locked(const char *name)
{
struct vfsconf *vfsp;
+ sx_assert(&vfsconf_sx, SA_LOCKED);
if (!strcmp(name, "ffs"))
name = "ufs";
- TAILQ_FOREACH(vfsp, &vfsconf, vfc_list)
+ TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
if (!strcmp(name, vfsp->vfc_name))
return (vfsp);
+ }
return (NULL);
}
struct vfsconf *
+vfs_byname(const char *name)
+{
+ struct vfsconf *vfsp;
+
+ vfsconf_slock();
+ vfsp = vfs_byname_locked(name);
+ vfsconf_sunlock();
+ return (vfsp);
+}
+
+struct vfsconf *
vfs_byname_kld(const char *fstype, struct thread *td, int *error)
{
struct vfsconf *vfsp;
@@ -169,8 +186,11 @@
vfc->vfc_name, vfc->vfc_version);
return (EINVAL);
}
- if (vfs_byname(vfc->vfc_name) != NULL)
- return EEXIST;
+ vfsconf_lock();
+ if (vfs_byname_locked(vfc->vfc_name) != NULL) {
+ vfsconf_unlock();
+ return (EEXIST);
+ }
if (vfs_typenumhash != 0) {
/*
@@ -203,26 +223,6 @@
TAILQ_INSERT_TAIL(&vfsconf, vfc, vfc_list);
/*
- * If this filesystem has a sysctl node under vfs
- * (i.e. vfs.xxfs), then change the oid number of that node to
- * match the filesystem's type number. This allows user code
- * which uses the type number to read sysctl variables defined
- * by the filesystem to continue working. Since the oids are
- * in a sorted list, we need to make sure the order is
- * preserved by re-registering the oid after modifying its
- * number.
- */
- sysctl_lock();
- SLIST_FOREACH(oidp, &sysctl__vfs_children, oid_link)
- if (strcmp(oidp->oid_name, vfc->vfc_name) == 0) {
- sysctl_unregister_oid(oidp);
- oidp->oid_number = vfc->vfc_typenum;
- sysctl_register_oid(oidp);
- break;
- }
- sysctl_unlock();
-
- /*
* Initialise unused ``struct vfsops'' fields, to use
* the vfs_std*() functions. Note, we need the mount
* and unmount operations, at the least. The check
@@ -281,8 +281,30 @@
* Call init function for this VFS...
*/
(*(vfc->vfc_vfsops->vfs_init))(vfc);
+ vfsconf_unlock();
- return 0;
+ /*
+ * If this filesystem has a sysctl node under vfs
+ * (i.e. vfs.xxfs), then change the oid number of that node to
+ * match the filesystem's type number. This allows user code
+ * which uses the type number to read sysctl variables defined
+ * by the filesystem to continue working. Since the oids are
+ * in a sorted list, we need to make sure the order is
+ * preserved by re-registering the oid after modifying its
+ * number.
+ */
+ sysctl_lock();
+ SLIST_FOREACH(oidp, &sysctl__vfs_children, oid_link) {
+ if (strcmp(oidp->oid_name, vfc->vfc_name) == 0) {
+ sysctl_unregister_oid(oidp);
+ oidp->oid_number = vfc->vfc_typenum;
+ sysctl_register_oid(oidp);
+ break;
+ }
+ }
+ sysctl_unlock();
+
+ return (0);
}
@@ -291,19 +313,24 @@
vfs_unregister(struct vfsconf *vfc)
{
struct vfsconf *vfsp;
- int error, i, maxtypenum;
+ int error, maxtypenum;
- i = vfc->vfc_typenum;
-
- vfsp = vfs_byname(vfc->vfc_name);
- if (vfsp == NULL)
- return EINVAL;
- if (vfsp->vfc_refcount)
- return EBUSY;
+ vfsconf_lock();
+ vfsp = vfs_byname_locked(vfc->vfc_name);
+ if (vfsp == NULL) {
+ vfsconf_unlock();
+ return (EINVAL);
+ }
+ if (vfsp->vfc_refcount != 0) {
+ vfsconf_unlock();
+ return (EBUSY);
+ }
if (vfc->vfc_vfsops->vfs_uninit != NULL) {
error = (*vfc->vfc_vfsops->vfs_uninit)(vfsp);
- if (error)
+ if (error != 0) {
+ vfsconf_unlock();
return (error);
+ }
}
TAILQ_REMOVE(&vfsconf, vfsp, vfc_list);
maxtypenum = VFS_GENERIC;
@@ -311,7 +338,8 @@
if (maxtypenum < vfsp->vfc_typenum)
maxtypenum = vfsp->vfc_typenum;
maxvfsconf = maxtypenum + 1;
- return 0;
+ vfsconf_unlock();
+ return (0);
}
/*
Modified: trunk/sys/kern/vfs_lookup.c
===================================================================
--- trunk/sys/kern/vfs_lookup.c 2018-05-25 21:07:58 UTC (rev 9951)
+++ trunk/sys/kern/vfs_lookup.c 2018-05-26 14:24:52 UTC (rev 9952)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 1982, 1986, 1989, 1993
* The Regents of the University of California. All rights reserved.
@@ -35,7 +36,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/vfs_lookup.c 308469 2016-11-09 17:07:45Z kib $");
#include "opt_capsicum.h"
#include "opt_kdtrace.h"
@@ -44,7 +45,7 @@
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
-#include <sys/capability.h>
+#include <sys/capsicum.h>
#include <sys/fcntl.h>
#include <sys/jail.h>
#include <sys/lock.h>
@@ -70,9 +71,9 @@
#undef NAMEI_DIAGNOSTIC
SDT_PROVIDER_DECLARE(vfs);
-SDT_PROBE_DEFINE3(vfs, namei, lookup, entry, entry, "struct vnode *", "char *",
+SDT_PROBE_DEFINE3(vfs, namei, lookup, entry, "struct vnode *", "char *",
"unsigned long");
-SDT_PROBE_DEFINE2(vfs, namei, lookup, return, return, "int", "struct vnode *");
+SDT_PROBE_DEFINE2(vfs, namei, lookup, return, "int", "struct vnode *");
/*
* Allocation zone for namei
@@ -143,10 +144,7 @@
struct componentname *cnp = &ndp->ni_cnd;
struct thread *td = cnp->cn_thread;
struct proc *p = td->td_proc;
- int vfslocked;
- KASSERT((cnp->cn_flags & MPSAFE) != 0 || mtx_owned(&Giant) != 0,
- ("NOT MPSAFE and Giant not held"));
ndp->ni_cnd.cn_cred = ndp->ni_cnd.cn_thread->td_ucred;
KASSERT(cnp->cn_cred && p, ("namei: bad cred/proc"));
KASSERT((cnp->cn_nameiop & (~OPMASK)) == 0,
@@ -167,11 +165,11 @@
if ((cnp->cn_flags & HASBUF) == 0)
cnp->cn_pnbuf = uma_zalloc(namei_zone, M_WAITOK);
if (ndp->ni_segflg == UIO_SYSSPACE)
- error = copystr(ndp->ni_dirp, cnp->cn_pnbuf,
- MAXPATHLEN, (size_t *)&ndp->ni_pathlen);
+ error = copystr(ndp->ni_dirp, cnp->cn_pnbuf, MAXPATHLEN,
+ &ndp->ni_pathlen);
else
- error = copyinstr(ndp->ni_dirp, cnp->cn_pnbuf,
- MAXPATHLEN, (size_t *)&ndp->ni_pathlen);
+ error = copyinstr(ndp->ni_dirp, cnp->cn_pnbuf, MAXPATHLEN,
+ &ndp->ni_pathlen);
/*
* Don't allow empty pathnames.
@@ -185,10 +183,16 @@
* not an absolute path, and not containing '..' components) to
* a real file descriptor, not the pseudo-descriptor AT_FDCWD.
*/
- if (IN_CAPABILITY_MODE(td)) {
+ if (error == 0 && IN_CAPABILITY_MODE(td) &&
+ (cnp->cn_flags & NOCAPCHECK) == 0) {
ndp->ni_strictrelative = 1;
- if (ndp->ni_dirfd == AT_FDCWD)
+ if (ndp->ni_dirfd == AT_FDCWD) {
+#ifdef KTRACE
+ if (KTRPOINT(td, KTR_CAPFAIL))
+ ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL);
+#endif
error = ECAPMODE;
+ }
}
#endif
if (error) {
@@ -225,31 +229,36 @@
dp = ndp->ni_startdir;
error = 0;
} else if (ndp->ni_dirfd != AT_FDCWD) {
+ cap_rights_t rights;
+
+ rights = ndp->ni_rightsneeded;
+ cap_rights_set(&rights, CAP_LOOKUP);
+
if (cnp->cn_flags & AUDITVNODE1)
AUDIT_ARG_ATFD1(ndp->ni_dirfd);
if (cnp->cn_flags & AUDITVNODE2)
AUDIT_ARG_ATFD2(ndp->ni_dirfd);
error = fgetvp_rights(td, ndp->ni_dirfd,
- ndp->ni_rightsneeded | CAP_LOOKUP,
- &(ndp->ni_baserights), &dp);
+ &rights, &ndp->ni_filecaps, &dp);
#ifdef CAPABILITIES
/*
- * Lookups relative to a capability must also be
+ * If file descriptor doesn't have all rights,
+ * all lookups relative to it must also be
* strictly relative.
- *
- * Note that a capability with rights CAP_MASK_VALID
- * is treated exactly like a regular file descriptor.
*/
- if (ndp->ni_baserights != CAP_MASK_VALID)
+ CAP_ALL(&rights);
+ if (!cap_rights_contains(&ndp->ni_filecaps.fc_rights,
+ &rights) ||
+ ndp->ni_filecaps.fc_fcntls != CAP_FCNTL_ALL ||
+ ndp->ni_filecaps.fc_nioctls != -1) {
ndp->ni_strictrelative = 1;
+ }
#endif
}
if (error != 0 || dp != NULL) {
FILEDESC_SUNLOCK(fdp);
if (error == 0 && dp->v_type != VDIR) {
- vfslocked = VFS_LOCK_GIANT(dp->v_mount);
vrele(dp);
- VFS_UNLOCK_GIANT(vfslocked);
error = ENOTDIR;
}
}
@@ -262,15 +271,11 @@
dp = fdp->fd_cdir;
VREF(dp);
FILEDESC_SUNLOCK(fdp);
- if (ndp->ni_startdir != NULL) {
- vfslocked = VFS_LOCK_GIANT(ndp->ni_startdir->v_mount);
+ if (ndp->ni_startdir != NULL)
vrele(ndp->ni_startdir);
- VFS_UNLOCK_GIANT(vfslocked);
- }
}
- SDT_PROBE(vfs, namei, lookup, entry, dp, cnp->cn_pnbuf,
- cnp->cn_flags, 0, 0);
- vfslocked = VFS_LOCK_GIANT(dp->v_mount);
+ SDT_PROBE3(vfs, namei, lookup, entry, dp, cnp->cn_pnbuf,
+ cnp->cn_flags);
for (;;) {
/*
* Check if root directory should replace current directory.
@@ -279,8 +284,11 @@
cnp->cn_nameptr = cnp->cn_pnbuf;
if (*(cnp->cn_nameptr) == '/') {
vrele(dp);
- VFS_UNLOCK_GIANT(vfslocked);
if (ndp->ni_strictrelative != 0) {
+#ifdef KTRACE
+ if (KTRPOINT(curthread, KTR_CAPFAIL))
+ ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL);
+#endif
namei_cleanup_cnp(cnp);
return (ENOTCAPABLE);
}
@@ -289,21 +297,15 @@
ndp->ni_pathlen--;
}
dp = ndp->ni_rootdir;
- vfslocked = VFS_LOCK_GIANT(dp->v_mount);
VREF(dp);
}
- if (vfslocked)
- ndp->ni_cnd.cn_flags |= GIANTHELD;
ndp->ni_startdir = dp;
error = lookup(ndp);
if (error) {
namei_cleanup_cnp(cnp);
- SDT_PROBE(vfs, namei, lookup, return, error, NULL, 0,
- 0, 0);
+ SDT_PROBE2(vfs, namei, lookup, return, error, NULL);
return (error);
}
- vfslocked = (ndp->ni_cnd.cn_flags & GIANTHELD) != 0;
- ndp->ni_cnd.cn_flags &= ~GIANTHELD;
/*
* If not a symbolic link, we're done.
*/
@@ -313,12 +315,7 @@
} else
cnp->cn_flags |= HASBUF;
- if ((cnp->cn_flags & MPSAFE) == 0) {
- VFS_UNLOCK_GIANT(vfslocked);
- } else if (vfslocked)
- ndp->ni_cnd.cn_flags |= GIANTHELD;
- SDT_PROBE(vfs, namei, lookup, return, 0, ndp->ni_vp,
- 0, 0, 0);
+ SDT_PROBE2(vfs, namei, lookup, return, 0, ndp->ni_vp);
return (0);
}
if (ndp->ni_loopcnt++ >= MAXSYMLINKS) {
@@ -379,8 +376,7 @@
vput(ndp->ni_vp);
ndp->ni_vp = NULL;
vrele(ndp->ni_dvp);
- VFS_UNLOCK_GIANT(vfslocked);
- SDT_PROBE(vfs, namei, lookup, return, error, NULL, 0, 0, 0);
+ SDT_PROBE2(vfs, namei, lookup, return, error, NULL);
return (error);
}
@@ -395,6 +391,7 @@
lkflags &= ~LK_SHARED;
lkflags |= LK_EXCLUSIVE;
}
+ lkflags |= LK_NODDLKTREAT;
return (lkflags);
}
@@ -418,13 +415,8 @@
* extended shared operations, then use a shared lock for the
* leaf node, otherwise use an exclusive lock.
*/
- if (flags & ISOPEN) {
- if (mp != NULL &&
- (mp->mnt_kern_flag & MNTK_EXTENDED_SHARED))
- return (0);
- else
- return (1);
- }
+ if ((flags & ISOPEN) != 0)
+ return (!MNT_EXTENDED_SHARED(mp));
/*
* Lookup requests outside of open() that specify LOCKSHARED
@@ -485,9 +477,6 @@
int error = 0;
int dpunlocked = 0; /* dp has already been unlocked */
struct componentname *cnp = &ndp->ni_cnd;
- int vfslocked; /* VFS Giant state for child */
- int dvfslocked; /* VFS Giant state for parent */
- int tvfslocked;
int lkflags_save;
int ni_dvp_unlocked;
@@ -494,10 +483,7 @@
/*
* Setup: break out flag bits into variables.
*/
- dvfslocked = (ndp->ni_cnd.cn_flags & GIANTHELD) != 0;
- vfslocked = 0;
ni_dvp_unlocked = 0;
- ndp->ni_cnd.cn_flags &= ~GIANTHELD;
wantparent = cnp->cn_flags & (LOCKPARENT | WANTPARENT);
KASSERT(cnp->cn_nameiop == LOOKUP || wantparent,
("CREATE, DELETE, RENAME require LOCKPARENT or WANTPARENT."));
@@ -638,6 +624,10 @@
*/
if (cnp->cn_flags & ISDOTDOT) {
if (ndp->ni_strictrelative != 0) {
+#ifdef KTRACE
+ if (KTRPOINT(curthread, KTR_CAPFAIL))
+ ktrcapfail(CAPFAIL_LOOKUP, NULL, NULL);
+#endif
error = ENOTCAPABLE;
goto bad;
}
@@ -659,7 +649,6 @@
(cnp->cn_flags & NOCROSSMOUNT) != 0)) {
ndp->ni_dvp = dp;
ndp->ni_vp = dp;
- vfslocked = VFS_LOCK_GIANT(dp->v_mount);
VREF(dp);
goto nextname;
}
@@ -671,11 +660,8 @@
}
tdp = dp;
dp = dp->v_mount->mnt_vnodecovered;
- tvfslocked = dvfslocked;
- dvfslocked = VFS_LOCK_GIANT(dp->v_mount);
VREF(dp);
vput(tdp);
- VFS_UNLOCK_GIANT(tvfslocked);
vn_lock(dp,
compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags |
LK_RETRY, ISDOTDOT));
@@ -697,7 +683,6 @@
ndp->ni_dvp = dp;
ndp->ni_vp = NULL;
ASSERT_VOP_LOCKED(dp, "lookup");
- VNASSERT(vfslocked == 0, dp, ("lookup: vfslocked %d", vfslocked));
/*
* If we have a shared lock we may need to upgrade the lock for the
* last operation.
@@ -733,11 +718,8 @@
(dp->v_mount->mnt_flag & MNT_UNION)) {
tdp = dp;
dp = dp->v_mount->mnt_vnodecovered;
- tvfslocked = dvfslocked;
- dvfslocked = VFS_LOCK_GIANT(dp->v_mount);
VREF(dp);
vput(tdp);
- VFS_UNLOCK_GIANT(tvfslocked);
vn_lock(dp,
compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags |
LK_RETRY, cnp->cn_flags));
@@ -791,7 +773,6 @@
}
dp = ndp->ni_vp;
- vfslocked = VFS_LOCK_GIANT(dp->v_mount);
/*
* Check to see if the vnode has been mounted on;
@@ -802,14 +783,10 @@
if (vfs_busy(mp, 0))
continue;
vput(dp);
- VFS_UNLOCK_GIANT(vfslocked);
- vfslocked = VFS_LOCK_GIANT(mp);
if (dp != ndp->ni_dvp)
vput(ndp->ni_dvp);
else
vrele(ndp->ni_dvp);
- VFS_UNLOCK_GIANT(dvfslocked);
- dvfslocked = 0;
vref(vp_crossmp);
ndp->ni_dvp = vp_crossmp;
error = VFS_ROOT(mp, compute_cn_lkflags(mp, cnp->cn_lkflags,
@@ -870,9 +847,6 @@
vput(ndp->ni_dvp);
else
vrele(ndp->ni_dvp);
- VFS_UNLOCK_GIANT(dvfslocked);
- dvfslocked = vfslocked; /* dp becomes dvp in dirloop */
- vfslocked = 0;
goto dirloop;
}
/*
@@ -901,8 +875,6 @@
vput(ndp->ni_dvp);
else
vrele(ndp->ni_dvp);
- VFS_UNLOCK_GIANT(dvfslocked);
- dvfslocked = 0;
} else if ((cnp->cn_flags & LOCKPARENT) == 0 && ndp->ni_dvp != dp) {
VOP_UNLOCK(ndp->ni_dvp, 0);
ni_dvp_unlocked = 1;
@@ -928,10 +900,6 @@
goto bad2;
}
}
- if (vfslocked && dvfslocked)
- VFS_UNLOCK_GIANT(dvfslocked); /* Only need one */
- if (vfslocked || dvfslocked)
- ndp->ni_cnd.cn_flags |= GIANTHELD;
return (0);
bad2:
@@ -944,9 +912,6 @@
bad:
if (!dpunlocked)
vput(dp);
- VFS_UNLOCK_GIANT(vfslocked);
- VFS_UNLOCK_GIANT(dvfslocked);
- ndp->ni_cnd.cn_flags &= ~GIANTHELD;
ndp->ni_vp = NULL;
return (error);
}
@@ -1086,6 +1051,27 @@
return (error);
}
+void
+NDINIT_ALL(struct nameidata *ndp, u_long op, u_long flags, enum uio_seg segflg,
+ const char *namep, int dirfd, struct vnode *startdir, cap_rights_t *rightsp,
+ struct thread *td)
+{
+
+ ndp->ni_cnd.cn_nameiop = op;
+ ndp->ni_cnd.cn_flags = flags;
+ ndp->ni_segflg = segflg;
+ ndp->ni_dirp = namep;
+ ndp->ni_dirfd = dirfd;
+ ndp->ni_startdir = startdir;
+ ndp->ni_strictrelative = 0;
+ if (rightsp != NULL)
+ ndp->ni_rightsneeded = *rightsp;
+ else
+ cap_rights_init(&ndp->ni_rightsneeded);
+ filecaps_init(&ndp->ni_filecaps);
+ ndp->ni_cnd.cn_thread = td;
+}
+
/*
* Free data allocated by namei(); see namei(9) for details.
*/
@@ -1142,7 +1128,7 @@
* Determine if there is a suitable alternate filename under the specified
* prefix for the specified path. If the create flag is set, then the
* alternate prefix will be used so long as the parent directory exists.
- * This is used by the various compatiblity ABIs so that Linux binaries prefer
+ * This is used by the various compatibility ABIs so that Linux binaries prefer
* files under /compat/linux for example. The chosen path (whether under
* the prefix or under /) is returned in a kernel malloc'd buffer pointed
* to by pathbuf. The caller is responsible for free'ing the buffer from
@@ -1209,13 +1195,13 @@
for (cp = &ptr[len] - 1; *cp != '/'; cp--);
*cp = '\0';
- NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE, UIO_SYSSPACE, buf, td);
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, buf, td);
error = namei(&nd);
*cp = '/';
if (error != 0)
goto keeporig;
} else {
- NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE, UIO_SYSSPACE, buf, td);
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, buf, td);
error = namei(&nd);
if (error != 0)
@@ -1229,7 +1215,7 @@
* root directory and never finding it, because "/" resolves
* to the emulation root directory. This is expensive :-(
*/
- NDINIT(&ndroot, LOOKUP, FOLLOW | MPSAFE, UIO_SYSSPACE, prefix,
+ NDINIT(&ndroot, LOOKUP, FOLLOW, UIO_SYSSPACE, prefix,
td);
/* We shouldn't ever get an error from this namei(). */
@@ -1240,13 +1226,11 @@
NDFREE(&ndroot, NDF_ONLY_PNBUF);
vrele(ndroot.ni_vp);
- VFS_UNLOCK_GIANT(NDHASGIANT(&ndroot));
}
}
NDFREE(&nd, NDF_ONLY_PNBUF);
vrele(nd.ni_vp);
- VFS_UNLOCK_GIANT(NDHASGIANT(&nd));
keeporig:
/* If there was an error, use the original path name. */
Modified: trunk/sys/kern/vfs_mount.c
===================================================================
--- trunk/sys/kern/vfs_mount.c 2018-05-25 21:07:58 UTC (rev 9951)
+++ trunk/sys/kern/vfs_mount.c 2018-05-26 14:24:52 UTC (rev 9952)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 1999-2004 Poul-Henning Kamp
* Copyright (c) 1999 Michael Smith
@@ -35,7 +36,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/vfs_mount.c 332754 2018-04-19 05:52:47Z avg $");
#include <sys/param.h>
#include <sys/conf.h>
@@ -78,6 +79,10 @@
SYSCTL_INT(_vfs, OID_AUTO, usermount, CTLFLAG_RW, &usermount, 0,
"Unprivileged users may mount and unmount file systems");
+static int default_autoro = false;
+SYSCTL_INT(_vfs, OID_AUTO, default_autoro, CTLFLAG_RW, &default_autoro, 0,
+ "Retry failed r/w mount as r/o if no explicit ro/rw option is specified");
+
MALLOC_DEFINE(M_MOUNT, "mount", "vfs mount structure");
static uma_zone_t mount_zone;
@@ -232,7 +237,7 @@
/*
* If a mount option is specified several times,
* (with or without the "no" prefix) only keep
- * the last occurence of it.
+ * the last occurrence of it.
*/
static void
vfs_sanitizeopts(struct vfsoptlist *opts)
@@ -463,9 +468,9 @@
mp->mnt_activevnodelistsize = 0;
mp->mnt_ref = 0;
(void) vfs_busy(mp, MBF_NOWAIT);
+ atomic_add_acq_int(&vfsp->vfc_refcount, 1);
mp->mnt_op = vfsp->vfc_vfsops;
mp->mnt_vfc = vfsp;
- vfsp->vfc_refcount++; /* XXX Unlocked */
mp->mnt_stat.f_type = vfsp->vfc_typenum;
mp->mnt_gen++;
strlcpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
@@ -505,7 +510,7 @@
panic("vfs_mount_destroy: nonzero writeopcount");
if (mp->mnt_secondary_writes != 0)
panic("vfs_mount_destroy: nonzero secondary_writes");
- mp->mnt_vfc->vfc_refcount--;
+ atomic_subtract_rel_int(&mp->mnt_vfc->vfc_refcount, 1);
if (!TAILQ_EMPTY(&mp->mnt_nvnodelist)) {
struct vnode *vp;
@@ -521,6 +526,8 @@
if (mp->mnt_lockref != 0)
panic("vfs_mount_destroy: nonzero lock refcount");
MNT_IUNLOCK(mp);
+ if (mp->mnt_vnodecovered != NULL)
+ vrele(mp->mnt_vnodecovered);
#ifdef MAC
mac_mount_destroy(mp);
#endif
@@ -530,6 +537,31 @@
uma_zfree(mount_zone, mp);
}
+static bool
+vfs_should_downgrade_to_ro_mount(uint64_t fsflags, int error)
+{
+ /* This is an upgrade of an exisiting mount. */
+ if ((fsflags & MNT_UPDATE) != 0)
+ return (false);
+ /* This is already an R/O mount. */
+ if ((fsflags & MNT_RDONLY) != 0)
+ return (false);
+
+ switch (error) {
+ case ENODEV: /* generic, geom, ... */
+ case EACCES: /* cam/scsi, ... */
+ case EROFS: /* md, mmcsd, ... */
+ /*
+ * These errors can be returned by the storage layer to signal
+ * that the media is read-only. No harm in the R/O mount
+ * attempt if the error was returned for some other reason.
+ */
+ return (true);
+ default:
+ return (false);
+ }
+}
+
int
vfs_donmount(struct thread *td, uint64_t fsflags, struct uio *fsoptions)
{
@@ -537,10 +569,12 @@
struct vfsopt *opt, *tmp_opt;
char *fstype, *fspath, *errmsg;
int error, fstypelen, fspathlen, errmsg_len, errmsg_pos;
+ bool autoro;
errmsg = fspath = NULL;
errmsg_len = fspathlen = 0;
errmsg_pos = -1;
+ autoro = default_autoro;
error = vfs_buildopts(fsoptions, &optlist);
if (error)
@@ -632,17 +666,28 @@
free(opt->name, M_MOUNT);
opt->name = strdup("nonosymfollow", M_MOUNT);
}
- else if (strcmp(opt->name, "noro") == 0)
+ else if (strcmp(opt->name, "noro") == 0) {
fsflags &= ~MNT_RDONLY;
- else if (strcmp(opt->name, "rw") == 0)
+ autoro = false;
+ }
+ else if (strcmp(opt->name, "rw") == 0) {
fsflags &= ~MNT_RDONLY;
- else if (strcmp(opt->name, "ro") == 0)
+ autoro = false;
+ }
+ else if (strcmp(opt->name, "ro") == 0) {
fsflags |= MNT_RDONLY;
+ autoro = false;
+ }
else if (strcmp(opt->name, "rdonly") == 0) {
free(opt->name, M_MOUNT);
opt->name = strdup("ro", M_MOUNT);
fsflags |= MNT_RDONLY;
+ autoro = false;
}
+ else if (strcmp(opt->name, "autoro") == 0) {
+ vfs_freeopt(optlist, opt);
+ autoro = true;
+ }
else if (strcmp(opt->name, "suiddir") == 0)
fsflags |= MNT_SUIDDIR;
else if (strcmp(opt->name, "sync") == 0)
@@ -649,6 +694,10 @@
fsflags |= MNT_SYNCHRONOUS;
else if (strcmp(opt->name, "union") == 0)
fsflags |= MNT_UNION;
+ else if (strcmp(opt->name, "automounted") == 0) {
+ fsflags |= MNT_AUTOMOUNTED;
+ vfs_freeopt(optlist, opt);
+ }
}
/*
@@ -656,12 +705,25 @@
* variables will fit in our mp buffers, including the
* terminating NUL.
*/
- if (fstypelen >= MFSNAMELEN - 1 || fspathlen >= MNAMELEN - 1) {
+ if (fstypelen > MFSNAMELEN || fspathlen > MNAMELEN) {
error = ENAMETOOLONG;
goto bail;
}
error = vfs_domount(td, fstype, fspath, fsflags, &optlist);
+
+ /*
+ * See if we can mount in the read-only mode if the error code suggests
+ * that it could be possible and the mount options allow for that.
+ * Never try it if "[no]{ro|rw}" has been explicitly requested and not
+ * overridden by "autoro".
+ */
+ if (autoro && vfs_should_downgrade_to_ro_mount(fsflags, error)) {
+ printf("%s: R/W mount failed, possibly R/O media,"
+ " trying R/O mount\n", __func__);
+ fsflags |= MNT_RDONLY;
+ error = vfs_domount(td, fstype, fspath, fsflags, &optlist);
+ }
bail:
/* copyout the errmsg */
if (errmsg_pos != -1 && ((2 * errmsg_pos + 1) < fsoptions->uio_iovcnt)
@@ -736,19 +798,14 @@
}
AUDIT_ARG_TEXT(fstype);
- mtx_lock(&Giant);
vfsp = vfs_byname_kld(fstype, td, &error);
free(fstype, M_TEMP);
- if (vfsp == NULL) {
- mtx_unlock(&Giant);
+ if (vfsp == NULL)
return (ENOENT);
- }
- if (vfsp->vfc_vfsops->vfs_cmount == NULL) {
- mtx_unlock(&Giant);
+ if (vfsp->vfc_vfsops->vfs_cmount == NULL)
return (EOPNOTSUPP);
- }
- ma = mount_argsu(ma, "fstype", uap->type, MNAMELEN);
+ ma = mount_argsu(ma, "fstype", uap->type, MFSNAMELEN);
ma = mount_argsu(ma, "fspath", uap->path, MNAMELEN);
ma = mount_argb(ma, flags & MNT_RDONLY, "noro");
ma = mount_argb(ma, !(flags & MNT_NOSUID), "nosuid");
@@ -755,7 +812,6 @@
ma = mount_argb(ma, !(flags & MNT_NOEXEC), "noexec");
error = vfsp->vfc_vfsops->vfs_cmount(ma, uap->data, flags);
- mtx_unlock(&Giant);
return (error);
}
@@ -777,7 +833,6 @@
struct vnode *newdp;
int error;
- mtx_assert(&Giant, MA_OWNED);
ASSERT_VOP_ELOCKED(vp, __func__);
KASSERT((fsflags & MNT_UPDATE) == 0, ("MNT_UPDATE shouldn't be here"));
@@ -821,6 +876,7 @@
error = VFS_MOUNT(mp);
if (error != 0) {
vfs_unbusy(mp);
+ mp->mnt_vnodecovered = NULL;
vfs_mount_destroy(mp);
VI_LOCK(vp);
vp->v_iflag &= ~VI_MOUNT;
@@ -861,8 +917,9 @@
vfs_event_signal(NULL, VQ_MOUNT, 0);
if (VFS_ROOT(mp, LK_EXCLUSIVE, &newdp))
panic("mount: lost mount");
+ VOP_UNLOCK(vp, 0);
+ EVENTHANDLER_INVOKE(vfs_mounted, mp, newdp, td);
VOP_UNLOCK(newdp, 0);
- VOP_UNLOCK(vp, 0);
mountcheckdirs(vp, newdp);
vrele(newdp);
if ((mp->mnt_flag & MNT_RDONLY) == 0)
@@ -888,15 +945,20 @@
int error, export_error;
uint64_t flag;
- mtx_assert(&Giant, MA_OWNED);
ASSERT_VOP_ELOCKED(vp, __func__);
KASSERT((fsflags & MNT_UPDATE) != 0, ("MNT_UPDATE should be here"));
+ mp = vp->v_mount;
if ((vp->v_vflag & VV_ROOT) == 0) {
+ if (vfs_copyopt(*optlist, "export", &export, sizeof(export))
+ == 0)
+ error = EXDEV;
+ else
+ error = EINVAL;
vput(vp);
- return (EINVAL);
+ return (error);
}
- mp = vp->v_mount;
+
/*
* We only allow the filesystem to be reloaded if it
* is currently mounted read-only.
@@ -931,6 +993,11 @@
VOP_UNLOCK(vp, 0);
MNT_ILOCK(mp);
+ if ((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0) {
+ MNT_IUNLOCK(mp);
+ error = EBUSY;
+ goto end;
+ }
mp->mnt_flag &= ~MNT_UPDATEMASK;
mp->mnt_flag |= fsflags & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE |
MNT_SNAPSHOT | MNT_ROOTFS | MNT_UPDATEMASK | MNT_RDONLY);
@@ -1085,13 +1152,11 @@
/*
* Get vnode to be covered or mount point's vnode in case of MNT_UPDATE.
*/
- NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1,
UIO_SYSSPACE, fspath, td);
error = namei(&nd);
if (error != 0)
return (error);
- if (!NDHASGIANT(&nd))
- mtx_lock(&Giant);
NDFREE(&nd, NDF_ONLY_PNBUF);
vp = nd.ni_vp;
if ((fsflags & MNT_UPDATE) == 0) {
@@ -1106,11 +1171,7 @@
free(pathbuf, M_TEMP);
} else
error = vfs_domount_update(td, vp, fsflags, optlist);
- mtx_unlock(&Giant);
- ASSERT_VI_UNLOCKED(vp, __func__);
- ASSERT_VOP_UNLOCKED(vp, __func__);
-
return (error);
}
@@ -1128,17 +1189,12 @@
#endif
/* ARGSUSED */
int
-sys_unmount(td, uap)
- struct thread *td;
- register struct unmount_args /* {
- char *path;
- int flags;
- } */ *uap;
+sys_unmount(struct thread *td, struct unmount_args *uap)
{
struct nameidata nd;
struct mount *mp;
char *pathbuf;
- int error, id0, id1, vfslocked;
+ int error, id0, id1;
AUDIT_ARG_VALUE(uap->flags);
if (jailed(td->td_ucred) || usermount == 0) {
@@ -1153,12 +1209,10 @@
free(pathbuf, M_TEMP);
return (error);
}
- mtx_lock(&Giant);
if (uap->flags & MNT_BYFSID) {
AUDIT_ARG_TEXT(pathbuf);
/* Decode the filesystem ID. */
if (sscanf(pathbuf, "FSID:%d:%d", &id0, &id1) != 2) {
- mtx_unlock(&Giant);
free(pathbuf, M_TEMP);
return (EINVAL);
}
@@ -1166,8 +1220,10 @@
mtx_lock(&mountlist_mtx);
TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) {
if (mp->mnt_stat.f_fsid.val[0] == id0 &&
- mp->mnt_stat.f_fsid.val[1] == id1)
+ mp->mnt_stat.f_fsid.val[1] == id1) {
+ vfs_ref(mp);
break;
+ }
}
mtx_unlock(&mountlist_mtx);
} else {
@@ -1174,22 +1230,21 @@
/*
* Try to find global path for path argument.
*/
- NDINIT(&nd, LOOKUP,
- FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1,
UIO_SYSSPACE, pathbuf, td);
if (namei(&nd) == 0) {
- vfslocked = NDHASGIANT(&nd);
NDFREE(&nd, NDF_ONLY_PNBUF);
error = vn_path_to_global_path(td, nd.ni_vp, pathbuf,
MNAMELEN);
if (error == 0 || error == ENODEV)
vput(nd.ni_vp);
- VFS_UNLOCK_GIANT(vfslocked);
}
mtx_lock(&mountlist_mtx);
TAILQ_FOREACH_REVERSE(mp, &mountlist, mntlist, mnt_list) {
- if (strcmp(mp->mnt_stat.f_mntonname, pathbuf) == 0)
+ if (strcmp(mp->mnt_stat.f_mntonname, pathbuf) == 0) {
+ vfs_ref(mp);
break;
+ }
}
mtx_unlock(&mountlist_mtx);
}
@@ -1201,7 +1256,6 @@
* now, so in the !MNT_BYFSID case return the more likely
* EINVAL for compatibility.
*/
- mtx_unlock(&Giant);
return ((uap->flags & MNT_BYFSID) ? ENOENT : EINVAL);
}
@@ -1209,11 +1263,10 @@
* Don't allow unmounting the root filesystem.
*/
if (mp->mnt_flag & MNT_ROOTFS) {
- mtx_unlock(&Giant);
+ vfs_rel(mp);
return (EINVAL);
}
error = dounmount(mp, uap->flags, td);
- mtx_unlock(&Giant);
return (error);
}
@@ -1221,10 +1274,7 @@
* Do the actual filesystem unmount.
*/
int
-dounmount(mp, flags, td)
- struct mount *mp;
- int flags;
- struct thread *td;
+dounmount(struct mount *mp, int flags, struct thread *td)
{
struct vnode *coveredvp, *fsrootvp;
int error;
@@ -1231,14 +1281,11 @@
uint64_t async_flag;
int mnt_gen_r;
- mtx_assert(&Giant, MA_OWNED);
-
if ((coveredvp = mp->mnt_vnodecovered) != NULL) {
mnt_gen_r = mp->mnt_gen;
VI_LOCK(coveredvp);
vholdl(coveredvp);
vn_lock(coveredvp, LK_EXCLUSIVE | LK_INTERLOCK | LK_RETRY);
- vdrop(coveredvp);
/*
* Check for mp being unmounted while waiting for the
* covered vnode lock.
@@ -1246,34 +1293,51 @@
if (coveredvp->v_mountedhere != mp ||
coveredvp->v_mountedhere->mnt_gen != mnt_gen_r) {
VOP_UNLOCK(coveredvp, 0);
+ vdrop(coveredvp);
+ vfs_rel(mp);
return (EBUSY);
}
}
+
/*
* Only privileged root, or (if MNT_USER is set) the user that did the
* original mount is permitted to unmount this filesystem.
*/
error = vfs_suser(mp, td);
- if (error) {
- if (coveredvp)
+ if (error != 0) {
+ if (coveredvp != NULL) {
VOP_UNLOCK(coveredvp, 0);
+ vdrop(coveredvp);
+ }
+ vfs_rel(mp);
return (error);
}
- vn_start_write(NULL, &mp, V_WAIT);
+ vn_start_write(NULL, &mp, V_WAIT | V_MNTREF);
MNT_ILOCK(mp);
if ((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0 ||
+ (mp->mnt_flag & MNT_UPDATE) != 0 ||
!TAILQ_EMPTY(&mp->mnt_uppers)) {
MNT_IUNLOCK(mp);
- if (coveredvp)
+ if (coveredvp != NULL) {
VOP_UNLOCK(coveredvp, 0);
+ vdrop(coveredvp);
+ }
vn_finished_write(mp);
return (EBUSY);
}
mp->mnt_kern_flag |= MNTK_UNMOUNT | MNTK_NOINSMNTQ;
/* Allow filesystems to detect that a forced unmount is in progress. */
- if (flags & MNT_FORCE)
+ if (flags & MNT_FORCE) {
mp->mnt_kern_flag |= MNTK_UNMOUNTF;
+ MNT_IUNLOCK(mp);
+ /*
+ * Must be done after setting MNTK_UNMOUNTF and before
+ * waiting for mnt_lockref to become 0.
+ */
+ VFS_PURGE(mp);
+ MNT_ILOCK(mp);
+ }
error = 0;
if (mp->mnt_lockref) {
mp->mnt_kern_flag |= MNTK_DRAINING;
@@ -1291,6 +1355,16 @@
if (mp->mnt_flag & MNT_EXPUBLIC)
vfs_setpublicfs(NULL, NULL, NULL);
+ /*
+ * From now, we can claim that the use reference on the
+ * coveredvp is ours, and the ref can be released only by
+ * successfull unmount by us, or left for later unmount
+ * attempt. The previously acquired hold reference is no
+ * longer needed to protect the vnode from reuse.
+ */
+ if (coveredvp != NULL)
+ vdrop(coveredvp);
+
vfs_msync(mp, MNT_WAIT);
MNT_ILOCK(mp);
async_flag = mp->mnt_flag & MNT_ASYNC;
@@ -1306,7 +1380,8 @@
*/
if ((flags & MNT_FORCE) &&
VFS_ROOT(mp, LK_EXCLUSIVE, &fsrootvp) == 0) {
- if (mp->mnt_vnodecovered != NULL)
+ if (mp->mnt_vnodecovered != NULL &&
+ (mp->mnt_flag & MNT_IGNORE) == 0)
mountcheckdirs(fsrootvp, mp->mnt_vnodecovered);
if (fsrootvp == rootvnode) {
vrele(rootvnode);
@@ -1314,8 +1389,8 @@
}
vput(fsrootvp);
}
- if (((mp->mnt_flag & MNT_RDONLY) ||
- (error = VFS_SYNC(mp, MNT_WAIT)) == 0) || (flags & MNT_FORCE) != 0)
+ if ((mp->mnt_flag & MNT_RDONLY) != 0 || (flags & MNT_FORCE) != 0 ||
+ (error = VFS_SYNC(mp, MNT_WAIT)) == 0)
error = VFS_UNMOUNT(mp, flags);
vn_finished_write(mp);
/*
@@ -1327,7 +1402,8 @@
if (error && error != ENXIO) {
if ((flags & MNT_FORCE) &&
VFS_ROOT(mp, LK_EXCLUSIVE, &fsrootvp) == 0) {
- if (mp->mnt_vnodecovered != NULL)
+ if (mp->mnt_vnodecovered != NULL &&
+ (mp->mnt_flag & MNT_IGNORE) == 0)
mountcheckdirs(mp->mnt_vnodecovered, fsrootvp);
if (rootvnode == NULL) {
rootvnode = fsrootvp;
@@ -1359,11 +1435,14 @@
mtx_lock(&mountlist_mtx);
TAILQ_REMOVE(&mountlist, mp, mnt_list);
mtx_unlock(&mountlist_mtx);
+ EVENTHANDLER_INVOKE(vfs_unmounted, mp, td);
if (coveredvp != NULL) {
coveredvp->v_mountedhere = NULL;
- vput(coveredvp);
+ VOP_UNLOCK(coveredvp, 0);
}
vfs_event_signal(NULL, VQ_UNMOUNT, 0);
+ if (mp == rootdevmp)
+ rootdevmp = NULL;
vfs_mount_destroy(mp);
return (0);
}
@@ -1511,6 +1590,48 @@
return (-1);
}
+int
+vfs_getopt_size(struct vfsoptlist *opts, const char *name, off_t *value)
+{
+ char *opt_value, *vtp;
+ quad_t iv;
+ int error, opt_len;
+
+ error = vfs_getopt(opts, name, (void **)&opt_value, &opt_len);
+ if (error != 0)
+ return (error);
+ if (opt_len == 0 || opt_value == NULL)
+ return (EINVAL);
+ if (opt_value[0] == '\0' || opt_value[opt_len - 1] != '\0')
+ return (EINVAL);
+ iv = strtoq(opt_value, &vtp, 0);
+ if (vtp == opt_value || (vtp[0] != '\0' && vtp[1] != '\0'))
+ return (EINVAL);
+ if (iv < 0)
+ return (EINVAL);
+ switch (vtp[0]) {
+ case 't':
+ case 'T':
+ iv *= 1024;
+ case 'g':
+ case 'G':
+ iv *= 1024;
+ case 'm':
+ case 'M':
+ iv *= 1024;
+ case 'k':
+ case 'K':
+ iv *= 1024;
+ case '\0':
+ break;
+ default:
+ return (EINVAL);
+ }
+ *value = iv;
+
+ return (0);
+}
+
char *
vfs_getopts(struct vfsoptlist *opts, const char *name, int *error)
{
@@ -1668,103 +1789,6 @@
return (ENOENT);
}
-/*
- * These are helper functions for filesystems to traverse all
- * their vnodes. See MNT_VNODE_FOREACH() in sys/mount.h.
- *
- * This interface has been deprecated in favor of MNT_VNODE_FOREACH_ALL.
- */
-
-MALLOC_DECLARE(M_VNODE_MARKER);
-
-struct vnode *
-__mnt_vnode_next(struct vnode **mvp, struct mount *mp)
-{
- struct vnode *vp;
-
- mtx_assert(MNT_MTX(mp), MA_OWNED);
-
- KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
- if (should_yield()) {
- MNT_IUNLOCK(mp);
- kern_yield(PRI_UNCHANGED);
- MNT_ILOCK(mp);
- }
- vp = TAILQ_NEXT(*mvp, v_nmntvnodes);
- while (vp != NULL && vp->v_type == VMARKER)
- vp = TAILQ_NEXT(vp, v_nmntvnodes);
-
- /* Check if we are done */
- if (vp == NULL) {
- __mnt_vnode_markerfree(mvp, mp);
- return (NULL);
- }
- TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
- TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
- return (vp);
-}
-
-struct vnode *
-__mnt_vnode_first(struct vnode **mvp, struct mount *mp)
-{
- struct vnode *vp;
-
- mtx_assert(MNT_MTX(mp), MA_OWNED);
-
- vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
- while (vp != NULL && vp->v_type == VMARKER)
- vp = TAILQ_NEXT(vp, v_nmntvnodes);
-
- /* Check if we are done */
- if (vp == NULL) {
- *mvp = NULL;
- return (NULL);
- }
- MNT_REF(mp);
- MNT_IUNLOCK(mp);
- *mvp = (struct vnode *) malloc(sizeof(struct vnode),
- M_VNODE_MARKER,
- M_WAITOK | M_ZERO);
- MNT_ILOCK(mp);
- (*mvp)->v_type = VMARKER;
-
- vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
- while (vp != NULL && vp->v_type == VMARKER)
- vp = TAILQ_NEXT(vp, v_nmntvnodes);
-
- /* Check if we are done */
- if (vp == NULL) {
- MNT_IUNLOCK(mp);
- free(*mvp, M_VNODE_MARKER);
- MNT_ILOCK(mp);
- *mvp = NULL;
- MNT_REL(mp);
- return (NULL);
- }
- (*mvp)->v_mount = mp;
- TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
- return (vp);
-}
-
-
-void
-__mnt_vnode_markerfree(struct vnode **mvp, struct mount *mp)
-{
-
- if (*mvp == NULL)
- return;
-
- mtx_assert(MNT_MTX(mp), MA_OWNED);
-
- KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
- TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
- MNT_IUNLOCK(mp);
- free(*mvp, M_VNODE_MARKER);
- MNT_ILOCK(mp);
- *mvp = NULL;
- MNT_REL(mp);
-}
-
int
__vfs_statfs(struct mount *mp, struct statfs *sbp)
{
Modified: trunk/sys/kern/vfs_mountroot.c
===================================================================
--- trunk/sys/kern/vfs_mountroot.c 2018-05-25 21:07:58 UTC (rev 9951)
+++ trunk/sys/kern/vfs_mountroot.c 2018-05-26 14:24:52 UTC (rev 9952)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 2010 Marcel Moolenaar
* Copyright (c) 1999-2004 Poul-Henning Kamp
@@ -38,7 +39,7 @@
#include "opt_rootdevname.h"
#include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/vfs_mountroot.c 331276 2018-03-20 22:57:14Z ian $");
#include <sys/param.h>
#include <sys/conf.h>
@@ -79,7 +80,7 @@
*
* If the environment variable vfs.root.mountfrom is a space separated list,
* each list element is tried in turn and the root filesystem will be mounted
- * from the first one that suceeds.
+ * from the first one that succeeds.
*
* The environment variable vfs.root.mountfrom.options is a comma delimited
* set of string mount options. These mount options must be parseable
@@ -95,8 +96,16 @@
*/
struct vnode *rootvnode;
+/*
+ * Mount of the system's /dev.
+ */
+struct mount *rootdevmp;
+
char *rootdevnames[2] = {NULL, NULL};
+struct mtx root_holds_mtx;
+MTX_SYSINIT(root_holds, &root_holds_mtx, "root_holds", MTX_DEF);
+
struct root_hold_token {
const char *who;
LIST_ENTRY(root_hold_token) list;
@@ -119,6 +128,7 @@
/* By default wait up to 3 seconds for devices to appear. */
static int root_mount_timeout = 3;
+TUNABLE_INT("vfs.mountroot.timeout", &root_mount_timeout);
struct root_hold_token *
root_mount_hold(const char *identifier)
@@ -130,9 +140,9 @@
h = malloc(sizeof *h, M_DEVBUF, M_ZERO | M_WAITOK);
h->who = identifier;
- mtx_lock(&mountlist_mtx);
+ mtx_lock(&root_holds_mtx);
LIST_INSERT_HEAD(&root_holds, h, list);
- mtx_unlock(&mountlist_mtx);
+ mtx_unlock(&root_holds_mtx);
return (h);
}
@@ -142,10 +152,10 @@
if (h == NULL)
return;
- mtx_lock(&mountlist_mtx);
+ mtx_lock(&root_holds_mtx);
LIST_REMOVE(h, list);
wakeup(&root_holds);
- mtx_unlock(&mountlist_mtx);
+ mtx_unlock(&root_holds_mtx);
free(h, M_DEVBUF);
}
@@ -167,12 +177,12 @@
*/
KASSERT(curthread->td_proc->p_pid != 0,
("root_mount_wait: cannot be called from the swapper thread"));
- mtx_lock(&mountlist_mtx);
+ mtx_lock(&root_holds_mtx);
while (!root_mount_complete) {
- msleep(&root_mount_complete, &mountlist_mtx, PZERO, "rootwait",
+ msleep(&root_mount_complete, &root_holds_mtx, PZERO, "rootwait",
hz);
}
- mtx_unlock(&mountlist_mtx);
+ mtx_unlock(&root_holds_mtx);
}
static void
@@ -199,8 +209,6 @@
VREF(rootvnode);
FILEDESC_XUNLOCK(p->p_fd);
-
- EVENTHANDLER_INVOKE(mountroot);
}
static int
@@ -213,27 +221,39 @@
*mpp = NULL;
- vfsp = vfs_byname("devfs");
- KASSERT(vfsp != NULL, ("Could not find devfs by name"));
- if (vfsp == NULL)
- return (ENOENT);
+ if (rootdevmp != NULL) {
+ /*
+ * Already have /dev; this happens during rerooting.
+ */
+ error = vfs_busy(rootdevmp, 0);
+ if (error != 0)
+ return (error);
+ *mpp = rootdevmp;
+ } else {
+ vfsp = vfs_byname("devfs");
+ KASSERT(vfsp != NULL, ("Could not find devfs by name"));
+ if (vfsp == NULL)
+ return (ENOENT);
- mp = vfs_mount_alloc(NULLVP, vfsp, "/dev", td->td_ucred);
+ mp = vfs_mount_alloc(NULLVP, vfsp, "/dev", td->td_ucred);
- error = VFS_MOUNT(mp);
- KASSERT(error == 0, ("VFS_MOUNT(devfs) failed %d", error));
- if (error)
- return (error);
+ error = VFS_MOUNT(mp);
+ KASSERT(error == 0, ("VFS_MOUNT(devfs) failed %d", error));
+ if (error)
+ return (error);
- opts = malloc(sizeof(struct vfsoptlist), M_MOUNT, M_WAITOK);
- TAILQ_INIT(opts);
- mp->mnt_opt = opts;
+ opts = malloc(sizeof(struct vfsoptlist), M_MOUNT, M_WAITOK);
+ TAILQ_INIT(opts);
+ mp->mnt_opt = opts;
- mtx_lock(&mountlist_mtx);
- TAILQ_INSERT_HEAD(&mountlist, mp, mnt_list);
- mtx_unlock(&mountlist_mtx);
+ mtx_lock(&mountlist_mtx);
+ TAILQ_INSERT_HEAD(&mountlist, mp, mnt_list);
+ mtx_unlock(&mountlist_mtx);
- *mpp = mp;
+ *mpp = mp;
+ rootdevmp = mp;
+ }
+
set_rootvnode();
error = kern_symlink(td, "/", "dev", UIO_SYSSPACE);
@@ -243,7 +263,7 @@
return (error);
}
-static int
+static void
vfs_mountroot_shuffle(struct thread *td, struct mount *mpdevfs)
{
struct nameidata nd;
@@ -353,8 +373,6 @@
printf("mountroot: unable to unlink /dev/dev "
"(error %d)\n", error);
}
-
- return (0);
}
/*
@@ -390,13 +408,6 @@
(*conf)++;
}
-static __inline int
-parse_isspace(int c)
-{
-
- return ((c == ' ' || c == '\t' || c == '\n') ? 1 : 0);
-}
-
static int
parse_skipto(char **conf, int mc)
{
@@ -711,13 +722,13 @@
errmsg = malloc(ERRMSGL, M_TEMP, M_WAITOK | M_ZERO);
if (vfs_byname(fs) == NULL) {
- strlcpy(errmsg, "unknown file system", sizeof(errmsg));
+ strlcpy(errmsg, "unknown file system", ERRMSGL);
error = ENOENT;
goto out;
}
- if (strcmp(fs, "zfs") != 0 && dev[0] != '\0' &&
- !parse_mount_dev_present(dev)) {
+ if (strcmp(fs, "zfs") != 0 && strstr(fs, "nfs") == NULL &&
+ dev[0] != '\0' && !parse_mount_dev_present(dev)) {
printf("mountroot: waiting for device %s ...\n", dev);
delay = hz / 10;
timeout = root_mount_timeout * hz;
@@ -731,15 +742,31 @@
}
}
- ma = NULL;
- ma = mount_arg(ma, "fstype", fs, -1);
- ma = mount_arg(ma, "fspath", "/", -1);
- ma = mount_arg(ma, "from", dev, -1);
- ma = mount_arg(ma, "errmsg", errmsg, ERRMSGL);
- ma = mount_arg(ma, "ro", NULL, 0);
- ma = parse_mountroot_options(ma, opts);
- error = kernel_mount(ma, MNT_ROOTFS);
+ delay = hz / 10;
+ timeout = root_mount_timeout * hz;
+ for (;;) {
+ ma = NULL;
+ ma = mount_arg(ma, "fstype", fs, -1);
+ ma = mount_arg(ma, "fspath", "/", -1);
+ ma = mount_arg(ma, "from", dev, -1);
+ ma = mount_arg(ma, "errmsg", errmsg, ERRMSGL);
+ ma = mount_arg(ma, "ro", NULL, 0);
+ ma = parse_mountroot_options(ma, opts);
+
+ error = kernel_mount(ma, MNT_ROOTFS);
+ if (error == 0 || timeout <= 0)
+ break;
+
+ if (root_mount_timeout * hz == timeout ||
+ (bootverbose && timeout % hz == 0)) {
+ printf("Mounting from %s:%s failed with error %d; "
+ "retrying for %d more second%s\n", fs, dev, error,
+ timeout / hz, (timeout / hz > 1) ? "s" : "");
+ }
+ pause("rmretry", delay);
+ timeout -= delay;
+ }
out:
if (error) {
printf("Mounting from %s:%s failed with error %d",
@@ -875,16 +902,14 @@
struct nameidata nd;
off_t ofs;
ssize_t resid;
- int error, flags, len, vfslocked;
+ int error, flags, len;
- NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE, UIO_SYSSPACE,
- "/.mount.conf", td);
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, "/.mount.conf", td);
flags = FREAD;
error = vn_open(&nd, &flags, 0, NULL);
if (error)
return (error);
- vfslocked = NDHASGIANT(&nd);
NDFREE(&nd, NDF_ONLY_PNBUF);
ofs = 0;
len = sizeof(buf) - 1;
@@ -903,7 +928,6 @@
VOP_UNLOCK(nd.ni_vp, 0);
vn_close(nd.ni_vp, FREAD, td->td_ucred, td);
- VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
@@ -919,9 +943,9 @@
DROP_GIANT();
g_waitidle();
PICKUP_GIANT();
- mtx_lock(&mountlist_mtx);
+ mtx_lock(&root_holds_mtx);
if (LIST_EMPTY(&root_holds)) {
- mtx_unlock(&mountlist_mtx);
+ mtx_unlock(&root_holds_mtx);
break;
}
if (ppsratecheck(&lastfail, &curfail, 1)) {
@@ -930,7 +954,7 @@
printf(" %s", h->who);
printf("\n");
}
- msleep(&root_holds, &mountlist_mtx, PZERO | PDROP, "roothold",
+ msleep(&root_holds, &root_holds_mtx, PZERO | PDROP, "roothold",
hz);
}
}
@@ -956,12 +980,10 @@
while (!error) {
error = vfs_mountroot_parse(sb, mp);
if (!error) {
- error = vfs_mountroot_shuffle(td, mp);
- if (!error) {
- sbuf_clear(sb);
- error = vfs_mountroot_readconf(td, sb);
- sbuf_finish(sb);
- }
+ vfs_mountroot_shuffle(td, mp);
+ sbuf_clear(sb);
+ error = vfs_mountroot_readconf(td, sb);
+ sbuf_finish(sb);
}
}
@@ -990,10 +1012,12 @@
vref(prison0.pr_root);
mtx_unlock(&prison0.pr_mtx);
- mtx_lock(&mountlist_mtx);
+ mtx_lock(&root_holds_mtx);
atomic_store_rel_int(&root_mount_complete, 1);
wakeup(&root_mount_complete);
- mtx_unlock(&mountlist_mtx);
+ mtx_unlock(&root_holds_mtx);
+
+ EVENTHANDLER_INVOKE(mountroot);
}
static struct mntarg *
Modified: trunk/sys/kern/vfs_subr.c
===================================================================
--- trunk/sys/kern/vfs_subr.c 2018-05-25 21:07:58 UTC (rev 9951)
+++ trunk/sys/kern/vfs_subr.c 2018-05-26 14:24:52 UTC (rev 9952)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 1989, 1993
* The Regents of the University of California. All rights reserved.
@@ -39,7 +40,7 @@
*/
#include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/vfs_subr.c 328997 2018-02-07 22:50:10Z mckusick $");
#include "opt_compat.h"
#include "opt_ddb.h"
@@ -65,8 +66,10 @@
#include <sys/malloc.h>
#include <sys/mount.h>
#include <sys/namei.h>
+#include <sys/pctrie.h>
#include <sys/priv.h>
#include <sys/reboot.h>
+#include <sys/rwlock.h>
#include <sys/sched.h>
#include <sys/sleepqueue.h>
#include <sys/smp.h>
@@ -94,9 +97,6 @@
#include <ddb/ddb.h>
#endif
-#define WI_MPSAFEQ 0
-#define WI_GIANTQ 1
-
static void delmntque(struct vnode *vp);
static int flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo,
int slpflag, int slptimeo);
@@ -123,6 +123,10 @@
SYSCTL_ULONG(_vfs, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0,
"Number of vnodes in existence");
+static u_long vnodes_created;
+SYSCTL_ULONG(_vfs, OID_AUTO, vnodes_created, CTLFLAG_RD, &vnodes_created,
+ 0, "Number of vnodes created by getnewvnode");
+
/*
* Conversion tables for conversion from vnode types to inode formats
* and back.
@@ -157,6 +161,10 @@
SYSCTL_INT(_vfs, OID_AUTO, vlru_allow_cache_src, CTLFLAG_RW,
&vlru_allow_cache_src, 0, "Allow vlru to reclaim source vnode");
+static u_long recycles_count;
+SYSCTL_ULONG(_vfs, OID_AUTO, recycles, CTLFLAG_RD, &recycles_count, 0,
+ "Number of vnodes recycled to avoid exceding kern.maxvnodes");
+
/*
* Various variables used for debugging the new implementation of
* reassignbuf().
@@ -166,6 +174,11 @@
SYSCTL_INT(_vfs, OID_AUTO, reassignbufcalls, CTLFLAG_RW, &reassignbufcalls, 0,
"Number of calls to reassignbuf");
+static u_long free_owe_inact;
+SYSCTL_ULONG(_vfs, OID_AUTO, free_owe_inact, CTLFLAG_RD, &free_owe_inact, 0,
+ "Number of times free vnodes kept on active list due to VFS "
+ "owing inactivation");
+
/*
* Cache for the mount type id assigned to NFS. This is used for
* special checks in nfs/nfs_nqlease.c and vm/vnode_pager.c.
@@ -186,6 +199,8 @@
/* Publicly exported FS */
struct nfs_public nfs_pub;
+static uma_zone_t buf_trie_zone;
+
/* Zone for allocation of new vnodes - used exclusively by getnewvnode() */
static uma_zone_t vnode_zone;
static uma_zone_t vnodepoll_zone;
@@ -218,7 +233,7 @@
static int syncer_delayno;
static long syncer_mask;
LIST_HEAD(synclist, bufobj);
-static struct synclist *syncer_workitem_pending[2];
+static struct synclist *syncer_workitem_pending;
/*
* The sync_mtx protects:
* bo->bo_synclist
@@ -266,8 +281,25 @@
* XXX desiredvnodes is historical cruft and should not exist.
*/
int desiredvnodes;
-SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW,
- &desiredvnodes, 0, "Maximum number of vnodes");
+
+static int
+sysctl_update_desiredvnodes(SYSCTL_HANDLER_ARGS)
+{
+ int error, old_desiredvnodes;
+
+ old_desiredvnodes = desiredvnodes;
+ if ((error = sysctl_handle_int(oidp, arg1, arg2, req)) != 0)
+ return (error);
+ if (old_desiredvnodes != desiredvnodes) {
+ vfs_hash_changesize(desiredvnodes);
+ cache_changesize(desiredvnodes);
+ }
+ return (0);
+}
+
+SYSCTL_PROC(_kern, KERN_MAXVNODES, maxvnodes,
+ CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, &desiredvnodes, 0,
+ sysctl_update_desiredvnodes, "I", "Maximum number of vnodes");
SYSCTL_ULONG(_kern, OID_AUTO, minvnodes, CTLFLAG_RW,
&wantfreevnodes, 0, "Minimum number of vnodes (legacy)");
static int vnlru_nowhere;
@@ -274,17 +306,27 @@
SYSCTL_INT(_debug, OID_AUTO, vnlru_nowhere, CTLFLAG_RW,
&vnlru_nowhere, 0, "Number of times the vnlru process ran without success");
+/* Shift count for (uintptr_t)vp to initialize vp->v_hash. */
+static int vnsz2log;
+
/*
- * Macros to control when a vnode is freed and recycled. All require
- * the vnode interlock.
+ * Support for the bufobj clean & dirty pctrie.
*/
-#define VCANRECYCLE(vp) (((vp)->v_iflag & VI_FREE) && !(vp)->v_holdcnt)
-#define VSHOULDFREE(vp) (!((vp)->v_iflag & VI_FREE) && !(vp)->v_holdcnt)
-#define VSHOULDBUSY(vp) (((vp)->v_iflag & VI_FREE) && (vp)->v_holdcnt)
+static void *
+buf_trie_alloc(struct pctrie *ptree)
+{
-/* Shift count for (uintptr_t)vp to initialize vp->v_hash. */
-static int vnsz2log;
+ return uma_zalloc(buf_trie_zone, M_NOWAIT);
+}
+static void
+buf_trie_free(struct pctrie *ptree, void *node)
+{
+
+ uma_zfree(buf_trie_zone, node);
+}
+PCTRIE_DEFINE(BUF, buf, b_lblkno, buf_trie_alloc, buf_trie_free);
+
/*
* Initialize the vnode management data structures.
*
@@ -295,7 +337,67 @@
#ifndef MAXVNODES_MAX
#define MAXVNODES_MAX (512 * (1024 * 1024 * 1024 / (int)PAGE_SIZE / 16))
#endif
+
+/*
+ * Initialize a vnode as it first enters the zone.
+ */
+static int
+vnode_init(void *mem, int size, int flags)
+{
+ struct vnode *vp;
+ struct bufobj *bo;
+
+ vp = mem;
+ bzero(vp, size);
+ /*
+ * Setup locks.
+ */
+ vp->v_vnlock = &vp->v_lock;
+ mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF);
+ /*
+ * By default, don't allow shared locks unless filesystems opt-in.
+ */
+ lockinit(vp->v_vnlock, PVFS, "vnode", VLKTIMEOUT,
+ LK_NOSHARE | LK_IS_VNODE);
+ /*
+ * Initialize bufobj.
+ */
+ bo = &vp->v_bufobj;
+ bo->__bo_vnode = vp;
+ rw_init(BO_LOCKPTR(bo), "bufobj interlock");
+ bo->bo_private = vp;
+ TAILQ_INIT(&bo->bo_clean.bv_hd);
+ TAILQ_INIT(&bo->bo_dirty.bv_hd);
+ /*
+ * Initialize namecache.
+ */
+ LIST_INIT(&vp->v_cache_src);
+ TAILQ_INIT(&vp->v_cache_dst);
+ /*
+ * Initialize rangelocks.
+ */
+ rangelock_init(&vp->v_rl);
+ return (0);
+}
+
+/*
+ * Free a vnode when it is cleared from the zone.
+ */
static void
+vnode_fini(void *mem, int size)
+{
+ struct vnode *vp;
+ struct bufobj *bo;
+
+ vp = mem;
+ rangelock_destroy(&vp->v_rl);
+ lockdestroy(vp->v_vnlock);
+ mtx_destroy(&vp->v_interlock);
+ bo = &vp->v_bufobj;
+ rw_destroy(BO_LOCKPTR(bo));
+}
+
+static void
vntblinit(void *dummy __unused)
{
u_int i;
@@ -327,16 +429,23 @@
TAILQ_INIT(&vnode_free_list);
mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF);
vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL,
- NULL, NULL, UMA_ALIGN_PTR, 0);
+ vnode_init, vnode_fini, UMA_ALIGN_PTR, 0);
vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo),
NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
/*
+ * Preallocate enough nodes to support one-per buf so that
+ * we can not fail an insert. reassignbuf() callers can not
+ * tolerate the insertion failure.
+ */
+ buf_trie_zone = uma_zcreate("BUF TRIE", pctrie_node_size(),
+ NULL, NULL, pctrie_zone_init, NULL, UMA_ALIGN_PTR,
+ UMA_ZONE_NOFREE | UMA_ZONE_VM);
+ uma_prealloc(buf_trie_zone, nbuf);
+ /*
* Initialize the filesystem syncer.
*/
- syncer_workitem_pending[WI_MPSAFEQ] = hashinit(syncer_maxdelay, M_VNODE,
+ syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE,
&syncer_mask);
- syncer_workitem_pending[WI_GIANTQ] = hashinit(syncer_maxdelay, M_VNODE,
- &syncer_mask);
syncer_maxdelay = syncer_mask + 1;
mtx_init(&sync_mtx, "Syncer mtx", NULL, MTX_DEF);
cv_init(&sync_wakeup, "syncer");
@@ -393,7 +502,7 @@
MNT_ILOCK(mp);
MNT_REF(mp);
/*
- * If mount point is currenly being unmounted, sleep until the
+ * If mount point is currently being unmounted, sleep until the
* mount point fate is decided. If thread doing the unmounting fails,
* it will clear MNTK_UNMOUNT flag before waking us up, indicating
* that this mount point has survived the unmount attempt and vfs_busy
@@ -474,14 +583,42 @@
/*
* Lookup a mount point by filesystem identifier, busying it before
* returning.
+ *
+ * To avoid congestion on mountlist_mtx, implement simple direct-mapped
+ * cache for popular filesystem identifiers. The cache is lockess, using
+ * the fact that struct mount's are never freed. In worst case we may
+ * get pointer to unmounted or even different filesystem, so we have to
+ * check what we got, and go slow way if so.
*/
struct mount *
vfs_busyfs(fsid_t *fsid)
{
+#define FSID_CACHE_SIZE 256
+ typedef struct mount * volatile vmp_t;
+ static vmp_t cache[FSID_CACHE_SIZE];
struct mount *mp;
int error;
+ uint32_t hash;
CTR2(KTR_VFS, "%s: fsid %p", __func__, fsid);
+ hash = fsid->val[0] ^ fsid->val[1];
+ hash = (hash >> 16 ^ hash) & (FSID_CACHE_SIZE - 1);
+ mp = cache[hash];
+ if (mp == NULL ||
+ mp->mnt_stat.f_fsid.val[0] != fsid->val[0] ||
+ mp->mnt_stat.f_fsid.val[1] != fsid->val[1])
+ goto slow;
+ if (vfs_busy(mp, 0) != 0) {
+ cache[hash] = NULL;
+ goto slow;
+ }
+ if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
+ mp->mnt_stat.f_fsid.val[1] == fsid->val[1])
+ return (mp);
+ else
+ vfs_unbusy(mp);
+
+slow:
mtx_lock(&mountlist_mtx);
TAILQ_FOREACH(mp, &mountlist, mnt_list) {
if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
@@ -488,9 +625,11 @@
mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
error = vfs_busy(mp, MBF_MNTLSTLOCK);
if (error) {
+ cache[hash] = NULL;
mtx_unlock(&mountlist_mtx);
return (NULL);
}
+ cache[hash] = mp;
return (mp);
}
}
@@ -584,7 +723,7 @@
*/
enum { TSP_SEC, TSP_HZ, TSP_USEC, TSP_NSEC };
-static int timestamp_precision = TSP_SEC;
+static int timestamp_precision = TSP_USEC;
SYSCTL_INT(_vfs, OID_AUTO, timestamp_precision, CTLFLAG_RW,
×tamp_precision, 0, "File timestamp precision (0: seconds, "
"1: sec + ns accurate to 1/HZ, 2: sec + ns truncated to ms, "
@@ -659,7 +798,7 @@
* the buffer cache may have references on the vnode, a directory
* vnode may still have references due to the namei cache representing
* underlying files, or the vnode may be in active use. It is not
- * desireable to reuse such vnodes. These conditions may cause the
+ * desirable to reuse such vnodes. These conditions may cause the
* number of vnodes to reach some minimum value regardless of what
* you set kern.maxvnodes to. Do not set kern.maxvnodes too low.
*/
@@ -735,10 +874,12 @@
(vp->v_object != NULL &&
vp->v_object->resident_page_count > trigger)) {
VOP_UNLOCK(vp, LK_INTERLOCK);
+ vdrop(vp);
goto next_iter_mntunlocked;
}
KASSERT((vp->v_iflag & VI_DOOMED) == 0,
("VI_DOOMED unexpectedly detected in vlrureclaim()"));
+ atomic_add_long(&recycles_count, 1);
vgonel(vp);
VOP_UNLOCK(vp, 0);
vdropl(vp);
@@ -752,7 +893,7 @@
continue;
MNT_IUNLOCK(mp);
yield:
- kern_yield(PRI_UNCHANGED);
+ kern_yield(PRI_USER);
relock_mnt:
MNT_ILOCK(mp);
}
@@ -768,7 +909,6 @@
vnlru_free(int count)
{
struct vnode *vp;
- int vfslocked;
mtx_assert(&vnode_free_list_mtx, MA_OWNED);
for (; count > 0; count--) {
@@ -793,16 +933,24 @@
TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_actfreelist);
continue;
}
- VNASSERT(VCANRECYCLE(vp), vp,
- ("vp inconsistent on freelist"));
+ VNASSERT((vp->v_iflag & VI_FREE) != 0 && vp->v_holdcnt == 0,
+ vp, ("vp inconsistent on freelist"));
+
+ /*
+ * The clear of VI_FREE prevents activation of the
+ * vnode. There is no sense in putting the vnode on
+ * the mount point active list, only to remove it
+ * later during recycling. Inline the relevant part
+ * of vholdl(), to avoid triggering assertions or
+ * activating.
+ */
freevnodes--;
vp->v_iflag &= ~VI_FREE;
- vholdl(vp);
+ vp->v_holdcnt++;
+
mtx_unlock(&vnode_free_list_mtx);
VI_UNLOCK(vp);
- vfslocked = VFS_LOCK_GIANT(vp->v_mount);
vtryrecycle(vp);
- VFS_UNLOCK_GIANT(vfslocked);
/*
* If the recycled succeeded this vdrop will actually free
* the vnode. If not it will simply place it back on
@@ -824,7 +972,7 @@
vnlru_proc(void)
{
struct mount *mp, *nmp;
- int done, vfslocked;
+ int done;
struct proc *p = vnlruproc;
EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p,
@@ -850,9 +998,7 @@
nmp = TAILQ_NEXT(mp, mnt_list);
continue;
}
- vfslocked = VFS_LOCK_GIANT(mp);
done += vlrureclaim(mp);
- VFS_UNLOCK_GIANT(vfslocked);
mtx_lock(&mountlist_mtx);
nmp = TAILQ_NEXT(mp, mnt_list);
vfs_unbusy(mp);
@@ -869,7 +1015,7 @@
vnlru_nowhere++;
tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3);
} else
- kern_yield(PRI_UNCHANGED);
+ kern_yield(PRI_USER);
}
}
@@ -934,8 +1080,10 @@
__func__, vp);
return (EBUSY);
}
- if ((vp->v_iflag & VI_DOOMED) == 0)
+ if ((vp->v_iflag & VI_DOOMED) == 0) {
+ atomic_add_long(&recycles_count, 1);
vgonel(vp);
+ }
VOP_UNLOCK(vp, LK_INTERLOCK);
vn_finished_write(vnmp);
return (0);
@@ -975,12 +1123,19 @@
struct thread *td;
td = curthread;
+ /* First try to be quick and racy. */
+ if (atomic_fetchadd_long(&numvnodes, count) + count <= desiredvnodes) {
+ td->td_vp_reserv += count;
+ return;
+ } else
+ atomic_subtract_long(&numvnodes, count);
+
mtx_lock(&vnode_free_list_mtx);
while (count > 0) {
if (getnewvnode_wait(0) == 0) {
count--;
td->td_vp_reserv++;
- numvnodes++;
+ atomic_add_long(&numvnodes, 1);
}
}
mtx_unlock(&vnode_free_list_mtx);
@@ -992,10 +1147,7 @@
struct thread *td;
td = curthread;
- mtx_lock(&vnode_free_list_mtx);
- KASSERT(numvnodes >= td->td_vp_reserv, ("reserve too large"));
- numvnodes -= td->td_vp_reserv;
- mtx_unlock(&vnode_free_list_mtx);
+ atomic_subtract_long(&numvnodes, td->td_vp_reserv);
td->td_vp_reserv = 0;
}
@@ -1007,8 +1159,8 @@
struct vnode **vpp)
{
struct vnode *vp;
- struct bufobj *bo;
struct thread *td;
+ struct lock_object *lo;
int error;
CTR3(KTR_VFS, "%s: mp %p with tag %s", __func__, mp, tag);
@@ -1032,43 +1184,46 @@
return (error);
}
#endif
- numvnodes++;
+ atomic_add_long(&numvnodes, 1);
mtx_unlock(&vnode_free_list_mtx);
alloc:
- vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK|M_ZERO);
+ atomic_add_long(&vnodes_created, 1);
+ vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK);
/*
- * Setup locks.
+ * Locks are given the generic name "vnode" when created.
+ * Follow the historic practice of using the filesystem
+ * name when they allocated, e.g., "zfs", "ufs", "nfs, etc.
+ *
+ * Locks live in a witness group keyed on their name. Thus,
+ * when a lock is renamed, it must also move from the witness
+ * group of its old name to the witness group of its new name.
+ *
+ * The change only needs to be made when the vnode moves
+ * from one filesystem type to another. We ensure that each
+ * filesystem use a single static name pointer for its tag so
+ * that we can compare pointers rather than doing a strcmp().
*/
- vp->v_vnlock = &vp->v_lock;
- mtx_init(&vp->v_interlock, "vnode interlock", NULL, MTX_DEF);
+ lo = &vp->v_vnlock->lock_object;
+ if (lo->lo_name != tag) {
+ lo->lo_name = tag;
+ WITNESS_DESTROY(lo);
+ WITNESS_INIT(lo, tag);
+ }
/*
- * By default, don't allow shared locks unless filesystems
- * opt-in.
+ * By default, don't allow shared locks unless filesystems opt-in.
*/
- lockinit(vp->v_vnlock, PVFS, tag, VLKTIMEOUT, LK_NOSHARE);
+ vp->v_vnlock->lock_object.lo_flags |= LK_NOSHARE;
/*
- * Initialize bufobj.
- */
- bo = &vp->v_bufobj;
- bo->__bo_vnode = vp;
- mtx_init(BO_MTX(bo), "bufobj interlock", NULL, MTX_DEF);
- bo->bo_ops = &buf_ops_bio;
- bo->bo_private = vp;
- TAILQ_INIT(&bo->bo_clean.bv_hd);
- TAILQ_INIT(&bo->bo_dirty.bv_hd);
- /*
- * Initialize namecache.
- */
- LIST_INIT(&vp->v_cache_src);
- TAILQ_INIT(&vp->v_cache_dst);
- /*
* Finalize various vnode identity bits.
*/
+ KASSERT(vp->v_object == NULL, ("stale v_object %p", vp));
+ KASSERT(vp->v_lockf == NULL, ("stale v_lockf %p", vp));
+ KASSERT(vp->v_pollinfo == NULL, ("stale v_pollinfo %p", vp));
vp->v_type = VNON;
vp->v_tag = tag;
vp->v_op = vops;
v_incr_usecount(vp);
- vp->v_data = 0;
+ vp->v_bufobj.bo_ops = &buf_ops_bio;
#ifdef MAC
mac_vnode_init(vp);
if (mp != NULL && (mp->mnt_flag & MNT_MULTILABEL) == 0)
@@ -1077,11 +1232,10 @@
printf("NULL mp in getnewvnode()\n");
#endif
if (mp != NULL) {
- bo->bo_bsize = mp->mnt_stat.f_iosize;
+ vp->v_bufobj.bo_bsize = mp->mnt_stat.f_iosize;
if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0)
vp->v_vflag |= VV_NOKNOTE;
}
- rangelock_init(&vp->v_rl);
/*
* For the filesystems which do not use vfs_hash_insert(),
@@ -1136,10 +1290,6 @@
vp->v_data = NULL;
vp->v_op = &dead_vnodeops;
- /* XXX non mp-safe fs may still call insmntque with vnode
- unlocked */
- if (!VOP_ISLOCKED(vp))
- vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
vgone(vp);
vput(vp);
}
@@ -1151,16 +1301,12 @@
insmntque1(struct vnode *vp, struct mount *mp,
void (*dtr)(struct vnode *, void *), void *dtr_arg)
{
- int locked;
KASSERT(vp->v_mount == NULL,
("insmntque: vnode already on per mount vnode list"));
VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)"));
-#ifdef DEBUG_VFS_LOCKS
- if (!VFS_NEEDSGIANT(mp))
- ASSERT_VOP_ELOCKED(vp,
- "insmntque: mp-safe fs and non-locked vp");
-#endif
+ ASSERT_VOP_ELOCKED(vp, "insmntque: non-locked vp");
+
/*
* We acquire the vnode interlock early to ensure that the
* vnode cannot be recycled by another process releasing a
@@ -1172,18 +1318,15 @@
*/
MNT_ILOCK(mp);
VI_LOCK(vp);
- if ((mp->mnt_kern_flag & MNTK_NOINSMNTQ) != 0 &&
+ if (((mp->mnt_kern_flag & MNTK_NOINSMNTQ) != 0 &&
((mp->mnt_kern_flag & MNTK_UNMOUNTF) != 0 ||
- mp->mnt_nvnodelistsize == 0)) {
- locked = VOP_ISLOCKED(vp);
- if (!locked || (locked == LK_EXCLUSIVE &&
- (vp->v_vflag & VV_FORCEINSMQ) == 0)) {
- VI_UNLOCK(vp);
- MNT_IUNLOCK(mp);
- if (dtr != NULL)
- dtr(vp, dtr_arg);
- return (EBUSY);
- }
+ mp->mnt_nvnodelistsize == 0)) &&
+ (vp->v_vflag & VV_FORCEINSMQ) == 0) {
+ VI_UNLOCK(vp);
+ MNT_IUNLOCK(mp);
+ if (dtr != NULL)
+ dtr(vp, dtr_arg);
+ return (EBUSY);
}
vp->v_mount = mp;
MNT_REF(mp);
@@ -1265,9 +1408,9 @@
bufobj_wwait(bo, 0, 0);
BO_UNLOCK(bo);
if (bo->bo_object != NULL) {
- VM_OBJECT_LOCK(bo->bo_object);
+ VM_OBJECT_WLOCK(bo->bo_object);
vm_object_pip_wait(bo->bo_object, "bovlbx");
- VM_OBJECT_UNLOCK(bo->bo_object);
+ VM_OBJECT_WUNLOCK(bo->bo_object);
}
BO_LOCK(bo);
} while (bo->bo_numoutput > 0);
@@ -1278,10 +1421,10 @@
*/
if (bo->bo_object != NULL &&
(flags & (V_ALT | V_NORMAL | V_CLEANONLY)) == 0) {
- VM_OBJECT_LOCK(bo->bo_object);
+ VM_OBJECT_WLOCK(bo->bo_object);
vm_object_page_remove(bo->bo_object, 0, 0, (flags & V_SAVE) ?
OBJPR_CLEANONLY : 0);
- VM_OBJECT_UNLOCK(bo->bo_object);
+ VM_OBJECT_WUNLOCK(bo->bo_object);
}
#ifdef INVARIANTS
@@ -1304,6 +1447,8 @@
CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags);
ASSERT_VOP_LOCKED(vp, "vinvalbuf");
+ if (vp->v_object != NULL && vp->v_object->handle != vp)
+ return (0);
return (bufobj_invalbuf(&vp->v_bufobj, flags, slpflag, slptimeo));
}
@@ -1312,7 +1457,7 @@
*
*/
static int
-flushbuflist( struct bufv *bufv, int flags, struct bufobj *bo, int slpflag,
+flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo, int slpflag,
int slptimeo)
{
struct buf *bp, *nbp;
@@ -1320,7 +1465,7 @@
daddr_t lblkno;
b_xflags_t xflags;
- ASSERT_BO_LOCKED(bo);
+ ASSERT_BO_WLOCKED(bo);
retval = 0;
TAILQ_FOREACH_SAFE(bp, &bufv->bv_hd, b_bobufs, nbp) {
@@ -1332,12 +1477,11 @@
xflags = 0;
if (nbp != NULL) {
lblkno = nbp->b_lblkno;
- xflags = nbp->b_xflags &
- (BX_BKGRDMARKER | BX_VNDIRTY | BX_VNCLEAN);
+ xflags = nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN);
}
retval = EAGAIN;
error = BUF_TIMELOCK(bp,
- LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_MTX(bo),
+ LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK, BO_LOCKPTR(bo),
"flushbuf", slpflag, slptimeo);
if (error) {
BO_LOCK(bo);
@@ -1359,17 +1503,13 @@
*/
if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) &&
(flags & V_SAVE)) {
- BO_LOCK(bo);
bremfree(bp);
- BO_UNLOCK(bo);
bp->b_flags |= B_ASYNC;
bwrite(bp);
BO_LOCK(bo);
return (EAGAIN); /* XXX: why not loop ? */
}
- BO_LOCK(bo);
bremfree(bp);
- BO_UNLOCK(bo);
bp->b_flags |= (B_INVAL | B_RELBUF);
bp->b_flags &= ~B_ASYNC;
brelse(bp);
@@ -1377,8 +1517,7 @@
if (nbp != NULL &&
(nbp->b_bufobj != bo ||
nbp->b_lblkno != lblkno ||
- (nbp->b_xflags &
- (BX_BKGRDMARKER | BX_VNDIRTY | BX_VNCLEAN)) != xflags))
+ (nbp->b_xflags & (BX_VNDIRTY | BX_VNCLEAN)) != xflags))
break; /* nbp invalid */
}
return (retval);
@@ -1390,8 +1529,7 @@
* sync activity.
*/
int
-vtruncbuf(struct vnode *vp, struct ucred *cred, struct thread *td,
- off_t length, int blksize)
+vtruncbuf(struct vnode *vp, struct ucred *cred, off_t length, int blksize)
{
struct buf *bp, *nbp;
int anyfreed;
@@ -1418,12 +1556,10 @@
continue;
if (BUF_LOCK(bp,
LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
- BO_MTX(bo)) == ENOLCK)
+ BO_LOCKPTR(bo)) == ENOLCK)
goto restart;
- BO_LOCK(bo);
bremfree(bp);
- BO_UNLOCK(bo);
bp->b_flags |= (B_INVAL | B_RELBUF);
bp->b_flags &= ~B_ASYNC;
brelse(bp);
@@ -1444,11 +1580,9 @@
continue;
if (BUF_LOCK(bp,
LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
- BO_MTX(bo)) == ENOLCK)
+ BO_LOCKPTR(bo)) == ENOLCK)
goto restart;
- BO_LOCK(bo);
bremfree(bp);
- BO_UNLOCK(bo);
bp->b_flags |= (B_INVAL | B_RELBUF);
bp->b_flags &= ~B_ASYNC;
brelse(bp);
@@ -1476,15 +1610,13 @@
*/
if (BUF_LOCK(bp,
LK_EXCLUSIVE | LK_SLEEPFAIL | LK_INTERLOCK,
- BO_MTX(bo)) == ENOLCK) {
+ BO_LOCKPTR(bo)) == ENOLCK) {
goto restart;
}
VNASSERT((bp->b_flags & B_DELWRI), vp,
("buf(%p) on dirty queue without DELWRI", bp));
- BO_LOCK(bo);
bremfree(bp);
- BO_UNLOCK(bo);
bawrite(bp);
BO_LOCK(bo);
goto restartsync;
@@ -1498,83 +1630,13 @@
return (0);
}
-/*
- * buf_splay() - splay tree core for the clean/dirty list of buffers in
- * a vnode.
- *
- * NOTE: We have to deal with the special case of a background bitmap
- * buffer, a situation where two buffers will have the same logical
- * block offset. We want (1) only the foreground buffer to be accessed
- * in a lookup and (2) must differentiate between the foreground and
- * background buffer in the splay tree algorithm because the splay
- * tree cannot normally handle multiple entities with the same 'index'.
- * We accomplish this by adding differentiating flags to the splay tree's
- * numerical domain.
- */
-static
-struct buf *
-buf_splay(daddr_t lblkno, b_xflags_t xflags, struct buf *root)
-{
- struct buf dummy;
- struct buf *lefttreemax, *righttreemin, *y;
-
- if (root == NULL)
- return (NULL);
- lefttreemax = righttreemin = &dummy;
- for (;;) {
- if (lblkno < root->b_lblkno ||
- (lblkno == root->b_lblkno &&
- (xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) {
- if ((y = root->b_left) == NULL)
- break;
- if (lblkno < y->b_lblkno) {
- /* Rotate right. */
- root->b_left = y->b_right;
- y->b_right = root;
- root = y;
- if ((y = root->b_left) == NULL)
- break;
- }
- /* Link into the new root's right tree. */
- righttreemin->b_left = root;
- righttreemin = root;
- } else if (lblkno > root->b_lblkno ||
- (lblkno == root->b_lblkno &&
- (xflags & BX_BKGRDMARKER) > (root->b_xflags & BX_BKGRDMARKER))) {
- if ((y = root->b_right) == NULL)
- break;
- if (lblkno > y->b_lblkno) {
- /* Rotate left. */
- root->b_right = y->b_left;
- y->b_left = root;
- root = y;
- if ((y = root->b_right) == NULL)
- break;
- }
- /* Link into the new root's left tree. */
- lefttreemax->b_right = root;
- lefttreemax = root;
- } else {
- break;
- }
- root = y;
- }
- /* Assemble the new root. */
- lefttreemax->b_right = root->b_left;
- righttreemin->b_left = root->b_right;
- root->b_left = dummy.b_right;
- root->b_right = dummy.b_left;
- return (root);
-}
-
static void
buf_vlist_remove(struct buf *bp)
{
- struct buf *root;
struct bufv *bv;
KASSERT(bp->b_bufobj != NULL, ("No b_bufobj %p", bp));
- ASSERT_BO_LOCKED(bp->b_bufobj);
+ ASSERT_BO_WLOCKED(bp->b_bufobj);
KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) !=
(BX_VNDIRTY|BX_VNCLEAN),
("buf_vlist_remove: Buf %p is on two lists", bp));
@@ -1582,17 +1644,7 @@
bv = &bp->b_bufobj->bo_dirty;
else
bv = &bp->b_bufobj->bo_clean;
- if (bp != bv->bv_root) {
- root = buf_splay(bp->b_lblkno, bp->b_xflags, bv->bv_root);
- KASSERT(root == bp, ("splay lookup failed in remove"));
- }
- if (bp->b_left == NULL) {
- root = bp->b_right;
- } else {
- root = buf_splay(bp->b_lblkno, bp->b_xflags, bp->b_left);
- root->b_right = bp->b_right;
- }
- bv->bv_root = root;
+ BUF_PCTRIE_REMOVE(&bv->bv_root, bp->b_lblkno);
TAILQ_REMOVE(&bv->bv_hd, bp, b_bobufs);
bv->bv_cnt--;
bp->b_xflags &= ~(BX_VNDIRTY | BX_VNCLEAN);
@@ -1599,8 +1651,7 @@
}
/*
- * Add the buffer to the sorted clean or dirty block list using a
- * splay tree algorithm.
+ * Add the buffer to the sorted clean or dirty block list.
*
* NOTE: xflags is passed as a constant, optimizing this inline function!
*/
@@ -1607,10 +1658,13 @@
static void
buf_vlist_add(struct buf *bp, struct bufobj *bo, b_xflags_t xflags)
{
- struct buf *root;
struct bufv *bv;
+ struct buf *n;
+ int error;
- ASSERT_BO_LOCKED(bo);
+ ASSERT_BO_WLOCKED(bo);
+ KASSERT((xflags & BX_VNDIRTY) == 0 || (bo->bo_flag & BO_DEAD) == 0,
+ ("dead bo %p", bo));
KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) == 0,
("buf_vlist_add: Buf %p has existing xflags %d", bp, bp->b_xflags));
bp->b_xflags |= xflags;
@@ -1619,26 +1673,22 @@
else
bv = &bo->bo_clean;
- root = buf_splay(bp->b_lblkno, bp->b_xflags, bv->bv_root);
- if (root == NULL) {
- bp->b_left = NULL;
- bp->b_right = NULL;
+ /*
+ * Keep the list ordered. Optimize empty list insertion. Assume
+ * we tend to grow at the tail so lookup_le should usually be cheaper
+ * than _ge.
+ */
+ if (bv->bv_cnt == 0 ||
+ bp->b_lblkno > TAILQ_LAST(&bv->bv_hd, buflists)->b_lblkno)
TAILQ_INSERT_TAIL(&bv->bv_hd, bp, b_bobufs);
- } else if (bp->b_lblkno < root->b_lblkno ||
- (bp->b_lblkno == root->b_lblkno &&
- (bp->b_xflags & BX_BKGRDMARKER) < (root->b_xflags & BX_BKGRDMARKER))) {
- bp->b_left = root->b_left;
- bp->b_right = root;
- root->b_left = NULL;
- TAILQ_INSERT_BEFORE(root, bp, b_bobufs);
- } else {
- bp->b_right = root->b_right;
- bp->b_left = root;
- root->b_right = NULL;
- TAILQ_INSERT_AFTER(&bv->bv_hd, root, bp, b_bobufs);
- }
+ else if ((n = BUF_PCTRIE_LOOKUP_LE(&bv->bv_root, bp->b_lblkno)) == NULL)
+ TAILQ_INSERT_HEAD(&bv->bv_hd, bp, b_bobufs);
+ else
+ TAILQ_INSERT_AFTER(&bv->bv_hd, n, bp, b_bobufs);
+ error = BUF_PCTRIE_INSERT(&bv->bv_root, bp);
+ if (error)
+ panic("buf_vlist_add: Preallocated nodes insufficient.");
bv->bv_cnt++;
- bv->bv_root = bp;
}
/*
@@ -1659,23 +1709,10 @@
struct buf *bp;
ASSERT_BO_LOCKED(bo);
- if ((bp = bo->bo_clean.bv_root) != NULL &&
- bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
+ bp = BUF_PCTRIE_LOOKUP(&bo->bo_clean.bv_root, lblkno);
+ if (bp != NULL)
return (bp);
- if ((bp = bo->bo_dirty.bv_root) != NULL &&
- bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
- return (bp);
- if ((bp = bo->bo_clean.bv_root) != NULL) {
- bo->bo_clean.bv_root = bp = buf_splay(lblkno, 0, bp);
- if (bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
- return (bp);
- }
- if ((bp = bo->bo_dirty.bv_root) != NULL) {
- bo->bo_dirty.bv_root = bp = buf_splay(lblkno, 0, bp);
- if (bp->b_lblkno == lblkno && !(bp->b_xflags & BX_BKGRDMARKER))
- return (bp);
- }
- return (NULL);
+ return BUF_PCTRIE_LOOKUP(&bo->bo_dirty.bv_root, lblkno);
}
/*
@@ -1687,7 +1724,7 @@
struct bufobj *bo;
bo = &vp->v_bufobj;
- ASSERT_BO_LOCKED(bo);
+ ASSERT_BO_WLOCKED(bo);
VNASSERT(bp->b_vp == NULL, bp->b_vp, ("bgetvp: not free"));
CTR3(KTR_BUF, "bgetvp(%p) vp %p flags %X", bp, vp, bp->b_flags);
@@ -1695,8 +1732,6 @@
("bgetvp: bp already attached! %p", bp));
vhold(vp);
- if (VFS_NEEDSGIANT(vp->v_mount) || bo->bo_flag & BO_NEEDSGIANT)
- bp->b_flags |= B_NEEDSGIANT;
bp->b_vp = vp;
bp->b_bufobj = bo;
/*
@@ -1734,7 +1769,6 @@
syncer_worklist_len--;
mtx_unlock(&sync_mtx);
}
- bp->b_flags &= ~B_NEEDSGIANT;
bp->b_vp = NULL;
bp->b_bufobj = NULL;
BO_UNLOCK(bo);
@@ -1747,9 +1781,9 @@
static void
vn_syncer_add_to_worklist(struct bufobj *bo, int delay)
{
- int queue, slot;
+ int slot;
- ASSERT_BO_LOCKED(bo);
+ ASSERT_BO_WLOCKED(bo);
mtx_lock(&sync_mtx);
if (bo->bo_flag & BO_ONWORKLST)
@@ -1763,10 +1797,7 @@
delay = syncer_maxdelay - 2;
slot = (syncer_delayno + delay) & syncer_mask;
- queue = VFS_NEEDSGIANT(bo->__bo_vnode->v_mount) ? WI_GIANTQ :
- WI_MPSAFEQ;
- LIST_INSERT_HEAD(&syncer_workitem_pending[queue][slot], bo,
- bo_synclist);
+ LIST_INSERT_HEAD(&syncer_workitem_pending[slot], bo, bo_synclist);
mtx_unlock(&sync_mtx);
}
@@ -1840,6 +1871,8 @@
return (0);
}
+static int first_printf = 1;
+
/*
* System filesystem synchronizer daemon.
*/
@@ -1846,8 +1879,7 @@
static void
sched_sync(void)
{
- struct synclist *gnext, *next;
- struct synclist *gslp, *slp;
+ struct synclist *next, *slp;
struct bufobj *bo;
long starttime;
struct thread *td = curthread;
@@ -1854,12 +1886,10 @@
int last_work_seen;
int net_worklist_len;
int syncer_final_iter;
- int first_printf;
int error;
last_work_seen = 0;
syncer_final_iter = 0;
- first_printf = 1;
syncer_state = SYNCER_RUNNING;
starttime = time_uptime;
td->td_pflags |= TDP_NORUNNINGBUF;
@@ -1893,13 +1923,11 @@
* Skip over empty worklist slots when shutting down.
*/
do {
- slp = &syncer_workitem_pending[WI_MPSAFEQ][syncer_delayno];
- gslp = &syncer_workitem_pending[WI_GIANTQ][syncer_delayno];
+ slp = &syncer_workitem_pending[syncer_delayno];
syncer_delayno += 1;
if (syncer_delayno == syncer_maxdelay)
syncer_delayno = 0;
- next = &syncer_workitem_pending[WI_MPSAFEQ][syncer_delayno];
- gnext = &syncer_workitem_pending[WI_GIANTQ][syncer_delayno];
+ next = &syncer_workitem_pending[syncer_delayno];
/*
* If the worklist has wrapped since the
* it was emptied of all but syncer vnodes,
@@ -1913,7 +1941,7 @@
syncer_final_iter = SYNCER_SHUTDOWN_SPEEDUP;
}
} while (syncer_state != SYNCER_RUNNING && LIST_EMPTY(slp) &&
- LIST_EMPTY(gslp) && syncer_worklist_len > 0);
+ syncer_worklist_len > 0);
/*
* Keep track of the last time there was anything
@@ -1937,21 +1965,6 @@
wdog_kern_pat(WD_LASTVAL);
}
- if (!LIST_EMPTY(gslp)) {
- mtx_unlock(&sync_mtx);
- mtx_lock(&Giant);
- mtx_lock(&sync_mtx);
- while (!LIST_EMPTY(gslp)) {
- error = sync_vnode(gslp, &bo, td);
- if (error == 1) {
- LIST_REMOVE(bo, bo_synclist);
- LIST_INSERT_HEAD(gnext, bo,
- bo_synclist);
- continue;
- }
- }
- mtx_unlock(&Giant);
- }
if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0)
syncer_final_iter--;
/*
@@ -2033,6 +2046,25 @@
kproc_shutdown(arg, howto);
}
+void
+syncer_suspend(void)
+{
+
+ syncer_shutdown(updateproc, 0);
+}
+
+void
+syncer_resume(void)
+{
+
+ mtx_lock(&sync_mtx);
+ first_printf = 1;
+ syncer_state = SYNCER_RUNNING;
+ mtx_unlock(&sync_mtx);
+ cv_broadcast(&sync_wakeup);
+ kproc_resume(updateproc);
+}
+
/*
* Reassign a buffer from one vnode to another.
* Used to assign file specific control information
@@ -2129,6 +2161,7 @@
{
CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
+ vholdl(vp);
vp->v_usecount++;
if (vp->v_type == VCHR && vp->v_rdev != NULL) {
dev_lock();
@@ -2135,7 +2168,6 @@
vp->v_rdev->si_usecount++;
dev_unlock();
}
- vholdl(vp);
}
/*
@@ -2212,7 +2244,6 @@
int error;
error = 0;
- VFS_ASSERT_GIANT(vp->v_mount);
VNASSERT((flags & LK_TYPE_MASK) != 0, vp,
("vget: invalid lock operation"));
CTR3(KTR_VFS, "%s: vp %p with flags %d", __func__, vp, flags);
@@ -2297,7 +2328,6 @@
ASSERT_VOP_LOCKED(vp, "vput");
else
KASSERT(func == VPUTX_VRELE, ("vputx: wrong func"));
- VFS_ASSERT_GIANT(vp->v_mount);
CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
VI_LOCK(vp);
@@ -2343,8 +2373,10 @@
}
break;
case VPUTX_VUNREF:
- if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE)
- error = EBUSY;
+ if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE) {
+ error = VOP_LOCK(vp, LK_TRYUPGRADE | LK_INTERLOCK);
+ VI_LOCK(vp);
+ }
break;
}
if (vp->v_usecount > 0)
@@ -2412,11 +2444,15 @@
struct mount *mp;
CTR2(KTR_VFS, "%s: vp %p", __func__, vp);
+#ifdef INVARIANTS
+ /* getnewvnode() calls v_incr_usecount() without holding interlock. */
+ if (vp->v_type != VNON || vp->v_data != NULL)
+ ASSERT_VI_LOCKED(vp, "vholdl");
+#endif
vp->v_holdcnt++;
- if (!VSHOULDBUSY(vp))
+ if ((vp->v_iflag & VI_FREE) == 0)
return;
- ASSERT_VI_LOCKED(vp, "vholdl");
- VNASSERT((vp->v_iflag & VI_FREE) != 0, vp, ("vnode not free"));
+ VNASSERT(vp->v_holdcnt == 1, vp, ("vholdl: wrong hold count"));
VNASSERT(vp->v_op != NULL, vp, ("vholdl: vnode already reclaimed."));
/*
* Remove a vnode from the free list, mark it as in use,
@@ -2425,7 +2461,7 @@
mtx_lock(&vnode_free_list_mtx);
TAILQ_REMOVE(&vnode_free_list, vp, v_actfreelist);
freevnodes--;
- vp->v_iflag &= ~(VI_FREE|VI_AGE);
+ vp->v_iflag &= ~VI_FREE;
KASSERT((vp->v_iflag & VI_ACTIVE) == 0,
("Activating already active vnode"));
vp->v_iflag |= VI_ACTIVE;
@@ -2451,6 +2487,10 @@
* Drop the hold count of the vnode. If this is the last reference to
* the vnode we place it on the free list unless it has been vgone'd
* (marked VI_DOOMED) in which case we will free it.
+ *
+ * Because the vnode vm object keeps a hold reference on the vnode if
+ * there is at least one resident non-cached page, the vnode cannot
+ * leave the active list without the page cleanup done.
*/
void
vdropl(struct vnode *vp)
@@ -2477,36 +2517,40 @@
("vdropl: vnode already reclaimed."));
VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
("vnode already free"));
- VNASSERT(VSHOULDFREE(vp), vp,
+ VNASSERT(vp->v_holdcnt == 0, vp,
("vdropl: freeing when we shouldn't"));
active = vp->v_iflag & VI_ACTIVE;
- vp->v_iflag &= ~VI_ACTIVE;
- mp = vp->v_mount;
- mtx_lock(&vnode_free_list_mtx);
- if (active) {
- TAILQ_REMOVE(&mp->mnt_activevnodelist, vp,
+ if ((vp->v_iflag & VI_OWEINACT) == 0) {
+ vp->v_iflag &= ~VI_ACTIVE;
+ mp = vp->v_mount;
+ mtx_lock(&vnode_free_list_mtx);
+ if (active) {
+ TAILQ_REMOVE(&mp->mnt_activevnodelist, vp,
+ v_actfreelist);
+ mp->mnt_activevnodelistsize--;
+ }
+ TAILQ_INSERT_TAIL(&vnode_free_list, vp,
v_actfreelist);
- mp->mnt_activevnodelistsize--;
- }
- if (vp->v_iflag & VI_AGE) {
- TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_actfreelist);
+ freevnodes++;
+ vp->v_iflag |= VI_FREE;
+ mtx_unlock(&vnode_free_list_mtx);
} else {
- TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_actfreelist);
+ atomic_add_long(&free_owe_inact, 1);
}
- freevnodes++;
- vp->v_iflag &= ~VI_AGE;
- vp->v_iflag |= VI_FREE;
- mtx_unlock(&vnode_free_list_mtx);
VI_UNLOCK(vp);
return;
}
/*
* The vnode has been marked for destruction, so free it.
+ *
+ * The vnode will be returned to the zone where it will
+ * normally remain until it is needed for another vnode. We
+ * need to cleanup (or verify that the cleanup has already
+ * been done) any residual data left from its current use
+ * so as not to contaminate the freshly allocated vnode.
*/
CTR2(KTR_VFS, "%s: destroying the vnode %p", __func__, vp);
- mtx_lock(&vnode_free_list_mtx);
- numvnodes--;
- mtx_unlock(&vnode_free_list_mtx);
+ atomic_subtract_long(&numvnodes, 1);
bo = &vp->v_bufobj;
VNASSERT((vp->v_iflag & VI_FREE) == 0, vp,
("cleaned vnode still on the free list."));
@@ -2516,26 +2560,33 @@
VNASSERT(vp->v_writecount == 0, vp, ("Non-zero write count"));
VNASSERT(bo->bo_numoutput == 0, vp, ("Clean vnode has pending I/O's"));
VNASSERT(bo->bo_clean.bv_cnt == 0, vp, ("cleanbufcnt not 0"));
- VNASSERT(bo->bo_clean.bv_root == NULL, vp, ("cleanblkroot not NULL"));
+ VNASSERT(pctrie_is_empty(&bo->bo_clean.bv_root), vp,
+ ("clean blk trie not empty"));
VNASSERT(bo->bo_dirty.bv_cnt == 0, vp, ("dirtybufcnt not 0"));
- VNASSERT(bo->bo_dirty.bv_root == NULL, vp, ("dirtyblkroot not NULL"));
+ VNASSERT(pctrie_is_empty(&bo->bo_dirty.bv_root), vp,
+ ("dirty blk trie not empty"));
VNASSERT(TAILQ_EMPTY(&vp->v_cache_dst), vp, ("vp has namecache dst"));
VNASSERT(LIST_EMPTY(&vp->v_cache_src), vp, ("vp has namecache src"));
VNASSERT(vp->v_cache_dd == NULL, vp, ("vp has namecache for .."));
+ VNASSERT(TAILQ_EMPTY(&vp->v_rl.rl_waiters), vp,
+ ("Dangling rangelock waiters"));
VI_UNLOCK(vp);
#ifdef MAC
mac_vnode_destroy(vp);
#endif
- if (vp->v_pollinfo != NULL)
+ if (vp->v_pollinfo != NULL) {
destroy_vpollinfo(vp->v_pollinfo);
+ vp->v_pollinfo = NULL;
+ }
#ifdef INVARIANTS
/* XXX Elsewhere we detect an already freed vnode via NULL v_op. */
vp->v_op = NULL;
#endif
- rangelock_destroy(&vp->v_rl);
- lockdestroy(vp->v_vnlock);
- mtx_destroy(&vp->v_interlock);
- mtx_destroy(BO_MTX(bo));
+ bzero(&vp->v_un, sizeof(vp->v_un));
+ vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0;
+ vp->v_iflag = 0;
+ vp->v_vflag = 0;
+ bo->bo_flag = 0;
uma_zfree(vnode_zone, vp);
}
@@ -2560,17 +2611,19 @@
VI_UNLOCK(vp);
/*
* Before moving off the active list, we must be sure that any
- * modified pages are on the vnode's dirty list since these will
- * no longer be checked once the vnode is on the inactive list.
- * Because the vnode vm object keeps a hold reference on the vnode
- * if there is at least one resident non-cached page, the vnode
- * cannot leave the active list without the page cleanup done.
+ * modified pages are converted into the vnode's dirty
+ * buffers, since these will no longer be checked once the
+ * vnode is on the inactive list.
+ *
+ * The write-out of the dirty pages is asynchronous. At the
+ * point that VOP_INACTIVE() is called, there could still be
+ * pending I/O and dirty pages in the object.
*/
obj = vp->v_object;
if (obj != NULL && (obj->flags & OBJ_MIGHTBEDIRTY) != 0) {
- VM_OBJECT_LOCK(obj);
+ VM_OBJECT_WLOCK(obj);
vm_object_page_clean(obj, 0, 0, OBJPC_NOSYNC);
- VM_OBJECT_UNLOCK(obj);
+ VM_OBJECT_WUNLOCK(obj);
}
VOP_INACTIVE(vp, td);
VI_LOCK(vp);
@@ -2651,9 +2704,9 @@
*/
if (flags & WRITECLOSE) {
if (vp->v_object != NULL) {
- VM_OBJECT_LOCK(vp->v_object);
+ VM_OBJECT_WLOCK(vp->v_object);
vm_object_page_clean(vp->v_object, 0, 0, 0);
- VM_OBJECT_UNLOCK(vp->v_object);
+ VM_OBJECT_WUNLOCK(vp->v_object);
}
error = VOP_FSYNC(vp, MNT_WAIT, td);
if (error != 0) {
@@ -2681,9 +2734,6 @@
* If FORCECLOSE is set, forcibly close the vnode.
*/
if (vp->v_usecount == 0 || (flags & FORCECLOSE)) {
- VNASSERT(vp->v_usecount == 0 ||
- (vp->v_type != VCHR && vp->v_type != VBLK), vp,
- ("device VNODE %p is FORCECLOSED", vp));
vgonel(vp);
} else {
busy++;
@@ -2727,7 +2777,7 @@
* Recycle an unused vnode to the front of the free list.
*/
int
-vrecycle(struct vnode *vp, struct thread *td)
+vrecycle(struct vnode *vp)
{
int recycled;
@@ -2821,7 +2871,7 @@
/*
* vgone, with the vp interlock held.
*/
-void
+static void
vgonel(struct vnode *vp)
{
struct thread *td;
@@ -2853,16 +2903,6 @@
vfs_notify_upper(vp, VFS_NOTIFY_UPPER_RECLAIM);
/*
- * Clean out any buffers associated with the vnode.
- * If the flush fails, just toss the buffers.
- */
- mp = NULL;
- if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd))
- (void) vn_start_secondary_write(vp, &mp, V_WAIT);
- if (vinvalbuf(vp, V_SAVE, 0, 0) != 0)
- vinvalbuf(vp, 0, 0, 0);
-
- /*
* If purging an active vnode, it must be closed and
* deactivated before being reclaimed.
*/
@@ -2876,7 +2916,35 @@
}
if (vp->v_type == VSOCK)
vfs_unp_reclaim(vp);
+
/*
+ * Clean out any buffers associated with the vnode.
+ * If the flush fails, just toss the buffers.
+ */
+ mp = NULL;
+ if (!TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd))
+ (void) vn_start_secondary_write(vp, &mp, V_WAIT);
+ if (vinvalbuf(vp, V_SAVE, 0, 0) != 0) {
+ while (vinvalbuf(vp, 0, 0, 0) != 0)
+ ;
+ }
+
+ BO_LOCK(&vp->v_bufobj);
+ KASSERT(TAILQ_EMPTY(&vp->v_bufobj.bo_dirty.bv_hd) &&
+ vp->v_bufobj.bo_dirty.bv_cnt == 0 &&
+ TAILQ_EMPTY(&vp->v_bufobj.bo_clean.bv_hd) &&
+ vp->v_bufobj.bo_clean.bv_cnt == 0,
+ ("vp %p bufobj not invalidated", vp));
+
+ /*
+ * For VMIO bufobj, BO_DEAD is set in vm_object_terminate()
+ * after the object's page queue is flushed.
+ */
+ if (vp->v_bufobj.bo_object == NULL)
+ vp->v_bufobj.bo_flag |= BO_DEAD;
+ BO_UNLOCK(&vp->v_bufobj);
+
+ /*
* Reclaim the vnode.
*/
if (VOP_RECLAIM(vp, td))
@@ -2889,6 +2957,7 @@
* Clear the advisory locks and wake up waiting threads.
*/
(void)VOP_ADVLOCKPURGE(vp);
+ vp->v_lockf = NULL;
/*
* Delete from old mount point vnode list.
*/
@@ -2952,8 +3021,25 @@
va_end(ap);
printf("%p: ", (void *)vp);
printf("tag %s, type %s\n", vp->v_tag, typename[vp->v_type]);
- printf(" usecount %d, writecount %d, refcount %d mountedhere %p\n",
- vp->v_usecount, vp->v_writecount, vp->v_holdcnt, vp->v_mountedhere);
+ printf(" usecount %d, writecount %d, refcount %d",
+ vp->v_usecount, vp->v_writecount, vp->v_holdcnt);
+ switch (vp->v_type) {
+ case VDIR:
+ printf(" mountedhere %p\n", vp->v_mountedhere);
+ break;
+ case VCHR:
+ printf(" rdev %p\n", vp->v_rdev);
+ break;
+ case VSOCK:
+ printf(" socket %p\n", vp->v_socket);
+ break;
+ case VFIFO:
+ printf(" fifoinfo %p\n", vp->v_fifoinfo);
+ break;
+ default:
+ printf("\n");
+ break;
+ }
buf[0] = '\0';
buf[1] = '\0';
if (vp->v_vflag & VV_ROOT)
@@ -2991,8 +3077,6 @@
}
if (vp->v_iflag & VI_MOUNT)
strlcat(buf, "|VI_MOUNT", sizeof(buf));
- if (vp->v_iflag & VI_AGE)
- strlcat(buf, "|VI_AGE", sizeof(buf));
if (vp->v_iflag & VI_DOOMED)
strlcat(buf, "|VI_DOOMED", sizeof(buf));
if (vp->v_iflag & VI_FREE)
@@ -3003,7 +3087,7 @@
strlcat(buf, "|VI_DOINGINACT", sizeof(buf));
if (vp->v_iflag & VI_OWEINACT)
strlcat(buf, "|VI_OWEINACT", sizeof(buf));
- flags = vp->v_iflag & ~(VI_MOUNT | VI_AGE | VI_DOOMED | VI_FREE |
+ flags = vp->v_iflag & ~(VI_MOUNT | VI_DOOMED | VI_FREE |
VI_ACTIVE | VI_DOINGINACT | VI_OWEINACT);
if (flags != 0) {
snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags);
@@ -3013,9 +3097,12 @@
if (mtx_owned(VI_MTX(vp)))
printf(" VI_LOCKed");
if (vp->v_object != NULL)
- printf(" v_object %p ref %d pages %d\n",
+ printf(" v_object %p ref %d pages %d "
+ "cleanbuf %d dirtybuf %d\n",
vp->v_object, vp->v_object->ref_count,
- vp->v_object->resident_page_count);
+ vp->v_object->resident_page_count,
+ vp->v_bufobj.bo_clean.bv_cnt,
+ vp->v_bufobj.bo_dirty.bv_cnt);
printf(" ");
lockmgr_printinfo(vp->v_vnlock);
if (vp->v_data != NULL)
@@ -3029,7 +3116,7 @@
*/
DB_SHOW_COMMAND(lockedvnods, lockedvnodes)
{
- struct mount *mp, *nmp;
+ struct mount *mp;
struct vnode *vp;
/*
@@ -3039,14 +3126,11 @@
* about that.
*/
db_printf("Locked vnodes\n");
- for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
- nmp = TAILQ_NEXT(mp, mnt_list);
+ TAILQ_FOREACH(mp, &mountlist, mnt_list) {
TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
- if (vp->v_type != VMARKER &&
- VOP_ISLOCKED(vp))
+ if (vp->v_type != VMARKER && VOP_ISLOCKED(vp))
vprint("", vp);
}
- nmp = TAILQ_NEXT(mp, mnt_list);
}
}
@@ -3170,6 +3254,7 @@
MNT_KERN_FLAG(MNTK_VGONE_WAITER);
MNT_KERN_FLAG(MNTK_LOOKUP_EXCL_DOTDOT);
MNT_KERN_FLAG(MNTK_MARKER);
+ MNT_KERN_FLAG(MNTK_USES_BCACHE);
MNT_KERN_FLAG(MNTK_NOASYNC);
MNT_KERN_FLAG(MNTK_UNMOUNT);
MNT_KERN_FLAG(MNTK_MWAIT);
@@ -3176,7 +3261,6 @@
MNT_KERN_FLAG(MNTK_SUSPEND);
MNT_KERN_FLAG(MNTK_SUSPEND2);
MNT_KERN_FLAG(MNTK_SUSPENDED);
- MNT_KERN_FLAG(MNTK_MPSAFE);
MNT_KERN_FLAG(MNTK_LOOKUP_SHARED);
MNT_KERN_FLAG(MNTK_NOKNOTE);
#undef MNT_KERN_FLAG
@@ -3228,6 +3312,7 @@
db_printf(" mnt_maxsymlinklen = %d\n", mp->mnt_maxsymlinklen);
db_printf(" mnt_iosize_max = %d\n", mp->mnt_iosize_max);
db_printf(" mnt_hashseed = %u\n", mp->mnt_hashseed);
+ db_printf(" mnt_lockref = %d\n", mp->mnt_lockref);
db_printf(" mnt_secondary_writes = %d\n", mp->mnt_secondary_writes);
db_printf(" mnt_secondary_accwrites = %d\n",
mp->mnt_secondary_accwrites);
@@ -3290,12 +3375,11 @@
{
struct xvfsconf32 xvfsp;
+ bzero(&xvfsp, sizeof(xvfsp));
strcpy(xvfsp.vfc_name, vfsp->vfc_name);
xvfsp.vfc_typenum = vfsp->vfc_typenum;
xvfsp.vfc_refcount = vfsp->vfc_refcount;
xvfsp.vfc_flags = vfsp->vfc_flags;
- xvfsp.vfc_vfsops = 0;
- xvfsp.vfc_next = 0;
return (SYSCTL_OUT(req, &xvfsp, sizeof(xvfsp)));
}
#endif
@@ -3310,6 +3394,7 @@
int error;
error = 0;
+ vfsconf_slock();
TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
#ifdef COMPAT_FREEBSD32
if (req->flags & SCTL_MASK32)
@@ -3320,11 +3405,12 @@
if (error)
break;
}
+ vfsconf_sunlock();
return (error);
}
-SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLTYPE_OPAQUE | CTLFLAG_RD,
- NULL, 0, sysctl_vfs_conflist,
+SYSCTL_PROC(_vfs, OID_AUTO, conflist, CTLTYPE_OPAQUE | CTLFLAG_RD |
+ CTLFLAG_MPSAFE, NULL, 0, sysctl_vfs_conflist,
"S,xvfsconf", "List of all configured filesystems");
#ifndef BURN_BRIDGES
@@ -3337,7 +3423,7 @@
u_int namelen = arg2 + 1; /* XXX */
struct vfsconf *vfsp;
- printf("WARNING: userland calling deprecated sysctl, "
+ log(LOG_WARNING, "userland calling deprecated sysctl, "
"please rebuild world\n");
#if 1 || defined(COMPAT_PRELITE2)
@@ -3354,9 +3440,12 @@
case VFS_CONF:
if (namelen != 3)
return (ENOTDIR); /* overloaded */
- TAILQ_FOREACH(vfsp, &vfsconf, vfc_list)
+ vfsconf_slock();
+ TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
if (vfsp->vfc_typenum == name[2])
break;
+ }
+ vfsconf_sunlock();
if (vfsp == NULL)
return (EOPNOTSUPP);
#ifdef COMPAT_FREEBSD32
@@ -3369,8 +3458,9 @@
return (EOPNOTSUPP);
}
-static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP,
- vfs_sysctl, "Generic filesystem");
+static SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD | CTLFLAG_SKIP |
+ CTLFLAG_MPSAFE, vfs_sysctl,
+ "Generic filesystem");
#if 1 || defined(COMPAT_PRELITE2)
@@ -3381,6 +3471,7 @@
struct vfsconf *vfsp;
struct ovfsconf ovfs;
+ vfsconf_slock();
TAILQ_FOREACH(vfsp, &vfsconf, vfc_list) {
bzero(&ovfs, sizeof(ovfs));
ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */
@@ -3389,10 +3480,13 @@
ovfs.vfc_refcount = vfsp->vfc_refcount;
ovfs.vfc_flags = vfsp->vfc_flags;
error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
- if (error)
- return error;
+ if (error != 0) {
+ vfsconf_sunlock();
+ return (error);
+ }
}
- return 0;
+ vfsconf_sunlock();
+ return (0);
}
#endif /* 1 || COMPAT_PRELITE2 */
@@ -3490,10 +3584,26 @@
return (error);
}
-SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD,
- 0, 0, sysctl_vnode, "S,xvnode", "");
+SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE | CTLFLAG_RD |
+ CTLFLAG_MPSAFE, 0, 0, sysctl_vnode, "S,xvnode",
+ "");
#endif
+static void
+unmount_or_warn(struct mount *mp)
+{
+ int error;
+
+ error = dounmount(mp, MNT_FORCE, curthread);
+ if (error != 0 && strcmp(mp->mnt_vfc->vfc_name, "devfs") != 0) {
+ printf("unmount of %s failed (", mp->mnt_stat.f_mntonname);
+ if (error == EBUSY)
+ printf("BUSY)\n");
+ else
+ printf("%d)\n", error);
+ }
+}
+
/*
* Unmount all filesystems. The list is traversed in reverse order
* of mounting to avoid dependencies.
@@ -3501,41 +3611,28 @@
void
vfs_unmountall(void)
{
- struct mount *mp;
- struct thread *td;
- int error;
+ struct mount *mp, *tmp;
CTR1(KTR_VFS, "%s: unmounting all filesystems", __func__);
- td = curthread;
/*
* Since this only runs when rebooting, it is not interlocked.
*/
- while(!TAILQ_EMPTY(&mountlist)) {
- mp = TAILQ_LAST(&mountlist, mntlist);
- error = dounmount(mp, MNT_FORCE, td);
- if (error) {
- TAILQ_REMOVE(&mountlist, mp, mnt_list);
- /*
- * XXX: Due to the way in which we mount the root
- * file system off of devfs, devfs will generate a
- * "busy" warning when we try to unmount it before
- * the root. Don't print a warning as a result in
- * order to avoid false positive errors that may
- * cause needless upset.
- */
- if (strcmp(mp->mnt_vfc->vfc_name, "devfs") != 0) {
- printf("unmount of %s failed (",
- mp->mnt_stat.f_mntonname);
- if (error == EBUSY)
- printf("BUSY)\n");
- else
- printf("%d)\n", error);
- }
- } else {
- /* The unmount has removed mp from the mountlist */
- }
+ TAILQ_FOREACH_REVERSE_SAFE(mp, &mountlist, mntlist, mnt_list, tmp) {
+ vfs_ref(mp);
+
+ /*
+ * Forcibly unmounting "/dev" before "/" would prevent clean
+ * unmount of the latter.
+ */
+ if (mp == rootdevmp)
+ continue;
+
+ unmount_or_warn(mp);
}
+
+ if (rootdevmp != NULL)
+ unmount_or_warn(rootdevmp);
}
/*
@@ -3563,11 +3660,11 @@
obj = vp->v_object;
if (obj != NULL) {
- VM_OBJECT_LOCK(obj);
+ VM_OBJECT_WLOCK(obj);
vm_object_page_clean(obj, 0, 0,
flags == MNT_WAIT ?
OBJPC_SYNC : OBJPC_NOSYNC);
- VM_OBJECT_UNLOCK(obj);
+ VM_OBJECT_WUNLOCK(obj);
}
vput(vp);
}
@@ -3577,16 +3674,25 @@
}
static void
-destroy_vpollinfo(struct vpollinfo *vi)
+destroy_vpollinfo_free(struct vpollinfo *vi)
{
- seldrain(&vi->vpi_selinfo);
+
knlist_destroy(&vi->vpi_selinfo.si_note);
mtx_destroy(&vi->vpi_lock);
uma_zfree(vnodepoll_zone, vi);
}
+static void
+destroy_vpollinfo(struct vpollinfo *vi)
+{
+
+ knlist_clear(&vi->vpi_selinfo.si_note, 1);
+ seldrain(&vi->vpi_selinfo);
+ destroy_vpollinfo_free(vi);
+}
+
/*
- * Initalize per-vnode helper structure to hold poll-related state.
+ * Initialize per-vnode helper structure to hold poll-related state.
*/
void
v_addpollinfo(struct vnode *vp)
@@ -3602,7 +3708,7 @@
VI_LOCK(vp);
if (vp->v_pollinfo != NULL) {
VI_UNLOCK(vp);
- destroy_vpollinfo(vi);
+ destroy_vpollinfo_free(vi);
return;
}
vp->v_pollinfo = vi;
@@ -3763,11 +3869,8 @@
* Walk the list of vnodes pushing all that are dirty and
* not already on the sync list.
*/
- mtx_lock(&mountlist_mtx);
- if (vfs_busy(mp, MBF_NOWAIT | MBF_MNTLSTLOCK) != 0) {
- mtx_unlock(&mountlist_mtx);
+ if (vfs_busy(mp, MBF_NOWAIT) != 0)
return (0);
- }
if (vn_start_write(NULL, &mp, V_NOWAIT) != 0) {
vfs_unbusy(mp);
return (0);
@@ -3828,11 +3931,13 @@
{
int error;
+ if (vp->v_type != VCHR) {
+ error = ENOTBLK;
+ goto out;
+ }
error = 0;
dev_lock();
- if (vp->v_type != VCHR)
- error = ENOTBLK;
- else if (vp->v_rdev == NULL)
+ if (vp->v_rdev == NULL)
error = ENXIO;
else if (vp->v_rdev->si_devsw == NULL)
error = ENXIO;
@@ -3839,6 +3944,7 @@
else if (!(vp->v_rdev->si_devsw->d_flags & D_DISK))
error = ENOTBLK;
dev_unlock();
+out:
if (errp != NULL)
*errp = error;
return (error == 0);
@@ -3997,7 +4103,7 @@
#ifdef DEBUG_VFS_LOCKS
/*
- * This only exists to supress warnings from unlocked specfs accesses. It is
+ * This only exists to suppress warnings from unlocked specfs accesses. It is
* no longer ok to have an unlocked VFS.
*/
#define IGNORE_LOCK(vp) (panicstr != NULL || (vp) == NULL || \
@@ -4268,6 +4374,15 @@
}
void
+vop_reclaim_post(void *ap, int rc)
+{
+ struct vop_reclaim_args *a = ap;
+
+ if (!rc)
+ VFS_KNOTE_LOCKED(a->a_vp, NOTE_REVOKE);
+}
+
+void
vop_remove_post(void *ap, int rc)
{
struct vop_remove_args *a = ap;
@@ -4282,10 +4397,27 @@
vop_rename_post(void *ap, int rc)
{
struct vop_rename_args *a = ap;
+ long hint;
if (!rc) {
- VFS_KNOTE_UNLOCKED(a->a_fdvp, NOTE_WRITE);
- VFS_KNOTE_UNLOCKED(a->a_tdvp, NOTE_WRITE);
+ hint = NOTE_WRITE;
+ if (a->a_fdvp == a->a_tdvp) {
+ if (a->a_tvp != NULL && a->a_tvp->v_type == VDIR)
+ hint |= NOTE_LINK;
+ VFS_KNOTE_UNLOCKED(a->a_fdvp, hint);
+ VFS_KNOTE_UNLOCKED(a->a_tdvp, hint);
+ } else {
+ hint |= NOTE_EXTEND;
+ if (a->a_fvp->v_type == VDIR)
+ hint |= NOTE_LINK;
+ VFS_KNOTE_UNLOCKED(a->a_fdvp, hint);
+
+ if (a->a_fvp->v_type == VDIR && a->a_tvp != NULL &&
+ a->a_tvp->v_type == VDIR)
+ hint &= ~NOTE_LINK;
+ VFS_KNOTE_UNLOCKED(a->a_tdvp, hint);
+ }
+
VFS_KNOTE_UNLOCKED(a->a_fvp, NOTE_RENAME);
if (a->a_tvp)
VFS_KNOTE_UNLOCKED(a->a_tvp, NOTE_DELETE);
@@ -4337,6 +4469,45 @@
VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
}
+void
+vop_open_post(void *ap, int rc)
+{
+ struct vop_open_args *a = ap;
+
+ if (!rc)
+ VFS_KNOTE_LOCKED(a->a_vp, NOTE_OPEN);
+}
+
+void
+vop_close_post(void *ap, int rc)
+{
+ struct vop_close_args *a = ap;
+
+ if (!rc && (a->a_cred != NOCRED || /* filter out revokes */
+ (a->a_vp->v_iflag & VI_DOOMED) == 0)) {
+ VFS_KNOTE_LOCKED(a->a_vp, (a->a_fflag & FWRITE) != 0 ?
+ NOTE_CLOSE_WRITE : NOTE_CLOSE);
+ }
+}
+
+void
+vop_read_post(void *ap, int rc)
+{
+ struct vop_read_args *a = ap;
+
+ if (!rc)
+ VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ);
+}
+
+void
+vop_readdir_post(void *ap, int rc)
+{
+ struct vop_readdir_args *a = ap;
+
+ if (!rc)
+ VFS_KNOTE_LOCKED(a->a_vp, NOTE_READ);
+}
+
static struct knlist fs_knlist;
static void
@@ -4516,6 +4687,7 @@
if (vp->v_pollinfo == NULL)
return (ENOMEM);
knl = &vp->v_pollinfo->vpi_selinfo.si_note;
+ vhold(vp);
knlist_add(knl, kn, 0);
return (0);
@@ -4531,6 +4703,7 @@
KASSERT(vp->v_pollinfo != NULL, ("Missing v_pollinfo"));
knlist_remove(&vp->v_pollinfo->vpi_selinfo.si_note, kn, 0);
+ vdrop(vp);
}
/*ARGSUSED*/
@@ -4545,7 +4718,7 @@
* filesystem is gone, so set the EOF flag and schedule
* the knote for deletion.
*/
- if (hint == NOTE_REVOKE) {
+ if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) {
VI_LOCK(vp);
kn->kn_flags |= (EV_EOF | EV_ONESHOT);
VI_UNLOCK(vp);
@@ -4574,7 +4747,7 @@
* filesystem is gone, so set the EOF flag and schedule
* the knote for deletion.
*/
- if (hint == NOTE_REVOKE)
+ if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD))
kn->kn_flags |= (EV_EOF | EV_ONESHOT);
kn->kn_data = 0;
@@ -4591,7 +4764,7 @@
VI_LOCK(vp);
if (kn->kn_sfflags & hint)
kn->kn_fflags |= hint;
- if (hint == NOTE_REVOKE) {
+ if (hint == NOTE_REVOKE || (hint == 0 && vp->v_type == VBAD)) {
kn->kn_flags |= EV_EOF;
VI_UNLOCK(vp);
return (1);
@@ -4627,6 +4800,7 @@
*ap->a_cookies = realloc(*ap->a_cookies,
(*ap->a_ncookies + 1) * sizeof(u_long), M_TEMP, M_WAITOK | M_ZERO);
(*ap->a_cookies)[*ap->a_ncookies] = off;
+ *ap->a_ncookies += 1;
return (0);
}
@@ -4642,7 +4816,6 @@
struct mount *mp;
mp = vp->v_mount;
- VFS_ASSERT_GIANT(mp);
ASSERT_VOP_LOCKED(vp, "vfs_mark_atime");
if (mp != NULL && (mp->mnt_flag & (MNT_NOATIME | MNT_RDONLY)) == 0)
(void)VOP_MARKATIME(vp);
@@ -4710,15 +4883,21 @@
struct vnode *vp;
if (should_yield())
- kern_yield(PRI_UNCHANGED);
+ kern_yield(PRI_USER);
MNT_ILOCK(mp);
KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
- vp = TAILQ_NEXT(*mvp, v_nmntvnodes);
- while (vp != NULL && (vp->v_type == VMARKER ||
- (vp->v_iflag & VI_DOOMED) != 0))
- vp = TAILQ_NEXT(vp, v_nmntvnodes);
-
- /* Check if we are done */
+ for (vp = TAILQ_NEXT(*mvp, v_nmntvnodes); vp != NULL;
+ vp = TAILQ_NEXT(vp, v_nmntvnodes)) {
+ /* Allow a racy peek at VI_DOOMED to save a lock acquisition. */
+ if (vp->v_type == VMARKER || (vp->v_iflag & VI_DOOMED) != 0)
+ continue;
+ VI_LOCK(vp);
+ if ((vp->v_iflag & VI_DOOMED) != 0) {
+ VI_UNLOCK(vp);
+ continue;
+ }
+ break;
+ }
if (vp == NULL) {
__mnt_vnode_markerfree_all(mvp, mp);
/* MNT_IUNLOCK(mp); -- done in above function */
@@ -4727,7 +4906,6 @@
}
TAILQ_REMOVE(&mp->mnt_nvnodelist, *mvp, v_nmntvnodes);
TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
- VI_LOCK(vp);
MNT_IUNLOCK(mp);
return (vp);
}
@@ -4740,14 +4918,20 @@
*mvp = malloc(sizeof(struct vnode), M_VNODE_MARKER, M_WAITOK | M_ZERO);
MNT_ILOCK(mp);
MNT_REF(mp);
+ (*mvp)->v_mount = mp;
(*mvp)->v_type = VMARKER;
- vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
- while (vp != NULL && (vp->v_type == VMARKER ||
- (vp->v_iflag & VI_DOOMED) != 0))
- vp = TAILQ_NEXT(vp, v_nmntvnodes);
-
- /* Check if we are done */
+ TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes) {
+ /* Allow a racy peek at VI_DOOMED to save a lock acquisition. */
+ if (vp->v_type == VMARKER || (vp->v_iflag & VI_DOOMED) != 0)
+ continue;
+ VI_LOCK(vp);
+ if ((vp->v_iflag & VI_DOOMED) != 0) {
+ VI_UNLOCK(vp);
+ continue;
+ }
+ break;
+ }
if (vp == NULL) {
MNT_REL(mp);
MNT_IUNLOCK(mp);
@@ -4755,14 +4939,11 @@
*mvp = NULL;
return (NULL);
}
- (*mvp)->v_mount = mp;
TAILQ_INSERT_AFTER(&mp->mnt_nvnodelist, vp, *mvp, v_nmntvnodes);
- VI_LOCK(vp);
MNT_IUNLOCK(mp);
return (vp);
}
-
void
__mnt_vnode_markerfree_all(struct vnode **mvp, struct mount *mp)
{
@@ -4799,12 +4980,6 @@
*mvp = NULL;
}
-#ifdef SMP
-#define ALWAYS_YIELD (mp_ncpus == 1)
-#else
-#define ALWAYS_YIELD 1
-#endif
-
static struct vnode *
mnt_vnode_next_active(struct vnode **mvp, struct mount *mp)
{
@@ -4821,10 +4996,10 @@
continue;
}
if (!VI_TRYLOCK(vp)) {
- if (ALWAYS_YIELD || should_yield()) {
+ if (mp_ncpus == 1 || should_yield()) {
TAILQ_INSERT_BEFORE(vp, *mvp, v_actfreelist);
mtx_unlock(&vnode_free_list_mtx);
- kern_yield(PRI_USER);
+ pause("vnacti", 1);
mtx_lock(&vnode_free_list_mtx);
goto restart;
}
@@ -4852,7 +5027,6 @@
KASSERT((vp->v_iflag & VI_ACTIVE) != 0, ("Non-active vp %p", vp));
return (vp);
}
-#undef ALWAYS_YIELD
struct vnode *
__mnt_vnode_next_active(struct vnode **mvp, struct mount *mp)
@@ -4859,7 +5033,7 @@
{
if (should_yield())
- kern_yield(PRI_UNCHANGED);
+ kern_yield(PRI_USER);
mtx_lock(&vnode_free_list_mtx);
return (mnt_vnode_next_active(mvp, mp));
}
Modified: trunk/sys/kern/vfs_syscalls.c
===================================================================
--- trunk/sys/kern/vfs_syscalls.c 2018-05-25 21:07:58 UTC (rev 9951)
+++ trunk/sys/kern/vfs_syscalls.c 2018-05-26 14:24:52 UTC (rev 9952)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 1989, 1993
* The Regents of the University of California. All rights reserved.
@@ -35,7 +36,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/vfs_syscalls.c 325099 2017-10-29 09:48:28Z kib $");
#include "opt_capsicum.h"
#include "opt_compat.h"
@@ -46,7 +47,7 @@
#include <sys/systm.h>
#include <sys/bio.h>
#include <sys/buf.h>
-#include <sys/capability.h>
+#include <sys/capsicum.h>
#include <sys/disk.h>
#include <sys/sysent.h>
#include <sys/malloc.h>
@@ -61,6 +62,7 @@
#include <sys/filio.h>
#include <sys/limits.h>
#include <sys/linker.h>
+#include <sys/rwlock.h>
#include <sys/sdt.h>
#include <sys/stat.h>
#include <sys/sx.h>
@@ -91,16 +93,18 @@
MALLOC_DEFINE(M_FADVISE, "fadvise", "posix_fadvise(2) information");
SDT_PROVIDER_DEFINE(vfs);
-SDT_PROBE_DEFINE(vfs, , stat, mode, mode);
-SDT_PROBE_ARGTYPE(vfs, , stat, mode, 0, "char *");
-SDT_PROBE_ARGTYPE(vfs, , stat, mode, 1, "int");
-SDT_PROBE_DEFINE(vfs, , stat, reg, reg);
-SDT_PROBE_ARGTYPE(vfs, , stat, reg, 0, "char *");
-SDT_PROBE_ARGTYPE(vfs, , stat, reg, 1, "int");
+SDT_PROBE_DEFINE2(vfs, , stat, mode, "char *", "int");
+SDT_PROBE_DEFINE2(vfs, , stat, reg, "char *", "int");
static int chroot_refuse_vdir_fds(struct filedesc *fdp);
+static int kern_chflags(struct thread *td, const char *path,
+ enum uio_seg pathseg, u_long flags);
+static int kern_chflagsat(struct thread *td, int fd, const char *path,
+ enum uio_seg pathseg, u_long flags, int atflag);
+static int setfflags(struct thread *td, struct vnode *, u_long);
static int getutimes(const struct timeval *, enum uio_seg, struct timespec *);
-static int setfflags(struct thread *td, struct vnode *, int);
+static int getutimens(const struct timespec *, enum uio_seg,
+ struct timespec *, int *);
static int setutimes(struct thread *td, struct vnode *,
const struct timespec *, int, int);
static int vn_access(struct vnode *vp, int user_flags, struct ucred *cred,
@@ -114,11 +118,6 @@
*/
int async_io_version;
-#ifdef DEBUG
-static int syncprt = 0;
-SYSCTL_INT(_debug, OID_AUTO, syncprt, CTLFLAG_RW, &syncprt, 0, "");
-#endif
-
/*
* Sync each mounted filesystem.
*/
@@ -134,7 +133,7 @@
struct sync_args *uap;
{
struct mount *mp, *nmp;
- int save, vfslocked;
+ int save;
mtx_lock(&mountlist_mtx);
for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
@@ -142,7 +141,6 @@
nmp = TAILQ_NEXT(mp, mnt_list);
continue;
}
- vfslocked = VFS_LOCK_GIANT(mp);
if ((mp->mnt_flag & MNT_RDONLY) == 0 &&
vn_start_write(NULL, &mp, V_NOWAIT) == 0) {
save = curthread_pflags_set(TDP_SYNCIO);
@@ -151,7 +149,6 @@
curthread_pflags_restore(save);
vn_finished_write(mp);
}
- VFS_UNLOCK_GIANT(vfslocked);
mtx_lock(&mountlist_mtx);
nmp = TAILQ_NEXT(mp, mnt_list);
vfs_unbusy(mp);
@@ -182,19 +179,17 @@
} */ *uap;
{
struct mount *mp;
- int vfslocked;
+ struct nameidata nd;
int error;
- struct nameidata nd;
AUDIT_ARG_CMD(uap->cmd);
AUDIT_ARG_UID(uap->uid);
if (!prison_allow(td->td_ucred, PR_ALLOW_QUOTAS))
return (EPERM);
- NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
- UIO_USERSPACE, uap->path, td);
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
+ uap->path, td);
if ((error = namei(&nd)) != 0)
return (error);
- vfslocked = NDHASGIANT(&nd);
NDFREE(&nd, NDF_ONLY_PNBUF);
mp = nd.ni_vp->v_mount;
vfs_ref(mp);
@@ -201,10 +196,8 @@
vput(nd.ni_vp);
error = vfs_busy(mp, 0);
vfs_rel(mp);
- if (error) {
- VFS_UNLOCK_GIANT(vfslocked);
+ if (error != 0)
return (error);
- }
error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, uap->arg);
/*
@@ -220,7 +213,6 @@
*/
if ((uap->cmd >> SUBCMDSHIFT) != Q_QUOTAON)
vfs_unbusy(mp);
- VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
@@ -265,6 +257,43 @@
sf->f_bavail >>= shift;
}
+static int
+kern_do_statfs(struct thread *td, struct mount *mp, struct statfs *buf)
+{
+ struct statfs *sp;
+ int error;
+
+ if (mp == NULL)
+ return (EBADF);
+ error = vfs_busy(mp, 0);
+ vfs_rel(mp);
+ if (error != 0)
+ return (error);
+#ifdef MAC
+ error = mac_mount_check_stat(td->td_ucred, mp);
+ if (error != 0)
+ goto out;
+#endif
+ /*
+ * Set these in case the underlying filesystem fails to do so.
+ */
+ sp = &mp->mnt_stat;
+ sp->f_version = STATFS_VERSION;
+ sp->f_namemax = NAME_MAX;
+ sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
+ error = VFS_STATFS(mp, sp);
+ if (error != 0)
+ goto out;
+ *buf = *sp;
+ if (priv_check(td, PRIV_VFS_GENERATION)) {
+ buf->f_fsid.val[0] = buf->f_fsid.val[1] = 0;
+ prison_enforce_statfs(td->td_ucred, mp, buf);
+ }
+out:
+ vfs_unbusy(mp);
+ return (error);
+}
+
/*
* Get filesystem statistics.
*/
@@ -296,53 +325,19 @@
struct statfs *buf)
{
struct mount *mp;
- struct statfs *sp, sb;
- int vfslocked;
+ struct nameidata nd;
int error;
- struct nameidata nd;
- NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | MPSAFE |
- AUDITVNODE1, pathseg, path, td);
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
+ pathseg, path, td);
error = namei(&nd);
- if (error)
+ if (error != 0)
return (error);
- vfslocked = NDHASGIANT(&nd);
mp = nd.ni_vp->v_mount;
vfs_ref(mp);
NDFREE(&nd, NDF_ONLY_PNBUF);
vput(nd.ni_vp);
- error = vfs_busy(mp, 0);
- vfs_rel(mp);
- if (error) {
- VFS_UNLOCK_GIANT(vfslocked);
- return (error);
- }
-#ifdef MAC
- error = mac_mount_check_stat(td->td_ucred, mp);
- if (error)
- goto out;
-#endif
- /*
- * Set these in case the underlying filesystem fails to do so.
- */
- sp = &mp->mnt_stat;
- sp->f_version = STATFS_VERSION;
- sp->f_namemax = NAME_MAX;
- sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
- error = VFS_STATFS(mp, sp);
- if (error)
- goto out;
- if (priv_check(td, PRIV_VFS_GENERATION)) {
- bcopy(sp, &sb, sizeof(sb));
- sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
- prison_enforce_statfs(td->td_ucred, mp, &sb);
- sp = &sb;
- }
- *buf = *sp;
-out:
- vfs_unbusy(mp);
- VFS_UNLOCK_GIANT(vfslocked);
- return (error);
+ return (kern_do_statfs(td, mp, buf));
}
/*
@@ -376,63 +371,26 @@
{
struct file *fp;
struct mount *mp;
- struct statfs *sp, sb;
- int vfslocked;
struct vnode *vp;
+ cap_rights_t rights;
int error;
AUDIT_ARG_FD(fd);
- error = getvnode(td->td_proc->p_fd, fd, CAP_FSTATFS, &fp);
- if (error)
+ error = getvnode(td->td_proc->p_fd, fd,
+ cap_rights_init(&rights, CAP_FSTATFS), &fp);
+ if (error != 0)
return (error);
vp = fp->f_vnode;
- vfslocked = VFS_LOCK_GIANT(vp->v_mount);
vn_lock(vp, LK_SHARED | LK_RETRY);
#ifdef AUDIT
AUDIT_ARG_VNODE1(vp);
#endif
mp = vp->v_mount;
- if (mp)
+ if (mp != NULL)
vfs_ref(mp);
VOP_UNLOCK(vp, 0);
fdrop(fp, td);
- if (mp == NULL) {
- error = EBADF;
- goto out;
- }
- error = vfs_busy(mp, 0);
- vfs_rel(mp);
- if (error) {
- VFS_UNLOCK_GIANT(vfslocked);
- return (error);
- }
-#ifdef MAC
- error = mac_mount_check_stat(td->td_ucred, mp);
- if (error)
- goto out;
-#endif
- /*
- * Set these in case the underlying filesystem fails to do so.
- */
- sp = &mp->mnt_stat;
- sp->f_version = STATFS_VERSION;
- sp->f_namemax = NAME_MAX;
- sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
- error = VFS_STATFS(mp, sp);
- if (error)
- goto out;
- if (priv_check(td, PRIV_VFS_GENERATION)) {
- bcopy(sp, &sb, sizeof(sb));
- sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
- prison_enforce_statfs(td->td_ucred, mp, &sb);
- sp = &sb;
- }
- *buf = *sp;
-out:
- if (mp)
- vfs_unbusy(mp);
- VFS_UNLOCK_GIANT(vfslocked);
- return (error);
+ return (kern_do_statfs(td, mp, buf));
}
/*
@@ -461,7 +419,7 @@
/*
* If (bufsize > 0 && bufseg == UIO_SYSSPACE)
- * The caller is responsible for freeing memory which will be allocated
+ * The caller is responsible for freeing memory which will be allocated
* in '*buf'.
*/
int
@@ -471,7 +429,6 @@
struct mount *mp, *nmp;
struct statfs *sfsp, *sp, sb;
size_t count, maxcount;
- int vfslocked;
int error;
maxcount = bufsize / sizeof(struct statfs);
@@ -508,8 +465,7 @@
nmp = TAILQ_NEXT(mp, mnt_list);
continue;
}
- vfslocked = VFS_LOCK_GIANT(mp);
- if (sfsp && count < maxcount) {
+ if (sfsp != NULL && count < maxcount) {
sp = &mp->mnt_stat;
/*
* Set these in case the underlying filesystem
@@ -526,7 +482,6 @@
if (((flags & (MNT_LAZY|MNT_NOWAIT)) == 0 ||
(flags & MNT_WAIT)) &&
(error = VFS_STATFS(mp, sp))) {
- VFS_UNLOCK_GIANT(vfslocked);
mtx_lock(&mountlist_mtx);
nmp = TAILQ_NEXT(mp, mnt_list);
vfs_unbusy(mp);
@@ -542,15 +497,13 @@
bcopy(sp, sfsp, sizeof(*sp));
else /* if (bufseg == UIO_USERSPACE) */ {
error = copyout(sp, sfsp, sizeof(*sp));
- if (error) {
+ if (error != 0) {
vfs_unbusy(mp);
- VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
}
sfsp++;
}
- VFS_UNLOCK_GIANT(vfslocked);
count++;
mtx_lock(&mountlist_mtx);
nmp = TAILQ_NEXT(mp, mnt_list);
@@ -557,7 +510,7 @@
vfs_unbusy(mp);
}
mtx_unlock(&mountlist_mtx);
- if (sfsp && count > maxcount)
+ if (sfsp != NULL && count > maxcount)
td->td_retval[0] = maxcount;
else
td->td_retval[0] = count;
@@ -589,7 +542,7 @@
int error;
error = kern_statfs(td, uap->path, UIO_USERSPACE, &sf);
- if (error)
+ if (error != 0)
return (error);
cvtstatfs(&sf, &osb);
return (copyout(&osb, uap->buf, sizeof(osb)));
@@ -617,7 +570,7 @@
int error;
error = kern_fstatfs(td, uap->fd, &sf);
- if (error)
+ if (error != 0)
return (error);
cvtstatfs(&sf, &osb);
return (copyout(&osb, uap->buf, sizeof(osb)));
@@ -688,10 +641,10 @@
int error;
error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
- if (error)
+ if (error != 0)
return (error);
error = kern_fhstatfs(td, fh, &sf);
- if (error)
+ if (error != 0)
return (error);
cvtstatfs(&sf, &osb);
return (copyout(&osb, uap->buf, sizeof(osb)));
@@ -751,49 +704,40 @@
struct vnode *vp, *tdp, *vpold;
struct mount *mp;
struct file *fp;
- int vfslocked;
+ cap_rights_t rights;
int error;
AUDIT_ARG_FD(uap->fd);
- if ((error = getvnode(fdp, uap->fd, CAP_FCHDIR, &fp)) != 0)
+ error = getvnode(fdp, uap->fd, cap_rights_init(&rights, CAP_FCHDIR),
+ &fp);
+ if (error != 0)
return (error);
vp = fp->f_vnode;
VREF(vp);
fdrop(fp, td);
- vfslocked = VFS_LOCK_GIANT(vp->v_mount);
vn_lock(vp, LK_SHARED | LK_RETRY);
AUDIT_ARG_VNODE1(vp);
error = change_dir(vp, td);
while (!error && (mp = vp->v_mountedhere) != NULL) {
- int tvfslocked;
if (vfs_busy(mp, 0))
continue;
- tvfslocked = VFS_LOCK_GIANT(mp);
error = VFS_ROOT(mp, LK_SHARED, &tdp);
vfs_unbusy(mp);
- if (error) {
- VFS_UNLOCK_GIANT(tvfslocked);
+ if (error != 0)
break;
- }
vput(vp);
- VFS_UNLOCK_GIANT(vfslocked);
vp = tdp;
- vfslocked = tvfslocked;
}
- if (error) {
+ if (error != 0) {
vput(vp);
- VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
VOP_UNLOCK(vp, 0);
- VFS_UNLOCK_GIANT(vfslocked);
FILEDESC_XLOCK(fdp);
vpold = fdp->fd_cdir;
fdp->fd_cdir = vp;
FILEDESC_XUNLOCK(fdp);
- vfslocked = VFS_LOCK_GIANT(vpold->v_mount);
vrele(vpold);
- VFS_UNLOCK_GIANT(vfslocked);
return (0);
}
@@ -820,32 +764,26 @@
kern_chdir(struct thread *td, char *path, enum uio_seg pathseg)
{
register struct filedesc *fdp = td->td_proc->p_fd;
- int error;
struct nameidata nd;
struct vnode *vp;
- int vfslocked;
+ int error;
- NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1 |
- MPSAFE, pathseg, path, td);
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
+ pathseg, path, td);
if ((error = namei(&nd)) != 0)
return (error);
- vfslocked = NDHASGIANT(&nd);
if ((error = change_dir(nd.ni_vp, td)) != 0) {
vput(nd.ni_vp);
- VFS_UNLOCK_GIANT(vfslocked);
NDFREE(&nd, NDF_ONLY_PNBUF);
return (error);
}
VOP_UNLOCK(nd.ni_vp, 0);
- VFS_UNLOCK_GIANT(vfslocked);
NDFREE(&nd, NDF_ONLY_PNBUF);
FILEDESC_XLOCK(fdp);
vp = fdp->fd_cdir;
fdp->fd_cdir = nd.ni_vp;
FILEDESC_XUNLOCK(fdp);
- vfslocked = VFS_LOCK_GIANT(vp->v_mount);
vrele(vp);
- VFS_UNLOCK_GIANT(vfslocked);
return (0);
}
@@ -863,7 +801,7 @@
FILEDESC_LOCK_ASSERT(fdp);
- for (fd = 0; fd < fdp->fd_nfiles ; fd++) {
+ for (fd = 0; fd <= fdp->fd_lastfile; fd++) {
fp = fget_locked(fdp, fd);
if (fp == NULL)
continue;
@@ -887,7 +825,8 @@
static int chroot_allow_open_directories = 1;
SYSCTL_INT(_kern, OID_AUTO, chroot_allow_open_directories, CTLFLAG_RW,
- &chroot_allow_open_directories, 0, "");
+ &chroot_allow_open_directories, 0,
+ "Allow a process to chroot(2) if it has a directory open");
/*
* Change notion of root (``/'') directory.
@@ -904,34 +843,32 @@
char *path;
} */ *uap;
{
+ struct nameidata nd;
int error;
- struct nameidata nd;
- int vfslocked;
error = priv_check(td, PRIV_VFS_CHROOT);
- if (error)
+ if (error != 0)
return (error);
- NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | MPSAFE |
- AUDITVNODE1, UIO_USERSPACE, uap->path, td);
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
+ UIO_USERSPACE, uap->path, td);
error = namei(&nd);
- if (error)
+ if (error != 0)
goto error;
- vfslocked = NDHASGIANT(&nd);
- if ((error = change_dir(nd.ni_vp, td)) != 0)
+ error = change_dir(nd.ni_vp, td);
+ if (error != 0)
goto e_vunlock;
#ifdef MAC
- if ((error = mac_vnode_check_chroot(td->td_ucred, nd.ni_vp)))
+ error = mac_vnode_check_chroot(td->td_ucred, nd.ni_vp);
+ if (error != 0)
goto e_vunlock;
#endif
VOP_UNLOCK(nd.ni_vp, 0);
error = change_root(nd.ni_vp, td);
vrele(nd.ni_vp);
- VFS_UNLOCK_GIANT(vfslocked);
NDFREE(&nd, NDF_ONLY_PNBUF);
return (error);
e_vunlock:
vput(nd.ni_vp);
- VFS_UNLOCK_GIANT(vfslocked);
error:
NDFREE(&nd, NDF_ONLY_PNBUF);
return (error);
@@ -946,7 +883,9 @@
struct vnode *vp;
struct thread *td;
{
+#ifdef MAC
int error;
+#endif
ASSERT_VOP_LOCKED(vp, "change_dir(): vp not locked");
if (vp->v_type != VDIR)
@@ -953,11 +892,10 @@
return (ENOTDIR);
#ifdef MAC
error = mac_vnode_check_chdir(td->td_ucred, vp);
- if (error)
+ if (error != 0)
return (error);
#endif
- error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td);
- return (error);
+ return (VOP_ACCESS(vp, VEXEC, td->td_ucred, td));
}
/*
@@ -972,16 +910,14 @@
{
struct filedesc *fdp;
struct vnode *oldvp;
- int vfslocked;
int error;
- VFS_ASSERT_GIANT(vp->v_mount);
fdp = td->td_proc->p_fd;
FILEDESC_XLOCK(fdp);
if (chroot_allow_open_directories == 0 ||
(chroot_allow_open_directories == 1 && fdp->fd_rdir != rootvnode)) {
error = chroot_refuse_vdir_fds(fdp);
- if (error) {
+ if (error != 0) {
FILEDESC_XUNLOCK(fdp);
return (error);
}
@@ -994,45 +930,43 @@
VREF(fdp->fd_jdir);
}
FILEDESC_XUNLOCK(fdp);
- vfslocked = VFS_LOCK_GIANT(oldvp->v_mount);
vrele(oldvp);
- VFS_UNLOCK_GIANT(vfslocked);
return (0);
}
-static __inline cap_rights_t
-flags_to_rights(int flags)
+static __inline void
+flags_to_rights(int flags, cap_rights_t *rightsp)
{
- cap_rights_t rights = 0;
- switch ((flags & O_ACCMODE)) {
- case O_RDONLY:
- rights |= CAP_READ;
- break;
-
- case O_RDWR:
- rights |= CAP_READ;
- /* fall through */
-
- case O_WRONLY:
- rights |= CAP_WRITE;
- break;
-
- case O_EXEC:
- rights |= CAP_FEXECVE;
- break;
+ if (flags & O_EXEC) {
+ cap_rights_set(rightsp, CAP_FEXECVE);
+ } else {
+ switch ((flags & O_ACCMODE)) {
+ case O_RDONLY:
+ cap_rights_set(rightsp, CAP_READ);
+ break;
+ case O_RDWR:
+ cap_rights_set(rightsp, CAP_READ);
+ /* FALLTHROUGH */
+ case O_WRONLY:
+ cap_rights_set(rightsp, CAP_WRITE);
+ if (!(flags & (O_APPEND | O_TRUNC)))
+ cap_rights_set(rightsp, CAP_SEEK);
+ break;
+ }
}
if (flags & O_CREAT)
- rights |= CAP_CREATE;
+ cap_rights_set(rightsp, CAP_CREATE);
if (flags & O_TRUNC)
- rights |= CAP_FTRUNCATE;
+ cap_rights_set(rightsp, CAP_FTRUNCATE);
- if ((flags & O_EXLOCK) || (flags & O_SHLOCK))
- rights |= CAP_FLOCK;
+ if (flags & (O_SYNC | O_FSYNC))
+ cap_rights_set(rightsp, CAP_FSYNC);
- return (rights);
+ if (flags & (O_EXLOCK | O_SHLOCK))
+ cap_rights_set(rightsp, CAP_FLOCK);
}
/*
@@ -1091,18 +1025,17 @@
struct filedesc *fdp = p->p_fd;
struct file *fp;
struct vnode *vp;
- int cmode;
- struct file *nfp;
- int type, indx = -1, error, error_open;
- struct flock lf;
struct nameidata nd;
- int vfslocked;
- cap_rights_t rights_needed = CAP_LOOKUP;
+ cap_rights_t rights;
+ int cmode, error, indx;
+ indx = -1;
+
AUDIT_ARG_FFLAGS(flags);
AUDIT_ARG_MODE(mode);
/* XXX: audit dirfd */
- rights_needed |= flags_to_rights(flags);
+ cap_rights_init(&rights, CAP_LOOKUP);
+ flags_to_rights(flags, &rights);
/*
* Only one of the O_EXEC, O_RDONLY, O_WRONLY and O_RDWR flags
* may be specified.
@@ -1110,27 +1043,30 @@
if (flags & O_EXEC) {
if (flags & O_ACCMODE)
return (EINVAL);
- } else if ((flags & O_ACCMODE) == O_ACCMODE)
+ } else if ((flags & O_ACCMODE) == O_ACCMODE) {
return (EINVAL);
- else
+ } else {
flags = FFLAGS(flags);
+ }
/*
- * allocate the file descriptor, but don't install a descriptor yet
+ * Allocate the file descriptor, but don't install a descriptor yet.
*/
- error = falloc_noinstall(td, &nfp);
- if (error)
+ error = falloc_noinstall(td, &fp);
+ if (error != 0)
return (error);
- /* An extra reference on `nfp' has been held for us by falloc_noinstall(). */
- fp = nfp;
+ /*
+ * An extra reference on `fp' has been held for us by
+ * falloc_noinstall().
+ */
/* Set the flags early so the finit in devfs can pick them up. */
fp->f_flag = flags & FMASK;
- cmode = ((mode &~ fdp->fd_cmask) & ALLPERMS) &~ S_ISTXT;
- NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | AUDITVNODE1 | MPSAFE, pathseg,
- path, fd, rights_needed, td);
+ cmode = ((mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT;
+ NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, fd,
+ &rights, td);
td->td_dupfd = -1; /* XXX check for fdopen */
error = vn_open(&nd, &flags, cmode, fp);
- if (error) {
+ if (error != 0) {
/*
* If the vn_open replaced the method vector, something
* wonderous happened deep below and we just pass it up
@@ -1140,37 +1076,24 @@
goto success;
/*
- * handle special fdopen() case. bleh. dupfdopen() is
- * responsible for dropping the old contents of ofiles[indx]
- * if it succeeds.
+ * Handle special fdopen() case. bleh.
*
* Don't do this for relative (capability) lookups; we don't
* understand exactly what would happen, and we don't think
* that it ever should.
*/
- if ((nd.ni_strictrelative == 0) &&
+ if (nd.ni_strictrelative == 0 &&
(error == ENODEV || error == ENXIO) &&
- (td->td_dupfd >= 0)) {
- /* XXX from fdopen */
- error_open = error;
- if ((error = finstall(td, fp, &indx, flags)) != 0)
- goto bad_unlocked;
- if ((error = dupfdopen(td, fdp, indx, td->td_dupfd,
- flags, error_open)) == 0)
+ td->td_dupfd >= 0) {
+ error = dupfdopen(td, fdp, td->td_dupfd, flags, error,
+ &indx);
+ if (error == 0)
goto success;
}
- /*
- * Clean up the descriptor, but only if another thread hadn't
- * replaced or closed it.
- */
- if (indx != -1)
- fdclose(fdp, fp, indx, td);
- fdrop(fp, td);
- return (error);
+ goto bad;
}
td->td_dupfd = 0;
- vfslocked = NDHASGIANT(&nd);
NDFREE(&nd, NDF_ONLY_PNBUF);
vp = nd.ni_vp;
@@ -1187,51 +1110,37 @@
if (fp->f_ops == &badfileops) {
KASSERT(vp->v_type != VFIFO, ("Unexpected fifo."));
fp->f_seqcount = 1;
- finit(fp, flags & FMASK, DTYPE_VNODE, vp, &vnops);
+ finit(fp, (flags & FMASK) | (fp->f_flag & FHASLOCK),
+ DTYPE_VNODE, vp, &vnops);
}
VOP_UNLOCK(vp, 0);
- if (fp->f_type == DTYPE_VNODE && (flags & (O_EXLOCK | O_SHLOCK)) != 0) {
- lf.l_whence = SEEK_SET;
- lf.l_start = 0;
- lf.l_len = 0;
- if (flags & O_EXLOCK)
- lf.l_type = F_WRLCK;
- else
- lf.l_type = F_RDLCK;
- type = F_FLOCK;
- if ((flags & FNONBLOCK) == 0)
- type |= F_WAIT;
- if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf,
- type)) != 0)
- goto bad;
- atomic_set_int(&fp->f_flag, FHASLOCK);
- }
if (flags & O_TRUNC) {
error = fo_truncate(fp, 0, td->td_ucred, td);
- if (error)
+ if (error != 0)
goto bad;
}
- VFS_UNLOCK_GIANT(vfslocked);
success:
/*
* If we haven't already installed the FD (for dupfdopen), do so now.
*/
if (indx == -1) {
+ struct filecaps *fcaps;
+
#ifdef CAPABILITIES
- if (nd.ni_strictrelative == 1) {
- /*
- * We are doing a strict relative lookup; wrap the
- * result in a capability.
- */
- if ((error = kern_capwrap(td, fp, nd.ni_baserights,
- &indx)) != 0)
- goto bad_unlocked;
- } else
+ if (nd.ni_strictrelative == 1)
+ fcaps = &nd.ni_filecaps;
+ else
#endif
- if ((error = finstall(td, fp, &indx, flags)) != 0)
- goto bad_unlocked;
-
+ fcaps = NULL;
+ error = finstall(td, fp, &indx, flags, fcaps);
+ /* On success finstall() consumes fcaps. */
+ if (error != 0) {
+ filecaps_free(&nd.ni_filecaps);
+ goto bad;
+ }
+ } else {
+ filecaps_free(&nd.ni_filecaps);
}
/*
@@ -1242,12 +1151,8 @@
td->td_retval[0] = indx;
return (0);
bad:
- VFS_UNLOCK_GIANT(vfslocked);
-bad_unlocked:
- if (indx != -1)
- fdclose(fdp, fp, indx, td);
+ KASSERT(indx == -1, ("indx=%d, should be -1", indx));
fdrop(fp, td);
- td->td_retval[0] = -1;
return (error);
}
@@ -1329,10 +1234,9 @@
struct vnode *vp;
struct mount *mp;
struct vattr vattr;
- int error;
- int whiteout = 0;
struct nameidata nd;
- int vfslocked;
+ cap_rights_t rights;
+ int error, whiteout = 0;
AUDIT_ARG_MODE(mode);
AUDIT_ARG_DEV(dev);
@@ -1340,10 +1244,9 @@
case S_IFCHR:
case S_IFBLK:
error = priv_check(td, PRIV_VFS_MKNOD_DEV);
+ if (error == 0 && dev == VNOVAL)
+ error = EINVAL;
break;
- case S_IFMT:
- error = priv_check(td, PRIV_VFS_MKNOD_BAD);
- break;
case S_IFWHT:
error = priv_check(td, PRIV_VFS_MKNOD_WHT);
break;
@@ -1355,16 +1258,15 @@
error = EINVAL;
break;
}
- if (error)
+ if (error != 0)
return (error);
restart:
bwillwrite();
- NDINIT_ATRIGHTS(&nd, CREATE,
- LOCKPARENT | SAVENAME | MPSAFE | AUDITVNODE1, pathseg, path, fd,
- CAP_MKFIFO, td);
+ NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 |
+ NOCACHE, pathseg, path, fd, cap_rights_init(&rights, CAP_MKNODAT),
+ td);
if ((error = namei(&nd)) != 0)
return (error);
- vfslocked = NDHASGIANT(&nd);
vp = nd.ni_vp;
if (vp != NULL) {
NDFREE(&nd, NDF_ONLY_PNBUF);
@@ -1373,7 +1275,6 @@
else
vput(nd.ni_dvp);
vrele(vp);
- VFS_UNLOCK_GIANT(vfslocked);
return (EEXIST);
} else {
VATTR_NULL(&vattr);
@@ -1383,9 +1284,6 @@
whiteout = 0;
switch (mode & S_IFMT) {
- case S_IFMT: /* used by badsect to flag bad sectors */
- vattr.va_type = VBAD;
- break;
case S_IFCHR:
vattr.va_type = VCHR;
break;
@@ -1402,7 +1300,6 @@
if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
NDFREE(&nd, NDF_ONLY_PNBUF);
vput(nd.ni_dvp);
- VFS_UNLOCK_GIANT(vfslocked);
if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
return (error);
goto restart;
@@ -1412,7 +1309,7 @@
error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp,
&nd.ni_cnd, &vattr);
#endif
- if (!error) {
+ if (error == 0) {
if (whiteout)
error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE);
else {
@@ -1425,7 +1322,6 @@
NDFREE(&nd, NDF_ONLY_PNBUF);
vput(nd.ni_dvp);
vn_finished_write(mp);
- VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
@@ -1478,18 +1374,18 @@
{
struct mount *mp;
struct vattr vattr;
+ struct nameidata nd;
+ cap_rights_t rights;
int error;
- struct nameidata nd;
- int vfslocked;
AUDIT_ARG_MODE(mode);
restart:
bwillwrite();
- NDINIT_AT(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE | AUDITVNODE1,
- pathseg, path, fd, td);
+ NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 |
+ NOCACHE, pathseg, path, fd, cap_rights_init(&rights, CAP_MKFIFOAT),
+ td);
if ((error = namei(&nd)) != 0)
return (error);
- vfslocked = NDHASGIANT(&nd);
if (nd.ni_vp != NULL) {
NDFREE(&nd, NDF_ONLY_PNBUF);
if (nd.ni_vp == nd.ni_dvp)
@@ -1497,13 +1393,11 @@
else
vput(nd.ni_dvp);
vrele(nd.ni_vp);
- VFS_UNLOCK_GIANT(vfslocked);
return (EEXIST);
}
if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
NDFREE(&nd, NDF_ONLY_PNBUF);
vput(nd.ni_dvp);
- VFS_UNLOCK_GIANT(vfslocked);
if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
return (error);
goto restart;
@@ -1514,7 +1408,7 @@
#ifdef MAC
error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
&vattr);
- if (error)
+ if (error != 0)
goto out;
#endif
error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
@@ -1525,7 +1419,6 @@
#endif
vput(nd.ni_dvp);
vn_finished_write(mp);
- VFS_UNLOCK_GIANT(vfslocked);
NDFREE(&nd, NDF_ONLY_PNBUF);
return (error);
}
@@ -1599,13 +1492,13 @@
if (hardlink_check_uid && cred->cr_uid != va.va_uid) {
error = priv_check_cred(cred, PRIV_VFS_LINK, 0);
- if (error)
+ if (error != 0)
return (error);
}
if (hardlink_check_gid && !groupmember(va.va_gid, cred)) {
error = priv_check_cred(cred, PRIV_VFS_LINK, 0);
- if (error)
+ if (error != 0)
return (error);
}
@@ -1626,59 +1519,81 @@
struct vnode *vp;
struct mount *mp;
struct nameidata nd;
- int vfslocked;
- int lvfslocked;
+ cap_rights_t rights;
int error;
+again:
bwillwrite();
- NDINIT_AT(&nd, LOOKUP, follow | MPSAFE | AUDITVNODE1, segflg, path1,
- fd1, td);
+ NDINIT_AT(&nd, LOOKUP, follow | AUDITVNODE1, segflg, path1, fd1, td);
if ((error = namei(&nd)) != 0)
return (error);
- vfslocked = NDHASGIANT(&nd);
NDFREE(&nd, NDF_ONLY_PNBUF);
vp = nd.ni_vp;
if (vp->v_type == VDIR) {
vrele(vp);
- VFS_UNLOCK_GIANT(vfslocked);
return (EPERM); /* POSIX */
}
- if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) {
- vrele(vp);
- VFS_UNLOCK_GIANT(vfslocked);
- return (error);
- }
- NDINIT_AT(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE | AUDITVNODE2,
- segflg, path2, fd2, td);
+ NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE2 |
+ NOCACHE, segflg, path2, fd2, cap_rights_init(&rights, CAP_LINKAT),
+ td);
if ((error = namei(&nd)) == 0) {
- lvfslocked = NDHASGIANT(&nd);
if (nd.ni_vp != NULL) {
+ NDFREE(&nd, NDF_ONLY_PNBUF);
if (nd.ni_dvp == nd.ni_vp)
vrele(nd.ni_dvp);
else
vput(nd.ni_dvp);
vrele(nd.ni_vp);
- error = EEXIST;
- } else if ((error = vn_lock(vp, LK_EXCLUSIVE | LK_RETRY))
- == 0) {
+ vrele(vp);
+ return (EEXIST);
+ } else if (nd.ni_dvp->v_mount != vp->v_mount) {
+ /*
+ * Cross-device link. No need to recheck
+ * vp->v_type, since it cannot change, except
+ * to VBAD.
+ */
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vput(nd.ni_dvp);
+ vrele(vp);
+ return (EXDEV);
+ } else if ((error = vn_lock(vp, LK_EXCLUSIVE)) == 0) {
error = can_hardlink(vp, td->td_ucred);
+#ifdef MAC
if (error == 0)
-#ifdef MAC
error = mac_vnode_check_link(td->td_ucred,
nd.ni_dvp, vp, &nd.ni_cnd);
- if (error == 0)
#endif
- error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd);
+ if (error != 0) {
+ vput(vp);
+ vput(nd.ni_dvp);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ return (error);
+ }
+ error = vn_start_write(vp, &mp, V_NOWAIT);
+ if (error != 0) {
+ vput(vp);
+ vput(nd.ni_dvp);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ error = vn_start_write(NULL, &mp,
+ V_XSLEEP | PCATCH);
+ if (error != 0)
+ return (error);
+ goto again;
+ }
+ error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd);
VOP_UNLOCK(vp, 0);
vput(nd.ni_dvp);
+ vn_finished_write(mp);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ } else {
+ vput(nd.ni_dvp);
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ vrele(vp);
+ goto again;
}
- NDFREE(&nd, NDF_ONLY_PNBUF);
- VFS_UNLOCK_GIANT(lvfslocked);
}
vrele(vp);
- vn_finished_write(mp);
- VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
@@ -1732,9 +1647,9 @@
struct mount *mp;
struct vattr vattr;
char *syspath;
+ struct nameidata nd;
int error;
- struct nameidata nd;
- int vfslocked;
+ cap_rights_t rights;
if (segflg == UIO_SYSSPACE) {
syspath = path1;
@@ -1746,11 +1661,11 @@
AUDIT_ARG_TEXT(syspath);
restart:
bwillwrite();
- NDINIT_AT(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE | AUDITVNODE1,
- segflg, path2, fd, td);
+ NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 |
+ NOCACHE, segflg, path2, fd, cap_rights_init(&rights, CAP_SYMLINKAT),
+ td);
if ((error = namei(&nd)) != 0)
goto out;
- vfslocked = NDHASGIANT(&nd);
if (nd.ni_vp) {
NDFREE(&nd, NDF_ONLY_PNBUF);
if (nd.ni_vp == nd.ni_dvp)
@@ -1758,7 +1673,6 @@
else
vput(nd.ni_dvp);
vrele(nd.ni_vp);
- VFS_UNLOCK_GIANT(vfslocked);
error = EEXIST;
goto out;
}
@@ -1765,7 +1679,6 @@
if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
NDFREE(&nd, NDF_ONLY_PNBUF);
vput(nd.ni_dvp);
- VFS_UNLOCK_GIANT(vfslocked);
if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
goto out;
goto restart;
@@ -1776,7 +1689,7 @@
vattr.va_type = VLNK;
error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
&vattr);
- if (error)
+ if (error != 0)
goto out2;
#endif
error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, syspath);
@@ -1788,7 +1701,6 @@
NDFREE(&nd, NDF_ONLY_PNBUF);
vput(nd.ni_dvp);
vn_finished_write(mp);
- VFS_UNLOCK_GIANT(vfslocked);
out:
if (segflg != UIO_SYSSPACE)
uma_zfree(namei_zone, syspath);
@@ -1805,19 +1717,17 @@
char *path;
} */ *uap;
{
- int error;
struct mount *mp;
struct nameidata nd;
- int vfslocked;
+ int error;
restart:
bwillwrite();
- NDINIT(&nd, DELETE, LOCKPARENT | DOWHITEOUT | MPSAFE | AUDITVNODE1,
+ NDINIT(&nd, DELETE, LOCKPARENT | DOWHITEOUT | AUDITVNODE1,
UIO_USERSPACE, uap->path, td);
error = namei(&nd);
- if (error)
+ if (error != 0)
return (error);
- vfslocked = NDHASGIANT(&nd);
if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) {
NDFREE(&nd, NDF_ONLY_PNBUF);
@@ -1827,13 +1737,11 @@
vput(nd.ni_dvp);
if (nd.ni_vp)
vrele(nd.ni_vp);
- VFS_UNLOCK_GIANT(vfslocked);
return (EEXIST);
}
if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
NDFREE(&nd, NDF_ONLY_PNBUF);
vput(nd.ni_dvp);
- VFS_UNLOCK_GIANT(vfslocked);
if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
return (error);
goto restart;
@@ -1842,7 +1750,6 @@
NDFREE(&nd, NDF_ONLY_PNBUF);
vput(nd.ni_dvp);
vn_finished_write(mp);
- VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
@@ -1901,18 +1808,17 @@
{
struct mount *mp;
struct vnode *vp;
- int error;
struct nameidata nd;
struct stat sb;
- int vfslocked;
+ cap_rights_t rights;
+ int error;
restart:
bwillwrite();
- NDINIT_AT(&nd, DELETE, LOCKPARENT | LOCKLEAF | MPSAFE | AUDITVNODE1,
- pathseg, path, fd, td);
+ NDINIT_ATRIGHTS(&nd, DELETE, LOCKPARENT | LOCKLEAF | AUDITVNODE1,
+ pathseg, path, fd, cap_rights_init(&rights, CAP_UNLINKAT), td);
if ((error = namei(&nd)) != 0)
return (error == EINVAL ? EPERM : error);
- vfslocked = NDHASGIANT(&nd);
vp = nd.ni_vp;
if (vp->v_type == VDIR && oldinum == 0) {
error = EPERM; /* POSIX */
@@ -1937,7 +1843,6 @@
vrele(vp);
else
vput(vp);
- VFS_UNLOCK_GIANT(vfslocked);
if ((error = vn_start_write(NULL, &mp,
V_XSLEEP | PCATCH)) != 0)
return (error);
@@ -1946,7 +1851,7 @@
#ifdef MAC
error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp,
&nd.ni_cnd);
- if (error)
+ if (error != 0)
goto out;
#endif
vfs_notify_upper(vp, VFS_NOTIFY_UPPER_UNLINK);
@@ -1962,7 +1867,6 @@
vrele(vp);
else
vput(vp);
- VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
@@ -1987,80 +1891,17 @@
int whence;
} */ *uap;
{
- struct ucred *cred = td->td_ucred;
struct file *fp;
- struct vnode *vp;
- struct vattr vattr;
- off_t foffset, offset, size;
- int error, noneg;
- int vfslocked;
+ cap_rights_t rights;
+ int error;
AUDIT_ARG_FD(uap->fd);
- if ((error = fget(td, uap->fd, CAP_SEEK, &fp)) != 0)
+ error = fget(td, uap->fd, cap_rights_init(&rights, CAP_SEEK), &fp);
+ if (error != 0)
return (error);
- if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE)) {
- fdrop(fp, td);
- return (ESPIPE);
- }
- vp = fp->f_vnode;
- foffset = foffset_lock(fp, 0);
- vfslocked = VFS_LOCK_GIANT(vp->v_mount);
- noneg = (vp->v_type != VCHR);
- offset = uap->offset;
- switch (uap->whence) {
- case L_INCR:
- if (noneg &&
- (foffset < 0 ||
- (offset > 0 && foffset > OFF_MAX - offset))) {
- error = EOVERFLOW;
- break;
- }
- offset += foffset;
- break;
- case L_XTND:
- vn_lock(vp, LK_SHARED | LK_RETRY);
- error = VOP_GETATTR(vp, &vattr, cred);
- VOP_UNLOCK(vp, 0);
- if (error)
- break;
-
- /*
- * If the file references a disk device, then fetch
- * the media size and use that to determine the ending
- * offset.
- */
- if (vattr.va_size == 0 && vp->v_type == VCHR &&
- fo_ioctl(fp, DIOCGMEDIASIZE, &size, cred, td) == 0)
- vattr.va_size = size;
- if (noneg &&
- (vattr.va_size > OFF_MAX ||
- (offset > 0 && vattr.va_size > OFF_MAX - offset))) {
- error = EOVERFLOW;
- break;
- }
- offset += vattr.va_size;
- break;
- case L_SET:
- break;
- case SEEK_DATA:
- error = fo_ioctl(fp, FIOSEEKDATA, &offset, cred, td);
- break;
- case SEEK_HOLE:
- error = fo_ioctl(fp, FIOSEEKHOLE, &offset, cred, td);
- break;
- default:
- error = EINVAL;
- }
- if (error == 0 && noneg && offset < 0)
- error = EINVAL;
- if (error != 0)
- goto drop;
- VFS_KNOTE_UNLOCKED(vp, 0);
- *(off_t *)(td->td_retval) = offset;
-drop:
+ error = (fp->f_ops->fo_flags & DFLAG_SEEKABLE) != 0 ?
+ fo_seek(fp, uap->offset, uap->whence, td) : ESPIPE;
fdrop(fp, td);
- VFS_UNLOCK_GIANT(vfslocked);
- foffset_unlock(fp, offset, error != 0 ? FOF_NOUPDATE : 0);
return (error);
}
@@ -2122,8 +1963,8 @@
struct ucred *cred;
struct thread *td;
{
+ accmode_t accmode;
int error;
- accmode_t accmode;
/* Flags == 0 means only check for existence. */
error = 0;
@@ -2137,7 +1978,7 @@
accmode |= VEXEC;
#ifdef MAC
error = mac_vnode_check_access(cred, vp, accmode);
- if (error)
+ if (error != 0)
return (error);
#endif
if ((accmode & VWRITE) == 0 || (error = vn_writechk(vp)) == 0)
@@ -2199,7 +2040,7 @@
struct ucred *cred, *tmpcred;
struct vnode *vp;
struct nameidata nd;
- int vfslocked;
+ cap_rights_t rights;
int error;
/*
@@ -2215,17 +2056,16 @@
} else
cred = tmpcred = td->td_ucred;
AUDIT_ARG_VALUE(amode);
- NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | MPSAFE |
- AUDITVNODE1, pathseg, path, fd, CAP_FSTAT, td);
+ NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF |
+ AUDITVNODE1, pathseg, path, fd, cap_rights_init(&rights, CAP_FSTAT),
+ td);
if ((error = namei(&nd)) != 0)
goto out1;
- vfslocked = NDHASGIANT(&nd);
vp = nd.ni_vp;
error = vn_access(vp, amode, tmpcred, td);
NDFREE(&nd, NDF_ONLY_PNBUF);
vput(vp);
- VFS_UNLOCK_GIANT(vfslocked);
out1:
if (!(flag & AT_EACCESS)) {
td->td_ucred = cred;
@@ -2285,11 +2125,10 @@
int error;
error = kern_stat(td, uap->path, UIO_USERSPACE, &sb);
- if (error)
+ if (error != 0)
return (error);
cvtstat(&sb, &osb);
- error = copyout(&osb, uap->ub, sizeof (osb));
- return (error);
+ return (copyout(&osb, uap->ub, sizeof (osb)));
}
/*
@@ -2314,11 +2153,10 @@
int error;
error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb);
- if (error)
+ if (error != 0)
return (error);
cvtstat(&sb, &osb);
- error = copyout(&osb, uap->ub, sizeof (osb));
- return (error);
+ return (copyout(&osb, uap->ub, sizeof (osb)));
}
/*
@@ -2421,30 +2259,29 @@
{
struct nameidata nd;
struct stat sb;
- int error, vfslocked;
+ cap_rights_t rights;
+ int error;
if (flag & ~AT_SYMLINK_NOFOLLOW)
return (EINVAL);
NDINIT_ATRIGHTS(&nd, LOOKUP, ((flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW :
- FOLLOW) | LOCKSHARED | LOCKLEAF | AUDITVNODE1 | MPSAFE, pathseg,
- path, fd, CAP_FSTAT, td);
+ FOLLOW) | LOCKSHARED | LOCKLEAF | AUDITVNODE1, pathseg, path, fd,
+ cap_rights_init(&rights, CAP_FSTAT), td);
if ((error = namei(&nd)) != 0)
return (error);
- vfslocked = NDHASGIANT(&nd);
error = vn_stat(nd.ni_vp, &sb, td->td_ucred, NOCRED, td);
- if (!error) {
- SDT_PROBE(vfs, , stat, mode, path, sb.st_mode, 0, 0, 0);
+ if (error == 0) {
+ SDT_PROBE2(vfs, , stat, mode, path, sb.st_mode);
if (S_ISREG(sb.st_mode))
- SDT_PROBE(vfs, , stat, reg, path, pathseg, 0, 0, 0);
+ SDT_PROBE2(vfs, , stat, reg, path, pathseg);
if (__predict_false(hook != NULL))
hook(nd.ni_vp, &sb);
}
NDFREE(&nd, NDF_ONLY_PNBUF);
vput(nd.ni_vp);
- VFS_UNLOCK_GIANT(vfslocked);
- if (error)
+ if (error != 0)
return (error);
*sbp = sb;
#ifdef KTRACE
@@ -2496,6 +2333,7 @@
struct stat *sb;
struct nstat *nsb;
{
+
bzero(nsb, sizeof *nsb);
nsb->st_dev = sb->st_dev;
nsb->st_ino = sb->st_ino;
@@ -2534,11 +2372,10 @@
int error;
error = kern_stat(td, uap->path, UIO_USERSPACE, &sb);
- if (error)
+ if (error != 0)
return (error);
cvtnstat(&sb, &nsb);
- error = copyout(&nsb, uap->ub, sizeof (nsb));
- return (error);
+ return (copyout(&nsb, uap->ub, sizeof (nsb)));
}
/*
@@ -2563,11 +2400,10 @@
int error;
error = kern_lstat(td, uap->path, UIO_USERSPACE, &sb);
- if (error)
+ if (error != 0)
return (error);
cvtnstat(&sb, &nsb);
- error = copyout(&nsb, uap->ub, sizeof (nsb));
- return (error);
+ return (copyout(&nsb, uap->ub, sizeof (nsb)));
}
/*
@@ -2606,7 +2442,8 @@
} */ *uap;
{
- return (kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name, NOFOLLOW));
+ return (kern_pathconf(td, uap->path, UIO_USERSPACE, uap->name,
+ NOFOLLOW));
}
int
@@ -2614,13 +2451,12 @@
u_long flags)
{
struct nameidata nd;
- int error, vfslocked;
+ int error;
- NDINIT(&nd, LOOKUP, LOCKSHARED | LOCKLEAF | MPSAFE | AUDITVNODE1 |
- flags, pathseg, path, td);
+ NDINIT(&nd, LOOKUP, LOCKSHARED | LOCKLEAF | AUDITVNODE1 | flags,
+ pathseg, path, td);
if ((error = namei(&nd)) != 0)
return (error);
- vfslocked = NDHASGIANT(&nd);
NDFREE(&nd, NDF_ONLY_PNBUF);
/* If asynchronous I/O is available, it works for all files. */
@@ -2629,7 +2465,6 @@
else
error = VOP_PATHCONF(nd.ni_vp, name, td->td_retval);
vput(nd.ni_vp);
- VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
@@ -2688,26 +2523,23 @@
struct vnode *vp;
struct iovec aiov;
struct uio auio;
+ struct nameidata nd;
int error;
- struct nameidata nd;
- int vfslocked;
if (count > IOSIZE_MAX)
return (EINVAL);
- NDINIT_AT(&nd, LOOKUP, NOFOLLOW | LOCKSHARED | LOCKLEAF | MPSAFE |
- AUDITVNODE1, pathseg, path, fd, td);
+ NDINIT_AT(&nd, LOOKUP, NOFOLLOW | LOCKSHARED | LOCKLEAF | AUDITVNODE1,
+ pathseg, path, fd, td);
if ((error = namei(&nd)) != 0)
return (error);
NDFREE(&nd, NDF_ONLY_PNBUF);
- vfslocked = NDHASGIANT(&nd);
vp = nd.ni_vp;
#ifdef MAC
error = mac_vnode_check_readlink(td->td_ucred, vp);
- if (error) {
+ if (error != 0) {
vput(vp);
- VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
#endif
@@ -2724,10 +2556,9 @@
auio.uio_td = td;
auio.uio_resid = count;
error = VOP_READLINK(vp, &auio, td->td_ucred);
+ td->td_retval[0] = count - auio.uio_resid;
}
vput(vp);
- VFS_UNLOCK_GIANT(vfslocked);
- td->td_retval[0] = count - auio.uio_resid;
return (error);
}
@@ -2738,11 +2569,11 @@
setfflags(td, vp, flags)
struct thread *td;
struct vnode *vp;
- int flags;
+ u_long flags;
{
- int error;
struct mount *mp;
struct vattr vattr;
+ int error;
/* We can't support the value matching VNOVAL. */
if (flags == VNOVAL)
@@ -2756,15 +2587,15 @@
*/
if (vp->v_type == VCHR || vp->v_type == VBLK) {
error = priv_check(td, PRIV_VFS_CHFLAGS_DEV);
- if (error)
+ if (error != 0)
return (error);
}
if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
return (error);
- vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
VATTR_NULL(&vattr);
vattr.va_flags = flags;
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
#ifdef MAC
error = mac_vnode_check_setflags(td->td_ucred, vp, vattr.va_flags);
if (error == 0)
@@ -2780,8 +2611,8 @@
*/
#ifndef _SYS_SYSPROTO_H_
struct chflags_args {
- char *path;
- int flags;
+ const char *path;
+ u_long flags;
};
#endif
int
@@ -2788,27 +2619,44 @@
sys_chflags(td, uap)
struct thread *td;
register struct chflags_args /* {
- char *path;
- int flags;
+ const char *path;
+ u_long flags;
} */ *uap;
{
- int error;
- struct nameidata nd;
- int vfslocked;
- AUDIT_ARG_FFLAGS(uap->flags);
- NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE | AUDITVNODE1, UIO_USERSPACE,
- uap->path, td);
- if ((error = namei(&nd)) != 0)
- return (error);
- NDFREE(&nd, NDF_ONLY_PNBUF);
- vfslocked = NDHASGIANT(&nd);
- error = setfflags(td, nd.ni_vp, uap->flags);
- vrele(nd.ni_vp);
- VFS_UNLOCK_GIANT(vfslocked);
- return (error);
+ return (kern_chflags(td, uap->path, UIO_USERSPACE, uap->flags));
}
+#ifndef _SYS_SYSPROTO_H_
+struct chflagsat_args {
+ int fd;
+ const char *path;
+ u_long flags;
+ int atflag;
+}
+#endif
+int
+sys_chflagsat(struct thread *td, struct chflagsat_args *uap)
+{
+ int fd = uap->fd;
+ const char *path = uap->path;
+ u_long flags = uap->flags;
+ int atflag = uap->atflag;
+
+ if (atflag & ~AT_SYMLINK_NOFOLLOW)
+ return (EINVAL);
+
+ return (kern_chflagsat(td, fd, path, UIO_USERSPACE, flags, atflag));
+}
+
+static int
+kern_chflags(struct thread *td, const char *path, enum uio_seg pathseg,
+ u_long flags)
+{
+
+ return (kern_chflagsat(td, AT_FDCWD, path, pathseg, flags, 0));
+}
+
/*
* Same as chflags() but doesn't follow symlinks.
*/
@@ -2816,24 +2664,32 @@
sys_lchflags(td, uap)
struct thread *td;
register struct lchflags_args /* {
- char *path;
- int flags;
+ const char *path;
+ u_long flags;
} */ *uap;
{
- int error;
+
+ return (kern_chflagsat(td, AT_FDCWD, uap->path, UIO_USERSPACE,
+ uap->flags, AT_SYMLINK_NOFOLLOW));
+}
+
+static int
+kern_chflagsat(struct thread *td, int fd, const char *path,
+ enum uio_seg pathseg, u_long flags, int atflag)
+{
struct nameidata nd;
- int vfslocked;
+ cap_rights_t rights;
+ int error, follow;
- AUDIT_ARG_FFLAGS(uap->flags);
- NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE | AUDITVNODE1, UIO_USERSPACE,
- uap->path, td);
+ AUDIT_ARG_FFLAGS(flags);
+ follow = (atflag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
+ NDINIT_ATRIGHTS(&nd, LOOKUP, follow | AUDITVNODE1, pathseg, path, fd,
+ cap_rights_init(&rights, CAP_FCHFLAGS), td);
if ((error = namei(&nd)) != 0)
return (error);
- vfslocked = NDHASGIANT(&nd);
NDFREE(&nd, NDF_ONLY_PNBUF);
- error = setfflags(td, nd.ni_vp, uap->flags);
+ error = setfflags(td, nd.ni_vp, flags);
vrele(nd.ni_vp);
- VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
@@ -2843,7 +2699,7 @@
#ifndef _SYS_SYSPROTO_H_
struct fchflags_args {
int fd;
- int flags;
+ u_long flags;
};
#endif
int
@@ -2851,19 +2707,19 @@
struct thread *td;
register struct fchflags_args /* {
int fd;
- int flags;
+ u_long flags;
} */ *uap;
{
struct file *fp;
- int vfslocked;
+ cap_rights_t rights;
int error;
AUDIT_ARG_FD(uap->fd);
AUDIT_ARG_FFLAGS(uap->flags);
- if ((error = getvnode(td->td_proc->p_fd, uap->fd, CAP_FCHFLAGS,
- &fp)) != 0)
+ error = getvnode(td->td_proc->p_fd, uap->fd,
+ cap_rights_init(&rights, CAP_FCHFLAGS), &fp);
+ if (error != 0)
return (error);
- vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
#ifdef AUDIT
vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY);
AUDIT_ARG_VNODE1(fp->f_vnode);
@@ -2870,7 +2726,6 @@
VOP_UNLOCK(fp->f_vnode, 0);
#endif
error = setfflags(td, fp->f_vnode, uap->flags);
- VFS_UNLOCK_GIANT(vfslocked);
fdrop(fp, td);
return (error);
}
@@ -2885,9 +2740,9 @@
struct vnode *vp;
int mode;
{
- int error;
struct mount *mp;
struct vattr vattr;
+ int error;
if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
return (error);
@@ -2976,27 +2831,23 @@
uap->mode, AT_SYMLINK_NOFOLLOW));
}
-
int
kern_fchmodat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
mode_t mode, int flag)
{
- int error;
struct nameidata nd;
- int vfslocked;
- int follow;
+ cap_rights_t rights;
+ int error, follow;
AUDIT_ARG_MODE(mode);
follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
- NDINIT_ATRIGHTS(&nd, LOOKUP, follow | MPSAFE | AUDITVNODE1, pathseg,
- path, fd, CAP_FCHMOD, td);
+ NDINIT_ATRIGHTS(&nd, LOOKUP, follow | AUDITVNODE1, pathseg, path, fd,
+ cap_rights_init(&rights, CAP_FCHMOD), td);
if ((error = namei(&nd)) != 0)
return (error);
- vfslocked = NDHASGIANT(&nd);
NDFREE(&nd, NDF_ONLY_PNBUF);
error = setfmode(td, td->td_ucred, nd.ni_vp, mode);
vrele(nd.ni_vp);
- VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
@@ -3013,12 +2864,13 @@
sys_fchmod(struct thread *td, struct fchmod_args *uap)
{
struct file *fp;
+ cap_rights_t rights;
int error;
AUDIT_ARG_FD(uap->fd);
AUDIT_ARG_MODE(uap->mode);
- error = fget(td, uap->fd, CAP_FCHMOD, &fp);
+ error = fget(td, uap->fd, cap_rights_init(&rights, CAP_FCHMOD), &fp);
if (error != 0)
return (error);
error = fo_chmod(fp, uap->mode, td->td_ucred, td);
@@ -3037,9 +2889,9 @@
uid_t uid;
gid_t gid;
{
- int error;
struct mount *mp;
struct vattr vattr;
+ int error;
if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
return (error);
@@ -3116,20 +2968,19 @@
int uid, int gid, int flag)
{
struct nameidata nd;
- int error, vfslocked, follow;
+ cap_rights_t rights;
+ int error, follow;
AUDIT_ARG_OWNER(uid, gid);
follow = (flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW : FOLLOW;
- NDINIT_ATRIGHTS(&nd, LOOKUP, follow | MPSAFE | AUDITVNODE1, pathseg,
- path, fd, CAP_FCHOWN, td);
+ NDINIT_ATRIGHTS(&nd, LOOKUP, follow | AUDITVNODE1, pathseg, path, fd,
+ cap_rights_init(&rights, CAP_FCHOWN), td);
if ((error = namei(&nd)) != 0)
return (error);
- vfslocked = NDHASGIANT(&nd);
NDFREE(&nd, NDF_ONLY_PNBUF);
error = setfown(td, td->td_ucred, nd.ni_vp, uid, gid);
vrele(nd.ni_vp);
- VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
@@ -3185,11 +3036,12 @@
} */ *uap;
{
struct file *fp;
+ cap_rights_t rights;
int error;
AUDIT_ARG_FD(uap->fd);
AUDIT_ARG_OWNER(uap->uid, uap->gid);
- error = fget(td, uap->fd, CAP_FCHOWN, &fp);
+ error = fget(td, uap->fd, cap_rights_init(&rights, CAP_FCHOWN), &fp);
if (error != 0)
return (error);
error = fo_chown(fp, uap->uid, uap->gid, td->td_ucred, td);
@@ -3232,9 +3084,55 @@
}
/*
- * Common implementation code for utimes(), lutimes(), and futimes().
+ * Common implementation code for futimens(), utimensat().
*/
+#define UTIMENS_NULL 0x1
+#define UTIMENS_EXIT 0x2
static int
+getutimens(const struct timespec *usrtsp, enum uio_seg tspseg,
+ struct timespec *tsp, int *retflags)
+{
+ struct timespec tsnow;
+ int error;
+
+ vfs_timestamp(&tsnow);
+ *retflags = 0;
+ if (usrtsp == NULL) {
+ tsp[0] = tsnow;
+ tsp[1] = tsnow;
+ *retflags |= UTIMENS_NULL;
+ return (0);
+ }
+ if (tspseg == UIO_SYSSPACE) {
+ tsp[0] = usrtsp[0];
+ tsp[1] = usrtsp[1];
+ } else if ((error = copyin(usrtsp, tsp, sizeof(*tsp) * 2)) != 0)
+ return (error);
+ if (tsp[0].tv_nsec == UTIME_OMIT && tsp[1].tv_nsec == UTIME_OMIT)
+ *retflags |= UTIMENS_EXIT;
+ if (tsp[0].tv_nsec == UTIME_NOW && tsp[1].tv_nsec == UTIME_NOW)
+ *retflags |= UTIMENS_NULL;
+ if (tsp[0].tv_nsec == UTIME_OMIT)
+ tsp[0].tv_sec = VNOVAL;
+ else if (tsp[0].tv_nsec == UTIME_NOW)
+ tsp[0] = tsnow;
+ else if (tsp[0].tv_nsec < 0 || tsp[0].tv_nsec >= 1000000000L)
+ return (EINVAL);
+ if (tsp[1].tv_nsec == UTIME_OMIT)
+ tsp[1].tv_sec = VNOVAL;
+ else if (tsp[1].tv_nsec == UTIME_NOW)
+ tsp[1] = tsnow;
+ else if (tsp[1].tv_nsec < 0 || tsp[1].tv_nsec >= 1000000000L)
+ return (EINVAL);
+
+ return (0);
+}
+
+/*
+ * Common implementation code for utimes(), lutimes(), futimes(), futimens(),
+ * and utimensat().
+ */
+static int
setutimes(td, vp, ts, numtimes, nullflag)
struct thread *td;
struct vnode *vp;
@@ -3242,9 +3140,9 @@
int numtimes;
int nullflag;
{
- int error, setbirthtime;
struct mount *mp;
struct vattr vattr;
+ int error, setbirthtime;
if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
return (error);
@@ -3324,20 +3222,19 @@
{
struct nameidata nd;
struct timespec ts[2];
- int error, vfslocked;
+ cap_rights_t rights;
+ int error;
if ((error = getutimes(tptr, tptrseg, ts)) != 0)
return (error);
- NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | MPSAFE | AUDITVNODE1, pathseg,
- path, fd, CAP_FUTIMES, td);
+ NDINIT_ATRIGHTS(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, fd,
+ cap_rights_init(&rights, CAP_FUTIMES), td);
if ((error = namei(&nd)) != 0)
return (error);
- vfslocked = NDHASGIANT(&nd);
NDFREE(&nd, NDF_ONLY_PNBUF);
error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL);
vrele(nd.ni_vp);
- VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
@@ -3368,20 +3265,17 @@
struct timeval *tptr, enum uio_seg tptrseg)
{
struct timespec ts[2];
+ struct nameidata nd;
int error;
- struct nameidata nd;
- int vfslocked;
if ((error = getutimes(tptr, tptrseg, ts)) != 0)
return (error);
- NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE | AUDITVNODE1, pathseg, path, td);
+ NDINIT(&nd, LOOKUP, NOFOLLOW | AUDITVNODE1, pathseg, path, td);
if ((error = namei(&nd)) != 0)
return (error);
- vfslocked = NDHASGIANT(&nd);
NDFREE(&nd, NDF_ONLY_PNBUF);
error = setutimes(td, nd.ni_vp, ts, 2, tptr == NULL);
vrele(nd.ni_vp);
- VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
@@ -3412,16 +3306,17 @@
{
struct timespec ts[2];
struct file *fp;
- int vfslocked;
+ cap_rights_t rights;
int error;
AUDIT_ARG_FD(fd);
- if ((error = getutimes(tptr, tptrseg, ts)) != 0)
+ error = getutimes(tptr, tptrseg, ts);
+ if (error != 0)
return (error);
- if ((error = getvnode(td->td_proc->p_fd, fd, CAP_FUTIMES, &fp))
- != 0)
+ error = getvnode(td->td_proc->p_fd, fd,
+ cap_rights_init(&rights, CAP_FUTIMES), &fp);
+ if (error != 0)
return (error);
- vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
#ifdef AUDIT
vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY);
AUDIT_ARG_VNODE1(fp->f_vnode);
@@ -3428,11 +3323,86 @@
VOP_UNLOCK(fp->f_vnode, 0);
#endif
error = setutimes(td, fp->f_vnode, ts, 2, tptr == NULL);
- VFS_UNLOCK_GIANT(vfslocked);
fdrop(fp, td);
return (error);
}
+int
+sys_futimens(struct thread *td, struct futimens_args *uap)
+{
+
+ return (kern_futimens(td, uap->fd, uap->times, UIO_USERSPACE));
+}
+
+int
+kern_futimens(struct thread *td, int fd, struct timespec *tptr,
+ enum uio_seg tptrseg)
+{
+ struct timespec ts[2];
+ struct file *fp;
+ cap_rights_t rights;
+ int error, flags;
+
+ AUDIT_ARG_FD(fd);
+ error = getutimens(tptr, tptrseg, ts, &flags);
+ if (error != 0)
+ return (error);
+ if (flags & UTIMENS_EXIT)
+ return (0);
+ error = getvnode(td->td_proc->p_fd, fd,
+ cap_rights_init(&rights, CAP_FUTIMES), &fp);
+ if (error != 0)
+ return (error);
+#ifdef AUDIT
+ vn_lock(fp->f_vnode, LK_SHARED | LK_RETRY);
+ AUDIT_ARG_VNODE1(fp->f_vnode);
+ VOP_UNLOCK(fp->f_vnode, 0);
+#endif
+ error = setutimes(td, fp->f_vnode, ts, 2, flags & UTIMENS_NULL);
+ fdrop(fp, td);
+ return (error);
+}
+
+int
+sys_utimensat(struct thread *td, struct utimensat_args *uap)
+{
+
+ return (kern_utimensat(td, uap->fd, uap->path, UIO_USERSPACE,
+ uap->times, UIO_USERSPACE, uap->flag));
+}
+
+int
+kern_utimensat(struct thread *td, int fd, char *path, enum uio_seg pathseg,
+ struct timespec *tptr, enum uio_seg tptrseg, int flag)
+{
+ struct nameidata nd;
+ struct timespec ts[2];
+ cap_rights_t rights;
+ int error, flags;
+
+ if (flag & ~AT_SYMLINK_NOFOLLOW)
+ return (EINVAL);
+
+ if ((error = getutimens(tptr, tptrseg, ts, &flags)) != 0)
+ return (error);
+ NDINIT_ATRIGHTS(&nd, LOOKUP, ((flag & AT_SYMLINK_NOFOLLOW) ? NOFOLLOW :
+ FOLLOW) | AUDITVNODE1, pathseg, path, fd,
+ cap_rights_init(&rights, CAP_FUTIMES), td);
+ if ((error = namei(&nd)) != 0)
+ return (error);
+ /*
+ * We are allowed to call namei() regardless of 2xUTIME_OMIT.
+ * POSIX states:
+ * "If both tv_nsec fields are UTIME_OMIT... EACCESS may be detected."
+ * "Search permission is denied by a component of the path prefix."
+ */
+ NDFREE(&nd, NDF_ONLY_PNBUF);
+ if ((flags & UTIMENS_EXIT) == 0)
+ error = setutimes(td, nd.ni_vp, ts, 2, flags & UTIMENS_NULL);
+ vrele(nd.ni_vp);
+ return (error);
+}
+
/*
* Truncate a file given its path name.
*/
@@ -3464,20 +3434,18 @@
void *rl_cookie;
struct vattr vattr;
struct nameidata nd;
- int error, vfslocked;
+ int error;
if (length < 0)
return(EINVAL);
- NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE | AUDITVNODE1, pathseg, path, td);
+ NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, td);
if ((error = namei(&nd)) != 0)
return (error);
- vfslocked = NDHASGIANT(&nd);
vp = nd.ni_vp;
rl_cookie = vn_rangelock_wlock(vp, 0, OFF_MAX);
if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) {
vn_rangelock_unlock(vp, rl_cookie);
vrele(vp);
- VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
NDFREE(&nd, NDF_ONLY_PNBUF);
@@ -3498,7 +3466,6 @@
vn_finished_write(mp);
vn_rangelock_unlock(vp, rl_cookie);
vrele(vp);
- VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
@@ -3571,16 +3538,17 @@
struct vnode *vp;
struct mount *mp;
struct file *fp;
- int vfslocked;
+ cap_rights_t rights;
int error, lock_flags;
AUDIT_ARG_FD(uap->fd);
- if ((error = getvnode(td->td_proc->p_fd, uap->fd, CAP_FSYNC,
- &fp)) != 0)
+ error = getvnode(td->td_proc->p_fd, uap->fd,
+ cap_rights_init(&rights, CAP_FSYNC), &fp);
+ if (error != 0)
return (error);
vp = fp->f_vnode;
- vfslocked = VFS_LOCK_GIANT(vp->v_mount);
- if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
+ error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
+ if (error != 0)
goto drop;
if (MNT_SHARED_WRITES(mp) ||
((mp == NULL) && MNT_SHARED_WRITES(vp->v_mount))) {
@@ -3591,9 +3559,9 @@
vn_lock(vp, lock_flags | LK_RETRY);
AUDIT_ARG_VNODE1(vp);
if (vp->v_object != NULL) {
- VM_OBJECT_LOCK(vp->v_object);
+ VM_OBJECT_WLOCK(vp->v_object);
vm_object_page_clean(vp->v_object, 0, 0, 0);
- VM_OBJECT_UNLOCK(vp->v_object);
+ VM_OBJECT_WUNLOCK(vp->v_object);
}
error = VOP_FSYNC(vp, MNT_WAIT, td);
@@ -3600,7 +3568,6 @@
VOP_UNLOCK(vp, 0);
vn_finished_write(mp);
drop:
- VFS_UNLOCK_GIANT(vfslocked);
fdrop(fp, td);
return (error);
}
@@ -3657,23 +3624,22 @@
struct mount *mp = NULL;
struct vnode *tvp, *fvp, *tdvp;
struct nameidata fromnd, tond;
- int tvfslocked;
- int fvfslocked;
+ cap_rights_t rights;
int error;
+again:
bwillwrite();
#ifdef MAC
NDINIT_ATRIGHTS(&fromnd, DELETE, LOCKPARENT | LOCKLEAF | SAVESTART |
- MPSAFE | AUDITVNODE1, pathseg, old, oldfd, CAP_DELETE, td);
+ AUDITVNODE1, pathseg, old, oldfd,
+ cap_rights_init(&rights, CAP_RENAMEAT), td);
#else
- NDINIT_ATRIGHTS(&fromnd, DELETE, WANTPARENT | SAVESTART | MPSAFE |
- AUDITVNODE1, pathseg, old, oldfd, CAP_DELETE, td);
+ NDINIT_ATRIGHTS(&fromnd, DELETE, WANTPARENT | SAVESTART | AUDITVNODE1,
+ pathseg, old, oldfd, cap_rights_init(&rights, CAP_RENAMEAT), td);
#endif
if ((error = namei(&fromnd)) != 0)
return (error);
- fvfslocked = NDHASGIANT(&fromnd);
- tvfslocked = 0;
#ifdef MAC
error = mac_vnode_check_rename_from(td->td_ucred, fromnd.ni_dvp,
fromnd.ni_vp, &fromnd.ni_cnd);
@@ -3682,17 +3648,9 @@
VOP_UNLOCK(fromnd.ni_vp, 0);
#endif
fvp = fromnd.ni_vp;
- if (error == 0)
- error = vn_start_write(fvp, &mp, V_WAIT | PCATCH);
- if (error != 0) {
- NDFREE(&fromnd, NDF_ONLY_PNBUF);
- vrele(fromnd.ni_dvp);
- vrele(fvp);
- goto out1;
- }
NDINIT_ATRIGHTS(&tond, RENAME, LOCKPARENT | LOCKLEAF | NOCACHE |
- SAVESTART | MPSAFE | AUDITVNODE2, pathseg, new, newfd, CAP_CREATE,
- td);
+ SAVESTART | AUDITVNODE2, pathseg, new, newfd,
+ cap_rights_init(&rights, CAP_LINKAT), td);
if (fromnd.ni_vp->v_type == VDIR)
tond.ni_cnd.cn_flags |= WILLBEDIR;
if ((error = namei(&tond)) != 0) {
@@ -3702,12 +3660,30 @@
NDFREE(&fromnd, NDF_ONLY_PNBUF);
vrele(fromnd.ni_dvp);
vrele(fvp);
- vn_finished_write(mp);
goto out1;
}
- tvfslocked = NDHASGIANT(&tond);
tdvp = tond.ni_dvp;
tvp = tond.ni_vp;
+ error = vn_start_write(fvp, &mp, V_NOWAIT);
+ if (error != 0) {
+ NDFREE(&fromnd, NDF_ONLY_PNBUF);
+ NDFREE(&tond, NDF_ONLY_PNBUF);
+ if (tvp != NULL)
+ vput(tvp);
+ if (tdvp == tvp)
+ vrele(tdvp);
+ else
+ vput(tdvp);
+ vrele(fromnd.ni_dvp);
+ vrele(fvp);
+ vrele(tond.ni_startdir);
+ if (fromnd.ni_startdir != NULL)
+ vrele(fromnd.ni_startdir);
+ error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH);
+ if (error != 0)
+ return (error);
+ goto again;
+ }
if (tvp != NULL) {
if (fvp->v_type == VDIR && tvp->v_type != VDIR) {
error = ENOTDIR;
@@ -3716,6 +3692,18 @@
error = EISDIR;
goto out;
}
+#ifdef CAPABILITIES
+ if (newfd != AT_FDCWD) {
+ /*
+ * If the target already exists we require CAP_UNLINKAT
+ * from 'newfd'.
+ */
+ error = cap_check(&tond.ni_filecaps.fc_rights,
+ cap_rights_init(&rights, CAP_UNLINKAT));
+ if (error != 0)
+ goto out;
+ }
+#endif
}
if (fvp == tdvp) {
error = EINVAL;
@@ -3733,15 +3721,15 @@
tond.ni_vp, fromnd.ni_dvp == tdvp, &tond.ni_cnd);
#endif
out:
- if (!error) {
+ if (error == 0) {
error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd,
- tond.ni_dvp, tond.ni_vp, &tond.ni_cnd);
+ tond.ni_dvp, tond.ni_vp, &tond.ni_cnd);
NDFREE(&fromnd, NDF_ONLY_PNBUF);
NDFREE(&tond, NDF_ONLY_PNBUF);
} else {
NDFREE(&fromnd, NDF_ONLY_PNBUF);
NDFREE(&tond, NDF_ONLY_PNBUF);
- if (tvp)
+ if (tvp != NULL)
vput(tvp);
if (tdvp == tvp)
vrele(tdvp);
@@ -3755,8 +3743,6 @@
out1:
if (fromnd.ni_startdir)
vrele(fromnd.ni_startdir);
- VFS_UNLOCK_GIANT(fvfslocked);
- VFS_UNLOCK_GIANT(tvfslocked);
if (error == -1)
return (0);
return (error);
@@ -3811,19 +3797,19 @@
struct mount *mp;
struct vnode *vp;
struct vattr vattr;
+ struct nameidata nd;
+ cap_rights_t rights;
int error;
- struct nameidata nd;
- int vfslocked;
AUDIT_ARG_MODE(mode);
restart:
bwillwrite();
- NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE |
- AUDITVNODE1, segflg, path, fd, CAP_MKDIR, td);
+ NDINIT_ATRIGHTS(&nd, CREATE, LOCKPARENT | SAVENAME | AUDITVNODE1 |
+ NOCACHE, segflg, path, fd, cap_rights_init(&rights, CAP_MKDIRAT),
+ td);
nd.ni_cnd.cn_flags |= WILLBEDIR;
if ((error = namei(&nd)) != 0)
return (error);
- vfslocked = NDHASGIANT(&nd);
vp = nd.ni_vp;
if (vp != NULL) {
NDFREE(&nd, NDF_ONLY_PNBUF);
@@ -3837,13 +3823,11 @@
else
vput(nd.ni_dvp);
vrele(vp);
- VFS_UNLOCK_GIANT(vfslocked);
return (EEXIST);
}
if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
NDFREE(&nd, NDF_ONLY_PNBUF);
vput(nd.ni_dvp);
- VFS_UNLOCK_GIANT(vfslocked);
if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
return (error);
goto restart;
@@ -3854,7 +3838,7 @@
#ifdef MAC
error = mac_vnode_check_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
&vattr);
- if (error)
+ if (error != 0)
goto out;
#endif
error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
@@ -3863,10 +3847,9 @@
#endif
NDFREE(&nd, NDF_ONLY_PNBUF);
vput(nd.ni_dvp);
- if (!error)
+ if (error == 0)
vput(nd.ni_vp);
vn_finished_write(mp);
- VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
@@ -3901,17 +3884,16 @@
{
struct mount *mp;
struct vnode *vp;
+ struct nameidata nd;
+ cap_rights_t rights;
int error;
- struct nameidata nd;
- int vfslocked;
restart:
bwillwrite();
- NDINIT_ATRIGHTS(&nd, DELETE, LOCKPARENT | LOCKLEAF | MPSAFE |
- AUDITVNODE1, pathseg, path, fd, CAP_RMDIR, td);
+ NDINIT_ATRIGHTS(&nd, DELETE, LOCKPARENT | LOCKLEAF | AUDITVNODE1,
+ pathseg, path, fd, cap_rights_init(&rights, CAP_UNLINKAT), td);
if ((error = namei(&nd)) != 0)
return (error);
- vfslocked = NDHASGIANT(&nd);
vp = nd.ni_vp;
if (vp->v_type != VDIR) {
error = ENOTDIR;
@@ -3934,7 +3916,7 @@
#ifdef MAC
error = mac_vnode_check_unlink(td->td_ucred, nd.ni_dvp, vp,
&nd.ni_cnd);
- if (error)
+ if (error != 0)
goto out;
#endif
if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
@@ -3944,7 +3926,6 @@
vrele(nd.ni_dvp);
else
vput(nd.ni_dvp);
- VFS_UNLOCK_GIANT(vfslocked);
if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
return (error);
goto restart;
@@ -3959,7 +3940,6 @@
vrele(nd.ni_dvp);
else
vput(nd.ni_dvp);
- VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
@@ -3996,8 +3976,9 @@
struct uio auio, kuio;
struct iovec aiov, kiov;
struct dirent *dp, *edp;
+ cap_rights_t rights;
caddr_t dirbuf;
- int error, eofflag, readcnt, vfslocked;
+ int error, eofflag, readcnt;
long loff;
off_t foffset;
@@ -4004,8 +3985,9 @@
/* XXX arbitrary sanity limit on `count'. */
if (uap->count > 64 * 1024)
return (EINVAL);
- if ((error = getvnode(td->td_proc->p_fd, uap->fd, CAP_READ,
- &fp)) != 0)
+ error = getvnode(td->td_proc->p_fd, uap->fd,
+ cap_rights_init(&rights, CAP_READ), &fp);
+ if (error != 0)
return (error);
if ((fp->f_flag & FREAD) == 0) {
fdrop(fp, td);
@@ -4014,9 +3996,7 @@
vp = fp->f_vnode;
foffset = foffset_lock(fp, 0);
unionread:
- vfslocked = VFS_LOCK_GIANT(vp->v_mount);
if (vp->v_type != VDIR) {
- VFS_UNLOCK_GIANT(vfslocked);
foffset_unlock(fp, foffset, 0);
fdrop(fp, td);
return (EINVAL);
@@ -4033,9 +4013,8 @@
loff = auio.uio_offset = foffset;
#ifdef MAC
error = mac_vnode_check_readdir(td->td_ucred, vp);
- if (error) {
+ if (error != 0) {
VOP_UNLOCK(vp, 0);
- VFS_UNLOCK_GIANT(vfslocked);
foffset_unlock(fp, foffset, FOF_NOUPDATE);
fdrop(fp, td);
return (error);
@@ -4092,9 +4071,8 @@
}
free(dirbuf, M_TEMP);
}
- if (error) {
+ if (error != 0) {
VOP_UNLOCK(vp, 0);
- VFS_UNLOCK_GIANT(vfslocked);
foffset_unlock(fp, foffset, 0);
fdrop(fp, td);
return (error);
@@ -4109,11 +4087,9 @@
fp->f_data = vp;
foffset = 0;
vput(tvp);
- VFS_UNLOCK_GIANT(vfslocked);
goto unionread;
}
VOP_UNLOCK(vp, 0);
- VFS_UNLOCK_GIANT(vfslocked);
foffset_unlock(fp, foffset, 0);
fdrop(fp, td);
td->td_retval[0] = uap->count - auio.uio_resid;
@@ -4147,8 +4123,9 @@
long base;
int error;
- error = kern_getdirentries(td, uap->fd, uap->buf, uap->count, &base);
- if (error)
+ error = kern_getdirentries(td, uap->fd, uap->buf, uap->count, &base,
+ NULL, UIO_USERSPACE);
+ if (error != 0)
return (error);
if (uap->basep != NULL)
error = copyout(&base, uap->basep, sizeof(long));
@@ -4157,23 +4134,24 @@
int
kern_getdirentries(struct thread *td, int fd, char *buf, u_int count,
- long *basep)
+ long *basep, ssize_t *residp, enum uio_seg bufseg)
{
struct vnode *vp;
struct file *fp;
struct uio auio;
struct iovec aiov;
- int vfslocked;
+ cap_rights_t rights;
long loff;
int error, eofflag;
off_t foffset;
AUDIT_ARG_FD(fd);
+ if (count > IOSIZE_MAX)
+ return (EINVAL);
auio.uio_resid = count;
- if (auio.uio_resid > IOSIZE_MAX)
- return (EINVAL);
- if ((error = getvnode(td->td_proc->p_fd, fd, CAP_READ | CAP_SEEK,
- &fp)) != 0)
+ error = getvnode(td->td_proc->p_fd, fd,
+ cap_rights_init(&rights, CAP_READ), &fp);
+ if (error != 0)
return (error);
if ((fp->f_flag & FREAD) == 0) {
fdrop(fp, td);
@@ -4182,9 +4160,7 @@
vp = fp->f_vnode;
foffset = foffset_lock(fp, 0);
unionread:
- vfslocked = VFS_LOCK_GIANT(vp->v_mount);
if (vp->v_type != VDIR) {
- VFS_UNLOCK_GIANT(vfslocked);
error = EINVAL;
goto fail;
}
@@ -4193,7 +4169,7 @@
auio.uio_iov = &aiov;
auio.uio_iovcnt = 1;
auio.uio_rw = UIO_READ;
- auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_segflg = bufseg;
auio.uio_td = td;
vn_lock(vp, LK_SHARED | LK_RETRY);
AUDIT_ARG_VNODE1(vp);
@@ -4205,9 +4181,8 @@
error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL,
NULL);
foffset = auio.uio_offset;
- if (error) {
+ if (error != 0) {
VOP_UNLOCK(vp, 0);
- VFS_UNLOCK_GIANT(vfslocked);
goto fail;
}
if (count == auio.uio_resid &&
@@ -4214,6 +4189,7 @@
(vp->v_vflag & VV_ROOT) &&
(vp->v_mount->mnt_flag & MNT_UNION)) {
struct vnode *tvp = vp;
+
vp = vp->v_mount->mnt_vnodecovered;
VREF(vp);
fp->f_vnode = vp;
@@ -4220,12 +4196,12 @@
fp->f_data = vp;
foffset = 0;
vput(tvp);
- VFS_UNLOCK_GIANT(vfslocked);
goto unionread;
}
VOP_UNLOCK(vp, 0);
- VFS_UNLOCK_GIANT(vfslocked);
*basep = loff;
+ if (residp != NULL)
+ *residp = auio.uio_resid;
td->td_retval[0] = count - auio.uio_resid;
fail:
foffset_unlock(fp, foffset, 0);
@@ -4250,6 +4226,7 @@
} */ *uap;
{
struct getdirentries_args ap;
+
ap.fd = uap->fd;
ap.buf = uap->buf;
ap.count = uap->count;
@@ -4300,15 +4277,13 @@
{
struct vnode *vp;
struct vattr vattr;
+ struct nameidata nd;
int error;
- struct nameidata nd;
- int vfslocked;
- NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
- UIO_USERSPACE, uap->path, td);
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
+ uap->path, td);
if ((error = namei(&nd)) != 0)
return (error);
- vfslocked = NDHASGIANT(&nd);
vp = nd.ni_vp;
NDFREE(&nd, NDF_ONLY_PNBUF);
if (vp->v_type != VCHR || vp->v_rdev == NULL) {
@@ -4317,15 +4292,15 @@
}
#ifdef MAC
error = mac_vnode_check_revoke(td->td_ucred, vp);
- if (error)
+ if (error != 0)
goto out;
#endif
error = VOP_GETATTR(vp, &vattr, td->td_ucred);
- if (error)
+ if (error != 0)
goto out;
if (td->td_ucred->cr_uid != vattr.va_uid) {
error = priv_check(td, PRIV_VFS_ADMIN);
- if (error)
+ if (error != 0)
goto out;
}
if (vcount(vp) > 1)
@@ -4332,7 +4307,6 @@
VOP_REVOKE(vp, REVOKEALL);
out:
vput(vp);
- VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
@@ -4342,35 +4316,14 @@
* entry is held upon returning.
*/
int
-getvnode(struct filedesc *fdp, int fd, cap_rights_t rights,
- struct file **fpp)
+getvnode(struct filedesc *fdp, int fd, cap_rights_t *rightsp, struct file **fpp)
{
struct file *fp;
-#ifdef CAPABILITIES
- struct file *fp_fromcap;
-#endif
int error;
- error = 0;
- fp = NULL;
- if ((fdp == NULL) || (fp = fget_unlocked(fdp, fd)) == NULL)
- return (EBADF);
-#ifdef CAPABILITIES
- /*
- * If the file descriptor is for a capability, test rights and use the
- * file descriptor referenced by the capability.
- */
- error = cap_funwrap(fp, rights, &fp_fromcap);
- if (error) {
- fdrop(fp, curthread);
+ error = fget_unlocked(fdp, fd, rightsp, 0, &fp, NULL);
+ if (error != 0)
return (error);
- }
- if (fp != fp_fromcap) {
- fhold(fp_fromcap);
- fdrop(fp, curthread);
- fp = fp_fromcap;
- }
-#endif /* CAPABILITIES */
/*
* The file could be not of the vnode type, or it may be not
@@ -4410,18 +4363,16 @@
struct nameidata nd;
fhandle_t fh;
register struct vnode *vp;
- int vfslocked;
int error;
error = priv_check(td, PRIV_VFS_GETFH);
- if (error)
+ if (error != 0)
return (error);
- NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
- UIO_USERSPACE, uap->fname, td);
+ NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
+ uap->fname, td);
error = namei(&nd);
- if (error)
+ if (error != 0)
return (error);
- vfslocked = NDHASGIANT(&nd);
NDFREE(&nd, NDF_ONLY_PNBUF);
vp = nd.ni_vp;
bzero(&fh, sizeof(fh));
@@ -4428,10 +4379,8 @@
fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid;
error = VOP_VPTOFH(vp, &fh.fh_fid);
vput(vp);
- VFS_UNLOCK_GIANT(vfslocked);
- if (error)
- return (error);
- error = copyout(&fh, uap->fhp, sizeof (fh));
+ if (error == 0)
+ error = copyout(&fh, uap->fhp, sizeof (fh));
return (error);
}
@@ -4449,18 +4398,16 @@
struct nameidata nd;
fhandle_t fh;
register struct vnode *vp;
- int vfslocked;
int error;
error = priv_check(td, PRIV_VFS_GETFH);
- if (error)
+ if (error != 0)
return (error);
- NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
- UIO_USERSPACE, uap->fname, td);
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_USERSPACE,
+ uap->fname, td);
error = namei(&nd);
- if (error)
+ if (error != 0)
return (error);
- vfslocked = NDHASGIANT(&nd);
NDFREE(&nd, NDF_ONLY_PNBUF);
vp = nd.ni_vp;
bzero(&fh, sizeof(fh));
@@ -4467,10 +4414,8 @@
fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid;
error = VOP_VPTOFH(vp, &fh.fh_fid);
vput(vp);
- VFS_UNLOCK_GIANT(vfslocked);
- if (error)
- return (error);
- error = copyout(&fh, uap->fhp, sizeof (fh));
+ if (error == 0)
+ error = copyout(&fh, uap->fhp, sizeof (fh));
return (error);
}
@@ -4495,177 +4440,75 @@
int flags;
} */ *uap;
{
- struct proc *p = td->td_proc;
struct mount *mp;
struct vnode *vp;
struct fhandle fhp;
- struct vattr vat;
- struct vattr *vap = &vat;
- struct flock lf;
struct file *fp;
- register struct filedesc *fdp = p->p_fd;
- int fmode, error, type;
- accmode_t accmode;
- struct file *nfp;
- int vfslocked;
+ int fmode, error;
int indx;
error = priv_check(td, PRIV_VFS_FHOPEN);
- if (error)
+ if (error != 0)
return (error);
+ indx = -1;
fmode = FFLAGS(uap->flags);
/* why not allow a non-read/write open for our lockd? */
if (((fmode & (FREAD | FWRITE)) == 0) || (fmode & O_CREAT))
return (EINVAL);
error = copyin(uap->u_fhp, &fhp, sizeof(fhp));
- if (error)
+ if (error != 0)
return(error);
/* find the mount point */
mp = vfs_busyfs(&fhp.fh_fsid);
if (mp == NULL)
return (ESTALE);
- vfslocked = VFS_LOCK_GIANT(mp);
/* now give me my vnode, it gets returned to me locked */
error = VFS_FHTOVP(mp, &fhp.fh_fid, LK_EXCLUSIVE, &vp);
vfs_unbusy(mp);
- if (error)
- goto out;
+ if (error != 0)
+ return (error);
+
+ error = falloc_noinstall(td, &fp);
+ if (error != 0) {
+ vput(vp);
+ return (error);
+ }
/*
- * from now on we have to make sure not
- * to forget about the vnode
- * any error that causes an abort must vput(vp)
- * just set error = err and 'goto bad;'.
+ * An extra reference on `fp' has been held for us by
+ * falloc_noinstall().
*/
- /*
- * from vn_open
- */
- if (vp->v_type == VLNK) {
- error = EMLINK;
+#ifdef INVARIANTS
+ td->td_dupfd = -1;
+#endif
+ error = vn_open_vnode(vp, fmode, td->td_ucred, td, fp);
+ if (error != 0) {
+ KASSERT(fp->f_ops == &badfileops,
+ ("VOP_OPEN in fhopen() set f_ops"));
+ KASSERT(td->td_dupfd < 0,
+ ("fhopen() encountered fdopen()"));
+
+ vput(vp);
goto bad;
}
- if (vp->v_type == VSOCK) {
- error = EOPNOTSUPP;
- goto bad;
- }
- if (vp->v_type != VDIR && fmode & O_DIRECTORY) {
- error = ENOTDIR;
- goto bad;
- }
- accmode = 0;
- if (fmode & (FWRITE | O_TRUNC)) {
- if (vp->v_type == VDIR) {
- error = EISDIR;
- goto bad;
- }
- error = vn_writechk(vp);
- if (error)
- goto bad;
- accmode |= VWRITE;
- }
- if (fmode & FREAD)
- accmode |= VREAD;
- if ((fmode & O_APPEND) && (fmode & FWRITE))
- accmode |= VAPPEND;
-#ifdef MAC
- error = mac_vnode_check_open(td->td_ucred, vp, accmode);
- if (error)
- goto bad;
+#ifdef INVARIANTS
+ td->td_dupfd = 0;
#endif
- if (accmode) {
- error = VOP_ACCESS(vp, accmode, td->td_ucred, td);
- if (error)
+ fp->f_vnode = vp;
+ fp->f_seqcount = 1;
+ finit(fp, (fmode & FMASK) | (fp->f_flag & FHASLOCK), DTYPE_VNODE, vp,
+ &vnops);
+ VOP_UNLOCK(vp, 0);
+ if ((fmode & O_TRUNC) != 0) {
+ error = fo_truncate(fp, 0, td->td_ucred, td);
+ if (error != 0)
goto bad;
}
- if (fmode & O_TRUNC) {
- vfs_ref(mp);
- VOP_UNLOCK(vp, 0); /* XXX */
- if ((error = vn_start_write(NULL, &mp, V_WAIT | PCATCH)) != 0) {
- vrele(vp);
- vfs_rel(mp);
- goto out;
- }
- vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); /* XXX */
- vfs_rel(mp);
-#ifdef MAC
- /*
- * We don't yet have fp->f_cred, so use td->td_ucred, which
- * should be right.
- */
- error = mac_vnode_check_write(td->td_ucred, td->td_ucred, vp);
- if (error == 0) {
-#endif
- VATTR_NULL(vap);
- vap->va_size = 0;
- error = VOP_SETATTR(vp, vap, td->td_ucred);
-#ifdef MAC
- }
-#endif
- vn_finished_write(mp);
- if (error)
- goto bad;
- }
- error = VOP_OPEN(vp, fmode, td->td_ucred, td, NULL);
- if (error)
- goto bad;
- if (fmode & FWRITE)
- vp->v_writecount++;
-
- /*
- * end of vn_open code
- */
-
- if ((error = falloc(td, &nfp, &indx, fmode)) != 0) {
- if (fmode & FWRITE)
- vp->v_writecount--;
- goto bad;
- }
- /* An extra reference on `nfp' has been held for us by falloc(). */
- fp = nfp;
- nfp->f_vnode = vp;
- finit(nfp, fmode & FMASK, DTYPE_VNODE, vp, &vnops);
- if (fmode & (O_EXLOCK | O_SHLOCK)) {
- lf.l_whence = SEEK_SET;
- lf.l_start = 0;
- lf.l_len = 0;
- if (fmode & O_EXLOCK)
- lf.l_type = F_WRLCK;
- else
- lf.l_type = F_RDLCK;
- type = F_FLOCK;
- if ((fmode & FNONBLOCK) == 0)
- type |= F_WAIT;
- VOP_UNLOCK(vp, 0);
- if ((error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf,
- type)) != 0) {
- /*
- * The lock request failed. Normally close the
- * descriptor but handle the case where someone might
- * have dup()d or close()d it when we weren't looking.
- */
- fdclose(fdp, fp, indx, td);
-
- /*
- * release our private reference
- */
- fdrop(fp, td);
- goto out;
- }
- vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
- atomic_set_int(&fp->f_flag, FHASLOCK);
- }
-
- VOP_UNLOCK(vp, 0);
+ error = finstall(td, fp, &indx, fmode, NULL);
+bad:
fdrop(fp, td);
- VFS_UNLOCK_GIANT(vfslocked);
td->td_retval[0] = indx;
- return (0);
-
-bad:
- vput(vp);
-out:
- VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
@@ -4687,33 +4530,36 @@
} */ *uap;
{
struct stat sb;
- fhandle_t fh;
+ struct fhandle fh;
+ int error;
+
+ error = copyin(uap->u_fhp, &fh, sizeof(fh));
+ if (error != 0)
+ return (error);
+ error = kern_fhstat(td, fh, &sb);
+ if (error == 0)
+ error = copyout(&sb, uap->sb, sizeof(sb));
+ return (error);
+}
+
+int
+kern_fhstat(struct thread *td, struct fhandle fh, struct stat *sb)
+{
struct mount *mp;
struct vnode *vp;
- int vfslocked;
int error;
error = priv_check(td, PRIV_VFS_FHSTAT);
- if (error)
+ if (error != 0)
return (error);
- error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
- if (error)
- return (error);
if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL)
return (ESTALE);
- vfslocked = VFS_LOCK_GIANT(mp);
error = VFS_FHTOVP(mp, &fh.fh_fid, LK_EXCLUSIVE, &vp);
vfs_unbusy(mp);
- if (error) {
- VFS_UNLOCK_GIANT(vfslocked);
+ if (error != 0)
return (error);
- }
- error = vn_stat(vp, &sb, td->td_ucred, NOCRED, td);
+ error = vn_stat(vp, sb, td->td_ucred, NOCRED, td);
vput(vp);
- VFS_UNLOCK_GIANT(vfslocked);
- if (error)
- return (error);
- error = copyout(&sb, uap->sb, sizeof(sb));
return (error);
}
@@ -4739,10 +4585,10 @@
int error;
error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
- if (error)
+ if (error != 0)
return (error);
error = kern_fhstatfs(td, fh, &sf);
- if (error)
+ if (error != 0)
return (error);
return (copyout(&sf, uap->buf, sizeof(sf)));
}
@@ -4753,28 +4599,25 @@
struct statfs *sp;
struct mount *mp;
struct vnode *vp;
- int vfslocked;
int error;
error = priv_check(td, PRIV_VFS_FHSTATFS);
- if (error)
+ if (error != 0)
return (error);
if ((mp = vfs_busyfs(&fh.fh_fsid)) == NULL)
return (ESTALE);
- vfslocked = VFS_LOCK_GIANT(mp);
error = VFS_FHTOVP(mp, &fh.fh_fid, LK_EXCLUSIVE, &vp);
- if (error) {
+ if (error != 0) {
vfs_unbusy(mp);
- VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
vput(vp);
error = prison_canseemount(td->td_ucred, mp);
- if (error)
+ if (error != 0)
goto out;
#ifdef MAC
error = mac_mount_check_stat(td->td_ucred, mp);
- if (error)
+ if (error != 0)
goto out;
#endif
/*
@@ -4789,7 +4632,6 @@
*buf = *sp;
out:
vfs_unbusy(mp);
- VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
@@ -4799,44 +4641,35 @@
struct file *fp;
struct mount *mp;
struct vnode *vp;
+ cap_rights_t rights;
off_t olen, ooffset;
- int error, vfslocked;
+ int error;
- fp = NULL;
- vfslocked = 0;
- error = fget(td, fd, CAP_WRITE, &fp);
+ if (offset < 0 || len <= 0)
+ return (EINVAL);
+ /* Check for wrap. */
+ if (offset > OFF_MAX - len)
+ return (EFBIG);
+ error = fget(td, fd, cap_rights_init(&rights, CAP_WRITE), &fp);
if (error != 0)
- goto out;
-
- switch (fp->f_type) {
- case DTYPE_VNODE:
- break;
- case DTYPE_PIPE:
- case DTYPE_FIFO:
+ return (error);
+ if ((fp->f_ops->fo_flags & DFLAG_SEEKABLE) == 0) {
error = ESPIPE;
goto out;
- default:
- error = ENODEV;
- goto out;
}
if ((fp->f_flag & FWRITE) == 0) {
error = EBADF;
goto out;
}
+ if (fp->f_type != DTYPE_VNODE) {
+ error = ENODEV;
+ goto out;
+ }
vp = fp->f_vnode;
if (vp->v_type != VREG) {
error = ENODEV;
goto out;
}
- if (offset < 0 || len <= 0) {
- error = EINVAL;
- goto out;
- }
- /* Check for wrap. */
- if (offset > OFF_MAX - len) {
- error = EFBIG;
- goto out;
- }
/* Allocating blocks may take a long time, so iterate. */
for (;;) {
@@ -4844,17 +4677,13 @@
ooffset = offset;
bwillwrite();
- vfslocked = VFS_LOCK_GIANT(vp->v_mount);
mp = NULL;
error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
- if (error != 0) {
- VFS_UNLOCK_GIANT(vfslocked);
+ if (error != 0)
break;
- }
error = vn_lock(vp, LK_EXCLUSIVE);
if (error != 0) {
vn_finished_write(mp);
- VFS_UNLOCK_GIANT(vfslocked);
break;
}
#ifdef MAC
@@ -4864,7 +4693,6 @@
error = VOP_ALLOCATE(vp, &offset, &len);
VOP_UNLOCK(vp, 0);
vn_finished_write(mp);
- VFS_UNLOCK_GIANT(vfslocked);
if (olen + ooffset != offset + len) {
panic("offset + len changed from %jx/%jx to %jx/%jx",
@@ -4876,8 +4704,7 @@
maybe_yield();
}
out:
- if (fp != NULL)
- fdrop(fp, td);
+ fdrop(fp, td);
return (error);
}
@@ -4885,7 +4712,9 @@
sys_posix_fallocate(struct thread *td, struct posix_fallocate_args *uap)
{
- return (kern_posix_fallocate(td, uap->fd, uap->offset, uap->len));
+ td->td_retval[0] = kern_posix_fallocate(td, uap->fd, uap->offset,
+ uap->len);
+ return (0);
}
/*
@@ -4901,6 +4730,7 @@
struct fadvise_info *fa, *new;
struct file *fp;
struct vnode *vp;
+ cap_rights_t rights;
off_t end;
int error;
@@ -4921,18 +4751,14 @@
return (EINVAL);
}
/* XXX: CAP_POSIX_FADVISE? */
- error = fget(td, fd, 0, &fp);
+ error = fget(td, fd, cap_rights_init(&rights), &fp);
if (error != 0)
goto out;
-
- switch (fp->f_type) {
- case DTYPE_VNODE:
- break;
- case DTYPE_PIPE:
- case DTYPE_FIFO:
+ if ((fp->f_ops->fo_flags & DFLAG_SEEKABLE) == 0) {
error = ESPIPE;
goto out;
- default:
+ }
+ if (fp->f_type != DTYPE_VNODE) {
error = ENODEV;
goto out;
}
@@ -4988,7 +4814,7 @@
new = fa;
fp->f_advice = NULL;
} else if (offset <= fa->fa_start &&
- end >= fa->fa_start)
+ end >= fa->fa_start)
fa->fa_start = end + 1;
else if (offset <= fa->fa_end && end >= fa->fa_end)
fa->fa_end = offset - 1;
@@ -5023,6 +4849,7 @@
sys_posix_fadvise(struct thread *td, struct posix_fadvise_args *uap)
{
- return (kern_posix_fadvise(td, uap->fd, uap->offset, uap->len,
- uap->advice));
+ td->td_retval[0] = kern_posix_fadvise(td, uap->fd, uap->offset,
+ uap->len, uap->advice);
+ return (0);
}
More information about the Midnightbsd-cvs
mailing list