[Midnightbsd-cvs] src [9944] trunk/sys/kern: sync with freebsd 10-stable
laffer1 at midnightbsd.org
laffer1 at midnightbsd.org
Fri May 25 16:46:52 EDT 2018
Revision: 9944
http://svnweb.midnightbsd.org/src/?rev=9944
Author: laffer1
Date: 2018-05-25 16:46:51 -0400 (Fri, 25 May 2018)
Log Message:
-----------
sync with freebsd 10-stable
Modified Paths:
--------------
trunk/sys/kern/kern_cons.c
trunk/sys/kern/kern_cpu.c
trunk/sys/kern/kern_cpuset.c
trunk/sys/kern/kern_descrip.c
trunk/sys/kern/kern_environment.c
trunk/sys/kern/kern_et.c
trunk/sys/kern/kern_event.c
trunk/sys/kern/kern_exec.c
trunk/sys/kern/kern_exit.c
trunk/sys/kern/kern_fail.c
trunk/sys/kern/kern_fork.c
trunk/sys/kern/kern_gzio.c
trunk/sys/kern/kern_hhook.c
trunk/sys/kern/kern_intr.c
trunk/sys/kern/kern_jail.c
trunk/sys/kern/kern_khelp.c
trunk/sys/kern/kern_kthread.c
trunk/sys/kern/kern_ktr.c
Modified: trunk/sys/kern/kern_cons.c
===================================================================
--- trunk/sys/kern/kern_cons.c 2018-05-25 20:39:59 UTC (rev 9943)
+++ trunk/sys/kern/kern_cons.c 2018-05-25 20:46:51 UTC (rev 9944)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 1988 University of Utah.
* Copyright (c) 1991 The Regents of the University of California.
@@ -38,9 +39,10 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/kern_cons.c 283333 2015-05-23 22:34:25Z ian $");
#include "opt_ddb.h"
+#include "opt_syscons.h"
#include <sys/param.h>
#include <sys/systm.h>
@@ -155,6 +157,13 @@
* Make the best console the preferred console.
*/
cnselect(best_cn);
+
+#ifdef EARLY_PRINTF
+ /*
+ * Release early console.
+ */
+ early_putc = NULL;
+#endif
}
void
@@ -432,10 +441,8 @@
case '\b':
case '\177':
if (lp > cp) {
- if (visible) {
- cnputc(c);
- cnputs(" \b");
- }
+ if (visible)
+ cnputs("\b \b");
lp--;
}
continue;
@@ -466,6 +473,15 @@
struct consdev *cn;
char *cp;
+#ifdef EARLY_PRINTF
+ if (early_putc != NULL) {
+ if (c == '\n')
+ early_putc('\r');
+ early_putc(c);
+ return;
+ }
+#endif
+
if (cn_mute || c == '\0')
return;
STAILQ_FOREACH(cnd, &cn_devlist, cnd_next) {
@@ -497,6 +513,13 @@
int unlock_reqd = 0;
if (use_cnputs_mtx) {
+ /*
+ * NOTE: Debug prints and/or witness printouts in
+ * console driver clients can cause the "cnputs_mtx"
+ * mutex to recurse. Simply return if that happens.
+ */
+ if (mtx_owned(&cnputs_mtx))
+ return;
mtx_lock_spin(&cnputs_mtx);
unlock_reqd = 1;
}
@@ -641,3 +664,63 @@
#endif
+/*
+ * Temporary support for sc(4) to vt(4) transition.
+ */
+static unsigned vty_prefer;
+static char vty_name[16];
+SYSCTL_STRING(_kern, OID_AUTO, vty, CTLFLAG_RDTUN, vty_name, 0,
+ "Console vty driver");
+
+int
+vty_enabled(unsigned vty)
+{
+ static unsigned vty_selected = 0;
+
+ if (vty_selected == 0) {
+ TUNABLE_STR_FETCH("kern.vty", vty_name, sizeof(vty_name));
+ do {
+#if defined(DEV_SC)
+ if (strcmp(vty_name, "sc") == 0) {
+ vty_selected = VTY_SC;
+ break;
+ }
+#endif
+#if defined(DEV_VT)
+ if (strcmp(vty_name, "vt") == 0) {
+ vty_selected = VTY_VT;
+ break;
+ }
+#endif
+ if (vty_prefer != 0) {
+ vty_selected = vty_prefer;
+ break;
+ }
+#if defined(DEV_SC)
+ vty_selected = VTY_SC;
+#elif defined(DEV_VT)
+ vty_selected = VTY_VT;
+#endif
+ } while (0);
+
+ if (vty_selected == VTY_VT)
+ strcpy(vty_name, "vt");
+ else if (vty_selected == VTY_SC)
+ strcpy(vty_name, "sc");
+ }
+ return ((vty_selected & vty) != 0);
+}
+
+void
+vty_set_preferred(unsigned vty)
+{
+
+ vty_prefer = vty;
+#if !defined(DEV_SC)
+ vty_prefer &= ~VTY_SC;
+#endif
+#if !defined(DEV_VT)
+ vty_prefer &= ~VTY_VT;
+#endif
+}
+
Modified: trunk/sys/kern/kern_cpu.c
===================================================================
--- trunk/sys/kern/kern_cpu.c 2018-05-25 20:39:59 UTC (rev 9943)
+++ trunk/sys/kern/kern_cpu.c 2018-05-25 20:46:51 UTC (rev 9944)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 2004-2007 Nate Lawson (SDG)
* All rights reserved.
@@ -25,7 +26,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/kern_cpu.c 266165 2014-05-15 18:07:35Z cperciva $");
#include <sys/param.h>
#include <sys/bus.h>
@@ -135,7 +136,8 @@
static int cf_verbose;
TUNABLE_INT("debug.cpufreq.lowest", &cf_lowest_freq);
TUNABLE_INT("debug.cpufreq.verbose", &cf_verbose);
-static SYSCTL_NODE(_debug, OID_AUTO, cpufreq, CTLFLAG_RD, NULL, "cpufreq debugging");
+static SYSCTL_NODE(_debug, OID_AUTO, cpufreq, CTLFLAG_RD, NULL,
+ "cpufreq debugging");
SYSCTL_INT(_debug_cpufreq, OID_AUTO, lowest, CTLFLAG_RW, &cf_lowest_freq, 1,
"Don't provide levels below this frequency.");
SYSCTL_INT(_debug_cpufreq, OID_AUTO, verbose, CTLFLAG_RW, &cf_verbose, 1,
@@ -267,7 +269,7 @@
* switching the main CPU. XXXTODO: Need to think more about how to
* handle having different CPUs at different frequencies.
*/
- if (mp_ncpus > 1 && !smp_active) {
+ if (mp_ncpus > 1 && !smp_started) {
device_printf(dev, "rejecting change, SMP not started yet\n");
error = ENXIO;
goto out;
@@ -1037,6 +1039,7 @@
if (cf_dev == NULL) {
device_printf(dev,
"warning: cpufreq_unregister called with no cpufreq device active\n");
+ free(devs, M_TEMP);
return (0);
}
cfcount = 0;
Modified: trunk/sys/kern/kern_cpuset.c
===================================================================
--- trunk/sys/kern/kern_cpuset.c 2018-05-25 20:39:59 UTC (rev 9943)
+++ trunk/sys/kern/kern_cpuset.c 2018-05-25 20:46:51 UTC (rev 9944)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 2008, Jeffrey Roberson <jeff at freebsd.org>
* All rights reserved.
@@ -29,7 +30,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/kern_cpuset.c 273736 2014-10-27 14:38:00Z hselasky $");
#include "opt_ddb.h"
@@ -110,7 +111,7 @@
/* Return the size of cpuset_t at the kernel level */
SYSCTL_INT(_kern_sched, OID_AUTO, cpusetsize, CTLFLAG_RD,
- 0, sizeof(cpuset_t), "sizeof(cpuset_t)");
+ SYSCTL_NULL_INT_PTR, sizeof(cpuset_t), "sizeof(cpuset_t)");
cpuset_t *cpuset_root;
@@ -303,7 +304,7 @@
* empty as well as RDONLY flags.
*/
static int
-cpuset_testupdate(struct cpuset *set, cpuset_t *mask)
+cpuset_testupdate(struct cpuset *set, cpuset_t *mask, int check_mask)
{
struct cpuset *nset;
cpuset_t newmask;
@@ -312,13 +313,16 @@
mtx_assert(&cpuset_lock, MA_OWNED);
if (set->cs_flags & CPU_SET_RDONLY)
return (EPERM);
- if (!CPU_OVERLAP(&set->cs_mask, mask))
- return (EDEADLK);
- CPU_COPY(&set->cs_mask, &newmask);
- CPU_AND(&newmask, mask);
+ if (check_mask) {
+ if (!CPU_OVERLAP(&set->cs_mask, mask))
+ return (EDEADLK);
+ CPU_COPY(&set->cs_mask, &newmask);
+ CPU_AND(&newmask, mask);
+ } else
+ CPU_COPY(mask, &newmask);
error = 0;
LIST_FOREACH(nset, &set->cs_children, cs_siblings)
- if ((error = cpuset_testupdate(nset, &newmask)) != 0)
+ if ((error = cpuset_testupdate(nset, &newmask, 1)) != 0)
break;
return (error);
}
@@ -370,11 +374,11 @@
if (root && !CPU_SUBSET(&root->cs_mask, mask))
return (EINVAL);
mtx_lock_spin(&cpuset_lock);
- error = cpuset_testupdate(set, mask);
+ error = cpuset_testupdate(set, mask, 0);
if (error)
goto out;
+ CPU_COPY(mask, &set->cs_mask);
cpuset_update(set, mask);
- CPU_COPY(mask, &set->cs_mask);
out:
mtx_unlock_spin(&cpuset_lock);
@@ -618,26 +622,6 @@
}
/*
- * Calculate the ffs() of the cpuset.
- */
-int
-cpusetobj_ffs(const cpuset_t *set)
-{
- size_t i;
- int cbit;
-
- cbit = 0;
- for (i = 0; i < _NCPUWORDS; i++) {
- if (set->__bits[i] != 0) {
- cbit = ffsl(set->__bits[i]);
- cbit += i * _NCPUBITS;
- break;
- }
- }
- return (cbit);
-}
-
-/*
* Return a string representing a valid layout for a cpuset_t object.
* It expects an incoming buffer at least sized as CPUSETBUFSIZ.
*/
@@ -651,12 +635,12 @@
bytesp = 0;
bufsiz = CPUSETBUFSIZ;
- for (i = _NCPUWORDS - 1; i > 0; i--) {
- bytesp = snprintf(tbuf, bufsiz, "%lx, ", set->__bits[i]);
+ for (i = 0; i < (_NCPUWORDS - 1); i++) {
+ bytesp = snprintf(tbuf, bufsiz, "%lx,", set->__bits[i]);
bufsiz -= bytesp;
tbuf += bytesp;
}
- snprintf(tbuf, bufsiz, "%lx", set->__bits[0]);
+ snprintf(tbuf, bufsiz, "%lx", set->__bits[_NCPUWORDS - 1]);
return (buf);
}
@@ -682,16 +666,16 @@
return (-1);
CPU_ZERO(set);
- for (i = nwords - 1; i > 0; i--) {
- ret = sscanf(buf, "%lx, ", &set->__bits[i]);
+ for (i = 0; i < (nwords - 1); i++) {
+ ret = sscanf(buf, "%lx,", &set->__bits[i]);
if (ret == 0 || ret == -1)
return (-1);
- buf = strstr(buf, " ");
+ buf = strstr(buf, ",");
if (buf == NULL)
return (-1);
buf++;
}
- ret = sscanf(buf, "%lx", &set->__bits[0]);
+ ret = sscanf(buf, "%lx", &set->__bits[nwords - 1]);
if (ret == 0 || ret == -1)
return (-1);
return (0);
@@ -900,6 +884,7 @@
cpuwhich_t which;
id_t id;
cpusetid_t *setid;
+};
#endif
int
sys_cpuset_getid(struct thread *td, struct cpuset_getid_args *uap)
@@ -1146,10 +1131,27 @@
}
#ifdef DDB
+void
+ddb_display_cpuset(const cpuset_t *set)
+{
+ int cpu, once;
+
+ for (once = 0, cpu = 0; cpu < CPU_SETSIZE; cpu++) {
+ if (CPU_ISSET(cpu, set)) {
+ if (once == 0) {
+ db_printf("%d", cpu);
+ once = 1;
+ } else
+ db_printf(",%d", cpu);
+ }
+ }
+ if (once == 0)
+ db_printf("<none>");
+}
+
DB_SHOW_COMMAND(cpusets, db_show_cpusets)
{
struct cpuset *set;
- int cpu, once;
LIST_FOREACH(set, &cpuset_ids, cs_link) {
db_printf("set=%p id=%-6u ref=%-6d flags=0x%04x parent id=%d\n",
@@ -1156,15 +1158,7 @@
set, set->cs_id, set->cs_ref, set->cs_flags,
(set->cs_parent != NULL) ? set->cs_parent->cs_id : 0);
db_printf(" mask=");
- for (once = 0, cpu = 0; cpu < CPU_SETSIZE; cpu++) {
- if (CPU_ISSET(cpu, &set->cs_mask)) {
- if (once == 0) {
- db_printf("%d", cpu);
- once = 1;
- } else
- db_printf(",%d", cpu);
- }
- }
+ ddb_display_cpuset(&set->cs_mask);
db_printf("\n");
if (db_pager_quit)
break;
Modified: trunk/sys/kern/kern_descrip.c
===================================================================
--- trunk/sys/kern/kern_descrip.c 2018-05-25 20:39:59 UTC (rev 9943)
+++ trunk/sys/kern/kern_descrip.c 2018-05-25 20:46:51 UTC (rev 9944)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 1982, 1986, 1989, 1991, 1993
* The Regents of the University of California. All rights reserved.
@@ -35,7 +36,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/kern_descrip.c 321020 2017-07-15 17:25:40Z dchagin $");
#include "opt_capsicum.h"
#include "opt_compat.h"
@@ -46,9 +47,10 @@
#include <sys/param.h>
#include <sys/systm.h>
-#include <sys/capability.h>
+#include <sys/capsicum.h>
#include <sys/conf.h>
#include <sys/domain.h>
+#include <sys/fail.h>
#include <sys/fcntl.h>
#include <sys/file.h>
#include <sys/filedesc.h>
@@ -55,6 +57,7 @@
#include <sys/filio.h>
#include <sys/jail.h>
#include <sys/kernel.h>
+#include <sys/ksem.h>
#include <sys/limits.h>
#include <sys/lock.h>
#include <sys/malloc.h>
@@ -103,41 +106,55 @@
static MALLOC_DEFINE(M_FILEDESC, "filedesc", "Open file descriptor table");
static MALLOC_DEFINE(M_FILEDESC_TO_LEADER, "filedesc_to_leader",
- "file desc to leader structures");
+ "file desc to leader structures");
static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures");
+MALLOC_DEFINE(M_FILECAPS, "filecaps", "descriptor capabilities");
MALLOC_DECLARE(M_FADVISE);
static uma_zone_t file_zone;
+void (*ksem_info)(struct ksem *ks, char *path, size_t size, uint32_t *value);
-/* Flags for do_dup() */
-#define DUP_FIXED 0x1 /* Force fixed allocation */
-#define DUP_FCNTL 0x2 /* fcntl()-style errors */
-#define DUP_CLOEXEC 0x4 /* Atomically set FD_CLOEXEC. */
-
-static int do_dup(struct thread *td, int flags, int old, int new,
- register_t *retval);
-static int fd_first_free(struct filedesc *, int, int);
-static int fd_last_used(struct filedesc *, int, int);
-static void fdgrowtable(struct filedesc *, int);
+static int closefp(struct filedesc *fdp, int fd, struct file *fp,
+ struct thread *td, int holdleaders);
+static int fd_first_free(struct filedesc *fdp, int low, int size);
+static int fd_last_used(struct filedesc *fdp, int size);
+static void fdgrowtable(struct filedesc *fdp, int nfd);
+static void fdgrowtable_exp(struct filedesc *fdp, int nfd);
static void fdunused(struct filedesc *fdp, int fd);
static void fdused(struct filedesc *fdp, int fd);
-static int fill_vnode_info(struct vnode *vp, struct kinfo_file *kif);
-static int fill_socket_info(struct socket *so, struct kinfo_file *kif);
-static int fill_pts_info(struct tty *tp, struct kinfo_file *kif);
static int fill_pipe_info(struct pipe *pi, struct kinfo_file *kif);
static int fill_procdesc_info(struct procdesc *pdp,
- struct kinfo_file *kif);
+ struct kinfo_file *kif);
+static int fill_pts_info(struct tty *tp, struct kinfo_file *kif);
+static int fill_sem_info(struct file *fp, struct kinfo_file *kif);
static int fill_shm_info(struct file *fp, struct kinfo_file *kif);
+static int fill_socket_info(struct socket *so, struct kinfo_file *kif);
+static int fill_vnode_info(struct vnode *vp, struct kinfo_file *kif);
+static int getmaxfd(struct proc *p);
/*
- * A process is initially started out with NDFILE descriptors stored within
- * this structure, selected to be enough for typical applications based on
- * the historical limit of 20 open files (and the usage of descriptors by
- * shells). If these descriptors are exhausted, a larger descriptor table
- * may be allocated, up to a process' resource limit; the internal arrays
- * are then unused.
+ * Each process has:
+ *
+ * - An array of open file descriptors (fd_ofiles)
+ * - An array of file flags (fd_ofileflags)
+ * - A bitmap recording which descriptors are in use (fd_map)
+ *
+ * A process starts out with NDFILE descriptors. The value of NDFILE has
+ * been selected based the historical limit of 20 open files, and an
+ * assumption that the majority of processes, especially short-lived
+ * processes like shells, will never need more.
+ *
+ * If this initial allocation is exhausted, a larger descriptor table and
+ * map are allocated dynamically, and the pointers in the process's struct
+ * filedesc are updated to point to those. This is repeated every time
+ * the process runs out of file descriptors (provided it hasn't hit its
+ * resource limit).
+ *
+ * Since threads may hold references to individual descriptor table
+ * entries, the tables are never freed. Instead, they are placed on a
+ * linked list and freed only when the struct filedesc is released.
*/
#define NDFILE 20
#define NDSLOTSIZE sizeof(NDSLOTTYPE)
@@ -147,34 +164,22 @@
#define NDSLOTS(x) (((x) + NDENTRIES - 1) / NDENTRIES)
/*
- * Storage required per open file descriptor.
+ * SLIST entry used to keep track of ofiles which must be reclaimed when
+ * the process exits.
*/
-#define OFILESIZE (sizeof(struct file *) + sizeof(char))
-
-/*
- * Storage to hold unused ofiles that need to be reclaimed.
- */
struct freetable {
- struct file **ft_table;
+ struct filedescent *ft_table;
SLIST_ENTRY(freetable) ft_next;
};
/*
- * Basic allocation of descriptors:
- * one of the above, plus arrays for NDFILE descriptors.
+ * Initial allocation: a filedesc structure + the head of SLIST used to
+ * keep track of old ofiles + enough space for NDFILE descriptors.
*/
struct filedesc0 {
- struct filedesc fd_fd;
- /*
- * ofiles which need to be reclaimed on free.
- */
- SLIST_HEAD(,freetable) fd_free;
- /*
- * These arrays are used when the number of open files is
- * <= NDFILE, and are then pointed to by the pointers above.
- */
- struct file *fd_dfiles[NDFILE];
- char fd_dfileflags[NDFILE];
+ struct filedesc fd_fd;
+ SLIST_HEAD(, freetable) fd_free;
+ struct filedescent fd_dfiles[NDFILE];
NDSLOTTYPE fd_dmap[NDSLOTS(NDFILE)];
};
@@ -183,14 +188,15 @@
*/
volatile int openfiles; /* actual number of open files */
struct mtx sigio_lock; /* mtx to protect pointers to sigio */
-void (*mq_fdclose)(struct thread *td, int fd, struct file *fp);
+void (*mq_fdclose)(struct thread *td, int fd, struct file *fp);
/* A mutex to protect the association between a proc and filedesc. */
-static struct mtx fdesc_mtx;
+static struct mtx fdesc_mtx;
/*
- * Find the first zero bit in the given bitmap, starting at low and not
- * exceeding size - 1.
+ * If low >= size, just return low. Otherwise find the first zero bit in the
+ * given bitmap, starting at low and not exceeding size - 1. Return size if
+ * not found.
*/
static int
fd_first_free(struct filedesc *fdp, int low, int size)
@@ -216,19 +222,16 @@
}
/*
- * Find the highest non-zero bit in the given bitmap, starting at low and
- * not exceeding size - 1.
+ * Find the highest non-zero bit in the given bitmap, starting at 0 and
+ * not exceeding size - 1. Return -1 if not found.
*/
static int
-fd_last_used(struct filedesc *fdp, int low, int size)
+fd_last_used(struct filedesc *fdp, int size)
{
NDSLOTTYPE *map = fdp->fd_map;
NDSLOTTYPE mask;
int off, minoff;
- if (low >= size)
- return (-1);
-
off = NDSLOT(size);
if (size % NDENTRIES) {
mask = ~(~(NDSLOTTYPE)0 << (size % NDENTRIES));
@@ -236,17 +239,21 @@
return (off * NDENTRIES + flsl(mask) - 1);
--off;
}
- for (minoff = NDSLOT(low); off >= minoff; --off)
+ for (minoff = NDSLOT(0); off >= minoff; --off)
if (map[off] != 0)
return (off * NDENTRIES + flsl(map[off]) - 1);
- return (low - 1);
+ return (-1);
}
static int
fdisused(struct filedesc *fdp, int fd)
{
- KASSERT(fd >= 0 && fd < fdp->fd_nfiles,
- ("file descriptor %d out of range (0, %d)", fd, fdp->fd_nfiles));
+
+ FILEDESC_LOCK_ASSERT(fdp);
+
+ KASSERT(fd >= 0 && fd < fdp->fd_nfiles,
+ ("file descriptor %d out of range (0, %d)", fd, fdp->fd_nfiles));
+
return ((fdp->fd_map[NDSLOT(fd)] & NDBIT(fd)) != 0);
}
@@ -258,9 +265,9 @@
{
FILEDESC_XLOCK_ASSERT(fdp);
- KASSERT(!fdisused(fdp, fd),
- ("fd already used"));
+ KASSERT(!fdisused(fdp, fd), ("fd=%d is already used", fd));
+
fdp->fd_map[NDSLOT(fd)] |= NDBIT(fd);
if (fd > fdp->fd_lastfile)
fdp->fd_lastfile = fd;
@@ -276,19 +283,58 @@
{
FILEDESC_XLOCK_ASSERT(fdp);
- KASSERT(fdisused(fdp, fd),
- ("fd is already unused"));
- KASSERT(fdp->fd_ofiles[fd] == NULL,
- ("fd is still in use"));
+ KASSERT(fdisused(fdp, fd), ("fd=%d is already unused", fd));
+ KASSERT(fdp->fd_ofiles[fd].fde_file == NULL,
+ ("fd=%d is still in use", fd));
+
fdp->fd_map[NDSLOT(fd)] &= ~NDBIT(fd);
if (fd < fdp->fd_freefile)
fdp->fd_freefile = fd;
if (fd == fdp->fd_lastfile)
- fdp->fd_lastfile = fd_last_used(fdp, 0, fd);
+ fdp->fd_lastfile = fd_last_used(fdp, fd);
}
/*
+ * Free a file descriptor.
+ *
+ * Avoid some work if fdp is about to be destroyed.
+ */
+static inline void
+_fdfree(struct filedesc *fdp, int fd, int last)
+{
+ struct filedescent *fde;
+
+ fde = &fdp->fd_ofiles[fd];
+#ifdef CAPABILITIES
+ if (!last)
+ seq_write_begin(&fde->fde_seq);
+#endif
+ filecaps_free(&fde->fde_caps);
+ if (last)
+ return;
+ bzero(fde, fde_change_size);
+ fdunused(fdp, fd);
+#ifdef CAPABILITIES
+ seq_write_end(&fde->fde_seq);
+#endif
+}
+
+static inline void
+fdfree(struct filedesc *fdp, int fd)
+{
+
+ _fdfree(fdp, fd, 0);
+}
+
+static inline void
+fdfree_last(struct filedesc *fdp, int fd)
+{
+
+ _fdfree(fdp, fd, 1);
+}
+
+/*
* System calls on descriptors.
*/
#ifndef _SYS_SYSPROTO_H_
@@ -364,15 +410,21 @@
int
sys_fcntl(struct thread *td, struct fcntl_args *uap)
{
+
+ return (kern_fcntl_freebsd(td, uap->fd, uap->cmd, uap->arg));
+}
+
+int
+kern_fcntl_freebsd(struct thread *td, int fd, int cmd, long arg)
+{
struct flock fl;
struct __oflock ofl;
- intptr_t arg;
- int error;
- int cmd;
+ intptr_t arg1;
+ int error, newcmd;
error = 0;
- cmd = uap->cmd;
- switch (uap->cmd) {
+ newcmd = cmd;
+ switch (cmd) {
case F_OGETLK:
case F_OSETLK:
case F_OSETLKW:
@@ -379,7 +431,7 @@
/*
* Convert old flock structure to new.
*/
- error = copyin((void *)(intptr_t)uap->arg, &ofl, sizeof(ofl));
+ error = copyin((void *)(intptr_t)arg, &ofl, sizeof(ofl));
fl.l_start = ofl.l_start;
fl.l_len = ofl.l_len;
fl.l_pid = ofl.l_pid;
@@ -387,96 +439,62 @@
fl.l_whence = ofl.l_whence;
fl.l_sysid = 0;
- switch (uap->cmd) {
+ switch (cmd) {
case F_OGETLK:
- cmd = F_GETLK;
- break;
+ newcmd = F_GETLK;
+ break;
case F_OSETLK:
- cmd = F_SETLK;
- break;
+ newcmd = F_SETLK;
+ break;
case F_OSETLKW:
- cmd = F_SETLKW;
- break;
+ newcmd = F_SETLKW;
+ break;
}
- arg = (intptr_t)&fl;
+ arg1 = (intptr_t)&fl;
break;
- case F_GETLK:
- case F_SETLK:
- case F_SETLKW:
+ case F_GETLK:
+ case F_SETLK:
+ case F_SETLKW:
case F_SETLK_REMOTE:
- error = copyin((void *)(intptr_t)uap->arg, &fl, sizeof(fl));
- arg = (intptr_t)&fl;
- break;
+ error = copyin((void *)(intptr_t)arg, &fl, sizeof(fl));
+ arg1 = (intptr_t)&fl;
+ break;
default:
- arg = uap->arg;
+ arg1 = arg;
break;
}
if (error)
return (error);
- error = kern_fcntl(td, uap->fd, cmd, arg);
+ error = kern_fcntl(td, fd, newcmd, arg1);
if (error)
return (error);
- if (uap->cmd == F_OGETLK) {
+ if (cmd == F_OGETLK) {
ofl.l_start = fl.l_start;
ofl.l_len = fl.l_len;
ofl.l_pid = fl.l_pid;
ofl.l_type = fl.l_type;
ofl.l_whence = fl.l_whence;
- error = copyout(&ofl, (void *)(intptr_t)uap->arg, sizeof(ofl));
- } else if (uap->cmd == F_GETLK) {
- error = copyout(&fl, (void *)(intptr_t)uap->arg, sizeof(fl));
+ error = copyout(&ofl, (void *)(intptr_t)arg, sizeof(ofl));
+ } else if (cmd == F_GETLK) {
+ error = copyout(&fl, (void *)(intptr_t)arg, sizeof(fl));
}
return (error);
}
-static inline struct file *
-fdtofp(int fd, struct filedesc *fdp)
-{
- struct file *fp;
-
- FILEDESC_LOCK_ASSERT(fdp);
- if ((unsigned)fd >= fdp->fd_nfiles ||
- (fp = fdp->fd_ofiles[fd]) == NULL)
- return (NULL);
- return (fp);
-}
-
-static inline int
-fdunwrap(int fd, cap_rights_t rights, struct filedesc *fdp, struct file **fpp)
-{
-
- *fpp = fdtofp(fd, fdp);
- if (*fpp == NULL)
- return (EBADF);
-
-#ifdef CAPABILITIES
- if ((*fpp)->f_type == DTYPE_CAPABILITY) {
- int err = cap_funwrap(*fpp, rights, fpp);
- if (err != 0) {
- *fpp = NULL;
- return (err);
- }
- }
-#endif /* CAPABILITIES */
- return (0);
-}
-
int
kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
{
struct filedesc *fdp;
struct flock *flp;
- struct file *fp;
+ struct file *fp, *fp2;
+ struct filedescent *fde;
struct proc *p;
- char *pop;
struct vnode *vp;
+ cap_rights_t rights;
int error, flg, tmp;
- int vfslocked;
- u_int old, new;
uint64_t bsize;
off_t foffset;
- vfslocked = 0;
error = 0;
flg = F_POSIX;
p = td->td_proc;
@@ -507,49 +525,44 @@
case F_GETFD:
FILEDESC_SLOCK(fdp);
- if ((fp = fdtofp(fd, fdp)) == NULL) {
+ if ((fp = fget_locked(fdp, fd)) == NULL) {
FILEDESC_SUNLOCK(fdp);
error = EBADF;
break;
}
- pop = &fdp->fd_ofileflags[fd];
- td->td_retval[0] = (*pop & UF_EXCLOSE) ? FD_CLOEXEC : 0;
+ fde = &fdp->fd_ofiles[fd];
+ td->td_retval[0] =
+ (fde->fde_flags & UF_EXCLOSE) ? FD_CLOEXEC : 0;
FILEDESC_SUNLOCK(fdp);
break;
case F_SETFD:
FILEDESC_XLOCK(fdp);
- if ((fp = fdtofp(fd, fdp)) == NULL) {
+ if ((fp = fget_locked(fdp, fd)) == NULL) {
FILEDESC_XUNLOCK(fdp);
error = EBADF;
break;
}
- pop = &fdp->fd_ofileflags[fd];
- *pop = (*pop &~ UF_EXCLOSE) |
+ fde = &fdp->fd_ofiles[fd];
+ fde->fde_flags = (fde->fde_flags & ~UF_EXCLOSE) |
(arg & FD_CLOEXEC ? UF_EXCLOSE : 0);
FILEDESC_XUNLOCK(fdp);
break;
case F_GETFL:
- FILEDESC_SLOCK(fdp);
- error = fdunwrap(fd, CAP_FCNTL, fdp, &fp);
- if (error != 0) {
- FILEDESC_SUNLOCK(fdp);
+ error = fget_unlocked(fdp, fd,
+ cap_rights_init(&rights, CAP_FCNTL), F_GETFL, &fp, NULL);
+ if (error != 0)
break;
- }
td->td_retval[0] = OFLAGS(fp->f_flag);
- FILEDESC_SUNLOCK(fdp);
+ fdrop(fp, td);
break;
case F_SETFL:
- FILEDESC_SLOCK(fdp);
- error = fdunwrap(fd, CAP_FCNTL, fdp, &fp);
- if (error != 0) {
- FILEDESC_SUNLOCK(fdp);
+ error = fget_unlocked(fdp, fd,
+ cap_rights_init(&rights, CAP_FCNTL), F_SETFL, &fp, NULL);
+ if (error != 0)
break;
- }
- fhold(fp);
- FILEDESC_SUNLOCK(fdp);
do {
tmp = flg = fp->f_flag;
tmp &= ~FCNTLFLAGS;
@@ -557,7 +570,7 @@
} while(atomic_cmpset_int(&fp->f_flag, flg, tmp) == 0);
tmp = fp->f_flag & FNONBLOCK;
error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
- if (error) {
+ if (error != 0) {
fdrop(fp, td);
break;
}
@@ -574,14 +587,10 @@
break;
case F_GETOWN:
- FILEDESC_SLOCK(fdp);
- error = fdunwrap(fd, CAP_FCNTL, fdp, &fp);
- if (error != 0) {
- FILEDESC_SUNLOCK(fdp);
+ error = fget_unlocked(fdp, fd,
+ cap_rights_init(&rights, CAP_FCNTL), F_GETOWN, &fp, NULL);
+ if (error != 0)
break;
- }
- fhold(fp);
- FILEDESC_SUNLOCK(fdp);
error = fo_ioctl(fp, FIOGETOWN, &tmp, td->td_ucred, td);
if (error == 0)
td->td_retval[0] = tmp;
@@ -589,14 +598,10 @@
break;
case F_SETOWN:
- FILEDESC_SLOCK(fdp);
- error = fdunwrap(fd, CAP_FCNTL, fdp, &fp);
- if (error != 0) {
- FILEDESC_SUNLOCK(fdp);
+ error = fget_unlocked(fdp, fd,
+ cap_rights_init(&rights, CAP_FCNTL), F_SETOWN, &fp, NULL);
+ if (error != 0)
break;
- }
- fhold(fp);
- FILEDESC_SUNLOCK(fdp);
tmp = arg;
error = fo_ioctl(fp, FIOSETOWN, &tmp, td->td_ucred, td);
fdrop(fp, td);
@@ -615,17 +620,16 @@
case F_SETLK:
do_setlk:
- FILEDESC_SLOCK(fdp);
- error = fdunwrap(fd, CAP_FLOCK, fdp, &fp);
- if (error != 0) {
- FILEDESC_SUNLOCK(fdp);
+ cap_rights_init(&rights, CAP_FLOCK);
+ error = fget_unlocked(fdp, fd, &rights, 0, &fp, NULL);
+ if (error != 0)
break;
- }
if (fp->f_type != DTYPE_VNODE) {
- FILEDESC_SUNLOCK(fdp);
error = EBADF;
+ fdrop(fp, td);
break;
}
+
flp = (struct flock *)arg;
if (flp->l_whence == SEEK_CUR) {
foffset = foffset_get(fp);
@@ -634,18 +638,13 @@
foffset > OFF_MAX - flp->l_start)) {
FILEDESC_SUNLOCK(fdp);
error = EOVERFLOW;
+ fdrop(fp, td);
break;
}
flp->l_start += foffset;
}
- /*
- * VOP_ADVLOCK() may block.
- */
- fhold(fp);
- FILEDESC_SUNLOCK(fdp);
vp = fp->f_vnode;
- vfslocked = VFS_LOCK_GIANT(vp->v_mount);
switch (flp->l_type) {
case F_RDLCK:
if ((fp->f_flag & FREAD) == 0) {
@@ -689,44 +688,60 @@
error = EINVAL;
break;
}
- VFS_UNLOCK_GIANT(vfslocked);
- vfslocked = 0;
- /* Check for race with close */
- FILEDESC_SLOCK(fdp);
- if ((unsigned) fd >= fdp->fd_nfiles ||
- fp != fdp->fd_ofiles[fd]) {
- FILEDESC_SUNLOCK(fdp);
+ if (error != 0 || flp->l_type == F_UNLCK ||
+ flp->l_type == F_UNLCKSYS) {
+ fdrop(fp, td);
+ break;
+ }
+
+ /*
+ * Check for a race with close.
+ *
+ * The vnode is now advisory locked (or unlocked, but this case
+ * is not really important) as the caller requested.
+ * We had to drop the filedesc lock, so we need to recheck if
+ * the descriptor is still valid, because if it was closed
+ * in the meantime we need to remove advisory lock from the
+ * vnode - close on any descriptor leading to an advisory
+ * locked vnode, removes that lock.
+ * We will return 0 on purpose in that case, as the result of
+ * successful advisory lock might have been externally visible
+ * already. This is fine - effectively we pretend to the caller
+ * that the closing thread was a bit slower and that the
+ * advisory lock succeeded before the close.
+ */
+ error = fget_unlocked(fdp, fd, &rights, 0, &fp2, NULL);
+ if (error != 0) {
+ fdrop(fp, td);
+ break;
+ }
+ if (fp != fp2) {
flp->l_whence = SEEK_SET;
flp->l_start = 0;
flp->l_len = 0;
flp->l_type = F_UNLCK;
- vfslocked = VFS_LOCK_GIANT(vp->v_mount);
(void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
- F_UNLCK, flp, F_POSIX);
- VFS_UNLOCK_GIANT(vfslocked);
- vfslocked = 0;
- } else
- FILEDESC_SUNLOCK(fdp);
+ F_UNLCK, flp, F_POSIX);
+ }
fdrop(fp, td);
+ fdrop(fp2, td);
break;
case F_GETLK:
- FILEDESC_SLOCK(fdp);
- error = fdunwrap(fd, CAP_FLOCK, fdp, &fp);
- if (error != 0) {
- FILEDESC_SUNLOCK(fdp);
+ error = fget_unlocked(fdp, fd,
+ cap_rights_init(&rights, CAP_FLOCK), 0, &fp, NULL);
+ if (error != 0)
break;
- }
if (fp->f_type != DTYPE_VNODE) {
- FILEDESC_SUNLOCK(fdp);
error = EBADF;
+ fdrop(fp, td);
break;
}
flp = (struct flock *)arg;
if (flp->l_type != F_RDLCK && flp->l_type != F_WRLCK &&
flp->l_type != F_UNLCK) {
- FILEDESC_SUNLOCK(fdp);
error = EINVAL;
+ fdrop(fp, td);
break;
}
if (flp->l_whence == SEEK_CUR) {
@@ -734,24 +749,17 @@
if ((flp->l_start > 0 &&
foffset > OFF_MAX - flp->l_start) ||
(flp->l_start < 0 &&
- foffset < OFF_MIN - flp->l_start)) {
+ foffset < OFF_MIN - flp->l_start)) {
FILEDESC_SUNLOCK(fdp);
error = EOVERFLOW;
+ fdrop(fp, td);
break;
}
flp->l_start += foffset;
}
- /*
- * VOP_ADVLOCK() may block.
- */
- fhold(fp);
- FILEDESC_SUNLOCK(fdp);
vp = fp->f_vnode;
- vfslocked = VFS_LOCK_GIANT(vp->v_mount);
error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK, flp,
F_POSIX);
- VFS_UNLOCK_GIANT(vfslocked);
- vfslocked = 0;
fdrop(fp, td);
break;
@@ -759,41 +767,32 @@
arg = arg ? 128 * 1024: 0;
/* FALLTHROUGH */
case F_READAHEAD:
- FILEDESC_SLOCK(fdp);
- if ((fp = fdtofp(fd, fdp)) == NULL) {
- FILEDESC_SUNLOCK(fdp);
- error = EBADF;
+ error = fget_unlocked(fdp, fd, NULL, 0, &fp, NULL);
+ if (error != 0)
break;
- }
if (fp->f_type != DTYPE_VNODE) {
- FILEDESC_SUNLOCK(fdp);
+ fdrop(fp, td);
error = EBADF;
break;
}
- fhold(fp);
- FILEDESC_SUNLOCK(fdp);
- if (arg != 0) {
- vp = fp->f_vnode;
- vfslocked = VFS_LOCK_GIANT(vp->v_mount);
- error = vn_lock(vp, LK_SHARED);
- if (error != 0)
- goto readahead_vnlock_fail;
+ vp = fp->f_vnode;
+ /*
+ * Exclusive lock synchronizes against f_seqcount reads and
+ * writes in sequential_heuristic().
+ */
+ error = vn_lock(vp, LK_EXCLUSIVE);
+ if (error != 0) {
+ fdrop(fp, td);
+ break;
+ }
+ if (arg >= 0) {
bsize = fp->f_vnode->v_mount->mnt_stat.f_iosize;
- VOP_UNLOCK(vp, 0);
fp->f_seqcount = (arg + bsize - 1) / bsize;
- do {
- new = old = fp->f_flag;
- new |= FRDAHEAD;
- } while (!atomic_cmpset_rel_int(&fp->f_flag, old, new));
-readahead_vnlock_fail:
- VFS_UNLOCK_GIANT(vfslocked);
- vfslocked = 0;
+ atomic_set_int(&fp->f_flag, FRDAHEAD);
} else {
- do {
- new = old = fp->f_flag;
- new &= ~FRDAHEAD;
- } while (!atomic_cmpset_rel_int(&fp->f_flag, old, new));
+ atomic_clear_int(&fp->f_flag, FRDAHEAD);
}
+ VOP_UNLOCK(vp, 0);
fdrop(fp, td);
break;
@@ -801,22 +800,34 @@
error = EINVAL;
break;
}
- VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
+static int
+getmaxfd(struct proc *p)
+{
+ int maxfd;
+
+ PROC_LOCK(p);
+ maxfd = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc);
+ PROC_UNLOCK(p);
+
+ return (maxfd);
+}
+
/*
* Common code for dup, dup2, fcntl(F_DUPFD) and fcntl(F_DUP2FD).
*/
-static int
+int
do_dup(struct thread *td, int flags, int old, int new,
register_t *retval)
{
struct filedesc *fdp;
+ struct filedescent *oldfde, *newfde;
struct proc *p;
struct file *fp;
struct file *delfp;
- int error, holdleaders, maxfd;
+ int error, maxfd;
p = td->td_proc;
fdp = p->p_fd;
@@ -830,33 +841,30 @@
return (EBADF);
if (new < 0)
return (flags & DUP_FCNTL ? EINVAL : EBADF);
- PROC_LOCK(p);
- maxfd = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc);
- PROC_UNLOCK(p);
+ maxfd = getmaxfd(p);
if (new >= maxfd)
return (flags & DUP_FCNTL ? EINVAL : EBADF);
FILEDESC_XLOCK(fdp);
- if (old >= fdp->fd_nfiles || fdp->fd_ofiles[old] == NULL) {
+ if (fget_locked(fdp, old) == NULL) {
FILEDESC_XUNLOCK(fdp);
return (EBADF);
}
+ oldfde = &fdp->fd_ofiles[old];
if (flags & DUP_FIXED && old == new) {
*retval = new;
if (flags & DUP_CLOEXEC)
- fdp->fd_ofileflags[new] |= UF_EXCLOSE;
+ fdp->fd_ofiles[new].fde_flags |= UF_EXCLOSE;
FILEDESC_XUNLOCK(fdp);
return (0);
}
- fp = fdp->fd_ofiles[old];
+ fp = oldfde->fde_file;
fhold(fp);
/*
* If the caller specified a file descriptor, make sure the file
* table is large enough to hold it, and grab it. Otherwise, just
- * allocate a new descriptor the usual way. Since the filedesc
- * lock may be temporarily dropped in the process, we have to look
- * out for a race.
+ * allocate a new descriptor the usual way.
*/
if (flags & DUP_FIXED) {
if (new >= fdp->fd_nfiles) {
@@ -869,18 +877,22 @@
* the limit on the size of the file descriptor table.
*/
#ifdef RACCT
- PROC_LOCK(p);
- error = racct_set(p, RACCT_NOFILE, new + 1);
- PROC_UNLOCK(p);
- if (error != 0) {
- FILEDESC_XUNLOCK(fdp);
- fdrop(fp, td);
- return (EMFILE);
+ if (racct_enable) {
+ PROC_LOCK(p);
+ error = racct_set(p, RACCT_NOFILE, new + 1);
+ PROC_UNLOCK(p);
+ if (error != 0) {
+ FILEDESC_XUNLOCK(fdp);
+ fdrop(fp, td);
+ return (EMFILE);
+ }
}
#endif
- fdgrowtable(fdp, new + 1);
+ fdgrowtable_exp(fdp, new + 1);
+ oldfde = &fdp->fd_ofiles[old];
}
- if (fdp->fd_ofiles[new] == NULL)
+ newfde = &fdp->fd_ofiles[new];
+ if (newfde->fde_file == NULL)
fdused(fdp, new);
} else {
if ((error = fdalloc(td, new, &new)) != 0) {
@@ -888,82 +900,39 @@
fdrop(fp, td);
return (error);
}
+ newfde = &fdp->fd_ofiles[new];
}
- /*
- * If the old file changed out from under us then treat it as a
- * bad file descriptor. Userland should do its own locking to
- * avoid this case.
- */
- if (fdp->fd_ofiles[old] != fp) {
- /* we've allocated a descriptor which we won't use */
- if (fdp->fd_ofiles[new] == NULL)
- fdunused(fdp, new);
- FILEDESC_XUNLOCK(fdp);
- fdrop(fp, td);
- return (EBADF);
- }
- KASSERT(old != new,
- ("new fd is same as old"));
+ KASSERT(fp == oldfde->fde_file, ("old fd has been modified"));
+ KASSERT(old != new, ("new fd is same as old"));
- /*
- * Save info on the descriptor being overwritten. We cannot close
- * it without introducing an ownership race for the slot, since we
- * need to drop the filedesc lock to call closef().
- *
- * XXX this duplicates parts of close().
- */
- delfp = fdp->fd_ofiles[new];
- holdleaders = 0;
- if (delfp != NULL) {
- if (td->td_proc->p_fdtol != NULL) {
- /*
- * Ask fdfree() to sleep to ensure that all relevant
- * process leaders can be traversed in closef().
- */
- fdp->fd_holdleaderscount++;
- holdleaders = 1;
- }
- }
+ delfp = newfde->fde_file;
/*
- * Duplicate the source descriptor
+ * Duplicate the source descriptor.
*/
- fdp->fd_ofiles[new] = fp;
+#ifdef CAPABILITIES
+ seq_write_begin(&newfde->fde_seq);
+#endif
+ filecaps_free(&newfde->fde_caps);
+ memcpy(newfde, oldfde, fde_change_size);
+ filecaps_copy(&oldfde->fde_caps, &newfde->fde_caps);
if ((flags & DUP_CLOEXEC) != 0)
- fdp->fd_ofileflags[new] = fdp->fd_ofileflags[old] | UF_EXCLOSE;
+ newfde->fde_flags = oldfde->fde_flags | UF_EXCLOSE;
else
- fdp->fd_ofileflags[new] = fdp->fd_ofileflags[old] & ~UF_EXCLOSE;
- if (new > fdp->fd_lastfile)
- fdp->fd_lastfile = new;
+ newfde->fde_flags = oldfde->fde_flags & ~UF_EXCLOSE;
+#ifdef CAPABILITIES
+ seq_write_end(&newfde->fde_seq);
+#endif
*retval = new;
- /*
- * If we dup'd over a valid file, we now own the reference to it
- * and must dispose of it using closef() semantics (as if a
- * close() were performed on it).
- *
- * XXX this duplicates parts of close().
- */
if (delfp != NULL) {
- knote_fdclose(td, new);
- if (delfp->f_type == DTYPE_MQUEUE)
- mq_fdclose(td, new, delfp);
- FILEDESC_XUNLOCK(fdp);
- (void) closef(delfp, td);
- if (holdleaders) {
- FILEDESC_XLOCK(fdp);
- fdp->fd_holdleaderscount--;
- if (fdp->fd_holdleaderscount == 0 &&
- fdp->fd_holdleaderswakeup != 0) {
- fdp->fd_holdleaderswakeup = 0;
- wakeup(&fdp->fd_holdleaderscount);
- }
- FILEDESC_XUNLOCK(fdp);
- }
+ (void) closefp(fdp, new, delfp, td, 1);
+ /* closefp() drops the FILEDESC lock for us. */
} else {
FILEDESC_XUNLOCK(fdp);
}
+
return (0);
}
@@ -988,13 +957,13 @@
struct pgrp *pg = (sigio)->sio_pgrp;
PGRP_LOCK(pg);
SLIST_REMOVE(&sigio->sio_pgrp->pg_sigiolst, sigio,
- sigio, sio_pgsigio);
+ sigio, sio_pgsigio);
PGRP_UNLOCK(pg);
} else {
struct proc *p = (sigio)->sio_proc;
PROC_LOCK(p);
SLIST_REMOVE(&sigio->sio_proc->p_sigiolst, sigio,
- sigio, sio_pgsigio);
+ sigio, sio_pgsigio);
PROC_UNLOCK(p);
}
SIGIO_UNLOCK();
@@ -1187,56 +1156,27 @@
}
/*
- * Close a file descriptor.
+ * Function drops the filedesc lock on return.
*/
-#ifndef _SYS_SYSPROTO_H_
-struct close_args {
- int fd;
-};
-#endif
-/* ARGSUSED */
-int
-sys_close(td, uap)
- struct thread *td;
- struct close_args *uap;
+static int
+closefp(struct filedesc *fdp, int fd, struct file *fp, struct thread *td,
+ int holdleaders)
{
-
- return (kern_close(td, uap->fd));
-}
-
-int
-kern_close(td, fd)
- struct thread *td;
- int fd;
-{
- struct filedesc *fdp;
- struct file *fp, *fp_object;
int error;
- int holdleaders;
- error = 0;
- holdleaders = 0;
- fdp = td->td_proc->p_fd;
+ FILEDESC_XLOCK_ASSERT(fdp);
- AUDIT_SYSCLOSE(td, fd);
-
- FILEDESC_XLOCK(fdp);
- if ((unsigned)fd >= fdp->fd_nfiles ||
- (fp = fdp->fd_ofiles[fd]) == NULL) {
- FILEDESC_XUNLOCK(fdp);
- return (EBADF);
+ if (holdleaders) {
+ if (td->td_proc->p_fdtol != NULL) {
+ /*
+ * Ask fdfree() to sleep to ensure that all relevant
+ * process leaders can be traversed in closef().
+ */
+ fdp->fd_holdleaderscount++;
+ } else {
+ holdleaders = 0;
+ }
}
- fdp->fd_ofiles[fd] = NULL;
- fdp->fd_ofileflags[fd] = 0;
- fdunused(fdp, fd);
- if (td->td_proc->p_fdtol != NULL) {
- /*
- * Ask fdfree() to sleep to ensure that all relevant
- * process leaders can be traversed in closef().
- */
- fdp->fd_holdleaderscount++;
- holdleaders = 1;
- }
/*
* We now hold the fp reference that used to be owned by the
@@ -1247,12 +1187,10 @@
knote_fdclose(td, fd);
/*
- * When we're closing an fd with a capability, we need to notify
- * mqueue if the underlying object is of type mqueue.
+ * We need to notify mqueue if the object is of type mqueue.
*/
- (void)cap_funwrap(fp, 0, &fp_object);
- if (fp_object->f_type == DTYPE_MQUEUE)
- mq_fdclose(td, fd, fp_object);
+ if (fp->f_type == DTYPE_MQUEUE)
+ mq_fdclose(td, fd, fp);
FILEDESC_XUNLOCK(fdp);
error = closef(fp, td);
@@ -1270,6 +1208,43 @@
}
/*
+ * Close a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct close_args {
+ int fd;
+};
+#endif
+/* ARGSUSED */
+int
+sys_close(struct thread *td, struct close_args *uap)
+{
+
+ return (kern_close(td, uap->fd));
+}
+
+int
+kern_close(struct thread *td, int fd)
+{
+ struct filedesc *fdp;
+ struct file *fp;
+
+ fdp = td->td_proc->p_fd;
+
+ AUDIT_SYSCLOSE(td, fd);
+
+ FILEDESC_XLOCK(fdp);
+ if ((fp = fget_locked(fdp, fd)) == NULL) {
+ FILEDESC_XUNLOCK(fdp);
+ return (EBADF);
+ }
+ fdfree(fdp, fd);
+
+ /* closefp() drops the FILEDESC lock for us. */
+ return (closefp(fdp, fd, fp, td, 1));
+}
+
+/*
* Close open file descriptors.
*/
#ifndef _SYS_SYSPROTO_H_
@@ -1294,8 +1269,8 @@
if (uap->lowfd < 0)
uap->lowfd = 0;
FILEDESC_SLOCK(fdp);
- for (fd = uap->lowfd; fd < fdp->fd_nfiles; fd++) {
- if (fdp->fd_ofiles[fd] != NULL) {
+ for (fd = uap->lowfd; fd <= fdp->fd_lastfile; fd++) {
+ if (fdp->fd_ofiles[fd].fde_file != NULL) {
FILEDESC_SUNLOCK(fdp);
(void)kern_close(td, fd);
FILEDESC_SLOCK(fdp);
@@ -1358,11 +1333,13 @@
kern_fstat(struct thread *td, int fd, struct stat *sbp)
{
struct file *fp;
+ cap_rights_t rights;
int error;
AUDIT_ARG_FD(fd);
- if ((error = fget(td, fd, CAP_FSTAT, &fp)) != 0)
+ error = fget(td, fd, cap_rights_init(&rights, CAP_FSTAT), &fp);
+ if (error != 0)
return (error);
AUDIT_ARG_FILE(td->td_proc, fp);
@@ -1416,9 +1393,11 @@
{
struct file *fp;
struct vnode *vp;
+ cap_rights_t rights;
int error;
- if ((error = fget(td, uap->fd, CAP_FPATHCONF, &fp)) != 0)
+ error = fget(td, uap->fd, cap_rights_init(&rights, CAP_FPATHCONF), &fp);
+ if (error != 0)
return (error);
/* If asynchronous I/O is available, it works for all descriptors. */
@@ -1428,18 +1407,15 @@
}
vp = fp->f_vnode;
if (vp != NULL) {
- int vfslocked;
- vfslocked = VFS_LOCK_GIANT(vp->v_mount);
vn_lock(vp, LK_SHARED | LK_RETRY);
error = VOP_PATHCONF(vp, uap->name, td->td_retval);
VOP_UNLOCK(vp, 0);
- VFS_UNLOCK_GIANT(vfslocked);
} else if (fp->f_type == DTYPE_PIPE || fp->f_type == DTYPE_SOCKET) {
if (uap->name != _PC_PIPE_BUF) {
error = EINVAL;
} else {
td->td_retval[0] = PIPE_BUF;
- error = 0;
+ error = 0;
}
} else {
error = EOPNOTSUPP;
@@ -1450,80 +1426,190 @@
}
/*
- * Grow the file table to accomodate (at least) nfd descriptors. This may
- * block and drop the filedesc lock, but it will reacquire it before
- * returning.
+ * Initialize filecaps structure.
*/
+void
+filecaps_init(struct filecaps *fcaps)
+{
+
+ bzero(fcaps, sizeof(*fcaps));
+ fcaps->fc_nioctls = -1;
+}
+
+/*
+ * Copy filecaps structure allocating memory for ioctls array if needed.
+ */
+void
+filecaps_copy(const struct filecaps *src, struct filecaps *dst)
+{
+ size_t size;
+
+ *dst = *src;
+ if (src->fc_ioctls != NULL) {
+ KASSERT(src->fc_nioctls > 0,
+ ("fc_ioctls != NULL, but fc_nioctls=%hd", src->fc_nioctls));
+
+ size = sizeof(src->fc_ioctls[0]) * src->fc_nioctls;
+ dst->fc_ioctls = malloc(size, M_FILECAPS, M_WAITOK);
+ bcopy(src->fc_ioctls, dst->fc_ioctls, size);
+ }
+}
+
+/*
+ * Move filecaps structure to the new place and clear the old place.
+ */
+void
+filecaps_move(struct filecaps *src, struct filecaps *dst)
+{
+
+ *dst = *src;
+ bzero(src, sizeof(*src));
+}
+
+/*
+ * Fill the given filecaps structure with full rights.
+ */
static void
+filecaps_fill(struct filecaps *fcaps)
+{
+
+ CAP_ALL(&fcaps->fc_rights);
+ fcaps->fc_ioctls = NULL;
+ fcaps->fc_nioctls = -1;
+ fcaps->fc_fcntls = CAP_FCNTL_ALL;
+}
+
+/*
+ * Free memory allocated within filecaps structure.
+ */
+void
+filecaps_free(struct filecaps *fcaps)
+{
+
+ free(fcaps->fc_ioctls, M_FILECAPS);
+ bzero(fcaps, sizeof(*fcaps));
+}
+
+/*
+ * Validate the given filecaps structure.
+ */
+static void
+filecaps_validate(const struct filecaps *fcaps, const char *func)
+{
+
+ KASSERT(cap_rights_is_valid(&fcaps->fc_rights),
+ ("%s: invalid rights", func));
+ KASSERT((fcaps->fc_fcntls & ~CAP_FCNTL_ALL) == 0,
+ ("%s: invalid fcntls", func));
+ KASSERT(fcaps->fc_fcntls == 0 ||
+ cap_rights_is_set(&fcaps->fc_rights, CAP_FCNTL),
+ ("%s: fcntls without CAP_FCNTL", func));
+ KASSERT(fcaps->fc_ioctls != NULL ? fcaps->fc_nioctls > 0 :
+ (fcaps->fc_nioctls == -1 || fcaps->fc_nioctls == 0),
+ ("%s: invalid ioctls", func));
+ KASSERT(fcaps->fc_nioctls == 0 ||
+ cap_rights_is_set(&fcaps->fc_rights, CAP_IOCTL),
+ ("%s: ioctls without CAP_IOCTL", func));
+}
+
+static void
+fdgrowtable_exp(struct filedesc *fdp, int nfd)
+{
+ int nfd1;
+
+ FILEDESC_XLOCK_ASSERT(fdp);
+
+ nfd1 = fdp->fd_nfiles * 2;
+ if (nfd1 < nfd)
+ nfd1 = nfd;
+ fdgrowtable(fdp, nfd1);
+}
+
+/*
+ * Grow the file table to accommodate (at least) nfd descriptors.
+ */
+static void
fdgrowtable(struct filedesc *fdp, int nfd)
{
struct filedesc0 *fdp0;
- struct freetable *fo;
- struct file **ntable;
- struct file **otable;
- char *nfileflags;
+ struct freetable *ft;
+ struct filedescent *ntable;
+ struct filedescent *otable;
int nnfiles, onfiles;
- NDSLOTTYPE *nmap;
+ NDSLOTTYPE *nmap, *omap;
FILEDESC_XLOCK_ASSERT(fdp);
- KASSERT(fdp->fd_nfiles > 0,
- ("zero-length file table"));
+ KASSERT(fdp->fd_nfiles > 0, ("zero-length file table"));
+ /* save old values */
+ onfiles = fdp->fd_nfiles;
+ otable = fdp->fd_ofiles;
+ omap = fdp->fd_map;
+
/* compute the size of the new table */
- onfiles = fdp->fd_nfiles;
nnfiles = NDSLOTS(nfd) * NDENTRIES; /* round up */
if (nnfiles <= onfiles)
/* the table is already large enough */
return;
- /* allocate a new table and (if required) new bitmaps */
- FILEDESC_XUNLOCK(fdp);
- ntable = malloc((nnfiles * OFILESIZE) + sizeof(struct freetable),
+ /*
+ * Allocate a new table. We need enough space for the
+ * file entries themselves and the struct freetable we will use
+ * when we decommission the table and place it on the freelist.
+ * We place the struct freetable in the middle so we don't have
+ * to worry about padding.
+ */
+ ntable = malloc(nnfiles * sizeof(ntable[0]) + sizeof(struct freetable),
M_FILEDESC, M_ZERO | M_WAITOK);
- nfileflags = (char *)&ntable[nnfiles];
- if (NDSLOTS(nnfiles) > NDSLOTS(onfiles))
- nmap = malloc(NDSLOTS(nnfiles) * NDSLOTSIZE,
- M_FILEDESC, M_ZERO | M_WAITOK);
- else
- nmap = NULL;
- FILEDESC_XLOCK(fdp);
+ /* copy the old data over and point at the new tables */
+ memcpy(ntable, otable, onfiles * sizeof(*otable));
+ fdp->fd_ofiles = ntable;
/*
- * We now have new tables ready to go. Since we dropped the
- * filedesc lock to call malloc(), watch out for a race.
+ * Allocate a new map only if the old is not large enough. It will
+ * grow at a slower rate than the table as it can map more
+ * entries than the table can hold.
*/
- onfiles = fdp->fd_nfiles;
- if (onfiles >= nnfiles) {
- /* we lost the race, but that's OK */
- free(ntable, M_FILEDESC);
- if (nmap != NULL)
- free(nmap, M_FILEDESC);
- return;
+ if (NDSLOTS(nnfiles) > NDSLOTS(onfiles)) {
+ nmap = malloc(NDSLOTS(nnfiles) * NDSLOTSIZE, M_FILEDESC,
+ M_ZERO | M_WAITOK);
+ /* copy over the old data and update the pointer */
+ memcpy(nmap, omap, NDSLOTS(onfiles) * sizeof(*omap));
+ fdp->fd_map = nmap;
}
- bcopy(fdp->fd_ofiles, ntable, onfiles * sizeof(*ntable));
- bcopy(fdp->fd_ofileflags, nfileflags, onfiles);
- otable = fdp->fd_ofiles;
- fdp->fd_ofileflags = nfileflags;
- fdp->fd_ofiles = ntable;
+
/*
- * We must preserve ofiles until the process exits because we can't
- * be certain that no threads have references to the old table via
- * _fget().
+ * In order to have a valid pattern for fget_unlocked()
+ * fdp->fd_nfiles must be the last member to be updated, otherwise
+ * fget_unlocked() consumers may reference a new, higher value for
+ * fdp->fd_nfiles before to access the fdp->fd_ofiles array,
+ * resulting in OOB accesses.
*/
+ atomic_store_rel_int(&fdp->fd_nfiles, nnfiles);
+
+ /*
+ * Do not free the old file table, as some threads may still
+ * reference entries within it. Instead, place it on a freelist
+ * which will be processed when the struct filedesc is released.
+ *
+ * Note that if onfiles == NDFILE, we're dealing with the original
+ * static allocation contained within (struct filedesc0 *)fdp,
+ * which must not be freed.
+ */
if (onfiles > NDFILE) {
- fo = (struct freetable *)&otable[onfiles];
+ ft = (struct freetable *)&otable[onfiles];
fdp0 = (struct filedesc0 *)fdp;
- fo->ft_table = otable;
- SLIST_INSERT_HEAD(&fdp0->fd_free, fo, ft_next);
+ ft->ft_table = otable;
+ SLIST_INSERT_HEAD(&fdp0->fd_free, ft, ft_next);
}
- if (NDSLOTS(nnfiles) > NDSLOTS(onfiles)) {
- bcopy(fdp->fd_map, nmap, NDSLOTS(onfiles) * sizeof(*nmap));
- if (NDSLOTS(onfiles) > NDSLOTS(NDFILE))
- free(fdp->fd_map, M_FILEDESC);
- fdp->fd_map = nmap;
- }
- fdp->fd_nfiles = nnfiles;
+ /*
+ * The map does not have the same possibility of threads still
+ * holding references to it. So always free it as long as it
+ * does not reference the original static allocation.
+ */
+ if (NDSLOTS(onfiles) > NDSLOTS(NDFILE))
+ free(omap, M_FILEDESC);
}
/*
@@ -1534,7 +1620,7 @@
{
struct proc *p = td->td_proc;
struct filedesc *fdp = p->p_fd;
- int fd = -1, maxfd;
+ int fd = -1, maxfd, allocfd;
#ifdef RACCT
int error;
#endif
@@ -1544,30 +1630,31 @@
if (fdp->fd_freefile > minfd)
minfd = fdp->fd_freefile;
- PROC_LOCK(p);
- maxfd = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc);
- PROC_UNLOCK(p);
+ maxfd = getmaxfd(p);
/*
- * Search the bitmap for a free descriptor. If none is found, try
- * to grow the file table. Keep at it until we either get a file
- * descriptor or run into process or system limits; fdgrowtable()
- * may drop the filedesc lock, so we're in a race.
+ * Search the bitmap for a free descriptor starting at minfd.
+ * If none is found, grow the file table.
*/
- for (;;) {
- fd = fd_first_free(fdp, minfd, fdp->fd_nfiles);
- if (fd >= maxfd)
- return (EMFILE);
- if (fd < fdp->fd_nfiles)
- break;
+ fd = fd_first_free(fdp, minfd, fdp->fd_nfiles);
+ if (fd >= maxfd)
+ return (EMFILE);
+ if (fd >= fdp->fd_nfiles) {
+ allocfd = min(fd * 2, maxfd);
#ifdef RACCT
- PROC_LOCK(p);
- error = racct_set(p, RACCT_NOFILE, min(fdp->fd_nfiles * 2, maxfd));
- PROC_UNLOCK(p);
- if (error != 0)
- return (EMFILE);
+ if (racct_enable) {
+ PROC_LOCK(p);
+ error = racct_set(p, RACCT_NOFILE, allocfd);
+ PROC_UNLOCK(p);
+ if (error != 0)
+ return (EMFILE);
+ }
#endif
- fdgrowtable(fdp, min(fdp->fd_nfiles * 2, maxfd));
+ /*
+ * fd is already equal to first free descriptor >= minfd, so
+ * we only need to grow the table and we are done.
+ */
+ fdgrowtable_exp(fdp, allocfd);
}
/*
@@ -1574,11 +1661,13 @@
* Perform some sanity checks, then mark the file descriptor as
* used and return it to the caller.
*/
+ KASSERT(fd >= 0 && fd < min(maxfd, fdp->fd_nfiles),
+ ("invalid descriptor %d", fd));
KASSERT(!fdisused(fdp, fd),
("fd_first_free() returned non-free descriptor"));
- KASSERT(fdp->fd_ofiles[fd] == NULL,
- ("free descriptor isn't"));
- fdp->fd_ofileflags[fd] = 0; /* XXX needed? */
+ KASSERT(fdp->fd_ofiles[fd].fde_file == NULL,
+ ("file descriptor isn't free"));
+ KASSERT(fdp->fd_ofiles[fd].fde_flags == 0, ("file flags are set"));
fdused(fdp, fd);
*result = fd;
return (0);
@@ -1585,6 +1674,34 @@
}
/*
+ * Allocate n file descriptors for the process.
+ */
+int
+fdallocn(struct thread *td, int minfd, int *fds, int n)
+{
+ struct proc *p = td->td_proc;
+ struct filedesc *fdp = p->p_fd;
+ int i;
+
+ FILEDESC_XLOCK_ASSERT(fdp);
+
+ if (!fdavail(td, n))
+ return (EMFILE);
+
+ for (i = 0; i < n; i++)
+ if (fdalloc(td, 0, &fds[i]) != 0)
+ break;
+
+ if (i < n) {
+ for (i--; i >= 0; i--)
+ fdunused(fdp, fds[i]);
+ return (EMFILE);
+ }
+
+ return (0);
+}
+
+/*
* Check to see whether n user file descriptors are available to the process
* p.
*/
@@ -1593,7 +1710,6 @@
{
struct proc *p = td->td_proc;
struct filedesc *fdp = td->td_proc->p_fd;
- struct file **fpp;
int i, lim, last;
FILEDESC_LOCK_ASSERT(fdp);
@@ -1603,15 +1719,12 @@
* call racct_add() from there instead of dealing with containers
* here.
*/
- PROC_LOCK(p);
- lim = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc);
- PROC_UNLOCK(p);
+ lim = getmaxfd(p);
if ((i = lim - fdp->fd_nfiles) > 0 && (n -= i) <= 0)
return (1);
last = min(fdp->fd_nfiles, lim);
- fpp = &fdp->fd_ofiles[fdp->fd_freefile];
- for (i = last - fdp->fd_freefile; --i >= 0; fpp++) {
- if (*fpp == NULL && --n <= 0)
+ for (i = fdp->fd_freefile; i < last; i++) {
+ if (fdp->fd_ofiles[i].fde_file == NULL && --n <= 0)
return (1);
}
return (0);
@@ -1618,7 +1731,7 @@
}
/*
- * Create a new open file structure and allocate a file decriptor for the
+ * Create a new open file structure and allocate a file descriptor for the
* process that refers to it. We add one reference to the file for the
* descriptor table and one reference for resultfp. This is to prevent us
* being preempted and the entry in the descriptor table closed after we
@@ -1634,7 +1747,7 @@
if (error)
return (error); /* no reference held on error */
- error = finstall(td, fp, &fd, flags);
+ error = finstall(td, fp, &fd, flags, NULL);
if (error) {
fdrop(fp, td); /* one reference (fp only) */
return (error);
@@ -1668,8 +1781,8 @@
priv_check(td, PRIV_MAXFILES) != 0) ||
openfiles >= maxfiles) {
if (ppsratecheck(&lastfail, &curfail, 1)) {
- printf("kern.maxfiles limit exceeded by uid %i, "
- "please see tuning(7).\n", td->td_ucred->cr_ruid);
+ printf("kern.maxfiles limit exceeded by uid %i, (%s) "
+ "please see tuning(7).\n", td->td_ucred->cr_ruid, td->td_proc->p_comm);
}
return (ENFILE);
}
@@ -1688,13 +1801,17 @@
* Install a file in a file descriptor table.
*/
int
-finstall(struct thread *td, struct file *fp, int *fd, int flags)
+finstall(struct thread *td, struct file *fp, int *fd, int flags,
+ struct filecaps *fcaps)
{
struct filedesc *fdp = td->td_proc->p_fd;
+ struct filedescent *fde;
int error;
KASSERT(fd != NULL, ("%s: fd == NULL", __func__));
KASSERT(fp != NULL, ("%s: fp == NULL", __func__));
+ if (fcaps != NULL)
+ filecaps_validate(fcaps, __func__);
FILEDESC_XLOCK(fdp);
if ((error = fdalloc(td, 0, fd))) {
@@ -1702,9 +1819,20 @@
return (error);
}
fhold(fp);
- fdp->fd_ofiles[*fd] = fp;
+ fde = &fdp->fd_ofiles[*fd];
+#ifdef CAPABILITIES
+ seq_write_begin(&fde->fde_seq);
+#endif
+ fde->fde_file = fp;
if ((flags & O_CLOEXEC) != 0)
- fdp->fd_ofileflags[*fd] |= UF_EXCLOSE;
+ fde->fde_flags |= UF_EXCLOSE;
+ if (fcaps != NULL)
+ filecaps_move(fcaps, &fde->fde_caps);
+ else
+ filecaps_fill(&fde->fde_caps);
+#ifdef CAPABILITIES
+ seq_write_end(&fde->fde_seq);
+#endif
FILEDESC_XUNLOCK(fdp);
return (0);
}
@@ -1721,7 +1849,7 @@
newfdp = malloc(sizeof *newfdp, M_FILEDESC, M_WAITOK | M_ZERO);
FILEDESC_LOCK_INIT(&newfdp->fd_fd);
if (fdp != NULL) {
- FILEDESC_XLOCK(fdp);
+ FILEDESC_SLOCK(fdp);
newfdp->fd_fd.fd_cdir = fdp->fd_cdir;
if (newfdp->fd_fd.fd_cdir)
VREF(newfdp->fd_fd.fd_cdir);
@@ -1731,7 +1859,7 @@
newfdp->fd_fd.fd_jdir = fdp->fd_jdir;
if (newfdp->fd_fd.fd_jdir)
VREF(newfdp->fd_fd.fd_jdir);
- FILEDESC_XUNLOCK(fdp);
+ FILEDESC_SUNLOCK(fdp);
}
/* Create the file descriptor table. */
@@ -1739,7 +1867,6 @@
newfdp->fd_fd.fd_holdcnt = 1;
newfdp->fd_fd.fd_cmask = CMASK;
newfdp->fd_fd.fd_ofiles = newfdp->fd_dfiles;
- newfdp->fd_fd.fd_ofileflags = newfdp->fd_dfileflags;
newfdp->fd_fd.fd_nfiles = NDFILE;
newfdp->fd_fd.fd_map = newfdp->fd_dmap;
newfdp->fd_fd.fd_lastfile = -1;
@@ -1798,19 +1925,17 @@
* Unshare a filedesc structure, if necessary by making a copy
*/
void
-fdunshare(struct proc *p, struct thread *td)
+fdunshare(struct thread *td)
{
+ struct filedesc *tmp;
+ struct proc *p = td->td_proc;
- FILEDESC_XLOCK(p->p_fd);
- if (p->p_fd->fd_refcnt > 1) {
- struct filedesc *tmp;
+ if (p->p_fd->fd_refcnt == 1)
+ return;
- FILEDESC_XUNLOCK(p->p_fd);
- tmp = fdcopy(p->p_fd);
- fdfree(td);
- p->p_fd = tmp;
- } else
- FILEDESC_XUNLOCK(p->p_fd);
+ tmp = fdcopy(p->p_fd);
+ fdescfree(td);
+ p->p_fd = tmp;
}
/*
@@ -1821,6 +1946,7 @@
fdcopy(struct filedesc *fdp)
{
struct filedesc *newfdp;
+ struct filedescent *nfde, *ofde;
int i;
/* Certain daemons might not have file descriptors. */
@@ -1839,12 +1965,14 @@
/* copy all passable descriptors (i.e. not kqueue) */
newfdp->fd_freefile = -1;
for (i = 0; i <= fdp->fd_lastfile; ++i) {
+ ofde = &fdp->fd_ofiles[i];
if (fdisused(fdp, i) &&
- (fdp->fd_ofiles[i]->f_ops->fo_flags & DFLAG_PASSABLE) &&
- fdp->fd_ofiles[i]->f_ops != &badfileops) {
- newfdp->fd_ofiles[i] = fdp->fd_ofiles[i];
- newfdp->fd_ofileflags[i] = fdp->fd_ofileflags[i];
- fhold(newfdp->fd_ofiles[i]);
+ (ofde->fde_file->f_ops->fo_flags & DFLAG_PASSABLE) &&
+ ofde->fde_file->f_ops != &badfileops) {
+ nfde = &newfdp->fd_ofiles[i];
+ *nfde = *ofde;
+ filecaps_copy(&ofde->fde_caps, &nfde->fde_caps);
+ fhold(nfde->fde_file);
newfdp->fd_lastfile = i;
} else {
if (newfdp->fd_freefile == -1)
@@ -1854,9 +1982,10 @@
newfdp->fd_cmask = fdp->fd_cmask;
FILEDESC_SUNLOCK(fdp);
FILEDESC_XLOCK(newfdp);
- for (i = 0; i <= newfdp->fd_lastfile; ++i)
- if (newfdp->fd_ofiles[i] != NULL)
+ for (i = 0; i <= newfdp->fd_lastfile; ++i) {
+ if (newfdp->fd_ofiles[i].fde_file != NULL)
fdused(newfdp, i);
+ }
if (newfdp->fd_freefile == -1)
newfdp->fd_freefile = i;
FILEDESC_XUNLOCK(newfdp);
@@ -1867,11 +1996,10 @@
* Release a filedesc structure.
*/
void
-fdfree(struct thread *td)
+fdescfree(struct thread *td)
{
struct filedesc *fdp;
- struct file **fpp;
- int i, locked;
+ int i;
struct filedesc_to_leader *fdtol;
struct file *fp;
struct vnode *cdir, *jdir, *rdir, *vp;
@@ -1883,9 +2011,11 @@
return;
#ifdef RACCT
- PROC_LOCK(td->td_proc);
- racct_set(td->td_proc, RACCT_NOFILE, 0);
- PROC_UNLOCK(td->td_proc);
+ if (racct_enable) {
+ PROC_LOCK(td->td_proc);
+ racct_set(td->td_proc, RACCT_NOFILE, 0);
+ PROC_UNLOCK(td->td_proc);
+ }
#endif
/* Check for special need to clear POSIX style locks */
@@ -1893,17 +2023,14 @@
if (fdtol != NULL) {
FILEDESC_XLOCK(fdp);
KASSERT(fdtol->fdl_refcount > 0,
- ("filedesc_to_refcount botch: fdl_refcount=%d",
- fdtol->fdl_refcount));
+ ("filedesc_to_refcount botch: fdl_refcount=%d",
+ fdtol->fdl_refcount));
if (fdtol->fdl_refcount == 1 &&
(td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
- for (i = 0, fpp = fdp->fd_ofiles;
- i <= fdp->fd_lastfile;
- i++, fpp++) {
- if (*fpp == NULL ||
- (*fpp)->f_type != DTYPE_VNODE)
+ for (i = 0; i <= fdp->fd_lastfile; i++) {
+ fp = fdp->fd_ofiles[i].fde_file;
+ if (fp == NULL || fp->f_type != DTYPE_VNODE)
continue;
- fp = *fpp;
fhold(fp);
FILEDESC_XUNLOCK(fdp);
lf.l_whence = SEEK_SET;
@@ -1911,17 +2038,11 @@
lf.l_len = 0;
lf.l_type = F_UNLCK;
vp = fp->f_vnode;
- locked = VFS_LOCK_GIANT(vp->v_mount);
(void) VOP_ADVLOCK(vp,
- (caddr_t)td->td_proc->
- p_leader,
- F_UNLCK,
- &lf,
- F_POSIX);
- VFS_UNLOCK_GIANT(locked);
+ (caddr_t)td->td_proc->p_leader, F_UNLCK,
+ &lf, F_POSIX);
FILEDESC_XLOCK(fdp);
fdrop(fp, td);
- fpp = fdp->fd_ofiles + i;
}
}
retry:
@@ -1960,36 +2081,18 @@
if (fdtol != NULL)
free(fdtol, M_FILEDESC_TO_LEADER);
}
- FILEDESC_XLOCK(fdp);
- i = --fdp->fd_refcnt;
- FILEDESC_XUNLOCK(fdp);
- if (i > 0)
- return;
- fpp = fdp->fd_ofiles;
- for (i = fdp->fd_lastfile; i-- >= 0; fpp++) {
- if (*fpp) {
- FILEDESC_XLOCK(fdp);
- fp = *fpp;
- *fpp = NULL;
- FILEDESC_XUNLOCK(fdp);
- (void) closef(fp, td);
- }
- }
- FILEDESC_XLOCK(fdp);
-
- /* XXX This should happen earlier. */
mtx_lock(&fdesc_mtx);
td->td_proc->p_fd = NULL;
mtx_unlock(&fdesc_mtx);
- if (fdp->fd_nfiles > NDFILE)
- free(fdp->fd_ofiles, M_FILEDESC);
- if (NDSLOTS(fdp->fd_nfiles) > NDSLOTS(NDFILE))
- free(fdp->fd_map, M_FILEDESC);
+ FILEDESC_XLOCK(fdp);
+ i = --fdp->fd_refcnt;
+ if (i > 0) {
+ FILEDESC_XUNLOCK(fdp);
+ return;
+ }
- fdp->fd_nfiles = 0;
-
cdir = fdp->fd_cdir;
fdp->fd_cdir = NULL;
rdir = fdp->fd_rdir;
@@ -1998,21 +2101,25 @@
fdp->fd_jdir = NULL;
FILEDESC_XUNLOCK(fdp);
- if (cdir) {
- locked = VFS_LOCK_GIANT(cdir->v_mount);
+ for (i = 0; i <= fdp->fd_lastfile; i++) {
+ fp = fdp->fd_ofiles[i].fde_file;
+ if (fp != NULL) {
+ fdfree_last(fdp, i);
+ (void) closef(fp, td);
+ }
+ }
+
+ if (fdp->fd_nfiles > NDFILE)
+ free(fdp->fd_ofiles, M_FILEDESC);
+ if (NDSLOTS(fdp->fd_nfiles) > NDSLOTS(NDFILE))
+ free(fdp->fd_map, M_FILEDESC);
+
+ if (cdir != NULL)
vrele(cdir);
- VFS_UNLOCK_GIANT(locked);
- }
- if (rdir) {
- locked = VFS_LOCK_GIANT(rdir->v_mount);
+ if (rdir != NULL)
vrele(rdir);
- VFS_UNLOCK_GIANT(locked);
- }
- if (jdir) {
- locked = VFS_LOCK_GIANT(jdir->v_mount);
+ if (jdir != NULL)
vrele(jdir);
- VFS_UNLOCK_GIANT(locked);
- }
fddrop(fdp);
}
@@ -2045,33 +2152,23 @@
setugidsafety(struct thread *td)
{
struct filedesc *fdp;
+ struct file *fp;
int i;
- /* Certain daemons might not have file descriptors. */
fdp = td->td_proc->p_fd;
- if (fdp == NULL)
- return;
-
- /*
- * Note: fdp->fd_ofiles may be reallocated out from under us while
- * we are blocked in a close. Be careful!
- */
+ KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared"));
FILEDESC_XLOCK(fdp);
for (i = 0; i <= fdp->fd_lastfile; i++) {
if (i > 2)
break;
- if (fdp->fd_ofiles[i] && is_unsafe(fdp->fd_ofiles[i])) {
- struct file *fp;
-
+ fp = fdp->fd_ofiles[i].fde_file;
+ if (fp != NULL && is_unsafe(fp)) {
knote_fdclose(td, i);
/*
* NULL-out descriptor prior to close to avoid
* a race while close blocks.
*/
- fp = fdp->fd_ofiles[i];
- fdp->fd_ofiles[i] = NULL;
- fdp->fd_ofileflags[i] = 0;
- fdunused(fdp, i);
+ fdfree(fdp, i);
FILEDESC_XUNLOCK(fdp);
(void) closef(fp, td);
FILEDESC_XLOCK(fdp);
@@ -2088,13 +2185,13 @@
* file descriptor out from under the thread creating the file object.
*/
void
-fdclose(struct filedesc *fdp, struct file *fp, int idx, struct thread *td)
+fdclose(struct thread *td, struct file *fp, int idx)
{
+ struct filedesc *fdp = td->td_proc->p_fd;
FILEDESC_XLOCK(fdp);
- if (fdp->fd_ofiles[idx] == fp) {
- fdp->fd_ofiles[idx] = NULL;
- fdunused(fdp, idx);
+ if (fdp->fd_ofiles[idx].fde_file == fp) {
+ fdfree(fdp, idx);
FILEDESC_XUNLOCK(fdp);
fdrop(fp, td);
} else
@@ -2108,38 +2205,21 @@
fdcloseexec(struct thread *td)
{
struct filedesc *fdp;
+ struct filedescent *fde;
+ struct file *fp;
int i;
- /* Certain daemons might not have file descriptors. */
fdp = td->td_proc->p_fd;
- if (fdp == NULL)
- return;
-
+ KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared"));
FILEDESC_XLOCK(fdp);
-
- /*
- * We cannot cache fd_ofiles or fd_ofileflags since operations
- * may block and rip them out from under us.
- */
for (i = 0; i <= fdp->fd_lastfile; i++) {
- if (fdp->fd_ofiles[i] != NULL &&
- (fdp->fd_ofiles[i]->f_type == DTYPE_MQUEUE ||
- (fdp->fd_ofileflags[i] & UF_EXCLOSE))) {
- struct file *fp;
-
- knote_fdclose(td, i);
- /*
- * NULL-out descriptor prior to close to avoid
- * a race while close blocks.
- */
- fp = fdp->fd_ofiles[i];
- fdp->fd_ofiles[i] = NULL;
- fdp->fd_ofileflags[i] = 0;
- fdunused(fdp, i);
- if (fp->f_type == DTYPE_MQUEUE)
- mq_fdclose(td, i, fp);
- FILEDESC_XUNLOCK(fdp);
- (void) closef(fp, td);
+ fde = &fdp->fd_ofiles[i];
+ fp = fde->fde_file;
+ if (fp != NULL && (fp->f_type == DTYPE_MQUEUE ||
+ (fde->fde_flags & UF_EXCLOSE))) {
+ fdfree(fdp, i);
+ (void) closefp(fdp, i, fp, td, 0);
+ /* closefp() drops the FILEDESC lock. */
FILEDESC_XLOCK(fdp);
}
}
@@ -2161,13 +2241,11 @@
int i, error, devnull;
fdp = td->td_proc->p_fd;
- if (fdp == NULL)
- return (0);
KASSERT(fdp->fd_refcnt == 1, ("the fdtable should not be shared"));
devnull = -1;
error = 0;
for (i = 0; i < 3; i++) {
- if (fdp->fd_ofiles[i] != NULL)
+ if (fdp->fd_ofiles[i].fde_file != NULL)
continue;
if (devnull < 0) {
save = td->td_retval[0];
@@ -2202,7 +2280,6 @@
struct flock lf;
struct filedesc_to_leader *fdtol;
struct filedesc *fdp;
- struct file *fp_object;
/*
* POSIX record locking dictates that any close releases ALL
@@ -2215,16 +2292,9 @@
* NULL thread pointer when there really is no owning
* context that might have locks, or the locks will be
* leaked.
- *
- * If this is a capability, we do lock processing under the underlying
- * node, not the capability itself.
*/
- (void)cap_funwrap(fp, 0, &fp_object);
- if ((fp_object->f_type == DTYPE_VNODE) && (td != NULL)) {
- int vfslocked;
-
- vp = fp_object->f_vnode;
- vfslocked = VFS_LOCK_GIANT(vp->v_mount);
+ if (fp->f_type == DTYPE_VNODE && td != NULL) {
+ vp = fp->f_vnode;
if ((td->td_proc->p_leader->p_flag & P_ADVLOCK) != 0) {
lf.l_whence = SEEK_SET;
lf.l_start = 0;
@@ -2231,7 +2301,7 @@
lf.l_len = 0;
lf.l_type = F_UNLCK;
(void) VOP_ADVLOCK(vp, (caddr_t)td->td_proc->p_leader,
- F_UNLCK, &lf, F_POSIX);
+ F_UNLCK, &lf, F_POSIX);
}
fdtol = td->td_proc->p_fdtol;
if (fdtol != NULL) {
@@ -2242,10 +2312,10 @@
fdp = td->td_proc->p_fd;
FILEDESC_XLOCK(fdp);
for (fdtol = fdtol->fdl_next;
- fdtol != td->td_proc->p_fdtol;
- fdtol = fdtol->fdl_next) {
+ fdtol != td->td_proc->p_fdtol;
+ fdtol = fdtol->fdl_next) {
if ((fdtol->fdl_leader->p_flag &
- P_ADVLOCK) == 0)
+ P_ADVLOCK) == 0)
continue;
fdtol->fdl_holdcount++;
FILEDESC_XUNLOCK(fdp);
@@ -2253,10 +2323,10 @@
lf.l_start = 0;
lf.l_len = 0;
lf.l_type = F_UNLCK;
- vp = fp_object->f_vnode;
+ vp = fp->f_vnode;
(void) VOP_ADVLOCK(vp,
- (caddr_t)fdtol->fdl_leader,
- F_UNLCK, &lf, F_POSIX);
+ (caddr_t)fdtol->fdl_leader, F_UNLCK, &lf,
+ F_POSIX);
FILEDESC_XLOCK(fdp);
fdtol->fdl_holdcount--;
if (fdtol->fdl_holdcount == 0 &&
@@ -2267,7 +2337,6 @@
}
FILEDESC_XUNLOCK(fdp);
}
- VFS_UNLOCK_GIANT(vfslocked);
}
return (fdrop(fp, td));
}
@@ -2288,15 +2357,28 @@
atomic_store_rel_ptr((volatile uintptr_t *)&fp->f_ops, (uintptr_t)ops);
}
-struct file *
-fget_unlocked(struct filedesc *fdp, int fd)
+int
+fget_unlocked(struct filedesc *fdp, int fd, cap_rights_t *needrightsp,
+ int needfcntl, struct file **fpp, cap_rights_t *haverightsp)
{
+#ifdef CAPABILITIES
+ struct filedescent fde;
+#endif
struct file *fp;
u_int count;
+#ifdef CAPABILITIES
+ seq_t seq;
+ cap_rights_t haverights;
+ int error;
+#endif
- if (fd < 0 || fd >= fdp->fd_nfiles)
- return (NULL);
/*
+ * Avoid reads reordering and then a first access to the
+ * fdp->fd_ofiles table which could result in OOB operation.
+ */
+ if (fd < 0 || fd >= atomic_load_acq_int(&fdp->fd_nfiles))
+ return (EBADF);
+ /*
* Fetch the descriptor locklessly. We avoid fdrop() races by
* never raising a refcount above 0. To accomplish this we have
* to use a cmpset loop rather than an atomic_add. The descriptor
@@ -2305,9 +2387,32 @@
* due to preemption.
*/
for (;;) {
- fp = fdp->fd_ofiles[fd];
+#ifdef CAPABILITIES
+ seq = seq_read(fd_seq(fdp, fd));
+ fde = fdp->fd_ofiles[fd];
+ if (!seq_consistent(fd_seq(fdp, fd), seq)) {
+ cpu_spinwait();
+ continue;
+ }
+ fp = fde.fde_file;
+#else
+ fp = fdp->fd_ofiles[fd].fde_file;
+#endif
if (fp == NULL)
- break;
+ return (EBADF);
+#ifdef CAPABILITIES
+ haverights = *cap_rights_fde(&fde);
+ if (needrightsp != NULL) {
+ error = cap_check(&haverights, needrightsp);
+ if (error != 0)
+ return (error);
+ if (cap_rights_is_set(needrightsp, CAP_FCNTL)) {
+ error = cap_fcntl_check_fde(&fde, needfcntl);
+ if (error != 0)
+ return (error);
+ }
+ }
+#endif
count = fp->f_count;
if (count == 0)
continue;
@@ -2317,12 +2422,23 @@
*/
if (atomic_cmpset_acq_int(&fp->f_count, count, count + 1) != 1)
continue;
- if (fp == fdp->fd_ofiles[fd])
+#ifdef CAPABILITIES
+ if (seq_consistent_nomb(fd_seq(fdp, fd), seq))
+#else
+ if (fp == fdp->fd_ofiles[fd].fde_file)
+#endif
break;
fdrop(fp, curthread);
}
-
- return (fp);
+ *fpp = fp;
+ if (haverightsp != NULL) {
+#ifdef CAPABILITIES
+ *haverightsp = haverights;
+#else
+ CAP_ALL(haverightsp);
+#endif
+ }
+ return (0);
}
/*
@@ -2332,33 +2448,33 @@
* If the descriptor doesn't exist or doesn't match 'flags', EBADF is
* returned.
*
- * If the FGET_GETCAP flag is set, the capability itself will be returned.
- * Calling _fget() with FGET_GETCAP on a non-capability will return EINVAL.
- * Otherwise, if the file is a capability, its rights will be checked against
- * the capability rights mask, and if successful, the object will be unwrapped.
+ * File's rights will be checked against the capability rights mask.
*
- * If an error occured the non-zero error is returned and *fpp is set to
+ * If an error occurred the non-zero error is returned and *fpp is set to
* NULL. Otherwise *fpp is held and set and zero is returned. Caller is
* responsible for fdrop().
*/
-#define FGET_GETCAP 0x00000001
static __inline int
_fget(struct thread *td, int fd, struct file **fpp, int flags,
- cap_rights_t needrights, cap_rights_t *haverightsp, u_char *maxprotp,
- int fget_flags)
+ cap_rights_t *needrightsp, u_char *maxprotp)
{
struct filedesc *fdp;
struct file *fp;
-#ifdef CAPABILITIES
- struct file *fp_fromcap;
-#endif
+ cap_rights_t haverights, needrights;
int error;
*fpp = NULL;
if (td == NULL || (fdp = td->td_proc->p_fd) == NULL)
return (EBADF);
- if ((fp = fget_unlocked(fdp, fd)) == NULL)
- return (EBADF);
+ if (needrightsp != NULL)
+ needrights = *needrightsp;
+ else
+ cap_rights_init(&needrights);
+ if (maxprotp != NULL)
+ cap_rights_set(&needrights, CAP_MMAP);
+ error = fget_unlocked(fdp, fd, &needrights, 0, &fp, &haverights);
+ if (error != 0)
+ return (error);
if (fp->f_ops == &badfileops) {
fdrop(fp, td);
return (EBADF);
@@ -2366,50 +2482,11 @@
#ifdef CAPABILITIES
/*
- * If this is a capability, what rights does it have?
+ * If requested, convert capability rights to access flags.
*/
- if (haverightsp != NULL) {
- if (fp->f_type == DTYPE_CAPABILITY)
- *haverightsp = cap_rights(fp);
- else
- *haverightsp = CAP_MASK_VALID;
- }
-
- /*
- * If a capability has been requested, return the capability directly.
- * Otherwise, check capability rights, extract the underlying object,
- * and check its access flags.
- */
- if (fget_flags & FGET_GETCAP) {
- if (fp->f_type != DTYPE_CAPABILITY) {
- fdrop(fp, td);
- return (EINVAL);
- }
- } else {
- if (maxprotp == NULL)
- error = cap_funwrap(fp, needrights, &fp_fromcap);
- else
- error = cap_funwrap_mmap(fp, needrights, maxprotp,
- &fp_fromcap);
- if (error != 0) {
- fdrop(fp, td);
- return (error);
- }
-
- /*
- * If we've unwrapped a file, drop the original capability
- * and hold the new descriptor. fp after this point refers to
- * the actual (unwrapped) object, not the capability.
- */
- if (fp != fp_fromcap) {
- fhold(fp_fromcap);
- fdrop(fp, td);
- fp = fp_fromcap;
- }
- }
+ if (maxprotp != NULL)
+ *maxprotp = cap_rights_to_vmprot(&haverights);
#else /* !CAPABILITIES */
- KASSERT(fp->f_type != DTYPE_CAPABILITY,
- ("%s: saw capability", __func__));
if (maxprotp != NULL)
*maxprotp = VM_PROT_ALL;
#endif /* CAPABILITIES */
@@ -2445,48 +2522,35 @@
}
int
-fget(struct thread *td, int fd, cap_rights_t rights, struct file **fpp)
+fget(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
{
- return(_fget(td, fd, fpp, 0, rights, NULL, NULL, 0));
+ return(_fget(td, fd, fpp, 0, rightsp, NULL));
}
int
-fget_mmap(struct thread *td, int fd, cap_rights_t rights, u_char *maxprotp,
+fget_mmap(struct thread *td, int fd, cap_rights_t *rightsp, u_char *maxprotp,
struct file **fpp)
{
- return (_fget(td, fd, fpp, 0, rights, NULL, maxprotp, 0));
+ return (_fget(td, fd, fpp, 0, rightsp, maxprotp));
}
int
-fget_read(struct thread *td, int fd, cap_rights_t rights, struct file **fpp)
+fget_read(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
{
- return(_fget(td, fd, fpp, FREAD, rights, NULL, NULL, 0));
+ return(_fget(td, fd, fpp, FREAD, rightsp, NULL));
}
int
-fget_write(struct thread *td, int fd, cap_rights_t rights, struct file **fpp)
+fget_write(struct thread *td, int fd, cap_rights_t *rightsp, struct file **fpp)
{
- return (_fget(td, fd, fpp, FWRITE, rights, NULL, NULL, 0));
+ return (_fget(td, fd, fpp, FWRITE, rightsp, NULL));
}
/*
- * Unlike the other fget() calls, which accept and check capability rights
- * but never return capabilities, fgetcap() returns the capability but doesn't
- * check capability rights.
- */
-int
-fgetcap(struct thread *td, int fd, struct file **fpp)
-{
-
- return (_fget(td, fd, fpp, 0, 0, NULL, NULL, FGET_GETCAP));
-}
-
-
-/*
* Like fget() but loads the underlying vnode, or returns an error if the
* descriptor does not represent a vnode. Note that pipes use vnodes but
* never have VM objects. The returned vnode will be vref()'d.
@@ -2494,15 +2558,15 @@
* XXX: what about the unused flags ?
*/
static __inline int
-_fgetvp(struct thread *td, int fd, int flags, cap_rights_t needrights,
- cap_rights_t *haverightsp, struct vnode **vpp)
+_fgetvp(struct thread *td, int fd, int flags, cap_rights_t *needrightsp,
+ struct vnode **vpp)
{
struct file *fp;
int error;
*vpp = NULL;
- if ((error = _fget(td, fd, &fp, flags, needrights, haverightsp,
- NULL, 0)) != 0)
+ error = _fget(td, fd, &fp, flags, needrightsp, NULL);
+ if (error != 0)
return (error);
if (fp->f_vnode == NULL) {
error = EINVAL;
@@ -2516,40 +2580,68 @@
}
int
-fgetvp(struct thread *td, int fd, cap_rights_t rights, struct vnode **vpp)
+fgetvp(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp)
{
- return (_fgetvp(td, fd, 0, rights, NULL, vpp));
+ return (_fgetvp(td, fd, 0, rightsp, vpp));
}
int
-fgetvp_rights(struct thread *td, int fd, cap_rights_t need, cap_rights_t *have,
- struct vnode **vpp)
+fgetvp_rights(struct thread *td, int fd, cap_rights_t *needrightsp,
+ struct filecaps *havecaps, struct vnode **vpp)
{
- return (_fgetvp(td, fd, 0, need, have, vpp));
+ struct filedesc *fdp;
+ struct file *fp;
+#ifdef CAPABILITIES
+ int error;
+#endif
+
+ if (td == NULL || (fdp = td->td_proc->p_fd) == NULL)
+ return (EBADF);
+
+ fp = fget_locked(fdp, fd);
+ if (fp == NULL || fp->f_ops == &badfileops)
+ return (EBADF);
+
+#ifdef CAPABILITIES
+ if (needrightsp != NULL) {
+ error = cap_check(cap_rights(fdp, fd), needrightsp);
+ if (error != 0)
+ return (error);
+ }
+#endif
+
+ if (fp->f_vnode == NULL)
+ return (EINVAL);
+
+ *vpp = fp->f_vnode;
+ vref(*vpp);
+ filecaps_copy(&fdp->fd_ofiles[fd].fde_caps, havecaps);
+
+ return (0);
}
int
-fgetvp_read(struct thread *td, int fd, cap_rights_t rights, struct vnode **vpp)
+fgetvp_read(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp)
{
- return (_fgetvp(td, fd, FREAD, rights, NULL, vpp));
+ return (_fgetvp(td, fd, FREAD, rightsp, vpp));
}
int
-fgetvp_exec(struct thread *td, int fd, cap_rights_t rights, struct vnode **vpp)
+fgetvp_exec(struct thread *td, int fd, cap_rights_t *rightsp, struct vnode **vpp)
{
- return (_fgetvp(td, fd, FEXEC, rights, NULL, vpp));
+ return (_fgetvp(td, fd, FEXEC, rightsp, vpp));
}
#ifdef notyet
int
-fgetvp_write(struct thread *td, int fd, cap_rights_t rights,
+fgetvp_write(struct thread *td, int fd, cap_rights_t *rightsp,
struct vnode **vpp)
{
- return (_fgetvp(td, fd, FWRITE, rights, NULL, vpp));
+ return (_fgetvp(td, fd, FWRITE, rightsp, vpp));
}
#endif
@@ -2565,7 +2657,7 @@
* during use.
*/
int
-fgetsock(struct thread *td, int fd, cap_rights_t rights, struct socket **spp,
+fgetsock(struct thread *td, int fd, cap_rights_t *rightsp, struct socket **spp,
u_int *fflagp)
{
struct file *fp;
@@ -2574,7 +2666,7 @@
*spp = NULL;
if (fflagp != NULL)
*fflagp = 0;
- if ((error = _fget(td, fd, &fp, 0, rights, NULL, NULL, 0)) != 0)
+ if ((error = _fget(td, fd, &fp, 0, rightsp, NULL)) != 0)
return (error);
if (fp->f_type != DTYPE_SOCKET) {
error = ENOTSOCK;
@@ -2610,9 +2702,6 @@
/*
* Handle the last reference to a file being closed.
- *
- * No special capability handling here, as the capability's fo_close will run
- * instead of the object here, and perform any necessary drop on the object.
*/
int
_fdrop(struct file *fp, struct thread *td)
@@ -2651,10 +2740,11 @@
struct file *fp;
struct vnode *vp;
struct flock lf;
- int vfslocked;
+ cap_rights_t rights;
int error;
- if ((error = fget(td, uap->fd, CAP_FLOCK, &fp)) != 0)
+ error = fget(td, uap->fd, cap_rights_init(&rights, CAP_FLOCK), &fp);
+ if (error != 0)
return (error);
if (fp->f_type != DTYPE_VNODE) {
fdrop(fp, td);
@@ -2662,7 +2752,6 @@
}
vp = fp->f_vnode;
- vfslocked = VFS_LOCK_GIANT(vp->v_mount);
lf.l_whence = SEEK_SET;
lf.l_start = 0;
lf.l_len = 0;
@@ -2685,7 +2774,6 @@
(uap->how & LOCK_NB) ? F_FLOCK : F_FLOCK | F_WAIT);
done2:
fdrop(fp, td);
- VFS_UNLOCK_GIANT(vfslocked);
return (error);
}
/*
@@ -2692,11 +2780,16 @@
* Duplicate the specified descriptor to a free descriptor.
*/
int
-dupfdopen(struct thread *td, struct filedesc *fdp, int indx, int dfd, int mode, int error)
+dupfdopen(struct thread *td, struct filedesc *fdp, int dfd, int mode,
+ int openerror, int *indxp)
{
- struct file *wfp;
+ struct filedescent *newfde, *oldfde;
struct file *fp;
+ int error, indx;
+ KASSERT(openerror == ENODEV || openerror == ENXIO,
+ ("unexpected error %d in %s", openerror, __func__));
+
/*
* If the to-be-dup'd fd number is greater than the allowed number
* of file descriptors, or the fd to be dup'd has already been
@@ -2703,12 +2796,17 @@
* closed, then reject.
*/
FILEDESC_XLOCK(fdp);
- if (dfd < 0 || dfd >= fdp->fd_nfiles ||
- (wfp = fdp->fd_ofiles[dfd]) == NULL) {
+ if ((fp = fget_locked(fdp, dfd)) == NULL) {
FILEDESC_XUNLOCK(fdp);
return (EBADF);
}
+ error = fdalloc(td, 0, &indx);
+ if (error != 0) {
+ FILEDESC_XUNLOCK(fdp);
+ return (error);
+ }
+
/*
* There are two cases of interest here.
*
@@ -2716,61 +2814,50 @@
*
* For ENXIO steal away the file structure from (dfd) and store it in
* (indx). (dfd) is effectively closed by this operation.
- *
- * Any other error code is just returned.
*/
- switch (error) {
+ switch (openerror) {
case ENODEV:
/*
* Check that the mode the file is being opened for is a
* subset of the mode of the existing descriptor.
*/
- if (((mode & (FREAD|FWRITE)) | wfp->f_flag) != wfp->f_flag) {
+ if (((mode & (FREAD|FWRITE)) | fp->f_flag) != fp->f_flag) {
+ fdunused(fdp, indx);
FILEDESC_XUNLOCK(fdp);
return (EACCES);
}
- fp = fdp->fd_ofiles[indx];
- fdp->fd_ofiles[indx] = wfp;
- fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd];
- if (fp == NULL)
- fdused(fdp, indx);
- fhold(wfp);
- FILEDESC_XUNLOCK(fdp);
- if (fp != NULL)
- /*
- * We now own the reference to fp that the ofiles[]
- * array used to own. Release it.
- */
- fdrop(fp, td);
- return (0);
-
+ fhold(fp);
+ newfde = &fdp->fd_ofiles[indx];
+ oldfde = &fdp->fd_ofiles[dfd];
+#ifdef CAPABILITIES
+ seq_write_begin(&newfde->fde_seq);
+#endif
+ memcpy(newfde, oldfde, fde_change_size);
+ filecaps_copy(&oldfde->fde_caps, &newfde->fde_caps);
+#ifdef CAPABILITIES
+ seq_write_end(&newfde->fde_seq);
+#endif
+ break;
case ENXIO:
/*
* Steal away the file pointer from dfd and stuff it into indx.
*/
- fp = fdp->fd_ofiles[indx];
- fdp->fd_ofiles[indx] = fdp->fd_ofiles[dfd];
- fdp->fd_ofiles[dfd] = NULL;
- fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd];
- fdp->fd_ofileflags[dfd] = 0;
+ newfde = &fdp->fd_ofiles[indx];
+ oldfde = &fdp->fd_ofiles[dfd];
+#ifdef CAPABILITIES
+ seq_write_begin(&newfde->fde_seq);
+#endif
+ memcpy(newfde, oldfde, fde_change_size);
+ bzero(oldfde, fde_change_size);
fdunused(fdp, dfd);
- if (fp == NULL)
- fdused(fdp, indx);
- FILEDESC_XUNLOCK(fdp);
-
- /*
- * We now own the reference to fp that the ofiles[] array
- * used to own. Release it.
- */
- if (fp != NULL)
- fdrop(fp, td);
- return (0);
-
- default:
- FILEDESC_XUNLOCK(fdp);
- return (error);
+#ifdef CAPABILITIES
+ seq_write_end(&newfde->fde_seq);
+#endif
+ break;
}
- /* NOTREACHED */
+ FILEDESC_XUNLOCK(fdp);
+ *indxp = indx;
+ return (0);
}
/*
@@ -2846,8 +2933,7 @@
struct filedesc_to_leader *fdtol;
fdtol = malloc(sizeof(struct filedesc_to_leader),
- M_FILEDESC_TO_LEADER,
- M_WAITOK);
+ M_FILEDESC_TO_LEADER, M_WAITOK);
fdtol->fdl_refcount = 1;
fdtol->fdl_holdcount = 0;
fdtol->fdl_wakeup = 0;
@@ -2919,8 +3005,8 @@
if (fdp == NULL)
continue;
FILEDESC_SLOCK(fdp);
- for (n = 0; fdp->fd_refcnt > 0 && n < fdp->fd_nfiles; ++n) {
- if ((fp = fdp->fd_ofiles[n]) == NULL)
+ for (n = 0; fdp->fd_refcnt > 0 && n <= fdp->fd_lastfile; ++n) {
+ if ((fp = fdp->fd_ofiles[n].fde_file) == NULL)
continue;
xf.xf_fd = n;
xf.xf_file = fp;
@@ -2944,7 +3030,7 @@
return (error);
}
-SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD,
+SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD|CTLFLAG_MPSAFE,
0, 0, sysctl_kern_file, "S,xfile", "Entire file table");
#ifdef KINFO_OFILE_SIZE
@@ -2958,7 +3044,6 @@
{
int error;
char *fullpath, *freepath;
- int vfslocked;
bzero(kif, sizeof(*kif));
kif->kf_structsize = sizeof(*kif);
@@ -2984,9 +3069,7 @@
fullpath = "-";
FILEDESC_SUNLOCK(fdp);
vn_fullpath(curthread, vp, &fullpath, &freepath);
- vfslocked = VFS_LOCK_GIANT(vp->v_mount);
vrele(vp);
- VFS_UNLOCK_GIANT(vfslocked);
strlcpy(kif->kf_path, fullpath, sizeof(kif->kf_path));
if (freepath != NULL)
free(freepath, M_TEMP);
@@ -3008,13 +3091,13 @@
struct shmfd *shmfd;
struct socket *so;
struct vnode *vp;
+ struct ksem *ks;
struct file *fp;
struct proc *p;
struct tty *tp;
- int vfslocked;
name = (int *)arg1;
- error = pget((pid_t)name[0], PGET_CANDEBUG, &p);
+ error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p);
if (error != 0)
return (error);
fdp = fdhold(p);
@@ -3032,11 +3115,12 @@
if (fdp->fd_jdir != NULL)
export_vnode_for_osysctl(fdp->fd_jdir, KF_FD_TYPE_JAIL, kif,
fdp, req);
- for (i = 0; i < fdp->fd_nfiles; i++) {
- if ((fp = fdp->fd_ofiles[i]) == NULL)
+ for (i = 0; fdp->fd_refcnt > 0 && i <= fdp->fd_lastfile; i++) {
+ if ((fp = fdp->fd_ofiles[i].fde_file) == NULL)
continue;
bzero(kif, sizeof(*kif));
kif->kf_structsize = sizeof(*kif);
+ ks = NULL;
vp = NULL;
so = NULL;
tp = NULL;
@@ -3043,21 +3127,6 @@
shmfd = NULL;
kif->kf_fd = i;
-#ifdef CAPABILITIES
- /*
- * When reporting a capability, most fields will be from the
- * underlying object, but do mark as a capability. With
- * ofiledesc, we don't have a field to export the cap_rights_t,
- * but we do with the new filedesc.
- */
- if (fp->f_type == DTYPE_CAPABILITY) {
- kif->kf_flags |= KF_FLAG_CAPABILITY;
- (void)cap_funwrap(fp, 0, &fp);
- }
-#else
- KASSERT(fp->f_type != DTYPE_CAPABILITY,
- ("sysctl_kern_proc_ofiledesc: saw capability"));
-#endif
switch (fp->f_type) {
case DTYPE_VNODE:
kif->kf_type = KF_TYPE_VNODE;
@@ -3097,6 +3166,7 @@
case DTYPE_SEM:
kif->kf_type = KF_TYPE_SEM;
+ ks = fp->f_data;
break;
case DTYPE_PTS:
@@ -3175,9 +3245,7 @@
fullpath = "-";
FILEDESC_SUNLOCK(fdp);
vn_fullpath(curthread, vp, &fullpath, &freepath);
- vfslocked = VFS_LOCK_GIANT(vp->v_mount);
vrele(vp);
- VFS_UNLOCK_GIANT(vfslocked);
strlcpy(kif->kf_path, fullpath,
sizeof(kif->kf_path));
if (freepath != NULL)
@@ -3208,6 +3276,8 @@
}
if (shmfd != NULL)
shm_path(shmfd, kif->kf_path, sizeof(kif->kf_path));
+ if (ks != NULL && ksem_info != NULL)
+ ksem_info(ks, kif->kf_path, sizeof(kif->kf_path), NULL);
error = SYSCTL_OUT(req, kif, sizeof(*kif));
if (error)
break;
@@ -3218,8 +3288,9 @@
return (0);
}
-static SYSCTL_NODE(_kern_proc, KERN_PROC_OFILEDESC, ofiledesc, CTLFLAG_RD,
- sysctl_kern_proc_ofiledesc, "Process ofiledesc entries");
+static SYSCTL_NODE(_kern_proc, KERN_PROC_OFILEDESC, ofiledesc,
+ CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_ofiledesc,
+ "Process ofiledesc entries");
#endif /* COMPAT_FREEBSD7 */
#ifdef KINFO_FILE_SIZE
@@ -3226,10 +3297,17 @@
CTASSERT(sizeof(struct kinfo_file) == KINFO_FILE_SIZE);
#endif
+struct export_fd_buf {
+ struct filedesc *fdp;
+ struct sbuf *sb;
+ ssize_t remainder;
+ struct kinfo_file kif;
+ int flags;
+};
+
static int
export_fd_to_sb(void *data, int type, int fd, int fflags, int refcnt,
- int64_t offset, int fd_is_cap, cap_rights_t fd_cap_rights,
- struct kinfo_file *kif, struct sbuf *sb, ssize_t *remainder)
+ int64_t offset, cap_rights_t *rightsp, struct export_fd_buf *efbuf)
{
struct {
int fflag;
@@ -3252,21 +3330,26 @@
{ O_TRUNC, KF_FLAG_TRUNC }
};
#define NFFLAGS (sizeof(fflags_table) / sizeof(*fflags_table))
+ struct kinfo_file *kif;
struct vnode *vp;
- int error, vfslocked;
+ int error, locked;
unsigned int i;
- if (*remainder == 0)
+ if (efbuf->remainder == 0)
return (0);
+ kif = &efbuf->kif;
bzero(kif, sizeof(*kif));
+ locked = efbuf->fdp != NULL;
switch (type) {
case KF_TYPE_FIFO:
case KF_TYPE_VNODE:
+ if (locked) {
+ FILEDESC_SUNLOCK(efbuf->fdp);
+ locked = 0;
+ }
vp = (struct vnode *)data;
error = fill_vnode_info(vp, kif);
- vfslocked = VFS_LOCK_GIANT(vp->v_mount);
vrele(vp);
- VFS_UNLOCK_GIANT(vfslocked);
break;
case KF_TYPE_SOCKET:
error = fill_socket_info((struct socket *)data, kif);
@@ -3280,6 +3363,9 @@
case KF_TYPE_PROCDESC:
error = fill_procdesc_info((struct procdesc *)data, kif);
break;
+ case KF_TYPE_SEM:
+ error = fill_sem_info((struct file *)data, kif);
+ break;
case KF_TYPE_SHM:
error = fill_shm_info((struct file *)data, kif);
break;
@@ -3295,27 +3381,36 @@
for (i = 0; i < NFFLAGS; i++)
if (fflags & fflags_table[i].fflag)
kif->kf_flags |= fflags_table[i].kf_fflag;
- if (fd_is_cap)
- kif->kf_flags |= KF_FLAG_CAPABILITY;
- if (fd_is_cap)
- kif->kf_cap_rights = fd_cap_rights;
+ if (rightsp != NULL)
+ kif->kf_cap_rights = *rightsp;
+ else
+ cap_rights_init(&kif->kf_cap_rights);
kif->kf_fd = fd;
kif->kf_type = type;
kif->kf_ref_count = refcnt;
kif->kf_offset = offset;
- /* Pack record size down */
- kif->kf_structsize = offsetof(struct kinfo_file, kf_path) +
- strlen(kif->kf_path) + 1;
+ if ((efbuf->flags & KERN_FILEDESC_PACK_KINFO) != 0)
+ /* Pack record size down */
+ kif->kf_structsize = offsetof(struct kinfo_file, kf_path) +
+ strlen(kif->kf_path) + 1;
+ else
+ kif->kf_structsize = sizeof(*kif);
kif->kf_structsize = roundup(kif->kf_structsize, sizeof(uint64_t));
- if (*remainder != -1) {
- if (*remainder < kif->kf_structsize) {
+ if (efbuf->remainder != -1) {
+ if (efbuf->remainder < kif->kf_structsize) {
/* Terminate export. */
- *remainder = 0;
+ efbuf->remainder = 0;
+ if (efbuf->fdp != NULL && !locked)
+ FILEDESC_SLOCK(efbuf->fdp);
return (0);
}
- *remainder -= kif->kf_structsize;
+ efbuf->remainder -= kif->kf_structsize;
}
- error = sbuf_bcat(sb, kif, kif->kf_structsize);
+ if (locked)
+ FILEDESC_SUNLOCK(efbuf->fdp);
+ error = sbuf_bcat(efbuf->sb, kif, kif->kf_structsize) == 0 ? 0 : ENOMEM;
+ if (efbuf->fdp != NULL)
+ FILEDESC_SLOCK(efbuf->fdp);
return (error);
}
@@ -3325,22 +3420,21 @@
* Takes a locked proc as argument, and returns with the proc unlocked.
*/
int
-kern_proc_filedesc_out(struct proc *p, struct sbuf *sb, ssize_t maxlen)
+kern_proc_filedesc_out(struct proc *p, struct sbuf *sb, ssize_t maxlen,
+ int flags)
{
struct file *fp;
struct filedesc *fdp;
- struct kinfo_file *kif;
+ struct export_fd_buf *efbuf;
struct vnode *cttyvp, *textvp, *tracevp;
int64_t offset;
void *data;
- ssize_t remainder;
int error, i;
- int fd_is_cap, type, refcnt, fflags;
- cap_rights_t fd_cap_rights;
+ int type, refcnt, fflags;
+ cap_rights_t rights;
PROC_LOCK_ASSERT(p, MA_OWNED);
- remainder = maxlen;
/* ktrace vnode */
tracevp = p->p_tracevp;
if (tracevp != NULL)
@@ -3358,68 +3452,54 @@
}
fdp = fdhold(p);
PROC_UNLOCK(p);
- kif = malloc(sizeof(*kif), M_TEMP, M_WAITOK);
+ efbuf = malloc(sizeof(*efbuf), M_TEMP, M_WAITOK);
+ efbuf->fdp = NULL;
+ efbuf->sb = sb;
+ efbuf->remainder = maxlen;
+ efbuf->flags = flags;
if (tracevp != NULL)
export_fd_to_sb(tracevp, KF_TYPE_VNODE, KF_FD_TYPE_TRACE,
- FREAD | FWRITE, -1, -1, 0, 0, kif, sb, &remainder);
+ FREAD | FWRITE, -1, -1, NULL, efbuf);
if (textvp != NULL)
export_fd_to_sb(textvp, KF_TYPE_VNODE, KF_FD_TYPE_TEXT,
- FREAD, -1, -1, 0, 0, kif, sb, &remainder);
+ FREAD, -1, -1, NULL, efbuf);
if (cttyvp != NULL)
export_fd_to_sb(cttyvp, KF_TYPE_VNODE, KF_FD_TYPE_CTTY,
- FREAD | FWRITE, -1, -1, 0, 0, kif, sb, &remainder);
+ FREAD | FWRITE, -1, -1, NULL, efbuf);
error = 0;
if (fdp == NULL)
goto fail;
+ efbuf->fdp = fdp;
FILEDESC_SLOCK(fdp);
/* working directory */
if (fdp->fd_cdir != NULL) {
vref(fdp->fd_cdir);
data = fdp->fd_cdir;
- FILEDESC_SUNLOCK(fdp);
export_fd_to_sb(data, KF_TYPE_VNODE, KF_FD_TYPE_CWD,
- FREAD, -1, -1, 0, 0, kif, sb, &remainder);
- FILEDESC_SLOCK(fdp);
+ FREAD, -1, -1, NULL, efbuf);
}
/* root directory */
if (fdp->fd_rdir != NULL) {
vref(fdp->fd_rdir);
data = fdp->fd_rdir;
- FILEDESC_SUNLOCK(fdp);
export_fd_to_sb(data, KF_TYPE_VNODE, KF_FD_TYPE_ROOT,
- FREAD, -1, -1, 0, 0, kif, sb, &remainder);
- FILEDESC_SLOCK(fdp);
+ FREAD, -1, -1, NULL, efbuf);
}
/* jail directory */
if (fdp->fd_jdir != NULL) {
vref(fdp->fd_jdir);
data = fdp->fd_jdir;
- FILEDESC_SUNLOCK(fdp);
export_fd_to_sb(data, KF_TYPE_VNODE, KF_FD_TYPE_JAIL,
- FREAD, -1, -1, 0, 0, kif, sb, &remainder);
- FILEDESC_SLOCK(fdp);
+ FREAD, -1, -1, NULL, efbuf);
}
- for (i = 0; i < fdp->fd_nfiles; i++) {
- if ((fp = fdp->fd_ofiles[i]) == NULL)
+ for (i = 0; fdp->fd_refcnt > 0 && i <= fdp->fd_lastfile; i++) {
+ if ((fp = fdp->fd_ofiles[i].fde_file) == NULL)
continue;
data = NULL;
- fd_is_cap = 0;
- fd_cap_rights = 0;
-
#ifdef CAPABILITIES
- /*
- * When reporting a capability, most fields will be from the
- * underlying object, but do mark as a capability and export
- * the capability rights mask.
- */
- if (fp->f_type == DTYPE_CAPABILITY) {
- fd_is_cap = 1;
- fd_cap_rights = cap_rights(fp);
- (void)cap_funwrap(fp, 0, &fp);
- }
+ rights = *cap_rights(fdp, i);
#else /* !CAPABILITIES */
- KASSERT(fp->f_type != DTYPE_CAPABILITY,
- ("sysctl_kern_proc_filedesc: saw capability"));
+ cap_rights_init(&rights);
#endif
switch (fp->f_type) {
case DTYPE_VNODE:
@@ -3463,6 +3543,7 @@
case DTYPE_SEM:
type = KF_TYPE_SEM;
+ data = fp;
break;
case DTYPE_PTS:
@@ -3491,20 +3572,15 @@
* re-validate and re-evaluate its properties when
* the loop continues.
*/
- if (type == KF_TYPE_VNODE || type == KF_TYPE_FIFO)
- FILEDESC_SUNLOCK(fdp);
error = export_fd_to_sb(data, type, i, fflags, refcnt,
- offset, fd_is_cap, fd_cap_rights, kif, sb, &remainder);
- if (type == KF_TYPE_VNODE || type == KF_TYPE_FIFO)
- FILEDESC_SLOCK(fdp);
- if (error)
+ offset, &rights, efbuf);
+ if (error != 0)
break;
}
FILEDESC_SUNLOCK(fdp);
+ fddrop(fdp);
fail:
- if (fdp != NULL)
- fddrop(fdp);
- free(kif, M_TEMP);
+ free(efbuf, M_TEMP);
return (error);
}
@@ -3524,13 +3600,14 @@
name = (int *)arg1;
sbuf_new_for_sysctl(&sb, NULL, FILEDESC_SBUF_SIZE, req);
- error = pget((pid_t)name[0], PGET_CANDEBUG, &p);
+ error = pget((pid_t)name[0], PGET_CANDEBUG | PGET_NOTWEXIT, &p);
if (error != 0) {
sbuf_delete(&sb);
return (error);
}
maxlen = req->oldptr != NULL ? req->oldlen : -1;
- error = kern_proc_filedesc_out(p, &sb, maxlen);
+ error = kern_proc_filedesc_out(p, &sb, maxlen,
+ KERN_FILEDESC_PACK_KINFO);
error2 = sbuf_finish(&sb);
sbuf_delete(&sb);
return (error != 0 ? error : error2);
@@ -3568,12 +3645,30 @@
return (KF_VTYPE_UNKNOWN);
}
+static inline void
+vn_fill_junk(struct kinfo_file *kif)
+{
+ size_t len, olen;
+
+ /*
+ * Simulate vn_fullpath returning changing values for a given
+ * vp during e.g. coredump.
+ */
+ len = (arc4random() % (sizeof(kif->kf_path) - 2)) + 1;
+ olen = strlen(kif->kf_path);
+ if (len < olen)
+ strcpy(&kif->kf_path[len - 1], "$");
+ else
+ for (; olen < len; olen++)
+ strcpy(&kif->kf_path[olen], "A");
+}
+
static int
fill_vnode_info(struct vnode *vp, struct kinfo_file *kif)
{
struct vattr va;
char *fullpath, *freepath;
- int error, vfslocked;
+ int error;
if (vp == NULL)
return (1);
@@ -3587,16 +3682,18 @@
if (freepath != NULL)
free(freepath, M_TEMP);
+ KFAIL_POINT_CODE(DEBUG_FP, fill_kinfo_vnode__random_path,
+ vn_fill_junk(kif);
+ );
+
/*
* Retrieve vnode attributes.
*/
va.va_fsid = VNOVAL;
va.va_rdev = NODEV;
- vfslocked = VFS_LOCK_GIANT(vp->v_mount);
vn_lock(vp, LK_SHARED | LK_RETRY);
error = VOP_GETATTR(vp, &va, curthread->td_ucred);
VOP_UNLOCK(vp, 0);
- VFS_UNLOCK_GIANT(vfslocked);
if (error != 0)
return (error);
if (va.va_fsid != VNOVAL)
@@ -3699,6 +3796,25 @@
}
static int
+fill_sem_info(struct file *fp, struct kinfo_file *kif)
+{
+ struct thread *td;
+ struct stat sb;
+
+ td = curthread;
+ if (fp->f_data == NULL)
+ return (1);
+ if (fo_stat(fp, &sb, td->td_ucred, td) != 0)
+ return (1);
+ if (ksem_info == NULL)
+ return (1);
+ ksem_info(fp->f_data, kif->kf_path, sizeof(kif->kf_path),
+ &kif->kf_un.kf_sem.kf_sem_value);
+ kif->kf_un.kf_sem.kf_sem_mode = sb.st_mode;
+ return (0);
+}
+
+static int
fill_shm_info(struct file *fp, struct kinfo_file *kif)
{
struct thread *td;
@@ -3715,8 +3831,9 @@
return (0);
}
-static SYSCTL_NODE(_kern_proc, KERN_PROC_FILEDESC, filedesc, CTLFLAG_RD,
- sysctl_kern_proc_filedesc, "Process filedesc entries");
+static SYSCTL_NODE(_kern_proc, KERN_PROC_FILEDESC, filedesc,
+ CTLFLAG_RD|CTLFLAG_MPSAFE, sysctl_kern_proc_filedesc,
+ "Process filedesc entries");
#ifdef DDB
/*
@@ -3771,8 +3888,8 @@
fdp = p->p_fd;
if (fdp == NULL)
continue;
- for (n = 0; n < fdp->fd_nfiles; n++) {
- if (fp == fdp->fd_ofiles[n])
+ for (n = 0; n <= fdp->fd_lastfile; n++) {
+ if (fp == fdp->fd_ofiles[n].fde_file)
return (p);
}
}
@@ -3821,8 +3938,8 @@
continue;
if ((fdp = p->p_fd) == NULL)
continue;
- for (n = 0; n < fdp->fd_nfiles; ++n) {
- if ((fp = fdp->fd_ofiles[n]) == NULL)
+ for (n = 0; n <= fdp->fd_lastfile; ++n) {
+ if ((fp = fdp->fd_ofiles[n].fde_file) == NULL)
continue;
db_print_file(fp, header);
header = 0;
@@ -3924,6 +4041,15 @@
return (EBADF);
}
+static int
+badfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
+ struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags,
+ int kflags, struct thread *td)
+{
+
+ return (EBADF);
+}
+
struct fileops badfileops = {
.fo_read = badfo_readwrite,
.fo_write = badfo_readwrite,
@@ -3935,6 +4061,7 @@
.fo_close = badfo_close,
.fo_chmod = badfo_chmod,
.fo_chown = badfo_chown,
+ .fo_sendfile = badfo_sendfile,
};
int
@@ -3953,6 +4080,15 @@
return (EINVAL);
}
+int
+invfo_sendfile(struct file *fp, int sockfd, struct uio *hdr_uio,
+ struct uio *trl_uio, off_t offset, size_t nbytes, off_t *sent, int flags,
+ int kflags, struct thread *td)
+{
+
+ return (EINVAL);
+}
+
/*-------------------------------------------------------------------*/
/*
Modified: trunk/sys/kern/kern_environment.c
===================================================================
--- trunk/sys/kern/kern_environment.c 2018-05-25 20:39:59 UTC (rev 9943)
+++ trunk/sys/kern/kern_environment.c 2018-05-25 20:46:51 UTC (rev 9944)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 1998 Michael Smith
* All rights reserved.
@@ -35,7 +36,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/kern_environment.c 294683 2016-01-24 21:04:06Z ian $");
#include <sys/types.h>
#include <sys/param.h>
@@ -210,12 +211,44 @@
return (error);
}
+/*
+ * Populate the initial kernel environment.
+ *
+ * This is called very early in MD startup, either to provide a copy of the
+ * environment obtained from a boot loader, or to provide an empty buffer into
+ * which MD code can store an initial environment using kern_setenv() calls.
+ *
+ * If the global envmode is 1, the environment is initialized from the global
+ * static_env[], regardless of the arguments passed. This implements the env
+ * keyword described in config(5). In this case env_pos is set to env_len,
+ * causing kern_setenv() to return -1 (if len > 0) or panic (if len == 0) until
+ * the dynamic environment is available. The envmode and static_env variables
+ * are defined in env.c which is generated by config(8).
+ *
+ * If len is non-zero, the caller is providing an empty buffer. The caller will
+ * subsequently use kern_setenv() to add up to len bytes of initial environment
+ * before the dynamic environment is available.
+ *
+ * If len is zero, the caller is providing a pre-loaded buffer containing
+ * environment strings. Additional strings cannot be added until the dynamic
+ * environment is available. The memory pointed to must remain stable at least
+ * until sysinit runs init_dynamic_kenv(). If no initial environment is
+ * available from the boot loader, passing a NULL pointer allows the static_env
+ * to be installed if it is configured.
+ */
void
init_static_kenv(char *buf, size_t len)
{
- kern_envp = buf;
- env_len = len;
- env_pos = 0;
+
+ if (envmode == 1) {
+ kern_envp = static_env;
+ env_len = len;
+ env_pos = len;
+ } else {
+ kern_envp = buf;
+ env_len = len;
+ env_pos = 0;
+ }
}
/*
@@ -231,20 +264,23 @@
kenvp = malloc((KENV_SIZE + 1) * sizeof(char *), M_KENV,
M_WAITOK | M_ZERO);
i = 0;
- for (cp = kern_envp; cp != NULL; cp = kernenv_next(cp)) {
- len = strlen(cp) + 1;
- if (len > KENV_MNAMELEN + 1 + KENV_MVALLEN + 1) {
- printf("WARNING: too long kenv string, ignoring %s\n",
- cp);
- continue;
+ if (kern_envp && *kern_envp != '\0') {
+ for (cp = kern_envp; cp != NULL; cp = kernenv_next(cp)) {
+ len = strlen(cp) + 1;
+ if (len > KENV_MNAMELEN + 1 + KENV_MVALLEN + 1) {
+ printf(
+ "WARNING: too long kenv string, ignoring %s\n",
+ cp);
+ continue;
+ }
+ if (i < KENV_SIZE) {
+ kenvp[i] = malloc(len, M_KENV, M_WAITOK);
+ strcpy(kenvp[i++], cp);
+ } else
+ printf(
+ "WARNING: too many kenv strings, ignoring %s\n",
+ cp);
}
- if (i < KENV_SIZE) {
- kenvp[i] = malloc(len, M_KENV, M_WAITOK);
- strcpy(kenvp[i++], cp);
- } else
- printf(
- "WARNING: too many kenv strings, ignoring %s\n",
- cp);
}
kenvp[i] = NULL;
@@ -312,20 +348,12 @@
getenv(const char *name)
{
char buf[KENV_MNAMELEN + 1 + KENV_MVALLEN + 1];
- char *ret, *cp;
- int len;
+ char *ret;
if (dynamic_kenv) {
- mtx_lock(&kenv_lock);
- cp = _getenv_dynamic(name, NULL);
- if (cp != NULL) {
- strcpy(buf, cp);
- mtx_unlock(&kenv_lock);
- len = strlen(buf) + 1;
- ret = malloc(len, M_KENV, M_WAITOK);
- strcpy(ret, buf);
+ if (getenv_string(name, buf, sizeof(buf))) {
+ ret = strdup(buf, M_KENV);
} else {
- mtx_unlock(&kenv_lock);
ret = NULL;
WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
"getenv");
@@ -455,15 +483,20 @@
int
getenv_string(const char *name, char *data, int size)
{
- char *tmp;
+ char *cp;
- tmp = getenv(name);
- if (tmp != NULL) {
- strlcpy(data, tmp, size);
- freeenv(tmp);
- return (1);
- } else
- return (0);
+ if (dynamic_kenv) {
+ mtx_lock(&kenv_lock);
+ cp = _getenv_dynamic(name, NULL);
+ if (cp != NULL)
+ strlcpy(data, cp, size);
+ mtx_unlock(&kenv_lock);
+ } else {
+ cp = _getenv_static(name);
+ if (cp != NULL)
+ strlcpy(data, cp, size);
+ }
+ return (cp != NULL);
}
/*
@@ -532,18 +565,15 @@
int
getenv_quad(const char *name, quad_t *data)
{
- char *value;
+ char value[KENV_MNAMELEN + 1 + KENV_MVALLEN + 1];
char *vtp;
quad_t iv;
- value = getenv(name);
- if (value == NULL)
+ if (!getenv_string(name, value, sizeof(value)))
return (0);
iv = strtoq(value, &vtp, 0);
- if (vtp == value || (vtp[0] != '\0' && vtp[1] != '\0')) {
- freeenv(value);
+ if (vtp == value || (vtp[0] != '\0' && vtp[1] != '\0'))
return (0);
- }
switch (vtp[0]) {
case 't': case 'T':
iv *= 1024;
@@ -556,11 +586,9 @@
case '\0':
break;
default:
- freeenv(value);
return (0);
}
*data = iv;
- freeenv(value);
return (1);
}
Modified: trunk/sys/kern/kern_et.c
===================================================================
--- trunk/sys/kern/kern_et.c 2018-05-25 20:39:59 UTC (rev 9943)
+++ trunk/sys/kern/kern_et.c 2018-05-25 20:46:51 UTC (rev 9944)
@@ -1,5 +1,6 @@
+/* $MidnightBSD$ */
/*-
- * Copyright (c) 2010 Alexander Motin <mav at FreeBSD.org>
+ * Copyright (c) 2010-2013 Alexander Motin <mav at FreeBSD.org>
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -25,7 +26,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/kern_et.c 266347 2014-05-17 20:10:12Z ian $");
#include <sys/param.h>
#include <sys/kernel.h>
@@ -34,6 +35,8 @@
#include <sys/queue.h>
#include <sys/timeet.h>
+#include "opt_timer.h"
+
SLIST_HEAD(et_eventtimers_list, eventtimer);
static struct et_eventtimers_list eventtimers = SLIST_HEAD_INITIALIZER(et_eventtimers);
@@ -62,6 +65,7 @@
et->et_quality);
}
}
+ KASSERT(et->et_start, ("et_register: timer has no start function"));
et->et_sysctl = SYSCTL_ADD_NODE(NULL,
SYSCTL_STATIC_CHILDREN(_kern_eventtimer_et), OID_AUTO, et->et_name,
CTLFLAG_RW, 0, "event timer description");
@@ -112,6 +116,20 @@
}
/*
+ * Change the frequency of the given timer. If it is the active timer,
+ * reconfigure it on all CPUs (reschedules all current events based on the new
+ * timer frequency).
+ */
+void
+et_change_frequency(struct eventtimer *et, uint64_t newfreq)
+{
+
+#ifndef NO_EVENTTIMERS
+ cpu_et_frequency(et, newfreq);
+#endif
+}
+
+/*
* Find free event timer hardware with specified parameters.
*/
struct eventtimer *
@@ -159,43 +177,29 @@
* period - period of subsequent periodic ticks.
*/
int
-et_start(struct eventtimer *et,
- struct bintime *first, struct bintime *period)
+et_start(struct eventtimer *et, sbintime_t first, sbintime_t period)
{
if (!et->et_active)
return (ENXIO);
- if (first == NULL && period == NULL)
- return (EINVAL);
- if ((et->et_flags & ET_FLAGS_PERIODIC) == 0 &&
- period != NULL)
- return (ENODEV);
- if ((et->et_flags & ET_FLAGS_ONESHOT) == 0 &&
- period == NULL)
- return (ENODEV);
- if (first != NULL) {
- if (first->sec < et->et_min_period.sec ||
- (first->sec == et->et_min_period.sec &&
- first->frac < et->et_min_period.frac))
- first = &et->et_min_period;
- if (first->sec > et->et_max_period.sec ||
- (first->sec == et->et_max_period.sec &&
- first->frac > et->et_max_period.frac))
- first = &et->et_max_period;
+ KASSERT(period >= 0, ("et_start: negative period"));
+ KASSERT((et->et_flags & ET_FLAGS_PERIODIC) || period == 0,
+ ("et_start: period specified for oneshot-only timer"));
+ KASSERT((et->et_flags & ET_FLAGS_ONESHOT) || period != 0,
+ ("et_start: period not specified for periodic-only timer"));
+ if (period != 0) {
+ if (period < et->et_min_period)
+ period = et->et_min_period;
+ else if (period > et->et_max_period)
+ period = et->et_max_period;
}
- if (period != NULL) {
- if (period->sec < et->et_min_period.sec ||
- (period->sec == et->et_min_period.sec &&
- period->frac < et->et_min_period.frac))
- period = &et->et_min_period;
- if (period->sec > et->et_max_period.sec ||
- (period->sec == et->et_max_period.sec &&
- period->frac > et->et_max_period.frac))
- period = &et->et_max_period;
+ if (period == 0 || first != 0) {
+ if (first < et->et_min_period)
+ first = et->et_min_period;
+ else if (first > et->et_max_period)
+ first = et->et_max_period;
}
- if (et->et_start)
- return (et->et_start(et, first, period));
- return (0);
+ return (et->et_start(et, first, period));
}
/* Stop event timer hardware. */
Modified: trunk/sys/kern/kern_event.c
===================================================================
--- trunk/sys/kern/kern_event.c 2018-05-25 20:39:59 UTC (rev 9943)
+++ trunk/sys/kern/kern_event.c 2018-05-25 20:46:51 UTC (rev 9944)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 1999,2000,2001 Jonathan Lemon <jlemon at FreeBSD.org>
* Copyright 2004 John-Mark Gurney <jmg at FreeBSD.org>
@@ -27,16 +28,17 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/kern_event.c 320293 2017-06-23 19:04:40Z kib $");
#include "opt_ktrace.h"
#include <sys/param.h>
#include <sys/systm.h>
-#include <sys/capability.h>
+#include <sys/capsicum.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/mutex.h>
+#include <sys/rwlock.h>
#include <sys/proc.h>
#include <sys/malloc.h>
#include <sys/unistd.h>
@@ -51,7 +53,6 @@
#include <sys/eventvar.h>
#include <sys/poll.h>
#include <sys/protosw.h>
-#include <sys/resourcevar.h>
#include <sys/sigio.h>
#include <sys/signalvar.h>
#include <sys/socket.h>
@@ -65,6 +66,7 @@
#ifdef KTRACE
#include <sys/ktrace.h>
#endif
+#include <machine/atomic.h>
#include <vm/uma.h>
@@ -126,6 +128,7 @@
.fo_close = kqueue_close,
.fo_chmod = invfo_chmod,
.fo_chown = invfo_chown,
+ .fo_sendfile = invfo_sendfile,
};
static int knote_attach(struct knote *kn, struct kqueue *kq);
@@ -182,9 +185,9 @@
};
static uma_zone_t knote_zone;
-static int kq_ncallouts = 0;
-static int kq_calloutmax = (4 * 1024);
-SYSCTL_INT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW,
+static unsigned int kq_ncallouts = 0;
+static unsigned int kq_calloutmax = 4 * 1024;
+SYSCTL_UINT(_kern, OID_AUTO, kq_calloutmax, CTLFLAG_RW,
&kq_calloutmax, 0, "Maximum number of callouts allocated for kqueue");
/* XXX - ensure not KN_INFLUX?? */
@@ -362,11 +365,21 @@
kn->kn_flags |= EV_CLEAR; /* automatically set */
/*
- * internal flag indicating registration done by kernel
+ * Internal flag indicating registration done by kernel for the
+ * purposes of getting a NOTE_CHILD notification.
*/
- if (kn->kn_flags & EV_FLAG1) {
+ if (kn->kn_flags & EV_FLAG2) {
+ kn->kn_flags &= ~EV_FLAG2;
kn->kn_data = kn->kn_sdata; /* ppid */
kn->kn_fflags = NOTE_CHILD;
+ kn->kn_sfflags &= ~NOTE_EXIT;
+ immediate = 1; /* Force immediate activation of child note. */
+ }
+ /*
+ * Internal flag indicating registration done by kernel (for other than
+ * NOTE_CHILD).
+ */
+ if (kn->kn_flags & EV_FLAG1) {
kn->kn_flags &= ~EV_FLAG1;
}
@@ -374,9 +387,10 @@
knlist_add(&p->p_klist, kn, 1);
/*
- * Immediately activate any exit notes if the target process is a
- * zombie. This is necessary to handle the case where the target
- * process, e.g. a child, dies before the kevent is registered.
+ * Immediately activate any child notes or, in the case of a zombie
+ * target process, exit notes. The latter is necessary to handle the
+ * case where the target process, e.g. a child, dies before the kevent
+ * is registered.
*/
if (immediate && filt_proc(kn, NOTE_EXIT))
KNOTE_ACTIVATE(kn, 0);
@@ -430,8 +444,11 @@
if (!(kn->kn_status & KN_DETACHED))
knlist_remove_inevent(&p->p_klist, kn);
kn->kn_flags |= (EV_EOF | EV_ONESHOT);
- kn->kn_data = p->p_xstat;
kn->kn_ptr.p_proc = NULL;
+ if (kn->kn_fflags & NOTE_EXIT)
+ kn->kn_data = p->p_xstat;
+ if (kn->kn_fflags == 0)
+ kn->kn_flags |= EV_DROP;
return (1);
}
@@ -463,7 +480,7 @@
continue;
kq = kn->kn_kq;
KQ_LOCK(kq);
- if ((kn->kn_status & KN_INFLUX) == KN_INFLUX) {
+ if ((kn->kn_status & (KN_INFLUX | KN_SCAN)) == KN_INFLUX) {
KQ_UNLOCK(kq);
continue;
}
@@ -473,7 +490,7 @@
*/
if ((kn->kn_sfflags & NOTE_TRACK) == 0) {
kn->kn_status |= KN_HASKQLOCK;
- if (kn->kn_fop->f_event(kn, NOTE_FORK | pid))
+ if (kn->kn_fop->f_event(kn, NOTE_FORK))
KNOTE_ACTIVATE(kn, 1);
kn->kn_status &= ~KN_HASKQLOCK;
KQ_UNLOCK(kq);
@@ -482,7 +499,7 @@
/*
* The NOTE_TRACK case. In addition to the activation
- * of the event, we need to register new event to
+ * of the event, we need to register new events to
* track the child. Drop the locks in preparation for
* the call to kqueue_register().
*/
@@ -491,20 +508,39 @@
list->kl_unlock(list->kl_lockarg);
/*
- * Activate existing knote and register a knote with
+ * Activate existing knote and register tracking knotes with
* new process.
+ *
+ * First register a knote to get just the child notice. This
+ * must be a separate note from a potential NOTE_EXIT
+ * notification since both NOTE_CHILD and NOTE_EXIT are defined
+ * to use the data field (in conflicting ways).
*/
kev.ident = pid;
kev.filter = kn->kn_filter;
+ kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_ONESHOT | EV_FLAG2;
+ kev.fflags = kn->kn_sfflags;
+ kev.data = kn->kn_id; /* parent */
+ kev.udata = kn->kn_kevent.udata;/* preserve udata */
+ error = kqueue_register(kq, &kev, NULL, 0);
+ if (error)
+ kn->kn_fflags |= NOTE_TRACKERR;
+
+ /*
+ * Then register another knote to track other potential events
+ * from the new process.
+ */
+ kev.ident = pid;
+ kev.filter = kn->kn_filter;
kev.flags = kn->kn_flags | EV_ADD | EV_ENABLE | EV_FLAG1;
kev.fflags = kn->kn_sfflags;
kev.data = kn->kn_id; /* parent */
kev.udata = kn->kn_kevent.udata;/* preserve udata */
error = kqueue_register(kq, &kev, NULL, 0);
- if (kn->kn_fop->f_event(kn, NOTE_FORK | pid))
- KNOTE_ACTIVATE(kn, 0);
if (error)
kn->kn_fflags |= NOTE_TRACKERR;
+ if (kn->kn_fop->f_event(kn, NOTE_FORK))
+ KNOTE_ACTIVATE(kn, 0);
KQ_LOCK(kq);
kn->kn_status &= ~KN_INFLUX;
KQ_UNLOCK_FLUX(kq);
@@ -517,64 +553,127 @@
* XXX: EVFILT_TIMER should perhaps live in kern_time.c beside the
* interval timer support code.
*/
-static int
-timertoticks(intptr_t data)
+
+#define NOTE_TIMER_PRECMASK \
+ (NOTE_SECONDS | NOTE_MSECONDS | NOTE_USECONDS | NOTE_NSECONDS)
+
+static sbintime_t
+timer2sbintime(intptr_t data, int flags)
{
- struct timeval tv;
- int tticks;
+ int64_t secs;
- tv.tv_sec = data / 1000;
- tv.tv_usec = (data % 1000) * 1000;
- tticks = tvtohz(&tv);
-
- return tticks;
+ /*
+ * Macros for converting to the fractional second portion of an
+ * sbintime_t using 64bit multiplication to improve precision.
+ */
+#define NS_TO_SBT(ns) (((ns) * (((uint64_t)1 << 63) / 500000000)) >> 32)
+#define US_TO_SBT(us) (((us) * (((uint64_t)1 << 63) / 500000)) >> 32)
+#define MS_TO_SBT(ms) (((ms) * (((uint64_t)1 << 63) / 500)) >> 32)
+ switch (flags & NOTE_TIMER_PRECMASK) {
+ case NOTE_SECONDS:
+#ifdef __LP64__
+ if (data > (SBT_MAX / SBT_1S))
+ return (SBT_MAX);
+#endif
+ return ((sbintime_t)data << 32);
+ case NOTE_MSECONDS: /* FALLTHROUGH */
+ case 0:
+ if (data >= 1000) {
+ secs = data / 1000;
+#ifdef __LP64__
+ if (secs > (SBT_MAX / SBT_1S))
+ return (SBT_MAX);
+#endif
+ return (secs << 32 | MS_TO_SBT(data % 1000));
+ }
+ return (MS_TO_SBT(data));
+ case NOTE_USECONDS:
+ if (data >= 1000000) {
+ secs = data / 1000000;
+#ifdef __LP64__
+ if (secs > (SBT_MAX / SBT_1S))
+ return (SBT_MAX);
+#endif
+ return (secs << 32 | US_TO_SBT(data % 1000000));
+ }
+ return (US_TO_SBT(data));
+ case NOTE_NSECONDS:
+ if (data >= 1000000000) {
+ secs = data / 1000000000;
+#ifdef __LP64__
+ if (secs > (SBT_MAX / SBT_1S))
+ return (SBT_MAX);
+#endif
+ return (secs << 32 | US_TO_SBT(data % 1000000000));
+ }
+ return (NS_TO_SBT(data));
+ default:
+ break;
+ }
+ return (-1);
}
+struct kq_timer_cb_data {
+ struct callout c;
+ sbintime_t next; /* next timer event fires at */
+ sbintime_t to; /* precalculated timer period */
+};
+
static void
filt_timerexpire(void *knx)
{
- struct knote *kn = knx;
- struct callout *calloutp;
+ struct knote *kn;
+ struct kq_timer_cb_data *kc;
+ kn = knx;
kn->kn_data++;
KNOTE_ACTIVATE(kn, 0); /* XXX - handle locking */
- /*
- * timertoticks() uses tvtohz() which always adds 1 to allow
- * for the time until the next clock interrupt being strictly
- * less than 1 clock tick. We don't want that here since we
- * want to appear to be in sync with the clock interrupt even
- * when we're delayed.
- */
- if ((kn->kn_flags & EV_ONESHOT) != EV_ONESHOT) {
- calloutp = (struct callout *)kn->kn_hook;
- callout_reset_curcpu(calloutp, timertoticks(kn->kn_sdata) - 1,
- filt_timerexpire, kn);
- }
+ if ((kn->kn_flags & EV_ONESHOT) != 0)
+ return;
+
+ kc = kn->kn_ptr.p_v;
+ kc->next += kc->to;
+ callout_reset_sbt_on(&kc->c, kc->next, 0, filt_timerexpire, kn,
+ PCPU_GET(cpuid), C_ABSOLUTE);
}
/*
- * data contains amount of time to sleep, in milliseconds
+ * data contains amount of time to sleep
*/
static int
filt_timerattach(struct knote *kn)
{
- struct callout *calloutp;
+ struct kq_timer_cb_data *kc;
+ sbintime_t to;
+ unsigned int ncallouts;
- atomic_add_int(&kq_ncallouts, 1);
+ if (kn->kn_sdata < 0)
+ return (EINVAL);
+ if (kn->kn_sdata == 0 && (kn->kn_flags & EV_ONESHOT) == 0)
+ kn->kn_sdata = 1;
+ /* Only precision unit are supported in flags so far */
+ if ((kn->kn_sfflags & ~NOTE_TIMER_PRECMASK) != 0)
+ return (EINVAL);
- if (kq_ncallouts >= kq_calloutmax) {
- atomic_add_int(&kq_ncallouts, -1);
- return (ENOMEM);
- }
+ to = timer2sbintime(kn->kn_sdata, kn->kn_sfflags);
+ if (to < 0)
+ return (EINVAL);
+ do {
+ ncallouts = kq_ncallouts;
+ if (ncallouts >= kq_calloutmax)
+ return (ENOMEM);
+ } while (!atomic_cmpset_int(&kq_ncallouts, ncallouts, ncallouts + 1));
+
kn->kn_flags |= EV_CLEAR; /* automatically set */
- kn->kn_status &= ~KN_DETACHED; /* knlist_add usually sets it */
- calloutp = malloc(sizeof(*calloutp), M_KQUEUE, M_WAITOK);
- callout_init(calloutp, CALLOUT_MPSAFE);
- kn->kn_hook = calloutp;
- callout_reset_curcpu(calloutp, timertoticks(kn->kn_sdata),
- filt_timerexpire, kn);
+ kn->kn_status &= ~KN_DETACHED; /* knlist_add clears it */
+ kn->kn_ptr.p_v = kc = malloc(sizeof(*kc), M_KQUEUE, M_WAITOK);
+ callout_init(&kc->c, 1);
+ kc->next = to + sbinuptime();
+ kc->to = to;
+ callout_reset_sbt_on(&kc->c, kc->next, 0, filt_timerexpire, kn,
+ PCPU_GET(cpuid), C_ABSOLUTE);
return (0);
}
@@ -582,13 +681,15 @@
static void
filt_timerdetach(struct knote *kn)
{
- struct callout *calloutp;
+ struct kq_timer_cb_data *kc;
+ unsigned int old;
- calloutp = (struct callout *)kn->kn_hook;
- callout_drain(calloutp);
- free(calloutp, M_KQUEUE);
- atomic_add_int(&kq_ncallouts, -1);
- kn->kn_status |= KN_DETACHED; /* knlist_remove usually clears it */
+ kc = kn->kn_ptr.p_v;
+ callout_drain(&kc->c);
+ free(kc, M_KQUEUE);
+ old = atomic_fetchadd_int(&kq_ncallouts, -1);
+ KASSERT(old > 0, ("Number of callouts cannot become negative"));
+ kn->kn_status |= KN_DETACHED; /* knlist_remove sets it */
}
static int
@@ -689,41 +790,33 @@
int
sys_kqueue(struct thread *td, struct kqueue_args *uap)
{
+
+ return (kern_kqueue(td, 0));
+}
+
+int
+kern_kqueue(struct thread *td, int flags)
+{
struct filedesc *fdp;
struct kqueue *kq;
struct file *fp;
- struct proc *p;
- struct ucred *cred;
int fd, error;
- p = td->td_proc;
- cred = td->td_ucred;
- crhold(cred);
- PROC_LOCK(p);
- if (!chgkqcnt(cred->cr_ruidinfo, 1, lim_cur(td->td_proc,
- RLIMIT_KQUEUES))) {
- PROC_UNLOCK(p);
- crfree(cred);
- return (EMFILE);
- }
- PROC_UNLOCK(p);
-
- fdp = p->p_fd;
- error = falloc(td, &fp, &fd, 0);
+ fdp = td->td_proc->p_fd;
+ error = falloc(td, &fp, &fd, flags);
if (error)
goto done2;
- /* An extra reference on `nfp' has been held for us by falloc(). */
+ /* An extra reference on `fp' has been held for us by falloc(). */
kq = malloc(sizeof *kq, M_KQUEUE, M_WAITOK | M_ZERO);
mtx_init(&kq->kq_lock, "kqueue", NULL, MTX_DEF|MTX_DUPOK);
TAILQ_INIT(&kq->kq_head);
kq->kq_fdp = fdp;
- kq->kq_cred = cred;
knlist_init_mtx(&kq->kq_sel.si_note, &kq->kq_lock);
TASK_INIT(&kq->kq_task, 0, kqueue_task, kq);
FILEDESC_XLOCK(fdp);
- SLIST_INSERT_HEAD(&fdp->fd_kqlist, kq, kq_list);
+ TAILQ_INSERT_HEAD(&fdp->fd_kqlist, kq, kq_list);
FILEDESC_XUNLOCK(fdp);
finit(fp, FREAD | FWRITE, DTYPE_KQUEUE, kq, &kqueueops);
@@ -731,13 +824,20 @@
td->td_retval[0] = fd;
done2:
- if (error != 0) {
- chgkqcnt(cred->cr_ruidinfo, -1, 0);
- crfree(cred);
- }
return (error);
}
+#ifdef KTRACE
+static size_t
+kev_iovlen(int n, u_int kgio)
+{
+
+ if (n < 0 || n >= kgio / sizeof(struct kevent))
+ return (kgio);
+ return (n * sizeof(struct kevent));
+}
+#endif
+
#ifndef _SYS_SYSPROTO_H_
struct kevent_args {
int fd;
@@ -761,6 +861,7 @@
struct iovec ktriov;
struct uio *ktruioin = NULL;
struct uio *ktruioout = NULL;
+ u_int kgio;
#endif
if (uap->timeout != NULL) {
@@ -773,13 +874,15 @@
#ifdef KTRACE
if (KTRPOINT(td, KTR_GENIO)) {
+ kgio = ktr_geniosize;
ktriov.iov_base = uap->changelist;
- ktriov.iov_len = uap->nchanges * sizeof(struct kevent);
+ ktriov.iov_len = kev_iovlen(uap->nchanges, kgio);
ktruio = (struct uio){ .uio_iov = &ktriov, .uio_iovcnt = 1,
.uio_segflg = UIO_USERSPACE, .uio_rw = UIO_READ,
.uio_td = td };
ktruioin = cloneuio(&ktruio);
ktriov.iov_base = uap->eventlist;
+ ktriov.iov_len = kev_iovlen(uap->nevents, kgio);
ktriov.iov_len = uap->nevents * sizeof(struct kevent);
ktruioout = cloneuio(&ktruio);
}
@@ -790,9 +893,9 @@
#ifdef KTRACE
if (ktruioin != NULL) {
- ktruioin->uio_resid = uap->nchanges * sizeof(struct kevent);
+ ktruioin->uio_resid = kev_iovlen(uap->nchanges, kgio);
ktrgenio(uap->fd, UIO_WRITE, ktruioin, 0);
- ktruioout->uio_resid = td->td_retval[0] * sizeof(struct kevent);
+ ktruioout->uio_resid = kev_iovlen(td->td_retval[0], kgio);
ktrgenio(uap->fd, UIO_READ, ktruioout, error);
}
#endif
@@ -840,16 +943,37 @@
kern_kevent(struct thread *td, int fd, int nchanges, int nevents,
struct kevent_copyops *k_ops, const struct timespec *timeout)
{
+ cap_rights_t rights;
+ struct file *fp;
+ int error;
+
+ cap_rights_init(&rights);
+ if (nchanges > 0)
+ cap_rights_set(&rights, CAP_KQUEUE_CHANGE);
+ if (nevents > 0)
+ cap_rights_set(&rights, CAP_KQUEUE_EVENT);
+ error = fget(td, fd, &rights, &fp);
+ if (error != 0)
+ return (error);
+
+ error = kern_kevent_fp(td, fp, nchanges, nevents, k_ops, timeout);
+ fdrop(fp, td);
+
+ return (error);
+}
+
+int
+kern_kevent_fp(struct thread *td, struct file *fp, int nchanges, int nevents,
+ struct kevent_copyops *k_ops, const struct timespec *timeout)
+{
struct kevent keva[KQ_NEVENTS];
struct kevent *kevp, *changes;
struct kqueue *kq;
- struct file *fp;
int i, n, nerrors, error;
- if ((error = fget(td, fd, CAP_POST_EVENT, &fp)) != 0)
+ error = kqueue_acquire(fp, &kq);
+ if (error != 0)
return (error);
- if ((error = kqueue_acquire(fp, &kq)) != 0)
- goto done_norel;
nerrors = 0;
@@ -889,8 +1013,6 @@
error = kqueue_scan(kq, nevents, k_ops, timeout, keva, td);
done:
kqueue_release(kq, 0);
-done_norel:
- fdrop(fp, td);
return (error);
}
@@ -984,13 +1106,15 @@
struct filterops *fops;
struct file *fp;
struct knote *kn, *tkn;
+ cap_rights_t rights;
int error, filt, event;
- int haskqglobal;
+ int haskqglobal, filedesc_unlock;
fp = NULL;
kn = NULL;
error = 0;
haskqglobal = 0;
+ filedesc_unlock = 0;
filt = kev->filter;
fops = kqueue_fo_find(filt);
@@ -1002,7 +1126,11 @@
findkn:
if (fops->f_isfd) {
KASSERT(td != NULL, ("td is NULL"));
- error = fget(td, kev->ident, CAP_POLL_EVENT, &fp);
+ if (kev->ident > INT_MAX)
+ error = EBADF;
+ else
+ error = fget(td, kev->ident,
+ cap_rights_init(&rights, CAP_EVENT), &fp);
if (error)
goto done;
@@ -1019,7 +1147,7 @@
if (fp->f_type == DTYPE_KQUEUE) {
/*
- * if we add some inteligence about what we are doing,
+ * If we add some intelligence about what we are doing,
* we should be able to support events on ourselves.
* We need to know when we are doing this to prevent
* getting both the knlist lock and the kq lock since
@@ -1030,6 +1158,13 @@
goto done;
}
+ /*
+ * Pre-lock the filedesc before the global
+ * lock mutex, see the comment in
+ * kqueue_close().
+ */
+ FILEDESC_XLOCK(td->td_proc->p_fd);
+ filedesc_unlock = 1;
KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
}
@@ -1044,7 +1179,18 @@
kqueue_expand(kq, fops, kev->ident, waitok);
KQ_LOCK(kq);
- if (kq->kq_knhashmask != 0) {
+
+ /*
+ * If possible, find an existing knote to use for this kevent.
+ */
+ if (kev->filter == EVFILT_PROC &&
+ (kev->flags & (EV_FLAG1 | EV_FLAG2)) != 0) {
+ /* This is an internal creation of a process tracking
+ * note. Don't attempt to coalesce this with an
+ * existing note.
+ */
+ ;
+ } else if (kq->kq_knhashmask != 0) {
struct klist *list;
list = &kq->kq_knhash[
@@ -1056,9 +1202,13 @@
}
}
- /* knote is in the process of changing, wait for it to stablize. */
+ /* knote is in the process of changing, wait for it to stabilize. */
if (kn != NULL && (kn->kn_status & KN_INFLUX) == KN_INFLUX) {
KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
+ if (filedesc_unlock) {
+ FILEDESC_XUNLOCK(td->td_proc->p_fd);
+ filedesc_unlock = 0;
+ }
kq->kq_state |= KQ_FLUXWAIT;
msleep(kq, &kq->kq_lock, PSOCK | PDROP, "kqflxwt", 0);
if (fp != NULL) {
@@ -1134,7 +1284,7 @@
* but doing so will not reset any filter which has already been
* triggered.
*/
- kn->kn_status |= KN_INFLUX;
+ kn->kn_status |= KN_INFLUX | KN_SCAN;
KQ_UNLOCK(kq);
KN_LIST_LOCK(kn);
kn->kn_kevent.udata = kev->udata;
@@ -1157,7 +1307,7 @@
KQ_LOCK(kq);
if (event)
KNOTE_ACTIVATE(kn, 1);
- kn->kn_status &= ~KN_INFLUX;
+ kn->kn_status &= ~(KN_INFLUX | KN_SCAN);
KN_LIST_UNLOCK(kn);
if ((kev->flags & EV_DISABLE) &&
@@ -1175,6 +1325,8 @@
done:
KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
+ if (filedesc_unlock)
+ FILEDESC_XUNLOCK(td->td_proc->p_fd);
if (fp != NULL)
fdrop(fp, td);
if (tkn != NULL)
@@ -1338,10 +1490,9 @@
const struct timespec *tsp, struct kevent *keva, struct thread *td)
{
struct kevent *kevp;
- struct timeval atv, rtv, ttv;
struct knote *kn, *marker;
- int count, timeout, nkev, error, influx;
- int haskqglobal, touch;
+ sbintime_t asbt, rsbt;
+ int count, error, haskqglobal, influx, nkev, touch;
count = maxevents;
nkev = 0;
@@ -1351,24 +1502,29 @@
if (maxevents == 0)
goto done_nl;
+ rsbt = 0;
if (tsp != NULL) {
- TIMESPEC_TO_TIMEVAL(&atv, tsp);
- if (itimerfix(&atv)) {
+ if (tsp->tv_sec < 0 || tsp->tv_nsec < 0 ||
+ tsp->tv_nsec >= 1000000000) {
error = EINVAL;
goto done_nl;
}
- if (tsp->tv_sec == 0 && tsp->tv_nsec == 0)
- timeout = -1;
- else
- timeout = atv.tv_sec > 24 * 60 * 60 ?
- 24 * 60 * 60 * hz : tvtohz(&atv);
- getmicrouptime(&rtv);
- timevaladd(&atv, &rtv);
- } else {
- atv.tv_sec = 0;
- atv.tv_usec = 0;
- timeout = 0;
- }
+ if (timespecisset(tsp)) {
+ if (tsp->tv_sec <= INT32_MAX) {
+ rsbt = tstosbt(*tsp);
+ if (TIMESEL(&asbt, rsbt))
+ asbt += tc_tick_sbt;
+ if (asbt <= SBT_MAX - rsbt)
+ asbt += rsbt;
+ else
+ asbt = 0;
+ rsbt >>= tc_precexp;
+ } else
+ asbt = 0;
+ } else
+ asbt = -1;
+ } else
+ asbt = 0;
marker = knote_alloc(1);
if (marker == NULL) {
error = ENOMEM;
@@ -1376,28 +1532,16 @@
}
marker->kn_status = KN_MARKER;
KQ_LOCK(kq);
- goto start;
retry:
- if (atv.tv_sec || atv.tv_usec) {
- getmicrouptime(&rtv);
- if (timevalcmp(&rtv, &atv, >=))
- goto done;
- ttv = atv;
- timevalsub(&ttv, &rtv);
- timeout = ttv.tv_sec > 24 * 60 * 60 ?
- 24 * 60 * 60 * hz : tvtohz(&ttv);
- }
-
-start:
kevp = keva;
if (kq->kq_count == 0) {
- if (timeout < 0) {
+ if (asbt == -1) {
error = EWOULDBLOCK;
} else {
kq->kq_state |= KQ_SLEEP;
- error = msleep(kq, &kq->kq_lock, PSOCK | PCATCH,
- "kqread", timeout);
+ error = msleep_sbt(kq, &kq->kq_lock, PSOCK | PCATCH,
+ "kqread", asbt, rsbt, C_ABSOLUTE);
}
if (error == 0)
goto retry;
@@ -1442,7 +1586,7 @@
KASSERT((kn->kn_status & KN_INFLUX) == 0,
("KN_INFLUX set when not suppose to be"));
- if ((kn->kn_flags & EV_ONESHOT) == EV_ONESHOT) {
+ if ((kn->kn_flags & EV_DROP) == EV_DROP) {
kn->kn_status &= ~KN_QUEUED;
kn->kn_status |= KN_INFLUX;
kq->kq_count--;
@@ -1451,6 +1595,20 @@
* We don't need to lock the list since we've marked
* it _INFLUX.
*/
+ if (!(kn->kn_status & KN_DETACHED))
+ kn->kn_fop->f_detach(kn);
+ knote_drop(kn, td);
+ KQ_LOCK(kq);
+ continue;
+ } else if ((kn->kn_flags & EV_ONESHOT) == EV_ONESHOT) {
+ kn->kn_status &= ~KN_QUEUED;
+ kn->kn_status |= KN_INFLUX;
+ kq->kq_count--;
+ KQ_UNLOCK(kq);
+ /*
+ * We don't need to lock the list since we've marked
+ * it _INFLUX.
+ */
*kevp = kn->kn_kevent;
if (!(kn->kn_status & KN_DETACHED))
kn->kn_fop->f_detach(kn);
@@ -1458,7 +1616,7 @@
KQ_LOCK(kq);
kn = NULL;
} else {
- kn->kn_status |= KN_INFLUX;
+ kn->kn_status |= KN_INFLUX | KN_SCAN;
KQ_UNLOCK(kq);
if ((kn->kn_status & KN_KQUEUE) == KN_KQUEUE)
KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
@@ -1467,7 +1625,8 @@
KQ_LOCK(kq);
KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
kn->kn_status &=
- ~(KN_QUEUED | KN_ACTIVE | KN_INFLUX);
+ ~(KN_QUEUED | KN_ACTIVE | KN_INFLUX |
+ KN_SCAN);
kq->kq_count--;
KN_LIST_UNLOCK(kn);
influx = 1;
@@ -1481,7 +1640,7 @@
*kevp = kn->kn_kevent;
KQ_LOCK(kq);
KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
- if (kn->kn_flags & (EV_CLEAR | EV_DISPATCH)) {
+ if (kn->kn_flags & (EV_CLEAR | EV_DISPATCH)) {
/*
* Manually clear knotes who weren't
* 'touch'ed.
@@ -1497,7 +1656,7 @@
} else
TAILQ_INSERT_TAIL(&kq->kq_head, kn, kn_tqe);
- kn->kn_status &= ~(KN_INFLUX);
+ kn->kn_status &= ~(KN_INFLUX | KN_SCAN);
KN_LIST_UNLOCK(kn);
influx = 1;
}
@@ -1662,10 +1821,12 @@
struct knote *kn;
int i;
int error;
+ int filedesc_unlock;
if ((error = kqueue_acquire(fp, &kq)))
return error;
+ filedesc_unlock = 0;
KQ_LOCK(kq);
KASSERT((kq->kq_state & KQ_CLOSING) != KQ_CLOSING,
@@ -1727,9 +1888,20 @@
KQ_UNLOCK(kq);
- FILEDESC_XLOCK(fdp);
- SLIST_REMOVE(&fdp->fd_kqlist, kq, kqueue, kq_list);
- FILEDESC_XUNLOCK(fdp);
+ /*
+ * We could be called due to the knote_drop() doing fdrop(),
+ * called from kqueue_register(). In this case the global
+ * lock is owned, and filedesc sx is locked before, to not
+ * take the sleepable lock after non-sleepable.
+ */
+ if (!sx_xlocked(FILEDESC_LOCK(fdp))) {
+ FILEDESC_XLOCK(fdp);
+ filedesc_unlock = 1;
+ } else
+ filedesc_unlock = 0;
+ TAILQ_REMOVE(&fdp->fd_kqlist, kq, kq_list);
+ if (filedesc_unlock)
+ FILEDESC_XUNLOCK(fdp);
seldrain(&kq->kq_sel);
knlist_destroy(&kq->kq_sel.si_note);
@@ -1742,8 +1914,6 @@
free(kq->kq_knlist, M_KQUEUE);
funsetown(&kq->kq_sigio);
- chgkqcnt(kq->kq_cred->cr_ruidinfo, -1, 0);
- crfree(kq->kq_cred);
free(kq, M_KQUEUE);
fp->f_data = NULL;
@@ -1783,8 +1953,9 @@
knote(struct knlist *list, long hint, int lockflags)
{
struct kqueue *kq;
- struct knote *kn;
+ struct knote *kn, *tkn;
int error;
+ bool own_influx;
if (list == NULL)
return;
@@ -1795,37 +1966,44 @@
list->kl_lock(list->kl_lockarg);
/*
- * If we unlock the list lock (and set KN_INFLUX), we can eliminate
- * the kqueue scheduling, but this will introduce four
- * lock/unlock's for each knote to test. If we do, continue to use
- * SLIST_FOREACH, SLIST_FOREACH_SAFE is not safe in our case, it is
- * only safe if you want to remove the current item, which we are
- * not doing.
+ * If we unlock the list lock (and set KN_INFLUX), we can
+ * eliminate the kqueue scheduling, but this will introduce
+ * four lock/unlock's for each knote to test. Also, marker
+ * would be needed to keep iteration position, since filters
+ * or other threads could remove events.
*/
- SLIST_FOREACH(kn, &list->kl_list, kn_selnext) {
+ SLIST_FOREACH_SAFE(kn, &list->kl_list, kn_selnext, tkn) {
kq = kn->kn_kq;
- if ((kn->kn_status & KN_INFLUX) != KN_INFLUX) {
+ KQ_LOCK(kq);
+ if ((kn->kn_status & (KN_INFLUX | KN_SCAN)) == KN_INFLUX) {
+ /*
+ * Do not process the influx notes, except for
+ * the influx coming from the kq unlock in the
+ * kqueue_scan(). In the later case, we do
+ * not interfere with the scan, since the code
+ * fragment in kqueue_scan() locks the knlist,
+ * and cannot proceed until we finished.
+ */
+ KQ_UNLOCK(kq);
+ } else if ((lockflags & KNF_NOKQLOCK) != 0) {
+ own_influx = (kn->kn_status & KN_INFLUX) == 0;
+ if (own_influx)
+ kn->kn_status |= KN_INFLUX;
+ KQ_UNLOCK(kq);
+ error = kn->kn_fop->f_event(kn, hint);
KQ_LOCK(kq);
- if ((kn->kn_status & KN_INFLUX) == KN_INFLUX) {
- KQ_UNLOCK(kq);
- } else if ((lockflags & KNF_NOKQLOCK) != 0) {
- kn->kn_status |= KN_INFLUX;
- KQ_UNLOCK(kq);
- error = kn->kn_fop->f_event(kn, hint);
- KQ_LOCK(kq);
+ if (own_influx)
kn->kn_status &= ~KN_INFLUX;
- if (error)
- KNOTE_ACTIVATE(kn, 1);
- KQ_UNLOCK_FLUX(kq);
- } else {
- kn->kn_status |= KN_HASKQLOCK;
- if (kn->kn_fop->f_event(kn, hint))
- KNOTE_ACTIVATE(kn, 1);
- kn->kn_status &= ~KN_HASKQLOCK;
- KQ_UNLOCK(kq);
- }
+ if (error)
+ KNOTE_ACTIVATE(kn, 1);
+ KQ_UNLOCK_FLUX(kq);
+ } else {
+ kn->kn_status |= KN_HASKQLOCK;
+ if (kn->kn_fop->f_event(kn, hint))
+ KNOTE_ACTIVATE(kn, 1);
+ kn->kn_status &= ~KN_HASKQLOCK;
+ KQ_UNLOCK(kq);
}
- kq = NULL;
}
if ((lockflags & KNF_LISTLOCKED) == 0)
list->kl_unlock(list->kl_lockarg);
@@ -1875,7 +2053,7 @@
}
/*
- * remove all knotes from a specified klist
+ * remove knote from the specified knlist
*/
void
knlist_remove(struct knlist *knl, struct knote *kn, int islocked)
@@ -1885,7 +2063,7 @@
}
/*
- * remove knote from a specified klist while in f_event handler.
+ * remove knote from the specified knlist while in f_event handler.
*/
void
knlist_remove_inevent(struct knlist *knl, struct knote *kn)
@@ -1898,13 +2076,14 @@
int
knlist_empty(struct knlist *knl)
{
+
KNL_ASSERT_LOCKED(knl);
- return SLIST_EMPTY(&knl->kl_list);
+ return (SLIST_EMPTY(&knl->kl_list));
}
-static struct mtx knlist_lock;
+static struct mtx knlist_lock;
MTX_SYSINIT(knlist_lock, &knlist_lock, "knlist lock for lockless objects",
- MTX_DEF);
+ MTX_DEF);
static void knlist_mtx_lock(void *arg);
static void knlist_mtx_unlock(void *arg);
@@ -1911,6 +2090,7 @@
static void
knlist_mtx_lock(void *arg)
{
+
mtx_lock((struct mtx *)arg);
}
@@ -1917,6 +2097,7 @@
static void
knlist_mtx_unlock(void *arg)
{
+
mtx_unlock((struct mtx *)arg);
}
@@ -1923,6 +2104,7 @@
static void
knlist_mtx_assert_locked(void *arg)
{
+
mtx_assert((struct mtx *)arg, MA_OWNED);
}
@@ -1929,9 +2111,38 @@
static void
knlist_mtx_assert_unlocked(void *arg)
{
+
mtx_assert((struct mtx *)arg, MA_NOTOWNED);
}
+static void
+knlist_rw_rlock(void *arg)
+{
+
+ rw_rlock((struct rwlock *)arg);
+}
+
+static void
+knlist_rw_runlock(void *arg)
+{
+
+ rw_runlock((struct rwlock *)arg);
+}
+
+static void
+knlist_rw_assert_locked(void *arg)
+{
+
+ rw_assert((struct rwlock *)arg, RA_LOCKED);
+}
+
+static void
+knlist_rw_assert_unlocked(void *arg)
+{
+
+ rw_assert((struct rwlock *)arg, RA_UNLOCKED);
+}
+
void
knlist_init(struct knlist *knl, void *lock, void (*kl_lock)(void *),
void (*kl_unlock)(void *),
@@ -1971,20 +2182,19 @@
}
void
+knlist_init_rw_reader(struct knlist *knl, struct rwlock *lock)
+{
+
+ knlist_init(knl, lock, knlist_rw_rlock, knlist_rw_runlock,
+ knlist_rw_assert_locked, knlist_rw_assert_unlocked);
+}
+
+void
knlist_destroy(struct knlist *knl)
{
-#ifdef INVARIANTS
- /*
- * if we run across this error, we need to find the offending
- * driver and have it call knlist_clear.
- */
- if (!SLIST_EMPTY(&knl->kl_list))
- printf("WARNING: destroying knlist w/ knotes on it!\n");
-#endif
-
- knl->kl_lockarg = knl->kl_lock = knl->kl_unlock = NULL;
- SLIST_INIT(&knl->kl_list);
+ KASSERT(KNLIST_EMPTY(knl),
+ ("destroying knlist %p with knotes on it", knl));
}
/*
@@ -2066,7 +2276,7 @@
* We shouldn't have to worry about new kevents appearing on fd
* since filedesc is locked.
*/
- SLIST_FOREACH(kq, &fdp->fd_kqlist, kq_list) {
+ TAILQ_FOREACH(kq, &fdp->fd_kqlist, kq_list) {
KQ_LOCK(kq);
again:
@@ -2103,17 +2313,15 @@
if (kn->kn_fop->f_isfd) {
if (kn->kn_id >= kq->kq_knlistsize)
- return ENOMEM;
+ return (ENOMEM);
list = &kq->kq_knlist[kn->kn_id];
} else {
if (kq->kq_knhash == NULL)
- return ENOMEM;
+ return (ENOMEM);
list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
}
-
SLIST_INSERT_HEAD(list, kn, kn_link);
-
- return 0;
+ return (0);
}
/*
@@ -2212,19 +2420,19 @@
{
struct kqueue *kq;
struct file *fp;
+ cap_rights_t rights;
int error;
- if ((error = fget(td, fd, CAP_POST_EVENT, &fp)) != 0)
+ error = fget(td, fd, cap_rights_init(&rights, CAP_KQUEUE_CHANGE), &fp);
+ if (error != 0)
return (error);
if ((error = kqueue_acquire(fp, &kq)) != 0)
goto noacquire;
error = kqueue_register(kq, kev, td, waitok);
-
kqueue_release(kq, 0);
noacquire:
fdrop(fp, td);
-
- return error;
+ return (error);
}
Modified: trunk/sys/kern/kern_exec.c
===================================================================
--- trunk/sys/kern/kern_exec.c 2018-05-25 20:39:59 UTC (rev 9943)
+++ trunk/sys/kern/kern_exec.c 2018-05-25 20:46:51 UTC (rev 9944)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 1993, David Greenman
* All rights reserved.
@@ -25,7 +26,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/kern_exec.c 330067 2018-02-27 14:45:55Z avg $");
#include "opt_capsicum.h"
#include "opt_hwpmc_hooks.h"
@@ -34,9 +35,9 @@
#include "opt_vm.h"
#include <sys/param.h>
-#include <sys/capability.h>
+#include <sys/capsicum.h>
#include <sys/systm.h>
-#include <sys/capability.h>
+#include <sys/capsicum.h>
#include <sys/eventhandler.h>
#include <sys/lock.h>
#include <sys/mutex.h>
@@ -55,8 +56,10 @@
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/pioctl.h>
+#include <sys/ptrace.h>
#include <sys/namei.h>
#include <sys/resourcevar.h>
+#include <sys/rwlock.h>
#include <sys/sched.h>
#include <sys/sdt.h>
#include <sys/sf_buf.h>
@@ -95,15 +98,22 @@
#endif
SDT_PROVIDER_DECLARE(proc);
-SDT_PROBE_DEFINE(proc, kernel, , exec, exec);
-SDT_PROBE_ARGTYPE(proc, kernel, , exec, 0, "char *");
-SDT_PROBE_DEFINE(proc, kernel, , exec_failure, exec-failure);
-SDT_PROBE_ARGTYPE(proc, kernel, , exec_failure, 0, "int");
-SDT_PROBE_DEFINE(proc, kernel, , exec_success, exec-success);
-SDT_PROBE_ARGTYPE(proc, kernel, , exec_success, 0, "char *");
+SDT_PROBE_DEFINE1(proc, , , exec, "char *");
+SDT_PROBE_DEFINE1(proc, , , exec__failure, "int");
+SDT_PROBE_DEFINE1(proc, , , exec__success, "char *");
MALLOC_DEFINE(M_PARGS, "proc-args", "Process arguments");
+int coredump_pack_fileinfo = 1;
+SYSCTL_INT(_kern, OID_AUTO, coredump_pack_fileinfo, CTLFLAG_RWTUN,
+ &coredump_pack_fileinfo, 0,
+ "Enable file path packing in 'procstat -f' coredump notes");
+
+int coredump_pack_vmmapinfo = 1;
+SYSCTL_INT(_kern, OID_AUTO, coredump_pack_vmmapinfo, CTLFLAG_RWTUN,
+ &coredump_pack_vmmapinfo, 0,
+ "Enable file path packing in 'procstat -v' coredump notes");
+
static int sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS);
static int sysctl_kern_usrstack(SYSCTL_HANDLER_ARGS);
static int sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS);
@@ -125,6 +135,11 @@
SYSCTL_ULONG(_kern, OID_AUTO, ps_arg_cache_limit, CTLFLAG_RW,
&ps_arg_cache_limit, 0, "");
+static int disallow_high_osrel;
+SYSCTL_INT(_kern, OID_AUTO, disallow_high_osrel, CTLFLAG_RW,
+ &disallow_high_osrel, 0,
+ "Disallow execution of binaries built for higher version of the world");
+
static int map_at_zero = 0;
TUNABLE_INT("security.bsd.map_at_zero", &map_at_zero);
SYSCTL_INT(_security_bsd, OID_AUTO, map_at_zero, CTLFLAG_RW, &map_at_zero, 0,
@@ -193,21 +208,20 @@
#endif
int
-sys_execve(td, uap)
- struct thread *td;
- struct execve_args /* {
- char *fname;
- char **argv;
- char **envv;
- } */ *uap;
+sys_execve(struct thread *td, struct execve_args *uap)
{
+ struct image_args args;
+ struct vmspace *oldvmspace;
int error;
- struct image_args args;
+ error = pre_execve(td, &oldvmspace);
+ if (error != 0)
+ return (error);
error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE,
uap->argv, uap->envv);
if (error == 0)
error = kern_execve(td, &args, NULL);
+ post_execve(td, error, oldvmspace);
return (error);
}
@@ -221,9 +235,13 @@
int
sys_fexecve(struct thread *td, struct fexecve_args *uap)
{
+ struct image_args args;
+ struct vmspace *oldvmspace;
int error;
- struct image_args args;
+ error = pre_execve(td, &oldvmspace);
+ if (error != 0)
+ return (error);
error = exec_copyin_args(&args, NULL, UIO_SYSSPACE,
uap->argv, uap->envv);
if (error == 0) {
@@ -230,6 +248,7 @@
args.fd = uap->fd;
error = kern_execve(td, &args, NULL);
}
+ post_execve(td, error, oldvmspace);
return (error);
}
@@ -243,23 +262,21 @@
#endif
int
-sys___mac_execve(td, uap)
- struct thread *td;
- struct __mac_execve_args /* {
- char *fname;
- char **argv;
- char **envv;
- struct mac *mac_p;
- } */ *uap;
+sys___mac_execve(struct thread *td, struct __mac_execve_args *uap)
{
#ifdef MAC
+ struct image_args args;
+ struct vmspace *oldvmspace;
int error;
- struct image_args args;
+ error = pre_execve(td, &oldvmspace);
+ if (error != 0)
+ return (error);
error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE,
uap->argv, uap->envv);
if (error == 0)
error = kern_execve(td, &args, uap->mac_p);
+ post_execve(td, error, oldvmspace);
return (error);
#else
return (ENOSYS);
@@ -266,39 +283,35 @@
#endif
}
-/*
- * XXX: kern_execve has the astonishing property of not always returning to
- * the caller. If sufficiently bad things happen during the call to
- * do_execve(), it can end up calling exit1(); as a result, callers must
- * avoid doing anything which they might need to undo (e.g., allocating
- * memory).
- */
int
-kern_execve(td, args, mac_p)
- struct thread *td;
- struct image_args *args;
- struct mac *mac_p;
+pre_execve(struct thread *td, struct vmspace **oldvmspace)
{
- struct proc *p = td->td_proc;
+ struct proc *p;
int error;
- AUDIT_ARG_ARGV(args->begin_argv, args->argc,
- args->begin_envv - args->begin_argv);
- AUDIT_ARG_ENVV(args->begin_envv, args->envc,
- args->endp - args->begin_envv);
- if (p->p_flag & P_HADTHREADS) {
+ KASSERT(td == curthread, ("non-current thread %p", td));
+ error = 0;
+ p = td->td_proc;
+ if ((p->p_flag & P_HADTHREADS) != 0) {
PROC_LOCK(p);
- if (thread_single(SINGLE_BOUNDARY)) {
- PROC_UNLOCK(p);
- exec_free_args(args);
- return (ERESTART); /* Try again later. */
- }
+ if (thread_single(p, SINGLE_BOUNDARY) != 0)
+ error = ERESTART;
PROC_UNLOCK(p);
}
+ KASSERT(error != 0 || (td->td_pflags & TDP_EXECVMSPC) == 0,
+ ("nested execve"));
+ *oldvmspace = p->p_vmspace;
+ return (error);
+}
- error = do_execve(td, args, mac_p);
+void
+post_execve(struct thread *td, int error, struct vmspace *oldvmspace)
+{
+ struct proc *p;
- if (p->p_flag & P_HADTHREADS) {
+ KASSERT(td == curthread, ("non-current thread %p", td));
+ p = td->td_proc;
+ if ((p->p_flag & P_HADTHREADS) != 0) {
PROC_LOCK(p);
/*
* If success, we upgrade to SINGLE_EXIT state to
@@ -305,13 +318,35 @@
* force other threads to suicide.
*/
if (error == 0)
- thread_single(SINGLE_EXIT);
+ thread_single(p, SINGLE_EXIT);
else
- thread_single_end();
+ thread_single_end(p, SINGLE_BOUNDARY);
PROC_UNLOCK(p);
}
+ if ((td->td_pflags & TDP_EXECVMSPC) != 0) {
+ KASSERT(p->p_vmspace != oldvmspace,
+ ("oldvmspace still used"));
+ vmspace_free(oldvmspace);
+ td->td_pflags &= ~TDP_EXECVMSPC;
+ }
+}
- return (error);
+/*
+ * XXX: kern_execve has the astonishing property of not always returning to
+ * the caller. If sufficiently bad things happen during the call to
+ * do_execve(), it can end up calling exit1(); as a result, callers must
+ * avoid doing anything which they might need to undo (e.g., allocating
+ * memory).
+ */
+int
+kern_execve(struct thread *td, struct image_args *args, struct mac *mac_p)
+{
+
+ AUDIT_ARG_ARGV(args->begin_argv, args->argc,
+ args->begin_envv - args->begin_argv);
+ AUDIT_ARG_ENVV(args->begin_envv, args->envc,
+ args->endp - args->begin_envv);
+ return (do_execve(td, args, mac_p));
}
/*
@@ -326,8 +361,8 @@
{
struct proc *p = td->td_proc;
struct nameidata nd;
- struct ucred *newcred = NULL, *oldcred;
- struct uidinfo *euip;
+ struct ucred *oldcred;
+ struct uidinfo *euip = NULL;
register_t *stack_base;
int error, i;
struct image_params image_params, *imgp;
@@ -334,14 +369,14 @@
struct vattr attr;
int (*img_first)(struct image_params *);
struct pargs *oldargs = NULL, *newargs = NULL;
- struct sigacts *oldsigacts, *newsigacts;
+ struct sigacts *oldsigacts = NULL, *newsigacts = NULL;
#ifdef KTRACE
struct vnode *tracevp = NULL;
struct ucred *tracecred = NULL;
#endif
- struct vnode *textvp = NULL, *binvp = NULL;
+ struct vnode *oldtextvp = NULL, *newtextvp;
+ cap_rights_t rights;
int credential_changing;
- int vfslocked;
int textset;
#ifdef MAC
struct label *interpvplabel = NULL;
@@ -352,7 +387,6 @@
#endif
static const char fexecv_proc_title[] = "(fexecv)";
- vfslocked = 0;
imgp = &image_params;
/*
@@ -371,29 +405,11 @@
/*
* Initialize part of the common data
*/
+ bzero(imgp, sizeof(*imgp));
imgp->proc = p;
- imgp->execlabel = NULL;
imgp->attr = &attr;
- imgp->entry_addr = 0;
- imgp->reloc_base = 0;
- imgp->vmspace_destroyed = 0;
- imgp->interpreted = 0;
- imgp->opened = 0;
- imgp->interpreter_name = NULL;
- imgp->auxargs = NULL;
- imgp->vp = NULL;
- imgp->object = NULL;
- imgp->firstpage = NULL;
- imgp->ps_strings = 0;
- imgp->auxarg_size = 0;
imgp->args = args;
- imgp->execpath = imgp->freepath = NULL;
- imgp->execpathp = 0;
- imgp->canary = 0;
- imgp->canarylen = 0;
- imgp->pagesizes = 0;
- imgp->pagesizeslen = 0;
- imgp->stack_prot = 0;
+ oldcred = p->p_ucred;
#ifdef MAC
error = mac_execve_enter(imgp, mac_p);
@@ -401,11 +417,9 @@
goto exec_fail;
#endif
- imgp->image_header = NULL;
-
/*
* Translate the file name. namei() returns a vnode pointer
- * in ni_vp amoung other things.
+ * in ni_vp among other things.
*
* XXXAUDIT: It would be desirable to also audit the name of the
* interpreter if this is an interpreted binary.
@@ -412,10 +426,10 @@
*/
if (args->fname != NULL) {
NDINIT(&nd, LOOKUP, ISOPEN | LOCKLEAF | FOLLOW | SAVENAME
- | MPSAFE | AUDITVNODE1, UIO_SYSSPACE, args->fname, td);
+ | AUDITVNODE1, UIO_SYSSPACE, args->fname, td);
}
- SDT_PROBE(proc, kernel, , exec, args->fname, 0, 0, 0, 0 );
+ SDT_PROBE1(proc, , , exec, args->fname);
interpret:
if (args->fname != NULL) {
@@ -435,24 +449,20 @@
if (error)
goto exec_fail;
- vfslocked = NDHASGIANT(&nd);
- binvp = nd.ni_vp;
- imgp->vp = binvp;
+ newtextvp = nd.ni_vp;
+ imgp->vp = newtextvp;
} else {
AUDIT_ARG_FD(args->fd);
/*
- * Some might argue that CAP_READ and/or CAP_MMAP should also
- * be required here; such arguments will be entertained.
- *
* Descriptors opened only with O_EXEC or O_RDONLY are allowed.
*/
- error = fgetvp_exec(td, args->fd, CAP_FEXECVE, &binvp);
+ error = fgetvp_exec(td, args->fd,
+ cap_rights_init(&rights, CAP_FEXECVE), &newtextvp);
if (error)
goto exec_fail;
- vfslocked = VFS_LOCK_GIANT(binvp->v_mount);
- vn_lock(binvp, LK_EXCLUSIVE | LK_RETRY);
- AUDIT_ARG_VNODE1(binvp);
- imgp->vp = binvp;
+ vn_lock(newtextvp, LK_EXCLUSIVE | LK_RETRY);
+ AUDIT_ARG_VNODE1(newtextvp);
+ imgp->vp = newtextvp;
}
/*
@@ -481,7 +491,101 @@
goto exec_fail_dealloc;
imgp->proc->p_osrel = 0;
+
/*
+ * Implement image setuid/setgid.
+ *
+ * Determine new credentials before attempting image activators
+ * so that it can be used by process_exec handlers to determine
+ * credential/setid changes.
+ *
+ * Don't honor setuid/setgid if the filesystem prohibits it or if
+ * the process is being traced.
+ *
+ * We disable setuid/setgid/etc in capability mode on the basis
+ * that most setugid applications are not written with that
+ * environment in mind, and will therefore almost certainly operate
+ * incorrectly. In principle there's no reason that setugid
+ * applications might not be useful in capability mode, so we may want
+ * to reconsider this conservative design choice in the future.
+ *
+ * XXXMAC: For the time being, use NOSUID to also prohibit
+ * transitions on the file system.
+ */
+ credential_changing = 0;
+ credential_changing |= (attr.va_mode & S_ISUID) &&
+ oldcred->cr_uid != attr.va_uid;
+ credential_changing |= (attr.va_mode & S_ISGID) &&
+ oldcred->cr_gid != attr.va_gid;
+#ifdef MAC
+ will_transition = mac_vnode_execve_will_transition(oldcred, imgp->vp,
+ interpvplabel, imgp);
+ credential_changing |= will_transition;
+#endif
+
+ if (credential_changing &&
+#ifdef CAPABILITY_MODE
+ ((oldcred->cr_flags & CRED_FLAG_CAPMODE) == 0) &&
+#endif
+ (imgp->vp->v_mount->mnt_flag & MNT_NOSUID) == 0 &&
+ (p->p_flag & P_TRACED) == 0) {
+ imgp->credential_setid = true;
+ VOP_UNLOCK(imgp->vp, 0);
+ imgp->newcred = crdup(oldcred);
+ if (attr.va_mode & S_ISUID) {
+ euip = uifind(attr.va_uid);
+ change_euid(imgp->newcred, euip);
+ }
+ vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY);
+ if (attr.va_mode & S_ISGID)
+ change_egid(imgp->newcred, attr.va_gid);
+ /*
+ * Implement correct POSIX saved-id behavior.
+ *
+ * XXXMAC: Note that the current logic will save the
+ * uid and gid if a MAC domain transition occurs, even
+ * though maybe it shouldn't.
+ */
+ change_svuid(imgp->newcred, imgp->newcred->cr_uid);
+ change_svgid(imgp->newcred, imgp->newcred->cr_gid);
+ } else {
+ /*
+ * Implement correct POSIX saved-id behavior.
+ *
+ * XXX: It's not clear that the existing behavior is
+ * POSIX-compliant. A number of sources indicate that the
+ * saved uid/gid should only be updated if the new ruid is
+ * not equal to the old ruid, or the new euid is not equal
+ * to the old euid and the new euid is not equal to the old
+ * ruid. The FreeBSD code always updates the saved uid/gid.
+ * Also, this code uses the new (replaced) euid and egid as
+ * the source, which may or may not be the right ones to use.
+ */
+ if (oldcred->cr_svuid != oldcred->cr_uid ||
+ oldcred->cr_svgid != oldcred->cr_gid) {
+ VOP_UNLOCK(imgp->vp, 0);
+ imgp->newcred = crdup(oldcred);
+ vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY);
+ change_svuid(imgp->newcred, imgp->newcred->cr_uid);
+ change_svgid(imgp->newcred, imgp->newcred->cr_gid);
+ }
+ }
+ /* The new credentials are installed into the process later. */
+
+ /*
+ * Do the best to calculate the full path to the image file.
+ */
+ if (args->fname != NULL && args->fname[0] == '/')
+ imgp->execpath = args->fname;
+ else {
+ VOP_UNLOCK(imgp->vp, 0);
+ if (vn_fullpath(td, imgp->vp, &imgp->execpath,
+ &imgp->freepath) != 0)
+ imgp->execpath = args->fname;
+ vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY);
+ }
+
+ /*
* If the current process has a special image activator it
* wants to try first, call it. For example, emulating shell
* scripts differently.
@@ -529,19 +633,25 @@
if (args->fname != NULL)
NDFREE(&nd, NDF_ONLY_PNBUF);
#ifdef MAC
- mac_execve_interpreter_enter(binvp, &interpvplabel);
+ mac_execve_interpreter_enter(newtextvp, &interpvplabel);
#endif
if (imgp->opened) {
- VOP_CLOSE(binvp, FREAD, td->td_ucred, td);
+ VOP_CLOSE(newtextvp, FREAD, td->td_ucred, td);
imgp->opened = 0;
}
- vput(binvp);
+ vput(newtextvp);
vm_object_deallocate(imgp->object);
imgp->object = NULL;
- VFS_UNLOCK_GIANT(vfslocked);
- vfslocked = 0;
+ imgp->credential_setid = false;
+ if (imgp->newcred != NULL) {
+ crfree(imgp->newcred);
+ imgp->newcred = NULL;
+ }
+ imgp->execpath = NULL;
+ free(imgp->freepath, M_TEMP);
+ imgp->freepath = NULL;
/* set new name to that of the interpreter */
- NDINIT(&nd, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME | MPSAFE,
+ NDINIT(&nd, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME,
UIO_SYSSPACE, imgp->interpreter_name, td);
args->fname = imgp->interpreter_name;
goto interpret;
@@ -553,13 +663,14 @@
*/
VOP_UNLOCK(imgp->vp, 0);
- /*
- * Do the best to calculate the full path to the image file.
- */
- if (imgp->auxargs != NULL &&
- ((args->fname != NULL && args->fname[0] == '/') ||
- vn_fullpath(td, imgp->vp, &imgp->execpath, &imgp->freepath) != 0))
- imgp->execpath = args->fname;
+ if (disallow_high_osrel &&
+ P_OSREL_MAJOR(p->p_osrel) > P_OSREL_MAJOR(__FreeBSD_version)) {
+ error = ENOEXEC;
+ uprintf("Osrel %d for image %s too high\n", p->p_osrel,
+ imgp->execpath != NULL ? imgp->execpath : "<unresolved>");
+ vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
+ goto exec_fail_dealloc;
+ }
/*
* Copy out strings (args and env) and initialize stack base
@@ -583,13 +694,13 @@
* For security and other reasons, the file descriptor table cannot
* be shared after an exec.
*/
- fdunshare(p, td);
+ fdunshare(td);
+ /* close files on exec */
+ fdcloseexec(td);
/*
* Malloc things before we need locks.
*/
- newcred = crget();
- euip = uifind(attr.va_uid);
i = imgp->args->begin_envv - imgp->args->begin_argv;
/* Cache arguments if they fit inside our allowance */
if (ps_arg_cache_limit >= i + sizeof(struct pargs)) {
@@ -597,13 +708,6 @@
bcopy(imgp->args->begin_argv, newargs->ar_args, i);
}
- /* close files on exec */
- fdcloseexec(td);
- vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
-
- /* Get a reference to the vnode prior to locking the proc */
- VREF(binvp);
-
/*
* For security and other reasons, signal handlers cannot
* be shared after an exec. The new process gets a copy of the old
@@ -610,18 +714,17 @@
* handlers. In execsigs(), the new process will have its signals
* reset.
*/
- PROC_LOCK(p);
- oldcred = crcopysafe(p, newcred);
if (sigacts_shared(p->p_sigacts)) {
oldsigacts = p->p_sigacts;
- PROC_UNLOCK(p);
newsigacts = sigacts_alloc();
sigacts_copy(newsigacts, oldsigacts);
- PROC_LOCK(p);
+ }
+
+ vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
+
+ PROC_LOCK(p);
+ if (oldsigacts)
p->p_sigacts = newsigacts;
- } else
- oldsigacts = NULL;
-
/* Stop profiling */
stopprofclock(p);
@@ -633,7 +736,7 @@
if (args->fname)
bcopy(nd.ni_cnd.cn_nameptr, p->p_comm,
min(nd.ni_cnd.cn_namelen, MAXCOMLEN));
- else if (vn_commname(binvp, p->p_comm, sizeof(p->p_comm)) != 0)
+ else if (vn_commname(newtextvp, p->p_comm, sizeof(p->p_comm)) != 0)
bcopy(fexecv_proc_title, p->p_comm, sizeof(fexecv_proc_title));
bcopy(p->p_comm, td->td_name, sizeof(td->td_name));
#ifdef KTR
@@ -645,44 +748,19 @@
* it that it now has its own resources back
*/
p->p_flag |= P_EXEC;
- if (p->p_pptr && (p->p_flag & P_PPWAIT)) {
+ if ((p->p_flag2 & P2_NOTRACE_EXEC) == 0)
+ p->p_flag2 &= ~P2_NOTRACE;
+ if (p->p_flag & P_PPWAIT) {
p->p_flag &= ~(P_PPWAIT | P_PPTRACE);
cv_broadcast(&p->p_pwait);
+ /* STOPs are no longer ignored, arrange for AST */
+ signotify(td);
}
/*
- * Implement image setuid/setgid.
- *
- * Don't honor setuid/setgid if the filesystem prohibits it or if
- * the process is being traced.
- *
- * We disable setuid/setgid/etc in compatibility mode on the basis
- * that most setugid applications are not written with that
- * environment in mind, and will therefore almost certainly operate
- * incorrectly. In principle there's no reason that setugid
- * applications might not be useful in capability mode, so we may want
- * to reconsider this conservative design choice in the future.
- *
- * XXXMAC: For the time being, use NOSUID to also prohibit
- * transitions on the file system.
+ * Implement image setuid/setgid installation.
*/
- credential_changing = 0;
- credential_changing |= (attr.va_mode & S_ISUID) && oldcred->cr_uid !=
- attr.va_uid;
- credential_changing |= (attr.va_mode & S_ISGID) && oldcred->cr_gid !=
- attr.va_gid;
-#ifdef MAC
- will_transition = mac_vnode_execve_will_transition(oldcred, imgp->vp,
- interpvplabel, imgp);
- credential_changing |= will_transition;
-#endif
-
- if (credential_changing &&
-#ifdef CAPABILITY_MODE
- ((oldcred->cr_flags & CRED_FLAG_CAPMODE) == 0) &&
-#endif
- (imgp->vp->v_mount->mnt_flag & MNT_NOSUID) == 0 &&
- (p->p_flag & P_TRACED) == 0) {
+ if (imgp->credential_setid) {
/*
* Turn off syscall tracing for set-id programs, except for
* root. Record any set-id flags first to make sure that
@@ -710,63 +788,34 @@
error = fdcheckstd(td);
vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
if (error != 0)
- goto done1;
+ goto exec_fail_dealloc;
PROC_LOCK(p);
- /*
- * Set the new credentials.
- */
- if (attr.va_mode & S_ISUID)
- change_euid(newcred, euip);
- if (attr.va_mode & S_ISGID)
- change_egid(newcred, attr.va_gid);
#ifdef MAC
if (will_transition) {
- mac_vnode_execve_transition(oldcred, newcred, imgp->vp,
- interpvplabel, imgp);
+ mac_vnode_execve_transition(oldcred, imgp->newcred,
+ imgp->vp, interpvplabel, imgp);
}
#endif
- /*
- * Implement correct POSIX saved-id behavior.
- *
- * XXXMAC: Note that the current logic will save the
- * uid and gid if a MAC domain transition occurs, even
- * though maybe it shouldn't.
- */
- change_svuid(newcred, newcred->cr_uid);
- change_svgid(newcred, newcred->cr_gid);
- p->p_ucred = newcred;
- newcred = NULL;
} else {
if (oldcred->cr_uid == oldcred->cr_ruid &&
oldcred->cr_gid == oldcred->cr_rgid)
p->p_flag &= ~P_SUGID;
- /*
- * Implement correct POSIX saved-id behavior.
- *
- * XXX: It's not clear that the existing behavior is
- * POSIX-compliant. A number of sources indicate that the
- * saved uid/gid should only be updated if the new ruid is
- * not equal to the old ruid, or the new euid is not equal
- * to the old euid and the new euid is not equal to the old
- * ruid. The FreeBSD code always updates the saved uid/gid.
- * Also, this code uses the new (replaced) euid and egid as
- * the source, which may or may not be the right ones to use.
- */
- if (oldcred->cr_svuid != oldcred->cr_uid ||
- oldcred->cr_svgid != oldcred->cr_gid) {
- change_svuid(newcred, newcred->cr_uid);
- change_svgid(newcred, newcred->cr_gid);
- p->p_ucred = newcred;
- newcred = NULL;
- }
}
+ /*
+ * Set the new credentials.
+ */
+ if (imgp->newcred != NULL) {
+ proc_set_cred(p, imgp->newcred);
+ crfree(oldcred);
+ oldcred = NULL;
+ }
/*
- * Store the vp for use in procfs. This vnode was referenced prior
- * to locking the proc lock.
+ * Store the vp for use in procfs. This vnode was referenced by namei
+ * or fgetvp_exec.
*/
- textvp = p->p_textvp;
- p->p_textvp = binvp;
+ oldtextvp = p->p_textvp;
+ p->p_textvp = newtextvp;
#ifdef KDTRACE_HOOKS
/*
@@ -828,53 +877,9 @@
vfs_mark_atime(imgp->vp, td->td_ucred);
- SDT_PROBE(proc, kernel, , exec_success, args->fname, 0, 0, 0, 0);
+ SDT_PROBE1(proc, , , exec__success, args->fname);
-done1:
- /*
- * Free any resources malloc'd earlier that we didn't use.
- */
- uifree(euip);
- if (newcred == NULL)
- crfree(oldcred);
- else
- crfree(newcred);
- VOP_UNLOCK(imgp->vp, 0);
-
- /*
- * Handle deferred decrement of ref counts.
- */
- if (textvp != NULL) {
- int tvfslocked;
-
- tvfslocked = VFS_LOCK_GIANT(textvp->v_mount);
- vrele(textvp);
- VFS_UNLOCK_GIANT(tvfslocked);
- }
- if (binvp && error != 0)
- vrele(binvp);
-#ifdef KTRACE
- if (tracevp != NULL) {
- int tvfslocked;
-
- tvfslocked = VFS_LOCK_GIANT(tracevp->v_mount);
- vrele(tracevp);
- VFS_UNLOCK_GIANT(tvfslocked);
- }
- if (tracecred != NULL)
- crfree(tracecred);
-#endif
- vn_lock(imgp->vp, LK_SHARED | LK_RETRY);
- pargs_drop(oldargs);
- pargs_drop(newargs);
- if (oldsigacts != NULL)
- sigacts_free(oldsigacts);
-
exec_fail_dealloc:
-
- /*
- * free various allocated resources
- */
if (imgp->firstpage != NULL)
exec_unmap_first_page(imgp);
@@ -883,7 +888,10 @@
NDFREE(&nd, NDF_ONLY_PNBUF);
if (imgp->opened)
VOP_CLOSE(imgp->vp, FREAD, td->td_ucred, td);
- vput(imgp->vp);
+ if (error != 0)
+ vput(imgp->vp);
+ else
+ VOP_UNLOCK(imgp->vp, 0);
}
if (imgp->object != NULL)
@@ -893,7 +901,8 @@
if (error == 0) {
PROC_LOCK(p);
- td->td_dbgflags |= TDB_EXEC;
+ if (p->p_ptevents & PTRACE_EXEC)
+ td->td_dbgflags |= TDB_EXEC;
PROC_UNLOCK(p);
/*
@@ -901,25 +910,43 @@
* the S_EXEC bit set.
*/
STOPEVENT(p, S_EXEC, 0);
- goto done2;
+ } else {
+exec_fail:
+ /* we're done here, clear P_INEXEC */
+ PROC_LOCK(p);
+ p->p_flag &= ~P_INEXEC;
+ PROC_UNLOCK(p);
+
+ SDT_PROBE1(proc, , , exec__failure, error);
}
-exec_fail:
- /* we're done here, clear P_INEXEC */
- PROC_LOCK(p);
- p->p_flag &= ~P_INEXEC;
- PROC_UNLOCK(p);
+ if (imgp->newcred != NULL && oldcred != NULL)
+ crfree(imgp->newcred);
- SDT_PROBE(proc, kernel, , exec_failure, error, 0, 0, 0, 0);
-
-done2:
#ifdef MAC
mac_execve_exit(imgp);
mac_execve_interpreter_exit(interpvplabel);
#endif
- VFS_UNLOCK_GIANT(vfslocked);
exec_free_args(args);
+ /*
+ * Handle deferred decrement of ref counts.
+ */
+ if (oldtextvp != NULL)
+ vrele(oldtextvp);
+#ifdef KTRACE
+ if (tracevp != NULL)
+ vrele(tracevp);
+ if (tracecred != NULL)
+ crfree(tracecred);
+#endif
+ pargs_drop(oldargs);
+ pargs_drop(newargs);
+ if (oldsigacts != NULL)
+ sigacts_free(oldsigacts);
+ if (euip != NULL)
+ uifree(euip);
+
if (error && imgp->vmspace_destroyed) {
/* sorry, no more process anymore. exit gracefully */
exit1(td, W_EXITCODE(0, SIGABRT));
@@ -949,7 +976,7 @@
object = imgp->vp->v_object;
if (object == NULL)
return (EACCES);
- VM_OBJECT_LOCK(object);
+ VM_OBJECT_WLOCK(object);
#if VM_NRESERVLEVEL > 0
if ((object->flags & OBJ_COLORED) == 0) {
object->flags |= OBJ_COLORED;
@@ -956,7 +983,7 @@
object->pg_color = 0;
}
#endif
- ma[0] = vm_page_grab(object, 0, VM_ALLOC_NORMAL | VM_ALLOC_RETRY);
+ ma[0] = vm_page_grab(object, 0, VM_ALLOC_NORMAL);
if (ma[0]->valid != VM_PAGE_BITS_ALL) {
initial_pagein = VM_INITIAL_PAGEIN;
if (initial_pagein > object->size)
@@ -965,9 +992,8 @@
if ((ma[i] = vm_page_next(ma[i - 1])) != NULL) {
if (ma[i]->valid)
break;
- if ((ma[i]->oflags & VPO_BUSY) || ma[i]->busy)
+ if (!vm_page_tryxbusy(ma[i]))
break;
- vm_page_busy(ma[i]);
} else {
ma[i] = vm_page_alloc(object, i,
VM_ALLOC_NORMAL | VM_ALLOC_IFNOTCACHED);
@@ -984,15 +1010,16 @@
vm_page_free(ma[0]);
vm_page_unlock(ma[0]);
}
- VM_OBJECT_UNLOCK(object);
+ VM_OBJECT_WUNLOCK(object);
return (EIO);
}
}
+ vm_page_xunbusy(ma[0]);
vm_page_lock(ma[0]);
vm_page_hold(ma[0]);
+ vm_page_activate(ma[0]);
vm_page_unlock(ma[0]);
- vm_page_wakeup(ma[0]);
- VM_OBJECT_UNLOCK(object);
+ VM_OBJECT_WUNLOCK(object);
imgp->firstpage = sf_buf_alloc(ma[0], 0);
imgp->image_header = (char *)sf_buf_kva(imgp->firstpage);
@@ -1017,9 +1044,9 @@
}
/*
- * Destroy old address space, and allocate a new stack
- * The new stack is only SGROWSIZ large because it is grown
- * automatically in trap.c.
+ * Destroy old address space, and allocate a new stack.
+ * The new stack is only sgrowsiz large because it is grown
+ * automatically on a page fault.
*/
int
exec_new_vmspace(imgp, sv)
@@ -1030,6 +1057,7 @@
struct proc *p = imgp->proc;
struct vmspace *vmspace = p->p_vmspace;
vm_object_t obj;
+ struct rlimit rlim_stack;
vm_offset_t sv_minuser, stack_addr;
vm_map_t map;
u_long ssiz;
@@ -1055,6 +1083,10 @@
shmexit(vmspace);
pmap_remove_pages(vmspace_pmap(vmspace));
vm_map_remove(map, vm_map_min(map), vm_map_max(map));
+ /* An exec terminates mlockall(MCL_FUTURE). */
+ vm_map_lock(map);
+ vm_map_modflags(map, 0, MAP_WIREFUTURE);
+ vm_map_unlock(map);
} else {
error = vmspace_exec(p, sv_minuser, sv->sv_maxuser);
if (error)
@@ -1069,31 +1101,42 @@
vm_object_reference(obj);
error = vm_map_fixed(map, obj, 0,
sv->sv_shared_page_base, sv->sv_shared_page_len,
- VM_PROT_READ | VM_PROT_EXECUTE, VM_PROT_ALL,
- MAP_COPY_ON_WRITE | MAP_ACC_NO_CHARGE);
- if (error) {
+ VM_PROT_READ | VM_PROT_EXECUTE,
+ VM_PROT_READ | VM_PROT_EXECUTE,
+ MAP_INHERIT_SHARE | MAP_ACC_NO_CHARGE);
+ if (error != KERN_SUCCESS) {
vm_object_deallocate(obj);
- return (error);
+ return (vm_mmap_to_errno(error));
}
}
/* Allocate a new stack */
- if (sv->sv_maxssiz != NULL)
+ if (imgp->stack_sz != 0) {
+ ssiz = trunc_page(imgp->stack_sz);
+ PROC_LOCK(p);
+ lim_rlimit(p, RLIMIT_STACK, &rlim_stack);
+ PROC_UNLOCK(p);
+ if (ssiz > rlim_stack.rlim_max)
+ ssiz = rlim_stack.rlim_max;
+ if (ssiz > rlim_stack.rlim_cur) {
+ rlim_stack.rlim_cur = ssiz;
+ kern_setrlimit(curthread, RLIMIT_STACK, &rlim_stack);
+ }
+ } else if (sv->sv_maxssiz != NULL) {
ssiz = *sv->sv_maxssiz;
- else
+ } else {
ssiz = maxssiz;
+ }
stack_addr = sv->sv_usrstack - ssiz;
error = vm_map_stack(map, stack_addr, (vm_size_t)ssiz,
obj != NULL && imgp->stack_prot != 0 ? imgp->stack_prot :
- sv->sv_stackprot,
- VM_PROT_ALL, MAP_STACK_GROWS_DOWN);
- if (error)
- return (error);
+ sv->sv_stackprot, VM_PROT_ALL, MAP_STACK_GROWS_DOWN);
+ if (error != KERN_SUCCESS)
+ return (vm_mmap_to_errno(error));
#ifdef __ia64__
/* Allocate a new register stack */
- stack_addr = IA64_BACKINGSTORE;
- error = vm_map_stack(map, stack_addr, (vm_size_t)ssiz,
+ error = vm_map_stack(map, IA64_BACKINGSTORE, (vm_size_t)ssiz,
sv->sv_stackprot, VM_PROT_ALL, MAP_STACK_GROWS_UP);
if (error)
return (error);
@@ -1104,7 +1147,7 @@
* process stack so we can check the stack rlimit.
*/
vmspace->vm_ssize = sgrowsiz >> PAGE_SHIFT;
- vmspace->vm_maxsaddr = (char *)sv->sv_usrstack - ssiz;
+ vmspace->vm_maxsaddr = (char *)stack_addr;
return (0);
}
@@ -1117,7 +1160,7 @@
exec_copyin_args(struct image_args *args, char *fname,
enum uio_seg segflg, char **argv, char **envv)
{
- char *argp, *envp;
+ u_long argp, envp;
int error;
size_t length;
@@ -1153,13 +1196,17 @@
/*
* extract arguments first
*/
- while ((argp = (caddr_t) (intptr_t) fuword(argv++))) {
- if (argp == (caddr_t) -1) {
+ for (;;) {
+ error = fueword(argv++, &argp);
+ if (error == -1) {
error = EFAULT;
goto err_exit;
}
- if ((error = copyinstr(argp, args->endp,
- args->stringspace, &length))) {
+ if (argp == 0)
+ break;
+ error = copyinstr((void *)(uintptr_t)argp, args->endp,
+ args->stringspace, &length);
+ if (error != 0) {
if (error == ENAMETOOLONG)
error = E2BIG;
goto err_exit;
@@ -1175,13 +1222,17 @@
* extract environment strings
*/
if (envv) {
- while ((envp = (caddr_t)(intptr_t)fuword(envv++))) {
- if (envp == (caddr_t)-1) {
+ for (;;) {
+ error = fueword(envv++, &envp);
+ if (error == -1) {
error = EFAULT;
goto err_exit;
}
- if ((error = copyinstr(envp, args->endp,
- args->stringspace, &length))) {
+ if (envp == 0)
+ break;
+ error = copyinstr((void *)(uintptr_t)envp,
+ args->endp, args->stringspace, &length);
+ if (error != 0) {
if (error == ENAMETOOLONG)
error = E2BIG;
goto err_exit;
@@ -1208,7 +1259,7 @@
exec_alloc_args(struct image_args *args)
{
- args->buf = (char *)kmem_alloc_wait(exec_map, PATH_MAX + ARG_MAX);
+ args->buf = (char *)kmap_alloc_wait(exec_map, PATH_MAX + ARG_MAX);
return (args->buf != NULL ? 0 : ENOMEM);
}
@@ -1217,7 +1268,7 @@
{
if (args->buf != NULL) {
- kmem_free_wakeup(exec_map, (vm_offset_t)args->buf,
+ kmap_free_wakeup(exec_map, (vm_offset_t)args->buf,
PATH_MAX + ARG_MAX);
args->buf = NULL;
}
@@ -1238,7 +1289,8 @@
{
int argc, envc;
char **vectp;
- char *stringp, *destp;
+ char *stringp;
+ uintptr_t destp;
register_t *stack_base;
struct ps_strings *arginfo;
struct proc *p;
@@ -1262,26 +1314,24 @@
if (p->p_sysent->sv_szsigcode != NULL)
szsigcode = *(p->p_sysent->sv_szsigcode);
}
- destp = (caddr_t)arginfo - szsigcode - SPARE_USRSPACE -
- roundup(execpath_len, sizeof(char *)) -
- roundup(sizeof(canary), sizeof(char *)) -
- roundup(szps, sizeof(char *)) -
- roundup((ARG_MAX - imgp->args->stringspace), sizeof(char *));
+ destp = (uintptr_t)arginfo;
/*
* install sigcode
*/
- if (szsigcode != 0)
- copyout(p->p_sysent->sv_sigcode, ((caddr_t)arginfo -
- szsigcode), szsigcode);
+ if (szsigcode != 0) {
+ destp -= szsigcode;
+ destp = rounddown2(destp, sizeof(void *));
+ copyout(p->p_sysent->sv_sigcode, (void *)destp, szsigcode);
+ }
/*
* Copy the image path for the rtld.
*/
if (execpath_len != 0) {
- imgp->execpathp = (uintptr_t)arginfo - szsigcode - execpath_len;
- copyout(imgp->execpath, (void *)imgp->execpathp,
- execpath_len);
+ destp -= execpath_len;
+ imgp->execpathp = destp;
+ copyout(imgp->execpath, (void *)destp, execpath_len);
}
/*
@@ -1288,19 +1338,23 @@
* Prepare the canary for SSP.
*/
arc4rand(canary, sizeof(canary), 0);
- imgp->canary = (uintptr_t)arginfo - szsigcode - execpath_len -
- sizeof(canary);
- copyout(canary, (void *)imgp->canary, sizeof(canary));
+ destp -= sizeof(canary);
+ imgp->canary = destp;
+ copyout(canary, (void *)destp, sizeof(canary));
imgp->canarylen = sizeof(canary);
/*
* Prepare the pagesizes array.
*/
- imgp->pagesizes = (uintptr_t)arginfo - szsigcode - execpath_len -
- roundup(sizeof(canary), sizeof(char *)) - szps;
- copyout(pagesizes, (void *)imgp->pagesizes, szps);
+ destp -= szps;
+ destp = rounddown2(destp, sizeof(void *));
+ imgp->pagesizes = destp;
+ copyout(pagesizes, (void *)destp, szps);
imgp->pagesizeslen = szps;
+ destp -= ARG_MAX - imgp->args->stringspace;
+ destp = rounddown2(destp, sizeof(void *));
+
/*
* If we have a valid auxargs ptr, prepare some room
* on the stack.
@@ -1325,8 +1379,8 @@
* The '+ 2' is for the null pointers at the end of each of
* the arg and env vector sets
*/
- vectp = (char **)(destp - (imgp->args->argc + imgp->args->envc + 2) *
- sizeof(char *));
+ vectp = (char **)(destp - (imgp->args->argc + imgp->args->envc
+ + 2) * sizeof(char *));
}
/*
@@ -1341,7 +1395,7 @@
/*
* Copy out strings - arguments and environment.
*/
- copyout(stringp, destp, ARG_MAX - imgp->args->stringspace);
+ copyout(stringp, (void *)destp, ARG_MAX - imgp->args->stringspace);
/*
* Fill in "ps_strings" struct for ps, w, etc.
@@ -1468,8 +1522,6 @@
for (es = execsw; *es; es++)
count++;
newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK);
- if (newexecsw == NULL)
- return (ENOMEM);
xs = newexecsw;
if (execsw)
for (es = execsw; *es; es++)
@@ -1502,8 +1554,6 @@
if (*es != execsw_arg)
count++;
newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK);
- if (newexecsw == NULL)
- return (ENOMEM);
xs = newexecsw;
for (es = execsw; *es; es++)
if (*es != execsw_arg)
Modified: trunk/sys/kern/kern_exit.c
===================================================================
--- trunk/sys/kern/kern_exit.c 2018-05-25 20:39:59 UTC (rev 9943)
+++ trunk/sys/kern/kern_exit.c 2018-05-25 20:46:51 UTC (rev 9944)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 1982, 1986, 1989, 1991, 1993
* The Regents of the University of California. All rights reserved.
@@ -35,7 +36,7 @@
*/
#include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/kern_exit.c 310585 2016-12-26 10:16:05Z kib $");
#include "opt_compat.h"
#include "opt_kdtrace.h"
@@ -45,7 +46,7 @@
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/sysproto.h>
-#include <sys/capability.h>
+#include <sys/capsicum.h>
#include <sys/eventhandler.h>
#include <sys/kernel.h>
#include <sys/malloc.h>
@@ -73,6 +74,7 @@
#include <sys/sdt.h>
#include <sys/shm.h>
#include <sys/sem.h>
+#include <sys/umtx.h>
#ifdef KTRACE
#include <sys/ktrace.h>
#endif
@@ -94,22 +96,78 @@
#endif
SDT_PROVIDER_DECLARE(proc);
-SDT_PROBE_DEFINE(proc, kernel, , exit, exit);
-SDT_PROBE_ARGTYPE(proc, kernel, , exit, 0, "int");
+SDT_PROBE_DEFINE1(proc, , , exit, "int");
/* Hook for NFS teardown procedure. */
void (*nlminfo_release_p)(struct proc *p);
+struct proc *
+proc_realparent(struct proc *child)
+{
+ struct proc *p, *parent;
+
+ sx_assert(&proctree_lock, SX_LOCKED);
+ if ((child->p_treeflag & P_TREE_ORPHANED) == 0) {
+ if (child->p_oppid == 0 ||
+ child->p_pptr->p_pid == child->p_oppid)
+ parent = child->p_pptr;
+ else
+ parent = initproc;
+ return (parent);
+ }
+ for (p = child; (p->p_treeflag & P_TREE_FIRST_ORPHAN) == 0;) {
+ /* Cannot use LIST_PREV(), since the list head is not known. */
+ p = __containerof(p->p_orphan.le_prev, struct proc,
+ p_orphan.le_next);
+ KASSERT((p->p_treeflag & P_TREE_ORPHANED) != 0,
+ ("missing P_ORPHAN %p", p));
+ }
+ parent = __containerof(p->p_orphan.le_prev, struct proc,
+ p_orphans.lh_first);
+ return (parent);
+}
+
+void
+reaper_abandon_children(struct proc *p, bool exiting)
+{
+ struct proc *p1, *p2, *ptmp;
+
+ sx_assert(&proctree_lock, SX_LOCKED);
+ KASSERT(p != initproc, ("reaper_abandon_children for initproc"));
+ if ((p->p_treeflag & P_TREE_REAPER) == 0)
+ return;
+ p1 = p->p_reaper;
+ LIST_FOREACH_SAFE(p2, &p->p_reaplist, p_reapsibling, ptmp) {
+ LIST_REMOVE(p2, p_reapsibling);
+ p2->p_reaper = p1;
+ p2->p_reapsubtree = p->p_reapsubtree;
+ LIST_INSERT_HEAD(&p1->p_reaplist, p2, p_reapsibling);
+ if (exiting && p2->p_pptr == p) {
+ PROC_LOCK(p2);
+ proc_reparent(p2, p1);
+ PROC_UNLOCK(p2);
+ }
+ }
+ KASSERT(LIST_EMPTY(&p->p_reaplist), ("p_reaplist not empty"));
+ p->p_treeflag &= ~P_TREE_REAPER;
+}
+
static void
clear_orphan(struct proc *p)
{
+ struct proc *p1;
- PROC_LOCK_ASSERT(p, MA_OWNED);
-
- if (p->p_flag & P_ORPHAN) {
- LIST_REMOVE(p, p_orphan);
- p->p_flag &= ~P_ORPHAN;
+ sx_assert(&proctree_lock, SA_XLOCKED);
+ if ((p->p_treeflag & P_TREE_ORPHANED) == 0)
+ return;
+ if ((p->p_treeflag & P_TREE_FIRST_ORPHAN) != 0) {
+ p1 = LIST_NEXT(p, p_orphan);
+ if (p1 != NULL)
+ p1->p_treeflag |= P_TREE_FIRST_ORPHAN;
+ p->p_treeflag &= ~P_TREE_FIRST_ORPHAN;
}
+ LIST_REMOVE(p, p_orphan);
+ p->p_treeflag &= ~P_TREE_ORPHANED;
}
/*
@@ -131,11 +189,9 @@
void
exit1(struct thread *td, int rv)
{
- struct proc *p, *nq, *q;
- struct vnode *vtmp;
+ struct proc *p, *nq, *q, *t;
+ struct thread *tdt;
struct vnode *ttyvp = NULL;
- struct plimit *plim;
- int locked;
mtx_assert(&Giant, MA_NOTOWNED);
@@ -152,17 +208,24 @@
}
/*
+ * Deref SU mp, since the thread does not return to userspace.
+ */
+ if (softdep_ast_cleanup != NULL)
+ softdep_ast_cleanup();
+
+ /*
* MUST abort all other threads before proceeding past here.
*/
PROC_LOCK(p);
+ /*
+ * First check if some other thread or external request got
+ * here before us. If so, act appropriately: exit or suspend.
+ * We must ensure that stop requests are handled before we set
+ * P_WEXIT.
+ */
+ thread_suspend_check(0);
while (p->p_flag & P_HADTHREADS) {
/*
- * First check if some other thread got here before us..
- * if so, act apropriatly, (exit or suspend);
- */
- thread_suspend_check(0);
-
- /*
* Kill off the other threads. This requires
* some co-operation from other parts of the kernel
* so it may not be instantaneous. With this state set
@@ -179,13 +242,19 @@
* re-check all suspension request, the thread should
* either be suspended there or exit.
*/
- if (! thread_single(SINGLE_EXIT))
+ if (!thread_single(p, SINGLE_EXIT))
+ /*
+ * All other activity in this process is now
+ * stopped. Threading support has been turned
+ * off.
+ */
break;
-
/*
- * All other activity in this process is now stopped.
- * Threading support has been turned off.
+ * Recheck for new stop or suspend requests which
+ * might appear while process lock was dropped in
+ * thread_single().
*/
+ thread_suspend_check(0);
}
KASSERT(p->p_numthreads == 1,
("exit1: proc %p exiting with %d threads", p, p->p_numthreads));
@@ -268,6 +337,7 @@
rv = p->p_xstat; /* Event handler could change exit status */
stopprofclock(p);
p->p_flag &= ~(P_TRACED | P_PPWAIT | P_PPTRACE);
+ p->p_ptevents = 0;
/*
* Stop the real interval timer. If the handler is currently
@@ -298,7 +368,7 @@
* Close open files and release open-file table.
* This may block!
*/
- fdfree(td);
+ fdescfree(td);
/*
* If this thread tickled GEOM, we need to wait for the giggling to
@@ -381,21 +451,16 @@
/*
* Release reference to text vnode
*/
- if ((vtmp = p->p_textvp) != NULL) {
+ if (p->p_textvp != NULL) {
+ vrele(p->p_textvp);
p->p_textvp = NULL;
- locked = VFS_LOCK_GIANT(vtmp->v_mount);
- vrele(vtmp);
- VFS_UNLOCK_GIANT(locked);
}
/*
* Release our limits structure.
*/
- PROC_LOCK(p);
- plim = p->p_limit;
+ lim_free(p->p_limit);
p->p_limit = NULL;
- PROC_UNLOCK(p);
- lim_free(plim);
tidhash_remove(td);
@@ -420,25 +485,40 @@
WITNESS_WARN(WARN_PANIC, NULL, "process (pid %d) exiting", p->p_pid);
/*
- * Reparent all of our children to init.
+ * Reparent all children processes:
+ * - traced ones to the original parent (or init if we are that parent)
+ * - the rest to init
*/
sx_xlock(&proctree_lock);
q = LIST_FIRST(&p->p_children);
if (q != NULL) /* only need this if any child is S_ZOMB */
- wakeup(initproc);
+ wakeup(q->p_reaper);
for (; q != NULL; q = nq) {
nq = LIST_NEXT(q, p_sibling);
PROC_LOCK(q);
- proc_reparent(q, initproc);
q->p_sigparent = SIGCHLD;
- /*
- * Traced processes are killed
- * since their existence means someone is screwing up.
- */
- if (q->p_flag & P_TRACED) {
- struct thread *temp;
+ if (!(q->p_flag & P_TRACED)) {
+ proc_reparent(q, q->p_reaper);
+ if (q->p_state == PRS_ZOMBIE) {
+ PROC_LOCK(q->p_reaper);
+ pksignal(q->p_reaper, SIGCHLD, q->p_ksi);
+ PROC_UNLOCK(q->p_reaper);
+ }
+ } else {
/*
+ * Traced processes are killed since their existence
+ * means someone is screwing up.
+ */
+ t = proc_realparent(q);
+ if (t == p) {
+ proc_reparent(q, q->p_reaper);
+ } else {
+ PROC_LOCK(t);
+ proc_reparent(q, t);
+ PROC_UNLOCK(t);
+ }
+ /*
* Since q was found on our children list, the
* proc_reparent() call moved q to the orphan
* list due to present P_TRACED flag. Clear
@@ -446,8 +526,12 @@
*/
clear_orphan(q);
q->p_flag &= ~(P_TRACED | P_STOPPED_TRACE);
- FOREACH_THREAD_IN_PROC(q, temp)
- temp->td_dbgflags &= ~TDB_SUSPEND;
+ q->p_flag2 &= ~P2_PTRACE_FSTP;
+ q->p_ptevents = 0;
+ FOREACH_THREAD_IN_PROC(q, tdt) {
+ tdt->td_dbgflags &= ~(TDB_SUSPEND | TDB_XSIG |
+ TDB_FSTP);
+ }
kern_psignal(q, SIGKILL);
}
PROC_UNLOCK(q);
@@ -458,6 +542,8 @@
*/
while ((q = LIST_FIRST(&p->p_orphans)) != NULL) {
PROC_LOCK(q);
+ CTR2(KTR_PTRACE, "exit: pid %d, clearing orphan %d", p->p_pid,
+ q->p_pid);
clear_orphan(q);
PROC_UNLOCK(q);
}
@@ -489,7 +575,7 @@
reason = CLD_DUMPED;
else if (WIFSIGNALED(rv))
reason = CLD_KILLED;
- SDT_PROBE(proc, kernel, , exit, reason, 0, 0, 0, 0);
+ SDT_PROBE1(proc, , , exit, reason);
#endif
/*
@@ -523,7 +609,7 @@
mtx_unlock(&p->p_pptr->p_sigacts->ps_mtx);
pp = p->p_pptr;
PROC_UNLOCK(pp);
- proc_reparent(p, initproc);
+ proc_reparent(p, p->p_reaper);
p->p_sigparent = SIGCHLD;
PROC_LOCK(p->p_pptr);
@@ -536,8 +622,8 @@
} else
mtx_unlock(&p->p_pptr->p_sigacts->ps_mtx);
- if (p->p_pptr == initproc)
- kern_psignal(p->p_pptr, SIGCHLD);
+ if (p->p_pptr == p->p_reaper || p->p_pptr == initproc)
+ childproc_exited(p);
else if (p->p_sigparent != 0) {
if (p->p_sigparent == SIGCHLD)
childproc_exited(p);
@@ -571,6 +657,7 @@
wakeup(p->p_pptr);
cv_broadcast(&p->p_pwait);
sched_exit(p->p_pptr, td);
+ umtx_thread_exit(td);
PROC_SLOCK(p);
p->p_state = PRS_ZOMBIE;
PROC_UNLOCK(p->p_pptr);
@@ -584,7 +671,9 @@
/*
* Save our children's rusage information in our exit rusage.
*/
+ PROC_STATLOCK(p);
ruadd(&p->p_ru, &p->p_rux, &p->p_stats->p_cru, &p->p_crux);
+ PROC_STATUNLOCK(p);
/*
* Make sure the scheduler takes this thread out of its tables etc.
@@ -702,9 +791,9 @@
else
rup = NULL;
error = kern_wait(td, uap->pid, &status, uap->options, rup);
- if (uap->status != NULL && error == 0)
+ if (uap->status != NULL && error == 0 && td->td_retval[0] != 0)
error = copyout(&status, uap->status, sizeof(status));
- if (uap->rusage != NULL && error == 0)
+ if (uap->rusage != NULL && error == 0 && td->td_retval[0] != 0)
error = copyout(&ru, uap->rusage, sizeof(struct rusage));
return (error);
}
@@ -713,10 +802,10 @@
sys_wait6(struct thread *td, struct wait6_args *uap)
{
struct __wrusage wru, *wrup;
- siginfo_t si, *sip;
- int error, status;
+ siginfo_t si, *sip;
idtype_t idtype;
id_t id;
+ int error, status;
idtype = uap->idtype;
id = uap->id;
@@ -738,9 +827,9 @@
*/
error = kern_wait6(td, idtype, id, &status, uap->options, wrup, sip);
- if (uap->status != NULL && error == 0)
+ if (uap->status != NULL && error == 0 && td->td_retval[0] != 0)
error = copyout(&status, uap->status, sizeof(status));
- if (uap->wrusage != NULL && error == 0)
+ if (uap->wrusage != NULL && error == 0 && td->td_retval[0] != 0)
error = copyout(&wru, uap->wrusage, sizeof(wru));
if (uap->info != NULL && error == 0)
error = copyout(&si, uap->info, sizeof(si));
@@ -781,14 +870,19 @@
PROC_LOCK(q);
sigqueue_take(p->p_ksi);
PROC_UNLOCK(q);
- PROC_UNLOCK(p);
/*
* If we got the child via a ptrace 'attach', we need to give it back
* to the old parent.
*/
- if (p->p_oppid && (t = pfind(p->p_oppid)) != NULL) {
+ if (p->p_oppid != 0 && p->p_oppid != p->p_pptr->p_pid) {
+ PROC_UNLOCK(p);
+ t = proc_realparent(p);
+ PROC_LOCK(t);
PROC_LOCK(p);
+ CTR2(KTR_PTRACE,
+ "wait: traced child %d moved back to parent %d", p->p_pid,
+ t->p_pid);
proc_reparent(p, t);
p->p_oppid = 0;
PROC_UNLOCK(p);
@@ -799,6 +893,8 @@
sx_xunlock(&proctree_lock);
return;
}
+ p->p_oppid = 0;
+ PROC_UNLOCK(p);
/*
* Remove other references to this process to ensure we have an
@@ -808,6 +904,8 @@
LIST_REMOVE(p, p_list); /* off zombproc */
sx_xunlock(&allproc_lock);
LIST_REMOVE(p, p_sibling);
+ reaper_abandon_children(p, true);
+ LIST_REMOVE(p, p_reapsibling);
PROC_LOCK(p);
clear_orphan(p);
PROC_UNLOCK(p);
@@ -838,9 +936,11 @@
* Destroy resource accounting information associated with the process.
*/
#ifdef RACCT
- PROC_LOCK(p);
- racct_sub(p, RACCT_NPROC, 1);
- PROC_UNLOCK(p);
+ if (racct_enable) {
+ PROC_LOCK(p);
+ racct_sub(p, RACCT_NPROC, 1);
+ PROC_UNLOCK(p);
+ }
#endif
racct_proc_exit(p);
@@ -848,7 +948,7 @@
* Free credentials, arguments, and sigacts.
*/
crfree(p->p_ucred);
- p->p_ucred = NULL;
+ proc_set_cred(p, NULL);
pargs_drop(p->p_args);
p->p_args = NULL;
sigacts_free(p->p_sigacts);
@@ -870,21 +970,18 @@
KASSERT(FIRST_THREAD_IN_PROC(p),
("proc_reap: no residual thread!"));
uma_zfree(proc_zone, p);
- sx_xlock(&allproc_lock);
- nprocs--;
- sx_xunlock(&allproc_lock);
+ atomic_add_int(&nprocs, -1);
}
static int
proc_to_reap(struct thread *td, struct proc *p, idtype_t idtype, id_t id,
- int *status, int options, struct __wrusage *wrusage, siginfo_t *siginfo)
+ int *status, int options, struct __wrusage *wrusage, siginfo_t *siginfo,
+ int check_only)
{
- struct proc *q;
struct rusage *rup;
sx_assert(&proctree_lock, SA_XLOCKED);
- q = td->td_proc;
PROC_LOCK(p);
switch (idtype) {
@@ -921,8 +1018,7 @@
}
break;
case P_JAILID:
- if (p->p_ucred->cr_prison == NULL ||
- (p->p_ucred->cr_prison->pr_id != (int)id)) {
+ if (p->p_ucred->cr_prison->pr_id != (int)id) {
PROC_UNLOCK(p);
return (0);
}
@@ -935,7 +1031,6 @@
default:
PROC_UNLOCK(p);
return (0);
- break;
}
if (p_canwait(td, p)) {
@@ -962,10 +1057,8 @@
return (0);
}
- PROC_SLOCK(p);
-
if (siginfo != NULL) {
- bzero (siginfo, sizeof (*siginfo));
+ bzero(siginfo, sizeof(*siginfo));
siginfo->si_errno = 0;
/*
@@ -980,16 +1073,19 @@
* This is still a rough estimate. We will fix the
* cases TRAPPED, STOPPED, and CONTINUED later.
*/
- if (WCOREDUMP(p->p_xstat))
+ if (WCOREDUMP(p->p_xstat)) {
siginfo->si_code = CLD_DUMPED;
- else if (WIFSIGNALED(p->p_xstat))
+ siginfo->si_status = WTERMSIG(p->p_xstat);
+ } else if (WIFSIGNALED(p->p_xstat)) {
siginfo->si_code = CLD_KILLED;
- else
+ siginfo->si_status = WTERMSIG(p->p_xstat);
+ } else {
siginfo->si_code = CLD_EXITED;
+ siginfo->si_status = WEXITSTATUS(p->p_xstat);
+ }
siginfo->si_pid = p->p_pid;
siginfo->si_uid = p->p_ucred->cr_uid;
- siginfo->si_status = p->p_xstat;
/*
* The si_addr field would be useful additional
@@ -1007,7 +1103,9 @@
if (wrusage != NULL) {
rup = &wrusage->wru_self;
*rup = p->p_ru;
+ PROC_STATLOCK(p);
calcru(p, &rup->ru_utime, &rup->ru_stime);
+ PROC_STATUNLOCK(p);
rup = &wrusage->wru_children;
*rup = p->p_stats->p_cru;
@@ -1014,11 +1112,11 @@
calccru(p, &rup->ru_utime, &rup->ru_stime);
}
- if (p->p_state == PRS_ZOMBIE) {
+ if (p->p_state == PRS_ZOMBIE && !check_only) {
+ PROC_SLOCK(p);
proc_reap(td, p, status, options);
return (-1);
}
- PROC_SUNLOCK(p);
PROC_UNLOCK(p);
return (1);
}
@@ -1054,8 +1152,8 @@
wrup = NULL;
/*
- * For backward compatibility we implicitly add flags WEXITED
- * and WTRAPPED here.
+ * For backward compatibility we implicitly add flags WEXITED
+ * and WTRAPPED here.
*/
options |= WEXITED | WTRAPPED;
ret = kern_wait6(td, idtype, id, status, options, wrup, NULL);
@@ -1108,7 +1206,7 @@
sx_xlock(&proctree_lock);
LIST_FOREACH(p, &q->p_children, p_sibling) {
ret = proc_to_reap(td, p, idtype, id, status, options,
- wrusage, siginfo);
+ wrusage, siginfo, 0);
if (ret == 0)
continue;
else if (ret == 1)
@@ -1142,6 +1240,10 @@
PROC_UNLOCK(q);
}
+ CTR4(KTR_PTRACE,
+ "wait: returning trapped pid %d status %#x (xstat %d) xthread %d",
+ p->p_pid, W_STOPCODE(p->p_xstat), p->p_xstat,
+ p->p_xthread != NULL ? p->p_xthread->td_tid : -1);
PROC_UNLOCK(p);
return (0);
}
@@ -1206,15 +1308,17 @@
* for. By maintaining a list of orphans we allow the parent
* to successfully wait until the child becomes a zombie.
*/
- LIST_FOREACH(p, &q->p_orphans, p_orphan) {
- ret = proc_to_reap(td, p, idtype, id, status, options,
- wrusage, siginfo);
- if (ret == 0)
- continue;
- else if (ret == 1)
- nfound++;
- else
- return (0);
+ if (nfound == 0) {
+ LIST_FOREACH(p, &q->p_orphans, p_orphan) {
+ ret = proc_to_reap(td, p, idtype, id, NULL, options,
+ NULL, NULL, 1);
+ if (ret != 0) {
+ KASSERT(ret != -1, ("reaped an orphan (pid %d)",
+ (int)td->td_retval[0]));
+ nfound++;
+ break;
+ }
+ }
}
if (nfound == 0) {
sx_xunlock(&proctree_lock);
@@ -1259,8 +1363,15 @@
clear_orphan(child);
if (child->p_flag & P_TRACED) {
- LIST_INSERT_HEAD(&child->p_pptr->p_orphans, child, p_orphan);
- child->p_flag |= P_ORPHAN;
+ if (LIST_EMPTY(&child->p_pptr->p_orphans)) {
+ child->p_treeflag |= P_TREE_FIRST_ORPHAN;
+ LIST_INSERT_HEAD(&child->p_pptr->p_orphans, child,
+ p_orphan);
+ } else {
+ LIST_INSERT_AFTER(LIST_FIRST(&child->p_pptr->p_orphans),
+ child, p_orphan);
+ }
+ child->p_treeflag |= P_TREE_ORPHANED;
}
child->p_pptr = parent;
Modified: trunk/sys/kern/kern_fail.c
===================================================================
--- trunk/sys/kern/kern_fail.c 2018-05-25 20:39:59 UTC (rev 9943)
+++ trunk/sys/kern/kern_fail.c 2018-05-25 20:46:51 UTC (rev 9944)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 2009 Isilon Inc http://www.isilon.com/
*
@@ -50,7 +51,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/kern_fail.c 227293 2011-11-07 06:44:47Z ed $");
#include <sys/ctype.h>
#include <sys/errno.h>
Modified: trunk/sys/kern/kern_fork.c
===================================================================
--- trunk/sys/kern/kern_fork.c 2018-05-25 20:39:59 UTC (rev 9943)
+++ trunk/sys/kern/kern_fork.c 2018-05-25 20:46:51 UTC (rev 9944)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 1982, 1986, 1989, 1991, 1993
* The Regents of the University of California. All rights reserved.
@@ -35,7 +36,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/kern_fork.c 321020 2017-07-15 17:25:40Z dchagin $");
#include "opt_kdtrace.h"
#include "opt_ktrace.h"
@@ -59,6 +60,7 @@
#include <sys/proc.h>
#include <sys/procdesc.h>
#include <sys/pioctl.h>
+#include <sys/ptrace.h>
#include <sys/racct.h>
#include <sys/resourcevar.h>
#include <sys/sched.h>
@@ -89,10 +91,7 @@
#endif
SDT_PROVIDER_DECLARE(proc);
-SDT_PROBE_DEFINE(proc, kernel, , create, create);
-SDT_PROBE_ARGTYPE(proc, kernel, , create, 0, "struct proc *");
-SDT_PROBE_ARGTYPE(proc, kernel, , create, 1, "struct proc *");
-SDT_PROBE_ARGTYPE(proc, kernel, , create, 2, "int");
+SDT_PROBE_DEFINE3(proc, , , create, "struct proc *", "struct proc *", "int");
#ifndef _SYS_SYSPROTO_H_
struct fork_args {
@@ -269,11 +268,21 @@
* Scan the active and zombie procs to check whether this pid
* is in use. Remember the lowest pid that's greater
* than trypid, so we can avoid checking for a while.
+ *
+ * Avoid reuse of the process group id, session id or
+ * the reaper subtree id. Note that for process group
+ * and sessions, the amount of reserved pids is
+ * limited by process limit. For the subtree ids, the
+ * id is kept reserved only while there is a
+ * non-reaped process in the subtree, so amount of
+ * reserved pids is limited by process limit times
+ * two.
*/
p = LIST_FIRST(&allproc);
again:
for (; p != NULL; p = LIST_NEXT(p, p_list)) {
while (p->p_pid == trypid ||
+ p->p_reapsubtree == trypid ||
(p->p_pgrp != NULL &&
(p->p_pgrp->pg_id == trypid ||
(p->p_session != NULL &&
@@ -325,7 +334,7 @@
if (((p1->p_flag & (P_HADTHREADS|P_SYSTEM)) == P_HADTHREADS) &&
(flags & (RFCFDG | RFFDG))) {
PROC_LOCK(p1);
- if (thread_single(SINGLE_BOUNDARY)) {
+ if (thread_single(p1, SINGLE_BOUNDARY)) {
PROC_UNLOCK(p1);
return (ERESTART);
}
@@ -342,7 +351,7 @@
if (flags & RFCFDG) {
struct filedesc *fdtmp;
fdtmp = fdinit(td->td_proc->p_fd);
- fdfree(td);
+ fdescfree(td);
p1->p_fd = fdtmp;
}
@@ -349,14 +358,14 @@
/*
* Unshare file descriptors (from parent).
*/
- if (flags & RFFDG)
- fdunshare(p1, td);
+ if (flags & RFFDG)
+ fdunshare(td);
fail:
if (((p1->p_flag & (P_HADTHREADS|P_SYSTEM)) == P_HADTHREADS) &&
(flags & (RFCFDG | RFFDG))) {
PROC_LOCK(p1);
- thread_single_end();
+ thread_single_end(p1, SINGLE_BOUNDARY);
PROC_UNLOCK(p1);
}
return (error);
@@ -378,12 +387,6 @@
p2_held = 0;
p1 = td->td_proc;
- /*
- * Increment the nprocs resource before blocking can occur. There
- * are hard-limits as to the number of processes that can run.
- */
- nprocs++;
-
trypid = fork_findpid(flags);
sx_sunlock(&proctree_lock);
@@ -392,6 +395,7 @@
p2->p_pid = trypid;
AUDIT_ARG_PID(p2->p_pid);
LIST_INSERT_HEAD(&allproc, p2, p_list);
+ allproc_gen++;
LIST_INSERT_HEAD(PIDHASH(p2->p_pid), p2, p_hash);
tidhash_add(td2);
PROC_LOCK(p2);
@@ -401,14 +405,17 @@
bcopy(&p1->p_startcopy, &p2->p_startcopy,
__rangeof(struct proc, p_startcopy, p_endcopy));
+ p2->p_elf_machine = p1->p_elf_machine;
+ p2->p_elf_flags = p1->p_elf_flags;
pargs_hold(p2->p_args);
PROC_UNLOCK(p1);
bzero(&p2->p_startzero,
__rangeof(struct proc, p_startzero, p_endzero));
+ p2->p_treeflag = 0;
+ p2->p_filemon = NULL;
+ p2->p_ptevents = 0;
- p2->p_ucred = crhold(td->td_ucred);
-
/* Tell the prison that we exist. */
prison_proc_hold(p2->p_ucred->cr_prison);
@@ -465,6 +472,8 @@
bzero(&td2->td_startzero,
__rangeof(struct thread, td_startzero, td_endzero));
+ td2->td_su = NULL;
+ td2->td_sleeptimo = 0;
bcopy(&td->td_startcopy, &td2->td_startcopy,
__rangeof(struct thread, td_startcopy, td_endcopy));
@@ -471,9 +480,10 @@
bcopy(&p2->p_comm, &td2->td_name, sizeof(td2->td_name));
td2->td_sigstk = td->td_sigstk;
- td2->td_sigmask = td->td_sigmask;
td2->td_flags = TDF_INMEM;
td2->td_lend_user_pri = PRI_MAX;
+ td2->td_dbg_sc_code = td->td_dbg_sc_code;
+ td2->td_dbg_sc_narg = td->td_dbg_sc_narg;
#ifdef VIMAGE
td2->td_vnet = NULL;
@@ -492,6 +502,7 @@
* Increase reference counts on shared objects.
*/
p2->p_flag = P_INMEM;
+ p2->p_flag2 = p1->p_flag2 & (P2_NOTRACE | P2_NOTRACE_EXEC);
p2->p_swtick = ticks;
if (p1->p_flag & P_PROFIL)
startprofclock(p2);
@@ -515,6 +526,11 @@
p2->p_fd = fd;
p2->p_fdtol = fdtol;
+ if (p1->p_flag2 & P2_INHERIT_PROTECTED) {
+ p2->p_flag |= P_PROTECTED;
+ p2->p_flag2 |= P2_INHERIT_PROTECTED;
+ }
+
/*
* p_limit is copy-on-write. Bump its refcount.
*/
@@ -613,12 +629,22 @@
* of init. This effectively disassociates the child from the
* parent.
*/
- if (flags & RFNOWAIT)
- pptr = initproc;
- else
+ if ((flags & RFNOWAIT) != 0) {
+ pptr = p1->p_reaper;
+ p2->p_reaper = pptr;
+ } else {
+ p2->p_reaper = (p1->p_treeflag & P_TREE_REAPER) != 0 ?
+ p1 : p1->p_reaper;
pptr = p1;
+ }
p2->p_pptr = pptr;
LIST_INSERT_HEAD(&pptr->p_children, p2, p_sibling);
+ LIST_INIT(&p2->p_reaplist);
+ LIST_INSERT_HEAD(&p2->p_reaper->p_reaplist, p2, p_reapsibling);
+ if (p2->p_reaper == p1)
+ p2->p_reapsubtree = p2->p_pid;
+ else
+ p2->p_reapsubtree = p1->p_reapsubtree;
sx_xunlock(&proctree_lock);
/* Inform accounting that we have forked. */
@@ -681,16 +707,15 @@
#ifdef KDTRACE_HOOKS
/*
- * Tell the DTrace fasttrap provider about the new process
- * if it has registered an interest. We have to do this only after
- * p_state is PRS_NORMAL since the fasttrap module will use pfind()
- * later on.
+ * Tell the DTrace fasttrap provider about the new process so that any
+ * tracepoints inherited from the parent can be removed. We have to do
+ * this only after p_state is PRS_NORMAL since the fasttrap module will
+ * use pfind() later on.
*/
- if (dtrace_fasttrap_fork)
+ if ((flags & RFMEM) == 0 && dtrace_fasttrap_fork)
dtrace_fasttrap_fork(p1, p2);
#endif
- if ((p1->p_flag & (P_TRACED | P_FOLLOWFORK)) == (P_TRACED |
- P_FOLLOWFORK)) {
+ if (p1->p_ptevents & PTRACE_FORK) {
/*
* Arrange for debugger to receive the fork event.
*
@@ -707,6 +732,7 @@
if (flags & RFPPWAIT) {
td->td_pflags |= TDP_RFPPWAIT;
td->td_rfppwait_p = p2;
+ td->td_dbgflags |= TDB_VFORK;
}
PROC_UNLOCK(p2);
if ((flags & RFSTOPPED) == 0) {
@@ -730,7 +756,7 @@
* Tell any interested parties about the new process.
*/
knote_fork(&p1->p_klist, p2->p_pid);
- SDT_PROBE(proc, kernel, , create, p2, p1, flags, 0, 0);
+ SDT_PROBE3(proc, , , create, p2, p1, flags);
/*
* Wait until debugger is attached to child.
@@ -747,18 +773,16 @@
fork1(struct thread *td, int flags, int pages, struct proc **procp,
int *procdescp, int pdflags)
{
- struct proc *p1;
- struct proc *newproc;
- int ok;
+ struct proc *p1, *newproc;
struct thread *td2;
struct vmspace *vm2;
+#ifdef PROCDESC
+ struct file *fp_procdesc;
+#endif
vm_ooffset_t mem_charged;
- int error;
+ int error, nprocs_new, ok;
static int curfail;
static struct timeval lastfail;
-#ifdef PROCDESC
- struct file *fp_procdesc = NULL;
-#endif
/* Check for the undefined or unimplemented flags. */
if ((flags & ~(RFFLAGS | RFTSIGFLAGS(RFTSIGMASK))) != 0)
@@ -800,7 +824,38 @@
}
#ifdef PROCDESC
+ fp_procdesc = NULL;
+#endif
+ newproc = NULL;
+ vm2 = NULL;
+
/*
+ * Increment the nprocs resource before allocations occur.
+ * Although process entries are dynamically created, we still
+ * keep a global limit on the maximum number we will
+ * create. There are hard-limits as to the number of processes
+ * that can run, established by the KVA and memory usage for
+ * the process data.
+ *
+ * Don't allow a nonprivileged user to use the last ten
+ * processes; don't let root exceed the limit.
+ */
+ nprocs_new = atomic_fetchadd_int(&nprocs, 1) + 1;
+ if ((nprocs_new >= maxproc - 10 && priv_check_cred(td->td_ucred,
+ PRIV_MAXPROC, 0) != 0) || nprocs_new >= maxproc) {
+ sx_xlock(&allproc_lock);
+ if (ppsratecheck(&lastfail, &curfail, 1)) {
+ printf("maxproc limit exceeded by uid %u (pid %d); "
+ "see tuning(7) and login.conf(5)\n",
+ td->td_ucred->cr_ruid, p1->p_pid);
+ }
+ sx_xunlock(&allproc_lock);
+ error = EAGAIN;
+ goto fail1;
+ }
+
+#ifdef PROCDESC
+ /*
* If required, create a process descriptor in the parent first; we
* will abandon it if something goes wrong. We don't finit() until
* later.
@@ -808,12 +863,11 @@
if (flags & RFPROCDESC) {
error = falloc(td, &fp_procdesc, procdescp, 0);
if (error != 0)
- return (error);
+ goto fail1;
}
#endif
mem_charged = 0;
- vm2 = NULL;
if (pages == 0)
pages = KSTACK_PAGES;
/* Allocate new proc. */
@@ -823,7 +877,7 @@
td2 = thread_alloc(pages);
if (td2 == NULL) {
error = ENOMEM;
- goto fail1;
+ goto fail2;
}
proc_linkup(newproc, td2);
} else {
@@ -832,7 +886,7 @@
vm_thread_dispose(td2);
if (!thread_alloc_stack(td2, pages)) {
error = ENOMEM;
- goto fail1;
+ goto fail2;
}
}
}
@@ -841,18 +895,18 @@
vm2 = vmspace_fork(p1->p_vmspace, &mem_charged);
if (vm2 == NULL) {
error = ENOMEM;
- goto fail1;
+ goto fail2;
}
if (!swap_reserve(mem_charged)) {
/*
* The swap reservation failed. The accounting
* from the entries of the copied vm2 will be
- * substracted in vmspace_free(), so force the
+ * subtracted in vmspace_free(), so force the
* reservation there.
*/
swap_reserve_force(mem_charged);
error = ENOMEM;
- goto fail1;
+ goto fail2;
}
} else
vm2 = NULL;
@@ -861,7 +915,7 @@
* XXX: This is ugly; when we copy resource usage, we need to bump
* per-cred resource counters.
*/
- newproc->p_ucred = p1->p_ucred;
+ proc_set_cred_init(newproc, crhold(td->td_ucred));
/*
* Initialize resource accounting for the child process.
@@ -880,20 +934,7 @@
/* We have to lock the process tree while we look for a pid. */
sx_slock(&proctree_lock);
-
- /*
- * Although process entries are dynamically created, we still keep
- * a global limit on the maximum number we will create. Don't allow
- * a nonprivileged user to use the last ten processes; don't let root
- * exceed the limit. The variable nprocs is the current number of
- * processes, maxproc is the limit.
- */
sx_xlock(&allproc_lock);
- if ((nprocs >= maxproc - 10 && priv_check_cred(td->td_ucred,
- PRIV_MAXPROC, 0) != 0) || nprocs >= maxproc) {
- error = EAGAIN;
- goto fail;
- }
/*
* Increment the count of procs running with this uid. Don't allow
@@ -918,8 +959,10 @@
*/
*procp = newproc;
#ifdef PROCDESC
- if (flags & RFPROCDESC)
+ if (flags & RFPROCDESC) {
procdesc_finit(newproc->p_procdesc, fp_procdesc);
+ fdrop(fp_procdesc, td);
+ }
#endif
racct_proc_fork_done(newproc);
return (0);
@@ -926,11 +969,7 @@
}
error = EAGAIN;
-fail:
sx_sunlock(&proctree_lock);
- if (ppsratecheck(&lastfail, &curfail, 1))
- printf("maxproc limit exceeded by uid %u (pid %d); see tuning(7) and login.conf(5)\n",
- td->td_ucred->cr_ruid, p1->p_pid);
sx_xunlock(&allproc_lock);
#ifdef MAC
mac_proc_destroy(newproc);
@@ -937,13 +976,19 @@
#endif
racct_proc_exit(newproc);
fail1:
+ crfree(newproc->p_ucred);
+ newproc->p_ucred = NULL;
+fail2:
if (vm2 != NULL)
vmspace_free(vm2);
uma_zfree(proc_zone, newproc);
#ifdef PROCDESC
- if (((flags & RFPROCDESC) != 0) && (fp_procdesc != NULL))
+ if ((flags & RFPROCDESC) != 0 && fp_procdesc != NULL) {
+ fdclose(td, fp_procdesc, *procdescp);
fdrop(fp_procdesc, td);
+ }
#endif
+ atomic_add_int(&nprocs, -1);
pause("fork", hz / 2);
return (error);
}
@@ -994,7 +1039,7 @@
if (p->p_flag & P_KTHREAD) {
printf("Kernel thread \"%s\" (pid %d) exited prematurely.\n",
td->td_name, p->p_pid);
- kproc_exit(0);
+ kthread_exit();
}
mtx_assert(&Giant, MA_NOTOWNED);
@@ -1004,9 +1049,9 @@
/*
* Simplified back end of syscall(), used when returning from fork()
- * directly into user mode. Giant is not held on entry, and must not
- * be held on return. This function is passed in to fork_exit() as the
- * first parameter and is called when returning to a new userland process.
+ * directly into user mode. This function is passed in to fork_exit()
+ * as the first parameter and is called when returning to a new
+ * userland process.
*/
void
fork_return(struct thread *td, struct trapframe *frame)
@@ -1013,24 +1058,25 @@
{
struct proc *p, *dbg;
+ p = td->td_proc;
if (td->td_dbgflags & TDB_STOPATFORK) {
- p = td->td_proc;
sx_xlock(&proctree_lock);
PROC_LOCK(p);
- if ((p->p_pptr->p_flag & (P_TRACED | P_FOLLOWFORK)) ==
- (P_TRACED | P_FOLLOWFORK)) {
+ if (p->p_pptr->p_ptevents & PTRACE_FORK) {
/*
* If debugger still wants auto-attach for the
* parent's children, do it now.
*/
dbg = p->p_pptr->p_pptr;
- p->p_flag |= P_TRACED;
- p->p_oppid = p->p_pptr->p_pid;
+ proc_set_traced(p, true);
+ CTR2(KTR_PTRACE,
+ "fork_return: attaching to new child pid %d: oppid %d",
+ p->p_pid, p->p_oppid);
proc_reparent(p, dbg);
sx_xunlock(&proctree_lock);
- td->td_dbgflags |= TDB_CHILD;
- ptracestop(td, SIGSTOP);
- td->td_dbgflags &= ~TDB_CHILD;
+ td->td_dbgflags |= TDB_CHILD | TDB_SCX | TDB_FSTP;
+ ptracestop(td, SIGSTOP, NULL);
+ td->td_dbgflags &= ~(TDB_CHILD | TDB_SCX);
} else {
/*
* ... otherwise clear the request.
@@ -1040,6 +1086,19 @@
cv_broadcast(&p->p_dbgwait);
}
PROC_UNLOCK(p);
+ } else if (p->p_flag & P_TRACED || td->td_dbgflags & TDB_BORN) {
+ /*
+ * This is the start of a new thread in a traced
+ * process. Report a system call exit event.
+ */
+ PROC_LOCK(p);
+ td->td_dbgflags |= TDB_SCX;
+ _STOPEVENT(p, S_SCX, td->td_dbg_sc_code);
+ if ((p->p_ptevents & PTRACE_SCX) != 0 ||
+ (td->td_dbgflags & TDB_BORN) != 0)
+ ptracestop(td, SIGTRAP, NULL);
+ td->td_dbgflags &= ~(TDB_SCX | TDB_BORN);
+ PROC_UNLOCK(p);
}
userret(td, frame);
@@ -1048,5 +1107,4 @@
if (KTRPOINT(td, KTR_SYSRET))
ktrsysret(SYS_fork, 0, 0);
#endif
- mtx_assert(&Giant, MA_NOTOWNED);
}
Modified: trunk/sys/kern/kern_gzio.c
===================================================================
--- trunk/sys/kern/kern_gzio.c 2018-05-25 20:39:59 UTC (rev 9943)
+++ trunk/sys/kern/kern_gzio.c 2018-05-25 20:46:51 UTC (rev 9944)
@@ -1,5 +1,6 @@
+/* $MidnightBSD$ */
/*
- * $Id: kern_gzio.c,v 1.2 2012-10-09 04:08:16 laffer1 Exp $
+ * $Id: kern_gzio.c,v 1.6 2008-10-18 22:54:45 lbazinet Exp $
*
* core_gzip.c -- gzip routines used in compressing user process cores
*
@@ -12,7 +13,7 @@
*
*/
-/* @(#) $MidnightBSD$ */
+/* @(#) $FreeBSD: stable/10/sys/kern/kern_gzio.c 241896 2012-10-22 17:50:54Z kib $ */
#include <sys/param.h>
#include <sys/proc.h>
@@ -219,7 +220,6 @@
off_t curoff;
size_t resid;
int error;
- int vfslocked;
if (s == NULL || s->mode != 'w') return Z_STREAM_ERROR;
@@ -232,11 +232,9 @@
if (s->stream.avail_out == 0) {
s->stream.next_out = s->outbuf;
- vfslocked = VFS_LOCK_GIANT(s->file->v_mount);
error = vn_rdwr_inchunks(UIO_WRITE, s->file, s->outbuf, Z_BUFSIZE,
curoff, UIO_SYSSPACE, IO_UNIT,
curproc->p_ucred, NOCRED, &resid, curthread);
- VFS_UNLOCK_GIANT(vfslocked);
if (error) {
log(LOG_ERR, "gzwrite: vn_rdwr return %d\n", error);
curoff += Z_BUFSIZE - resid;
@@ -274,7 +272,6 @@
gz_stream *s = (gz_stream*)file;
off_t curoff = s->outoff;
size_t resid;
- int vfslocked = 0;
int error;
if (s == NULL || s->mode != 'w') return Z_STREAM_ERROR;
@@ -289,11 +286,9 @@
len = Z_BUFSIZE - s->stream.avail_out;
if (len != 0) {
- vfslocked = VFS_LOCK_GIANT(s->file->v_mount);
error = vn_rdwr_inchunks(UIO_WRITE, s->file, s->outbuf, len, curoff,
UIO_SYSSPACE, IO_UNIT, curproc->p_ucred,
NOCRED, &resid, curthread);
- VFS_UNLOCK_GIANT(vfslocked);
if (error) {
s->z_err = Z_ERRNO;
s->outoff = curoff + len - resid;
Modified: trunk/sys/kern/kern_hhook.c
===================================================================
--- trunk/sys/kern/kern_hhook.c 2018-05-25 20:39:59 UTC (rev 9943)
+++ trunk/sys/kern/kern_hhook.c 2018-05-25 20:46:51 UTC (rev 9944)
@@ -1,5 +1,6 @@
+/* $MidnightBSD$ */
/*-
- * Copyright (c) 2010 Lawrence Stewart <lstewart at freebsd.org>
+ * Copyright (c) 2010,2013 Lawrence Stewart <lstewart at freebsd.org>
* Copyright (c) 2010 The FreeBSD Foundation
* All rights reserved.
*
@@ -35,7 +36,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/kern_hhook.c 251787 2013-06-15 10:08:34Z lstewart $");
#include <sys/param.h>
#include <sys/kernel.h>
@@ -61,15 +62,20 @@
static MALLOC_DEFINE(M_HHOOK, "hhook", "Helper hooks are linked off hhook_head lists");
LIST_HEAD(hhookheadhead, hhook_head);
-VNET_DEFINE(struct hhookheadhead, hhook_head_list);
-#define V_hhook_head_list VNET(hhook_head_list)
+struct hhookheadhead hhook_head_list;
+VNET_DEFINE(struct hhookheadhead, hhook_vhead_list);
+#define V_hhook_vhead_list VNET(hhook_vhead_list)
static struct mtx hhook_head_list_lock;
MTX_SYSINIT(hhookheadlistlock, &hhook_head_list_lock, "hhook_head list lock",
MTX_DEF);
+/* Protected by hhook_head_list_lock. */
+static uint32_t n_hhookheads;
+
/* Private function prototypes. */
static void hhook_head_destroy(struct hhook_head *hhh);
+void khelp_new_hhook_registered(struct hhook_head *hhh, uint32_t flags);
#define HHHLIST_LOCK() mtx_lock(&hhook_head_list_lock)
#define HHHLIST_UNLOCK() mtx_unlock(&hhook_head_list_lock)
@@ -164,22 +170,72 @@
}
/*
- * Lookup a helper hook point and register a new helper hook function with it.
+ * Register a helper hook function with a helper hook point (including all
+ * virtual instances of the hook point if it is virtualised).
+ *
+ * The logic is unfortunately far more complex than for
+ * hhook_remove_hook_lookup() because hhook_add_hook() can call malloc() with
+ * M_WAITOK and thus we cannot call hhook_add_hook() with the
+ * hhook_head_list_lock held.
+ *
+ * The logic assembles an array of hhook_head structs that correspond to the
+ * helper hook point being hooked and bumps the refcount on each (all done with
+ * the hhook_head_list_lock held). The hhook_head_list_lock is then dropped, and
+ * hhook_add_hook() is called and the refcount dropped for each hhook_head
+ * struct in the array.
*/
int
hhook_add_hook_lookup(struct hookinfo *hki, uint32_t flags)
{
- struct hhook_head *hhh;
- int error;
+ struct hhook_head **heads_to_hook, *hhh;
+ int error, i, n_heads_to_hook;
- hhh = hhook_head_get(hki->hook_type, hki->hook_id);
+tryagain:
+ error = i = 0;
+ /*
+ * Accessing n_hhookheads without hhook_head_list_lock held opens up a
+ * race with hhook_head_register() which we are unlikely to lose, but
+ * nonetheless have to cope with - hence the complex goto logic.
+ */
+ n_heads_to_hook = n_hhookheads;
+ heads_to_hook = malloc(n_heads_to_hook * sizeof(struct hhook_head *),
+ M_HHOOK, flags & HHOOK_WAITOK ? M_WAITOK : M_NOWAIT);
+ if (heads_to_hook == NULL)
+ return (ENOMEM);
- if (hhh == NULL)
- return (ENOENT);
+ HHHLIST_LOCK();
+ LIST_FOREACH(hhh, &hhook_head_list, hhh_next) {
+ if (hhh->hhh_type == hki->hook_type &&
+ hhh->hhh_id == hki->hook_id) {
+ if (i < n_heads_to_hook) {
+ heads_to_hook[i] = hhh;
+ refcount_acquire(&heads_to_hook[i]->hhh_refcount);
+ i++;
+ } else {
+ /*
+ * We raced with hhook_head_register() which
+ * inserted a hhook_head that we need to hook
+ * but did not malloc space for. Abort this run
+ * and try again.
+ */
+ for (i--; i >= 0; i--)
+ refcount_release(&heads_to_hook[i]->hhh_refcount);
+ free(heads_to_hook, M_HHOOK);
+ HHHLIST_UNLOCK();
+ goto tryagain;
+ }
+ }
+ }
+ HHHLIST_UNLOCK();
- error = hhook_add_hook(hhh, hki, flags);
- hhook_head_release(hhh);
+ for (i--; i >= 0; i--) {
+ if (!error)
+ error = hhook_add_hook(heads_to_hook[i], hki, flags);
+ refcount_release(&heads_to_hook[i]->hhh_refcount);
+ }
+ free(heads_to_hook, M_HHOOK);
+
return (error);
}
@@ -210,7 +266,8 @@
}
/*
- * Lookup a helper hook point and remove a helper hook function from it.
+ * Remove a helper hook function from a helper hook point (including all
+ * virtual instances of the hook point if it is virtualised).
*/
int
hhook_remove_hook_lookup(struct hookinfo *hki)
@@ -217,14 +274,14 @@
{
struct hhook_head *hhh;
- hhh = hhook_head_get(hki->hook_type, hki->hook_id);
+ HHHLIST_LOCK();
+ LIST_FOREACH(hhh, &hhook_head_list, hhh_next) {
+ if (hhh->hhh_type == hki->hook_type &&
+ hhh->hhh_id == hki->hook_id)
+ hhook_remove_hook(hhh, hki);
+ }
+ HHHLIST_UNLOCK();
- if (hhh == NULL)
- return (ENOENT);
-
- hhook_remove_hook(hhh, hki);
- hhook_head_release(hhh);
-
return (0);
}
@@ -245,13 +302,6 @@
return (EEXIST);
}
- /* XXXLAS: Need to implement support for non-virtualised hooks. */
- if ((flags & HHOOK_HEADISINVNET) == 0) {
- printf("%s: only vnet-style virtualised hooks can be used\n",
- __func__);
- return (EINVAL);
- }
-
tmphhh = malloc(sizeof(struct hhook_head), M_HHOOK,
M_ZERO | ((flags & HHOOK_WAITOK) ? M_WAITOK : M_NOWAIT));
@@ -263,23 +313,28 @@
tmphhh->hhh_nhooks = 0;
STAILQ_INIT(&tmphhh->hhh_hooks);
HHH_LOCK_INIT(tmphhh);
+ refcount_init(&tmphhh->hhh_refcount, 1);
- if (hhh != NULL)
- refcount_init(&tmphhh->hhh_refcount, 1);
- else
- refcount_init(&tmphhh->hhh_refcount, 0);
-
+ HHHLIST_LOCK();
if (flags & HHOOK_HEADISINVNET) {
tmphhh->hhh_flags |= HHH_ISINVNET;
- HHHLIST_LOCK();
- LIST_INSERT_HEAD(&V_hhook_head_list, tmphhh, hhh_next);
- HHHLIST_UNLOCK();
- } else {
- /* XXXLAS: Add tmphhh to the non-virtualised list. */
+#ifdef VIMAGE
+ KASSERT(curvnet != NULL, ("curvnet is NULL"));
+ tmphhh->hhh_vid = (uintptr_t)curvnet;
+ LIST_INSERT_HEAD(&V_hhook_vhead_list, tmphhh, hhh_vnext);
+#endif
}
+ LIST_INSERT_HEAD(&hhook_head_list, tmphhh, hhh_next);
+ n_hhookheads++;
+ HHHLIST_UNLOCK();
- *hhh = tmphhh;
+ khelp_new_hhook_registered(tmphhh, flags);
+ if (hhh != NULL)
+ *hhh = tmphhh;
+ else
+ refcount_release(&tmphhh->hhh_refcount);
+
return (0);
}
@@ -289,8 +344,13 @@
struct hhook *tmp, *tmp2;
HHHLIST_LOCK_ASSERT();
+ KASSERT(n_hhookheads > 0, ("n_hhookheads should be > 0"));
LIST_REMOVE(hhh, hhh_next);
+#ifdef VIMAGE
+ if (hhook_head_is_virtualised(hhh) == HHOOK_HEADISINVNET)
+ LIST_REMOVE(hhh, hhh_vnext);
+#endif
HHH_WLOCK(hhh);
STAILQ_FOREACH_SAFE(tmp, &hhh->hhh_hooks, hhk_next, tmp2)
free(tmp, M_HHOOK);
@@ -297,6 +357,7 @@
HHH_WUNLOCK(hhh);
HHH_LOCK_DESTROY(hhh);
free(hhh, M_HHOOK);
+ n_hhookheads--;
}
/*
@@ -348,10 +409,17 @@
{
struct hhook_head *hhh;
- /* XXXLAS: Pick hhook_head_list based on hhook_head flags. */
HHHLIST_LOCK();
- LIST_FOREACH(hhh, &V_hhook_head_list, hhh_next) {
+ LIST_FOREACH(hhh, &hhook_head_list, hhh_next) {
if (hhh->hhh_type == hhook_type && hhh->hhh_id == hhook_id) {
+#ifdef VIMAGE
+ if (hhook_head_is_virtualised(hhh) ==
+ HHOOK_HEADISINVNET) {
+ KASSERT(curvnet != NULL, ("curvnet is NULL"));
+ if (hhh->hhh_vid != (uintptr_t)curvnet)
+ continue;
+ }
+#endif
refcount_acquire(&hhh->hhh_refcount);
break;
}
@@ -413,7 +481,7 @@
hhook_vnet_init(const void *unused __unused)
{
- LIST_INIT(&V_hhook_head_list);
+ LIST_INIT(&V_hhook_vhead_list);
}
/*
@@ -430,7 +498,7 @@
* subsystem should have already called hhook_head_deregister().
*/
HHHLIST_LOCK();
- LIST_FOREACH_SAFE(hhh, &V_hhook_head_list, hhh_next, tmphhh) {
+ LIST_FOREACH_SAFE(hhh, &V_hhook_vhead_list, hhh_vnext, tmphhh) {
printf("%s: hhook_head type=%d, id=%d cleanup required\n",
__func__, hhh->hhh_type, hhh->hhh_id);
hhook_head_destroy(hhh);
@@ -440,9 +508,9 @@
/*
- * When a vnet is created and being initialised, init the V_hhook_head_list.
+ * When a vnet is created and being initialised, init the V_hhook_vhead_list.
*/
-VNET_SYSINIT(hhook_vnet_init, SI_SUB_PROTO_BEGIN, SI_ORDER_FIRST,
+VNET_SYSINIT(hhook_vnet_init, SI_SUB_MBUF, SI_ORDER_FIRST,
hhook_vnet_init, NULL);
/*
@@ -450,5 +518,5 @@
* points to clean up on vnet tear down, but in case the KPI is misused,
* provide a function to clean up and free memory for a vnet being destroyed.
*/
-VNET_SYSUNINIT(hhook_vnet_uninit, SI_SUB_PROTO_BEGIN, SI_ORDER_FIRST,
+VNET_SYSUNINIT(hhook_vnet_uninit, SI_SUB_MBUF, SI_ORDER_ANY,
hhook_vnet_uninit, NULL);
Modified: trunk/sys/kern/kern_intr.c
===================================================================
--- trunk/sys/kern/kern_intr.c 2018-05-25 20:39:59 UTC (rev 9943)
+++ trunk/sys/kern/kern_intr.c 2018-05-25 20:46:51 UTC (rev 9944)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 1997, Stefan Esser <se at freebsd.org>
* All rights reserved.
@@ -25,9 +26,10 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/kern_intr.c 272946 2014-10-11 17:49:51Z kib $");
#include "opt_ddb.h"
+#include "opt_kstack_usage_prof.h"
#include <sys/param.h>
#include <sys/bus.h>
@@ -336,7 +338,7 @@
if (ie->ie_cpu == NOCPU)
CPU_COPY(cpuset_root, &mask);
else
- CPU_SET(cpu, &mask);
+ CPU_SET(ie->ie_cpu, &mask);
id = ie->ie_thread->it_thread->td_tid;
mtx_unlock(&ie->ie_lock);
(void)cpuset_setthread(id, &mask);
@@ -626,7 +628,7 @@
mtx_lock(&ie->ie_lock);
it->it_event = ie;
ih->ih_thread = it;
- ithread_update(it); // XXX - do we really need this?!?!?
+ ithread_update(it); /* XXX - do we really need this?!?!? */
} else { /* Create the global per-event thread if we need one. */
while (ie->ie_thread == NULL && handler != NULL) {
if (ie->ie_flags & IE_ADDING_THREAD)
@@ -698,9 +700,9 @@
* description at that point. If one is not found, find the
* end of the name to use as the insertion point.
*/
- start = index(ih->ih_name, ':');
+ start = strchr(ih->ih_name, ':');
if (start == NULL)
- start = index(ih->ih_name, 0);
+ start = strchr(ih->ih_name, 0);
/*
* See if there is enough remaining room in the string for the
@@ -841,7 +843,7 @@
* again and remove this handler if it has already passed
* it on the list.
*/
- ie->ie_thread->it_need = 1;
+ atomic_store_rel_int(&ie->ie_thread->it_need, 1);
} else
TAILQ_REMOVE(&ie->ie_handlers, handler, ih_next);
thread_unlock(ie->ie_thread->it_thread);
@@ -901,7 +903,7 @@
p->p_pid, td->td_name);
entropy.event = (uintptr_t)ie;
entropy.td = ctd;
- random_harvest(&entropy, sizeof(entropy), 2, 0,
+ random_harvest(&entropy, sizeof(entropy), 2,
RANDOM_INTERRUPT);
}
@@ -912,7 +914,7 @@
* running. Then, lock the thread and see if we actually need to
* put it on the runqueue.
*/
- it->it_need = 1;
+ atomic_store_rel_int(&it->it_need, 1);
thread_lock(td);
if (TD_AWAITING_INTR(td)) {
CTR3(KTR_INTR, "%s: schedule pid %d (%s)", __func__, p->p_pid,
@@ -990,7 +992,7 @@
* again and remove this handler if it has already passed
* it on the list.
*/
- it->it_need = 1;
+ atomic_store_rel_int(&it->it_need, 1);
} else
TAILQ_REMOVE(&ie->ie_handlers, handler, ih_next);
thread_unlock(it->it_thread);
@@ -1055,7 +1057,7 @@
p->p_pid, td->td_name);
entropy.event = (uintptr_t)ie;
entropy.td = ctd;
- random_harvest(&entropy, sizeof(entropy), 2, 0,
+ random_harvest(&entropy, sizeof(entropy), 2,
RANDOM_INTERRUPT);
}
@@ -1066,7 +1068,7 @@
* running. Then, lock the thread and see if we actually need to
* put it on the runqueue.
*/
- it->it_need = 1;
+ atomic_store_rel_int(&it->it_need, 1);
thread_lock(td);
if (TD_AWAITING_INTR(td)) {
CTR3(KTR_INTR, "%s: schedule pid %d (%s)", __func__, p->p_pid,
@@ -1103,7 +1105,6 @@
swi_add(struct intr_event **eventp, const char *name, driver_intr_t handler,
void *arg, int pri, enum intr_type flags, void **cookiep)
{
- struct thread *td;
struct intr_event *ie;
int error;
@@ -1125,15 +1126,7 @@
}
error = intr_event_add_handler(ie, name, NULL, handler, arg,
PI_SWI(pri), flags, cookiep);
- if (error)
- return (error);
- if (pri == SWI_CLOCK) {
- td = ie->ie_thread->it_thread;
- thread_lock(td);
- td->td_flags |= TDF_NOLOAD;
- thread_unlock(td);
- }
- return (0);
+ return (error);
}
/*
@@ -1155,8 +1148,8 @@
curproc->p_pid, curthread->td_name);
entropy.event = (uintptr_t)ih;
entropy.td = curthread;
- random_harvest(&entropy, sizeof(entropy), 1, 0,
- RANDOM_INTERRUPT);
+ random_harvest(&entropy, sizeof(entropy), 1,
+ RANDOM_SWI);
}
/*
@@ -1256,7 +1249,7 @@
* interrupt threads always invoke all of their handlers.
*/
if (ie->ie_flags & IE_SOFT) {
- if (!ih->ih_need)
+ if (atomic_load_acq_int(&ih->ih_need) == 0)
continue;
else
atomic_store_rel_int(&ih->ih_need, 0);
@@ -1358,7 +1351,7 @@
* we are running, it will set it_need to note that we
* should make another pass.
*/
- while (ithd->it_need) {
+ while (atomic_load_acq_int(&ithd->it_need) != 0) {
/*
* This might need a full read and write barrier
* to make sure that this write posts before any
@@ -1377,7 +1370,8 @@
* set again, so we have to check it again.
*/
thread_lock(td);
- if (!ithd->it_need && !(ithd->it_flags & (IT_DEAD | IT_WAIT))) {
+ if ((atomic_load_acq_int(&ithd->it_need) == 0) &&
+ !(ithd->it_flags & (IT_DEAD | IT_WAIT))) {
TD_SET_IWAIT(td);
ie->ie_count = 0;
mi_switch(SW_VOL | SWT_IWAIT, NULL);
@@ -1415,6 +1409,10 @@
td = curthread;
+#ifdef KSTACK_USAGE_PROF
+ intr_prof_stack_use(td, frame);
+#endif
+
/* An interrupt with no event or handlers is a stray interrupt. */
if (ie == NULL || TAILQ_EMPTY(&ie->ie_handlers))
return (EINVAL);
@@ -1538,7 +1536,7 @@
* we are running, it will set it_need to note that we
* should make another pass.
*/
- while (ithd->it_need) {
+ while (atomic_load_acq_int(&ithd->it_need) != 0) {
/*
* This might need a full read and write barrier
* to make sure that this write posts before any
@@ -1560,7 +1558,8 @@
* set again, so we have to check it again.
*/
thread_lock(td);
- if (!ithd->it_need && !(ithd->it_flags & (IT_DEAD | IT_WAIT))) {
+ if ((atomic_load_acq_int(&ithd->it_need) == 0) &&
+ !(ithd->it_flags & (IT_DEAD | IT_WAIT))) {
TD_SET_IWAIT(td);
ie->ie_count = 0;
mi_switch(SW_VOL | SWT_IWAIT, NULL);
@@ -1856,8 +1855,8 @@
struct intr_event *ie;
int all, verbose;
- verbose = index(modif, 'v') != NULL;
- all = index(modif, 'a') != NULL;
+ verbose = strchr(modif, 'v') != NULL;
+ all = strchr(modif, 'a') != NULL;
TAILQ_FOREACH(ie, &event_list, ie_list) {
if (!all && TAILQ_EMPTY(&ie->ie_handlers))
continue;
@@ -1902,6 +1901,24 @@
static int
sysctl_intrcnt(SYSCTL_HANDLER_ARGS)
{
+#ifdef SCTL_MASK32
+ uint32_t *intrcnt32;
+ unsigned i;
+ int error;
+
+ if (req->flags & SCTL_MASK32) {
+ if (!req->oldptr)
+ return (sysctl_handle_opaque(oidp, NULL, sintrcnt / 2, req));
+ intrcnt32 = malloc(sintrcnt / 2, M_TEMP, M_NOWAIT);
+ if (intrcnt32 == NULL)
+ return (ENOMEM);
+ for (i = 0; i < sintrcnt / sizeof (u_long); i++)
+ intrcnt32[i] = intrcnt[i];
+ error = sysctl_handle_opaque(oidp, intrcnt32, sintrcnt / 2, req);
+ free(intrcnt32, M_TEMP);
+ return (error);
+ }
+#endif
return (sysctl_handle_opaque(oidp, intrcnt, sintrcnt, req));
}
Modified: trunk/sys/kern/kern_jail.c
===================================================================
--- trunk/sys/kern/kern_jail.c 2018-05-25 20:39:59 UTC (rev 9943)
+++ trunk/sys/kern/kern_jail.c 2018-05-25 20:46:51 UTC (rev 9944)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 1999 Poul-Henning Kamp.
* Copyright (c) 2008 Bjoern A. Zeeb.
@@ -27,7 +28,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/kern_jail.c 302234 2016-06-27 21:50:30Z bdrewery $");
#include "opt_compat.h"
#include "opt_ddb.h"
@@ -206,6 +207,10 @@
"allow.mount.nullfs",
"allow.mount.zfs",
"allow.mount.procfs",
+ "allow.mount.tmpfs",
+ "allow.mount.fdescfs",
+ "allow.mount.linprocfs",
+ "allow.mount.linsysfs",
};
const size_t pr_allow_names_size = sizeof(pr_allow_names);
@@ -221,6 +226,10 @@
"allow.mount.nonullfs",
"allow.mount.nozfs",
"allow.mount.noprocfs",
+ "allow.mount.notmpfs",
+ "allow.mount.nofdescfs",
+ "allow.mount.nolinprocfs",
+ "allow.mount.nolinsysfs",
};
const size_t pr_allow_nonames_size = sizeof(pr_allow_nonames);
@@ -234,6 +243,19 @@
static unsigned jail_max_af_ips = 255;
#endif
+/*
+ * Initialize the parts of prison0 that can't be static-initialized with
+ * constants. This is called from proc0_init() after creating thread0 cpuset.
+ */
+void
+prison0_init(void)
+{
+
+ prison0.pr_cpuset = cpuset_ref(thread0.td_cpuset);
+ prison0.pr_osreldate = osreldate;
+ strlcpy(prison0.pr_osrelease, osrelease, sizeof(prison0.pr_osrelease));
+}
+
#ifdef INET
static int
qcmp_v4(const void *ip1, const void *ip2)
@@ -311,7 +333,7 @@
j.version = j0.version;
j.path = j0.path;
j.hostname = j0.hostname;
- j.ip4s = j0.ip_number;
+ j.ip4s = htonl(j0.ip_number); /* jail_v0 is host order */
break;
}
@@ -533,17 +555,18 @@
struct prison *pr, *deadpr, *mypr, *ppr, *tpr;
struct vnode *root;
char *domain, *errmsg, *host, *name, *namelc, *p, *path, *uuid;
- char *g_path;
+ char *g_path, *osrelstr;
#if defined(INET) || defined(INET6)
struct prison *tppr;
void *op;
#endif
unsigned long hid;
- size_t namelen, onamelen;
- int created, cuflags, descend, enforce, error, errmsg_len, errmsg_pos;
+ size_t namelen, onamelen, pnamelen;
+ int born, created, cuflags, descend, enforce;
+ int error, errmsg_len, errmsg_pos;
int gotchildmax, gotenforce, gothid, gotrsnum, gotslevel;
int fi, jid, jsys, len, level;
- int childmax, rsnum, slevel, vfslocked;
+ int childmax, osreldt, rsnum, slevel;
int fullpath_disabled;
#if defined(INET) || defined(INET6)
int ii, ij;
@@ -563,7 +586,7 @@
error = priv_check(td, PRIV_JAIL_ATTACH);
if (error)
return (error);
- mypr = ppr = td->td_ucred->cr_prison;
+ mypr = td->td_ucred->cr_prison;
if ((flags & JAIL_CREATE) && mypr->pr_childmax == 0)
return (EPERM);
if (flags & ~JAIL_SET_MASK)
@@ -590,6 +613,13 @@
#endif
g_path = NULL;
+ cuflags = flags & (JAIL_CREATE | JAIL_UPDATE);
+ if (!cuflags) {
+ error = EINVAL;
+ vfs_opterror(opts, "no valid operation (create or update)");
+ goto done_errmsg;
+ }
+
error = vfs_copyopt(opts, "jid", &jid, sizeof(jid));
if (error == ENOENT)
jid = 0;
@@ -797,7 +827,7 @@
#ifdef INET
error = vfs_getopt(opts, "ip4.addr", &op, &ip4s);
if (error == ENOENT)
- ip4s = (pr_flags & PR_IP4_DISABLE) ? 0 : -1;
+ ip4s = 0;
else if (error != 0)
goto done_free;
else if (ip4s & (sizeof(*ip4) - 1)) {
@@ -855,7 +885,7 @@
#ifdef INET6
error = vfs_getopt(opts, "ip6.addr", &op, &ip6s);
if (error == ENOENT)
- ip6s = (pr_flags & PR_IP6_DISABLE) ? 0 : -1;
+ ip6s = 0;
else if (error != 0)
goto done_free;
else if (ip6s & (sizeof(*ip6) - 1)) {
@@ -903,6 +933,46 @@
}
#endif
+ error = vfs_getopt(opts, "osrelease", (void **)&osrelstr, &len);
+ if (error == ENOENT)
+ osrelstr = NULL;
+ else if (error != 0)
+ goto done_free;
+ else {
+ if (flags & JAIL_UPDATE) {
+ error = EINVAL;
+ vfs_opterror(opts,
+ "osrelease cannot be changed after creation");
+ goto done_errmsg;
+ }
+ if (len == 0 || len >= OSRELEASELEN) {
+ error = EINVAL;
+ vfs_opterror(opts,
+ "osrelease string must be 1-%d bytes long",
+ OSRELEASELEN - 1);
+ goto done_errmsg;
+ }
+ }
+
+ error = vfs_copyopt(opts, "osreldate", &osreldt, sizeof(osreldt));
+ if (error == ENOENT)
+ osreldt = 0;
+ else if (error != 0)
+ goto done_free;
+ else {
+ if (flags & JAIL_UPDATE) {
+ error = EINVAL;
+ vfs_opterror(opts,
+ "osreldate cannot be changed after creation");
+ goto done_errmsg;
+ }
+ if (osreldt == 0) {
+ error = EINVAL;
+ vfs_opterror(opts, "osreldate cannot be 0");
+ goto done_errmsg;
+ }
+ }
+
fullpath_disabled = 0;
root = NULL;
error = vfs_getopt(opts, "path", (void **)&path, &len);
@@ -921,12 +991,11 @@
error = EINVAL;
goto done_free;
}
- NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE, UIO_SYSSPACE,
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE,
path, td);
error = namei(&nd);
if (error)
goto done_free;
- vfslocked = NDHASGIANT(&nd);
root = nd.ni_vp;
NDFREE(&nd, NDF_ONLY_PNBUF);
g_path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
@@ -941,22 +1010,20 @@
path = NULL;
} else {
/* exit on other errors */
- VFS_UNLOCK_GIANT(vfslocked);
goto done_free;
}
if (root->v_type != VDIR) {
error = ENOTDIR;
vput(root);
- VFS_UNLOCK_GIANT(vfslocked);
goto done_free;
}
VOP_UNLOCK(root, 0);
- VFS_UNLOCK_GIANT(vfslocked);
if (fullpath_disabled) {
/* Leave room for a real-root full pathname. */
if (len + (path[0] == '/' && strcmp(mypr->pr_path, "/")
? strlen(mypr->pr_path) : 0) > MAXPATHLEN) {
error = ENAMETOOLONG;
+ vrele(root);
goto done_free;
}
}
@@ -963,36 +1030,11 @@
}
/*
- * Grab the allprison lock before letting modules check their
- * parameters. Once we have it, do not let go so we'll have a
- * consistent view of the OSD list.
- */
- sx_xlock(&allprison_lock);
- error = osd_jail_call(NULL, PR_METHOD_CHECK, opts);
- if (error)
- goto done_unlock_list;
-
- /* By now, all parameters should have been noted. */
- TAILQ_FOREACH(opt, opts, link) {
- if (!opt->seen && strcmp(opt->name, "errmsg")) {
- error = EINVAL;
- vfs_opterror(opts, "unknown parameter: %s", opt->name);
- goto done_unlock_list;
- }
- }
-
- /*
- * See if we are creating a new record or updating an existing one.
+ * Find the specified jail, or at least its parent.
* This abuses the file error codes ENOENT and EEXIST.
*/
- cuflags = flags & (JAIL_CREATE | JAIL_UPDATE);
- if (!cuflags) {
- error = EINVAL;
- vfs_opterror(opts, "no valid operation (create or update)");
- goto done_unlock_list;
- }
pr = NULL;
- namelc = NULL;
+ ppr = mypr;
if (cuflags == JAIL_CREATE && jid == 0 && name != NULL) {
namelc = strrchr(name, '.');
jid = strtoul(namelc != NULL ? namelc + 1 : name, &p, 10);
@@ -999,6 +1041,7 @@
if (*p != '\0')
jid = 0;
}
+ sx_xlock(&allprison_lock);
if (jid != 0) {
/*
* See if a requested jid already exists. There is an
@@ -1064,6 +1107,7 @@
* and updates keyed by the name itself (where the name must exist
* because that is the jail being updated).
*/
+ namelc = NULL;
if (name != NULL) {
namelc = strrchr(name, '.');
if (namelc == NULL)
@@ -1074,7 +1118,6 @@
* parent and child names, and make sure the parent
* exists or matches an already found jail.
*/
- *namelc = '\0';
if (pr != NULL) {
if (strncmp(name, ppr->pr_name, namelc - name)
|| ppr->pr_name[namelc - name] != '\0') {
@@ -1085,6 +1128,7 @@
goto done_unlock_list;
}
} else {
+ *namelc = '\0';
ppr = prison_find_name(mypr, name);
if (ppr == NULL) {
error = ENOENT;
@@ -1093,17 +1137,18 @@
goto done_unlock_list;
}
mtx_unlock(&ppr->pr_mtx);
+ *namelc = '.';
}
- name = ++namelc;
+ namelc++;
}
- if (name[0] != '\0') {
- namelen =
+ if (namelc[0] != '\0') {
+ pnamelen =
(ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1;
name_again:
deadpr = NULL;
FOREACH_PRISON_CHILD(ppr, tpr) {
if (tpr != pr && tpr->pr_ref > 0 &&
- !strcmp(tpr->pr_name + namelen, name)) {
+ !strcmp(tpr->pr_name + pnamelen, namelc)) {
if (pr == NULL &&
cuflags != JAIL_CREATE) {
mtx_lock(&tpr->pr_mtx);
@@ -1177,10 +1222,11 @@
}
created = 1;
mtx_lock(&ppr->pr_mtx);
- if (ppr->pr_ref == 0 || (ppr->pr_flags & PR_REMOVE)) {
+ if (ppr->pr_ref == 0) {
mtx_unlock(&ppr->pr_mtx);
error = ENOENT;
- vfs_opterror(opts, "parent jail went away!");
+ vfs_opterror(opts, "jail \"%s\" not found",
+ prison_name(mypr, ppr));
goto done_unlock_list;
}
ppr->pr_ref++;
@@ -1234,8 +1280,8 @@
pr->pr_id = jid;
/* Set some default values, and inherit some from the parent. */
- if (name == NULL)
- name = "";
+ if (namelc == NULL)
+ namelc = "";
if (path == NULL) {
path = "/";
root = mypr->pr_root;
@@ -1290,8 +1336,15 @@
pr->pr_enforce_statfs = JAIL_DEFAULT_ENFORCE_STATFS;
pr->pr_devfs_rsnum = ppr->pr_devfs_rsnum;
+ pr->pr_osreldate = osreldt ? osreldt : ppr->pr_osreldate;
+ if (osrelstr == NULL)
+ strcpy(pr->pr_osrelease, ppr->pr_osrelease);
+ else
+ strcpy(pr->pr_osrelease, osrelstr);
+
LIST_INIT(&pr->pr_children);
mtx_init(&pr->pr_mtx, "jail mutex", NULL, MTX_DEF | MTX_DUPOK);
+ TASK_INIT(&pr->pr_task, 0, prison_complete, pr);
#ifdef VIMAGE
/* Allocate a new vnet if specified. */
@@ -1311,7 +1364,7 @@
mtx_lock(&pr->pr_mtx);
/*
* New prisons do not yet have a reference, because we do not
- * want other to see the incomplete prison once the
+ * want others to see the incomplete prison once the
* allprison_lock is downgraded.
*/
} else {
@@ -1525,13 +1578,13 @@
}
#endif
onamelen = namelen = 0;
- if (name != NULL) {
+ if (namelc != NULL) {
/* Give a default name of the jid. Also allow the name to be
* explicitly the jid - but not any other number, and only in
* normal form (no leading zero/etc).
*/
- if (name[0] == '\0')
- snprintf(name = numbuf, sizeof(numbuf), "%d", jid);
+ if (namelc[0] == '\0')
+ snprintf(namelc = numbuf, sizeof(numbuf), "%d", jid);
else if ((strtoul(namelc, &p, 10) != jid ||
namelc[0] < '1' || namelc[0] > '9') && *p == '\0') {
error = EINVAL;
@@ -1543,9 +1596,10 @@
* Make sure the name isn't too long for the prison or its
* children.
*/
- onamelen = strlen(pr->pr_name);
- namelen = strlen(name);
- if (strlen(ppr->pr_name) + namelen + 2 > sizeof(pr->pr_name)) {
+ pnamelen = (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1;
+ onamelen = strlen(pr->pr_name + pnamelen);
+ namelen = strlen(namelc);
+ if (pnamelen + namelen + 1 > sizeof(pr->pr_name)) {
error = ENAMETOOLONG;
goto done_deref_locked;
}
@@ -1562,6 +1616,30 @@
goto done_deref_locked;
}
+ /*
+ * Let modules check their parameters. This requires unlocking and
+ * then re-locking the prison, but this is still a valid state as long
+ * as allprison_lock remains xlocked.
+ */
+ mtx_unlock(&pr->pr_mtx);
+ error = osd_jail_call(pr, PR_METHOD_CHECK, opts);
+ if (error != 0) {
+ prison_deref(pr, created
+ ? PD_LIST_XLOCKED
+ : PD_DEREF | PD_LIST_XLOCKED);
+ goto done_releroot;
+ }
+ mtx_lock(&pr->pr_mtx);
+
+ /* At this point, all valid parameters should have been noted. */
+ TAILQ_FOREACH(opt, opts, link) {
+ if (!opt->seen && strcmp(opt->name, "errmsg")) {
+ error = EINVAL;
+ vfs_opterror(opts, "unknown parameter: %s", opt->name);
+ goto done_deref_locked;
+ }
+ }
+
/* Set the parameters of the prison. */
#ifdef INET
redo_ip4 = 0;
@@ -1635,12 +1713,12 @@
FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
tpr->pr_devfs_rsnum = rsnum;
}
- if (name != NULL) {
+ if (namelc != NULL) {
if (ppr == &prison0)
- strlcpy(pr->pr_name, name, sizeof(pr->pr_name));
+ strlcpy(pr->pr_name, namelc, sizeof(pr->pr_name));
else
snprintf(pr->pr_name, sizeof(pr->pr_name), "%s.%s",
- ppr->pr_name, name);
+ ppr->pr_name, namelc);
/* Change this component of child names. */
FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
bcopy(tpr->pr_name + onamelen, tpr->pr_name + namelen,
@@ -1718,6 +1796,7 @@
* for now, so new ones will remain unseen until after the module
* handlers have completed.
*/
+ born = pr->pr_uref == 0;
if (!created && (ch_flags & PR_PERSIST & (pr_flags ^ pr->pr_flags))) {
if (pr_flags & PR_PERSIST) {
pr->pr_ref++;
@@ -1731,7 +1810,7 @@
mtx_unlock(&pr->pr_mtx);
#ifdef RACCT
- if (created)
+ if (racct_enable && created)
prison_racct_attach(pr);
#endif
@@ -1787,15 +1866,20 @@
/* Let the modules do their work. */
sx_downgrade(&allprison_lock);
- if (created) {
+ if (born) {
error = osd_jail_call(pr, PR_METHOD_CREATE, opts);
if (error) {
- prison_deref(pr, PD_LIST_SLOCKED);
+ (void)osd_jail_call(pr, PR_METHOD_REMOVE, NULL);
+ prison_deref(pr, created
+ ? PD_LIST_SLOCKED
+ : PD_DEREF | PD_LIST_SLOCKED);
goto done_errmsg;
}
}
error = osd_jail_call(pr, PR_METHOD_SET, opts);
if (error) {
+ if (born)
+ (void)osd_jail_call(pr, PR_METHOD_REMOVE, NULL);
prison_deref(pr, created
? PD_LIST_SLOCKED
: PD_DEREF | PD_LIST_SLOCKED);
@@ -1815,10 +1899,12 @@
}
#ifdef RACCT
- if (!created) {
- sx_sunlock(&allprison_lock);
+ if (racct_enable && !created) {
+ if (!(flags & JAIL_ATTACH))
+ sx_sunlock(&allprison_lock);
prison_racct_modify(pr);
- sx_slock(&allprison_lock);
+ if (!(flags & JAIL_ATTACH))
+ sx_slock(&allprison_lock);
}
#endif
@@ -1845,7 +1931,7 @@
sx_sunlock(&allprison_lock);
}
- goto done_errmsg;
+ goto done_free;
done_deref_locked:
prison_deref(pr, created
@@ -1855,26 +1941,21 @@
done_unlock_list:
sx_xunlock(&allprison_lock);
done_releroot:
- if (root != NULL) {
- vfslocked = VFS_LOCK_GIANT(root->v_mount);
+ if (root != NULL)
vrele(root);
- VFS_UNLOCK_GIANT(vfslocked);
- }
done_errmsg:
if (error) {
- vfs_getopt(opts, "errmsg", (void **)&errmsg, &errmsg_len);
- if (errmsg_len > 0) {
+ if (vfs_getopt(opts, "errmsg", (void **)&errmsg,
+ &errmsg_len) == 0 && errmsg_len > 0) {
errmsg_pos = 2 * vfs_getopt_pos(opts, "errmsg") + 1;
- if (errmsg_pos > 0) {
- if (optuio->uio_segflg == UIO_SYSSPACE)
- bcopy(errmsg,
- optuio->uio_iov[errmsg_pos].iov_base,
- errmsg_len);
- else
- copyout(errmsg,
- optuio->uio_iov[errmsg_pos].iov_base,
- errmsg_len);
- }
+ if (optuio->uio_segflg == UIO_SYSSPACE)
+ bcopy(errmsg,
+ optuio->uio_iov[errmsg_pos].iov_base,
+ errmsg_len);
+ else
+ copyout(errmsg,
+ optuio->uio_iov[errmsg_pos].iov_base,
+ errmsg_len);
}
}
done_free:
@@ -2130,6 +2211,13 @@
error = vfs_setopt(opts, "nodying", &i, sizeof(i));
if (error != 0 && error != ENOENT)
goto done_deref;
+ error = vfs_setopt(opts, "osreldate", &pr->pr_osreldate,
+ sizeof(pr->pr_osreldate));
+ if (error != 0 && error != ENOENT)
+ goto done_deref;
+ error = vfs_setopts(opts, "osrelease", pr->pr_osrelease);
+ if (error != 0 && error != ENOENT)
+ goto done_deref;
/* Get the module parameters. */
mtx_unlock(&pr->pr_mtx);
@@ -2221,7 +2309,6 @@
/* Remove all descendants of this prison, then remove this prison. */
pr->pr_ref++;
- pr->pr_flags |= PR_REMOVE;
if (!LIST_EMPTY(&pr->pr_children)) {
mtx_unlock(&pr->pr_mtx);
lpr = NULL;
@@ -2230,7 +2317,6 @@
if (cpr->pr_ref > 0) {
tpr = cpr;
cpr->pr_ref++;
- cpr->pr_flags |= PR_REMOVE;
} else {
/* Already removed - do not do it again. */
tpr = NULL;
@@ -2314,7 +2400,14 @@
if (error)
return (error);
- sx_slock(&allprison_lock);
+ /*
+ * Start with exclusive hold on allprison_lock to ensure that a possible
+ * PR_METHOD_REMOVE call isn't concurrent with jail_set or jail_remove.
+ * But then immediately downgrade it since we don't need to stop
+ * readers.
+ */
+ sx_xlock(&allprison_lock);
+ sx_downgrade(&allprison_lock);
pr = prison_find_child(td->td_ucred->cr_prison, uap->jid);
if (pr == NULL) {
sx_sunlock(&allprison_lock);
@@ -2337,10 +2430,9 @@
static int
do_jail_attach(struct thread *td, struct prison *pr)
{
- struct prison *ppr;
struct proc *p;
struct ucred *newcred, *oldcred;
- int vfslocked, error;
+ int error;
/*
* XXX: Note that there is a slight race here if two threads
@@ -2365,13 +2457,11 @@
/*
* Reparent the newly attached process to this jail.
*/
- ppr = td->td_ucred->cr_prison;
p = td->td_proc;
error = cpuset_setproc_update_set(p, pr->pr_cpuset);
if (error)
goto e_revert_osd;
- vfslocked = VFS_LOCK_GIANT(pr->pr_root->v_mount);
vn_lock(pr->pr_root, LK_EXCLUSIVE | LK_RETRY);
if ((error = change_dir(pr->pr_root, td)) != 0)
goto e_unlock;
@@ -2381,30 +2471,27 @@
#endif
VOP_UNLOCK(pr->pr_root, 0);
if ((error = change_root(pr->pr_root, td)))
- goto e_unlock_giant;
- VFS_UNLOCK_GIANT(vfslocked);
+ goto e_revert_osd;
newcred = crget();
PROC_LOCK(p);
- oldcred = p->p_ucred;
+ oldcred = crcopysafe(p, newcred);
+ newcred->cr_prison = pr;
+ proc_set_cred(p, newcred);
setsugid(p);
- crcopy(newcred, oldcred);
- newcred->cr_prison = pr;
- p->p_ucred = newcred;
PROC_UNLOCK(p);
#ifdef RACCT
racct_proc_ucred_changed(p, oldcred, newcred);
#endif
+ prison_deref(oldcred->cr_prison, PD_DEREF | PD_DEUREF);
crfree(oldcred);
- prison_deref(ppr, PD_DEREF | PD_DEUREF);
return (0);
+
e_unlock:
VOP_UNLOCK(pr->pr_root, 0);
- e_unlock_giant:
- VFS_UNLOCK_GIANT(vfslocked);
e_revert_osd:
/* Tell modules this thread is still in its old jail after all. */
- (void)osd_jail_call(ppr, PR_METHOD_ATTACH, td);
+ (void)osd_jail_call(td->td_ucred->cr_prison, PR_METHOD_ATTACH, td);
prison_deref(pr, PD_DEREF | PD_DEUREF);
return (error);
}
@@ -2513,16 +2600,13 @@
void
prison_free_locked(struct prison *pr)
{
+ int ref;
mtx_assert(&pr->pr_mtx, MA_OWNED);
- pr->pr_ref--;
- if (pr->pr_ref == 0) {
- mtx_unlock(&pr->pr_mtx);
- TASK_INIT(&pr->pr_task, 0, prison_complete, pr);
+ ref = --pr->pr_ref;
+ mtx_unlock(&pr->pr_mtx);
+ if (ref == 0)
taskqueue_enqueue(taskqueue_thread, &pr->pr_task);
- return;
- }
- mtx_unlock(&pr->pr_mtx);
}
void
@@ -2533,11 +2617,19 @@
prison_free_locked(pr);
}
+/*
+ * Complete a call to either prison_free or prison_proc_free.
+ */
static void
prison_complete(void *context, int pending)
{
+ struct prison *pr = context;
- prison_deref((struct prison *)context, 0);
+ sx_xlock(&allprison_lock);
+ mtx_lock(&pr->pr_mtx);
+ prison_deref(pr, pr->pr_uref
+ ? PD_DEREF | PD_DEUREF | PD_LOCKED | PD_LIST_XLOCKED
+ : PD_LOCKED | PD_LIST_XLOCKED);
}
/*
@@ -2550,20 +2642,48 @@
prison_deref(struct prison *pr, int flags)
{
struct prison *ppr, *tpr;
- int vfslocked;
+ int ref, lasturef;
if (!(flags & PD_LOCKED))
mtx_lock(&pr->pr_mtx);
for (;;) {
if (flags & PD_DEUREF) {
+ KASSERT(pr->pr_uref > 0,
+ ("prison_deref PD_DEUREF on a dead prison (jid=%d)",
+ pr->pr_id));
pr->pr_uref--;
+ lasturef = pr->pr_uref == 0;
+ if (lasturef)
+ pr->pr_ref++;
KASSERT(prison0.pr_uref != 0, ("prison0 pr_uref=0"));
+ } else
+ lasturef = 0;
+ if (flags & PD_DEREF) {
+ KASSERT(pr->pr_ref > 0,
+ ("prison_deref PD_DEREF on a dead prison (jid=%d)",
+ pr->pr_id));
+ pr->pr_ref--;
}
- if (flags & PD_DEREF)
- pr->pr_ref--;
+ ref = pr->pr_ref;
+ mtx_unlock(&pr->pr_mtx);
+
+ /*
+ * Tell the modules if the last user reference was removed
+ * (even it sticks around in dying state).
+ */
+ if (lasturef) {
+ if (!(flags & (PD_LIST_SLOCKED | PD_LIST_XLOCKED))) {
+ sx_xlock(&allprison_lock);
+ flags |= PD_LIST_XLOCKED;
+ }
+ (void)osd_jail_call(pr, PR_METHOD_REMOVE, NULL);
+ mtx_lock(&pr->pr_mtx);
+ ref = --pr->pr_ref;
+ mtx_unlock(&pr->pr_mtx);
+ }
+
/* If the prison still has references, nothing else to do. */
- if (pr->pr_ref > 0) {
- mtx_unlock(&pr->pr_mtx);
+ if (ref > 0) {
if (flags & PD_LIST_SLOCKED)
sx_sunlock(&allprison_lock);
else if (flags & PD_LIST_XLOCKED)
@@ -2571,7 +2691,6 @@
return;
}
- mtx_unlock(&pr->pr_mtx);
if (flags & PD_LIST_SLOCKED) {
if (!sx_try_upgrade(&allprison_lock)) {
sx_sunlock(&allprison_lock);
@@ -2591,11 +2710,8 @@
if (pr->pr_vnet != ppr->pr_vnet)
vnet_destroy(pr->pr_vnet);
#endif
- if (pr->pr_root != NULL) {
- vfslocked = VFS_LOCK_GIANT(pr->pr_root->v_mount);
+ if (pr->pr_root != NULL)
vrele(pr->pr_root);
- VFS_UNLOCK_GIANT(vfslocked);
- }
mtx_destroy(&pr->pr_mtx);
#ifdef INET
free(pr->pr_ip4, M_PRISON);
@@ -2607,7 +2723,8 @@
cpuset_rel(pr->pr_cpuset);
osd_jail_exit(pr);
#ifdef RACCT
- prison_racct_detach(pr);
+ if (racct_enable)
+ prison_racct_detach(pr);
#endif
free(pr, M_PRISON);
@@ -2655,7 +2772,20 @@
mtx_lock(&pr->pr_mtx);
KASSERT(pr->pr_uref > 0,
("Trying to kill a process in a dead prison (jid=%d)", pr->pr_id));
- prison_deref(pr, PD_DEUREF | PD_LOCKED);
+ if (pr->pr_uref > 1)
+ pr->pr_uref--;
+ else {
+ /*
+ * Don't remove the last user reference in this context, which
+ * is expected to be a process that is not only locked, but
+ * also half dead.
+ */
+ pr->pr_ref++;
+ mtx_unlock(&pr->pr_mtx);
+ taskqueue_enqueue(taskqueue_thread, &pr->pr_task);
+ return;
+ }
+ mtx_unlock(&pr->pr_mtx);
}
@@ -3062,7 +3192,7 @@
ii++;
continue;
}
- switch (ij >= ppr->pr_ip4s ? -1 :
+ switch (ij >= ppr->pr_ip6s ? -1 :
qcmp_v6(&pr->pr_ip6[ii], &ppr->pr_ip6[ij])) {
case -1:
bcopy(pr->pr_ip6 + ii + 1, pr->pr_ip6 + ii,
@@ -3901,6 +4031,13 @@
case PRIV_VFS_SETGID:
case PRIV_VFS_STAT:
case PRIV_VFS_STICKYFILE:
+
+ /*
+ * As in the non-jail case, non-root users are expected to be
+ * able to read kernel/phyiscal memory (provided /dev/[k]mem
+ * exists in the jail and they have permission to access it).
+ */
+ case PRIV_KMEM_READ:
return (0);
/*
@@ -3936,7 +4073,7 @@
return (0);
/*
- * Allow jailed root to set certian IPv4/6 (option) headers.
+ * Allow jailed root to set certain IPv4/6 (option) headers.
*/
case PRIV_NETINET_SETHDROPTS:
return (0);
@@ -4173,11 +4310,11 @@
#if defined(INET) || defined(INET6)
SYSCTL_UINT(_security_jail, OID_AUTO, jail_max_af_ips, CTLFLAG_RW,
&jail_max_af_ips, 0,
- "Number of IP addresses a jail may have at most per address family");
+ "Number of IP addresses a jail may have at most per address family (deprecated)");
#endif
/*
- * Default parameters for jail(2) compatability. For historical reasons,
+ * Default parameters for jail(2) compatibility. For historical reasons,
* the sysctl names have varying similarity to the parameter names. Prisons
* just see their own parameters, and can't change them.
*/
@@ -4213,43 +4350,59 @@
SYSCTL_PROC(_security_jail, OID_AUTO, set_hostname_allowed,
CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
NULL, PR_ALLOW_SET_HOSTNAME, sysctl_jail_default_allow, "I",
- "Processes in jail can set their hostnames");
+ "Processes in jail can set their hostnames (deprecated)");
SYSCTL_PROC(_security_jail, OID_AUTO, socket_unixiproute_only,
CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
(void *)1, PR_ALLOW_SOCKET_AF, sysctl_jail_default_allow, "I",
- "Processes in jail are limited to creating UNIX/IP/route sockets only");
+ "Processes in jail are limited to creating UNIX/IP/route sockets only (deprecated)");
SYSCTL_PROC(_security_jail, OID_AUTO, sysvipc_allowed,
CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
NULL, PR_ALLOW_SYSVIPC, sysctl_jail_default_allow, "I",
- "Processes in jail can use System V IPC primitives");
+ "Processes in jail can use System V IPC primitives (deprecated)");
SYSCTL_PROC(_security_jail, OID_AUTO, allow_raw_sockets,
CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
NULL, PR_ALLOW_RAW_SOCKETS, sysctl_jail_default_allow, "I",
- "Prison root can create raw sockets");
+ "Prison root can create raw sockets (deprecated)");
SYSCTL_PROC(_security_jail, OID_AUTO, chflags_allowed,
CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
NULL, PR_ALLOW_CHFLAGS, sysctl_jail_default_allow, "I",
- "Processes in jail can alter system file flags");
+ "Processes in jail can alter system file flags (deprecated)");
SYSCTL_PROC(_security_jail, OID_AUTO, mount_allowed,
CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
NULL, PR_ALLOW_MOUNT, sysctl_jail_default_allow, "I",
- "Processes in jail can mount/unmount jail-friendly file systems");
+ "Processes in jail can mount/unmount jail-friendly file systems (deprecated)");
SYSCTL_PROC(_security_jail, OID_AUTO, mount_devfs_allowed,
CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
NULL, PR_ALLOW_MOUNT_DEVFS, sysctl_jail_default_allow, "I",
- "Processes in jail can mount the devfs file system");
+ "Processes in jail can mount the devfs file system (deprecated)");
+SYSCTL_PROC(_security_jail, OID_AUTO, mount_fdescfs_allowed,
+ CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
+ NULL, PR_ALLOW_MOUNT_FDESCFS, sysctl_jail_default_allow, "I",
+ "Processes in jail can mount the fdescfs file system (deprecated)");
SYSCTL_PROC(_security_jail, OID_AUTO, mount_nullfs_allowed,
CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
NULL, PR_ALLOW_MOUNT_NULLFS, sysctl_jail_default_allow, "I",
- "Processes in jail can mount the nullfs file system");
+ "Processes in jail can mount the nullfs file system (deprecated)");
SYSCTL_PROC(_security_jail, OID_AUTO, mount_procfs_allowed,
CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
NULL, PR_ALLOW_MOUNT_PROCFS, sysctl_jail_default_allow, "I",
- "Processes in jail can mount the procfs file system");
+ "Processes in jail can mount the procfs file system (deprecated)");
+SYSCTL_PROC(_security_jail, OID_AUTO, mount_linprocfs_allowed,
+ CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
+ NULL, PR_ALLOW_MOUNT_LINPROCFS, sysctl_jail_default_allow, "I",
+ "Processes in jail can mount the linprocfs file system (deprecated)");
+SYSCTL_PROC(_security_jail, OID_AUTO, mount_linsysfs_allowed,
+ CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
+ NULL, PR_ALLOW_MOUNT_LINSYSFS, sysctl_jail_default_allow, "I",
+ "Processes in jail can mount the linsysfs file system (deprecated)");
+SYSCTL_PROC(_security_jail, OID_AUTO, mount_tmpfs_allowed,
+ CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
+ NULL, PR_ALLOW_MOUNT_TMPFS, sysctl_jail_default_allow, "I",
+ "Processes in jail can mount the tmpfs file system (deprecated)");
SYSCTL_PROC(_security_jail, OID_AUTO, mount_zfs_allowed,
CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
NULL, PR_ALLOW_MOUNT_ZFS, sysctl_jail_default_allow, "I",
- "Processes in jail can mount the zfs file system");
+ "Processes in jail can mount the zfs file system (deprecated)");
static int
sysctl_jail_default_level(SYSCTL_HANDLER_ARGS)
@@ -4270,13 +4423,13 @@
CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
&jail_default_enforce_statfs, offsetof(struct prison, pr_enforce_statfs),
sysctl_jail_default_level, "I",
- "Processes in jail cannot see all mounted file systems");
+ "Processes in jail cannot see all mounted file systems (deprecated)");
SYSCTL_PROC(_security_jail, OID_AUTO, devfs_ruleset,
CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
&jail_default_devfs_rsnum, offsetof(struct prison, pr_devfs_rsnum),
sysctl_jail_default_level, "I",
- "Ruleset for the devfs filesystem in jail");
+ "Ruleset for the devfs filesystem in jail (deprecated)");
/*
* Nodes to describe jail parameters. Maximum length of string parameters
@@ -4318,6 +4471,10 @@
return (0);
}
+/*
+ * CTLFLAG_RDTUN in the following indicates jail parameters that can be set at
+ * jail creation time but cannot be changed in an existing jail.
+ */
SYSCTL_JAIL_PARAM(, jid, CTLTYPE_INT | CTLFLAG_RDTUN, "I", "Jail ID");
SYSCTL_JAIL_PARAM(, parent, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail parent ID");
SYSCTL_JAIL_PARAM_STRING(, name, CTLFLAG_RW, MAXHOSTNAMELEN, "Jail name");
@@ -4324,6 +4481,10 @@
SYSCTL_JAIL_PARAM_STRING(, path, CTLFLAG_RDTUN, MAXPATHLEN, "Jail root path");
SYSCTL_JAIL_PARAM(, securelevel, CTLTYPE_INT | CTLFLAG_RW,
"I", "Jail secure level");
+SYSCTL_JAIL_PARAM(, osreldate, CTLTYPE_INT | CTLFLAG_RDTUN, "I",
+ "Jail value for kern.osreldate and uname -K");
+SYSCTL_JAIL_PARAM_STRING(, osrelease, CTLFLAG_RDTUN, OSRELEASELEN,
+ "Jail value for kern.osrelease and uname -r");
SYSCTL_JAIL_PARAM(, enforce_statfs, CTLTYPE_INT | CTLFLAG_RW,
"I", "Jail cannot see all mounted file systems");
SYSCTL_JAIL_PARAM(, devfs_ruleset, CTLTYPE_INT | CTLFLAG_RW,
@@ -4394,13 +4555,22 @@
"B", "Jail may mount/unmount jail-friendly file systems in general");
SYSCTL_JAIL_PARAM(_allow_mount, devfs, CTLTYPE_INT | CTLFLAG_RW,
"B", "Jail may mount the devfs file system");
+SYSCTL_JAIL_PARAM(_allow_mount, fdescfs, CTLTYPE_INT | CTLFLAG_RW,
+ "B", "Jail may mount the fdescfs file system");
SYSCTL_JAIL_PARAM(_allow_mount, nullfs, CTLTYPE_INT | CTLFLAG_RW,
"B", "Jail may mount the nullfs file system");
SYSCTL_JAIL_PARAM(_allow_mount, procfs, CTLTYPE_INT | CTLFLAG_RW,
"B", "Jail may mount the procfs file system");
+SYSCTL_JAIL_PARAM(_allow_mount, linprocfs, CTLTYPE_INT | CTLFLAG_RW,
+ "B", "Jail may mount the linprocfs file system");
+SYSCTL_JAIL_PARAM(_allow_mount, linsysfs, CTLTYPE_INT | CTLFLAG_RW,
+ "B", "Jail may mount the linsysfs file system");
+SYSCTL_JAIL_PARAM(_allow_mount, tmpfs, CTLTYPE_INT | CTLFLAG_RW,
+ "B", "Jail may mount the tmpfs file system");
SYSCTL_JAIL_PARAM(_allow_mount, zfs, CTLTYPE_INT | CTLFLAG_RW,
"B", "Jail may mount the zfs file system");
+#ifdef RACCT
void
prison_racct_foreach(void (*callback)(struct racct *racct,
void *arg2, void *arg3), void *arg2, void *arg3)
@@ -4407,6 +4577,8 @@
{
struct prison_racct *prr;
+ ASSERT_RACCT_ENABLED();
+
sx_slock(&allprison_lock);
LIST_FOREACH(prr, &allprison_racct, prr_next)
(callback)(prr->prr_racct, arg2, arg3);
@@ -4418,6 +4590,7 @@
{
struct prison_racct *prr;
+ ASSERT_RACCT_ENABLED();
sx_assert(&allprison_lock, SA_XLOCKED);
if (name[0] == '\0' || strlen(name) >= MAXHOSTNAMELEN)
@@ -4448,6 +4621,8 @@
{
struct prison_racct *prr;
+ ASSERT_RACCT_ENABLED();
+
sx_xlock(&allprison_lock);
prr = prison_racct_find_locked(name);
sx_xunlock(&allprison_lock);
@@ -4458,6 +4633,8 @@
prison_racct_hold(struct prison_racct *prr)
{
+ ASSERT_RACCT_ENABLED();
+
refcount_acquire(&prr->prr_refcount);
}
@@ -4465,6 +4642,7 @@
prison_racct_free_locked(struct prison_racct *prr)
{
+ ASSERT_RACCT_ENABLED();
sx_assert(&allprison_lock, SA_XLOCKED);
if (refcount_release(&prr->prr_refcount)) {
@@ -4479,6 +4657,7 @@
{
int old;
+ ASSERT_RACCT_ENABLED();
sx_assert(&allprison_lock, SA_UNLOCKED);
old = prr->prr_refcount;
@@ -4490,12 +4669,12 @@
sx_xunlock(&allprison_lock);
}
-#ifdef RACCT
static void
prison_racct_attach(struct prison *pr)
{
struct prison_racct *prr;
+ ASSERT_RACCT_ENABLED();
sx_assert(&allprison_lock, SA_XLOCKED);
prr = prison_racct_find_locked(pr->pr_name);
@@ -4515,6 +4694,8 @@
struct ucred *cred;
struct prison_racct *oldprr;
+ ASSERT_RACCT_ENABLED();
+
sx_slock(&allproc_lock);
sx_xlock(&allprison_lock);
@@ -4554,6 +4735,7 @@
prison_racct_detach(struct prison *pr)
{
+ ASSERT_RACCT_ENABLED();
sx_assert(&allprison_lock, SA_UNLOCKED);
if (pr->pr_prison_racct == NULL)
Modified: trunk/sys/kern/kern_khelp.c
===================================================================
--- trunk/sys/kern/kern_khelp.c 2018-05-25 20:39:59 UTC (rev 9943)
+++ trunk/sys/kern/kern_khelp.c 2018-05-25 20:46:51 UTC (rev 9944)
@@ -1,5 +1,6 @@
+/* $MidnightBSD$ */
/*-
- * Copyright (c) 2010 Lawrence Stewart <lstewart at freebsd.org>
+ * Copyright (c) 2010,2013 Lawrence Stewart <lstewart at freebsd.org>
* Copyright (c) 2010 The FreeBSD Foundation
* All rights reserved.
*
@@ -35,12 +36,11 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/kern_khelp.c 251778 2013-06-15 06:45:17Z lstewart $");
#include <sys/param.h>
#include <sys/kernel.h>
#include <sys/hhook.h>
-#include <sys/jail.h>
#include <sys/khelp.h>
#include <sys/lock.h>
#include <sys/malloc.h>
@@ -52,8 +52,6 @@
#include <sys/rwlock.h>
#include <sys/systm.h>
-#include <net/vnet.h>
-
static struct rwlock khelp_list_lock;
RW_SYSINIT(khelplistlock, &khelp_list_lock, "helper list lock");
@@ -61,6 +59,7 @@
/* Private function prototypes. */
static inline void khelp_remove_osd(struct helper *h, struct osd *hosd);
+void khelp_new_hhook_registered(struct hhook_head *hhh, uint32_t flags);
#define KHELP_LIST_WLOCK() rw_wlock(&khelp_list_lock)
#define KHELP_LIST_WUNLOCK() rw_wunlock(&khelp_list_lock)
@@ -74,33 +73,32 @@
struct helper *tmph;
int error, i, inserted;
- error = 0;
- inserted = 0;
+ error = inserted = 0;
refcount_init(&h->h_refcount, 0);
h->h_id = osd_register(OSD_KHELP, NULL, NULL);
/* It's only safe to add the hooks after osd_register(). */
- if (h->h_nhooks > 0) {
- for (i = 0; i < h->h_nhooks && !error; i++) {
- /* We don't require the module to assign hook_helper. */
- h->h_hooks[i].hook_helper = h;
- error = khelp_add_hhook(&h->h_hooks[i], HHOOK_NOWAIT);
- }
-
- if (error) {
- for (i--; i >= 0; i--)
- khelp_remove_hhook(&h->h_hooks[i]);
-
- osd_deregister(OSD_KHELP, h->h_id);
- }
+ for (i = 0; i < h->h_nhooks && !error; i++) {
+ /* We don't require the module to assign hook_helper. */
+ h->h_hooks[i].hook_helper = h;
+ error = hhook_add_hook_lookup(&h->h_hooks[i], HHOOK_WAITOK);
+ if (error)
+ printf("%s: \"%s\" khelp module unable to "
+ "hook type %d id %d due to error %d\n", __func__,
+ h->h_name, h->h_hooks[i].hook_type,
+ h->h_hooks[i].hook_id, error);
}
- if (!error) {
+ if (error) {
+ for (i--; i >= 0; i--)
+ hhook_remove_hook_lookup(&h->h_hooks[i]);
+ osd_deregister(OSD_KHELP, h->h_id);
+ } else {
KHELP_LIST_WLOCK();
/*
* Keep list of helpers sorted in descending h_id order. Due to
* the way osd_set() works, a sorted list ensures
- * init_helper_osd() will operate with improved efficiency.
+ * khelp_init_osd() will operate with improved efficiency.
*/
TAILQ_FOREACH(tmph, &helpers, h_next) {
if (tmph->h_id < h->h_id) {
@@ -124,8 +122,6 @@
struct helper *tmph;
int error, i;
- error = 0;
-
KHELP_LIST_WLOCK();
if (h->h_refcount > 0)
error = EBUSY;
@@ -142,10 +138,8 @@
KHELP_LIST_WUNLOCK();
if (!error) {
- if (h->h_nhooks > 0) {
- for (i = 0; i < h->h_nhooks; i++)
- khelp_remove_hhook(&h->h_hooks[i]);
- }
+ for (i = 0; i < h->h_nhooks; i++)
+ hhook_remove_hook_lookup(&h->h_hooks[i]);
osd_deregister(OSD_KHELP, h->h_id);
}
@@ -263,29 +257,14 @@
int
khelp_add_hhook(struct hookinfo *hki, uint32_t flags)
{
- VNET_ITERATOR_DECL(vnet_iter);
int error;
- error = 0;
-
/*
- * XXXLAS: If a helper is dynamically adding a helper hook function at
- * runtime using this function, we should update the helper's h_hooks
- * struct member to include the additional hookinfo struct.
+ * XXXLAS: Should probably include the functionality to update the
+ * helper's h_hooks struct member.
*/
+ error = hhook_add_hook_lookup(hki, flags);
- VNET_LIST_RLOCK_NOSLEEP();
- VNET_FOREACH(vnet_iter) {
- CURVNET_SET(vnet_iter);
- error = hhook_add_hook_lookup(hki, flags);
- CURVNET_RESTORE();
-#ifdef VIMAGE
- if (error)
- break;
-#endif
- }
- VNET_LIST_RUNLOCK_NOSLEEP();
-
return (error);
}
@@ -292,32 +271,47 @@
int
khelp_remove_hhook(struct hookinfo *hki)
{
- VNET_ITERATOR_DECL(vnet_iter);
int error;
- error = 0;
-
/*
- * XXXLAS: If a helper is dynamically removing a helper hook function at
- * runtime using this function, we should update the helper's h_hooks
- * struct member to remove the defunct hookinfo struct.
+ * XXXLAS: Should probably include the functionality to update the
+ * helper's h_hooks struct member.
*/
+ error = hhook_remove_hook_lookup(hki);
- VNET_LIST_RLOCK_NOSLEEP();
- VNET_FOREACH(vnet_iter) {
- CURVNET_SET(vnet_iter);
- error = hhook_remove_hook_lookup(hki);
- CURVNET_RESTORE();
-#ifdef VIMAGE
- if (error)
- break;
-#endif
- }
- VNET_LIST_RUNLOCK_NOSLEEP();
-
return (error);
}
+/*
+ * Private KPI between hhook and khelp that allows khelp modules to insert hook
+ * functions into hhook points which register after the modules were loaded.
+ */
+void
+khelp_new_hhook_registered(struct hhook_head *hhh, uint32_t flags)
+{
+ struct helper *h;
+ int error, i;
+
+ KHELP_LIST_RLOCK();
+ TAILQ_FOREACH(h, &helpers, h_next) {
+ for (i = 0; i < h->h_nhooks; i++) {
+ if (hhh->hhh_type != h->h_hooks[i].hook_type ||
+ hhh->hhh_id != h->h_hooks[i].hook_id)
+ continue;
+ error = hhook_add_hook(hhh, &h->h_hooks[i], flags);
+ if (error) {
+ printf("%s: \"%s\" khelp module unable to "
+ "hook type %d id %d due to error %d\n",
+ __func__, h->h_name,
+ h->h_hooks[i].hook_type,
+ h->h_hooks[i].hook_id, error);
+ error = 0;
+ }
+ }
+ }
+ KHELP_LIST_RUNLOCK();
+}
+
int
khelp_modevent(module_t mod, int event_type, void *data)
{
@@ -377,95 +371,3 @@
return (error);
}
-
-/*
- * This function is called in two separate situations:
- *
- * - When the kernel is booting, it is called directly by the SYSINIT framework
- * to allow Khelp modules which were compiled into the kernel or loaded by the
- * boot loader to insert their non-virtualised hook functions into the kernel.
- *
- * - When the kernel is booting or a vnet is created, this function is also
- * called indirectly through khelp_vnet_init() by the vnet initialisation code.
- * In this situation, Khelp modules are able to insert their virtualised hook
- * functions into the virtualised hook points in the vnet which is being
- * initialised. In the case where the kernel is not compiled with "options
- * VIMAGE", this step is still run once at boot, but the hook functions get
- * transparently inserted into the standard unvirtualised network stack.
- */
-static void
-khelp_init(const void *vnet)
-{
- struct helper *h;
- int error, i, vinit;
- int32_t htype, hid;
-
- error = 0;
- vinit = vnet != NULL;
-
- KHELP_LIST_RLOCK();
- TAILQ_FOREACH(h, &helpers, h_next) {
- for (i = 0; i < h->h_nhooks && !error; i++) {
- htype = h->h_hooks[i].hook_type;
- hid = h->h_hooks[i].hook_id;
-
- /*
- * If we're doing a virtualised init (vinit != 0) and
- * the hook point is virtualised, or we're doing a plain
- * sysinit at boot and the hook point is not
- * virtualised, insert the hook.
- */
- if ((hhook_head_is_virtualised_lookup(htype, hid) ==
- HHOOK_HEADISINVNET && vinit) ||
- (!hhook_head_is_virtualised_lookup(htype, hid) &&
- !vinit)) {
- error = hhook_add_hook_lookup(&h->h_hooks[i],
- HHOOK_NOWAIT);
- }
- }
-
- if (error) {
- /* Remove any helper's hooks we successfully added. */
- for (i--; i >= 0; i--)
- hhook_remove_hook_lookup(&h->h_hooks[i]);
-
- printf("%s: Failed to add hooks for helper \"%s\" (%p)",
- __func__, h->h_name, h);
- if (vinit)
- printf(" to vnet %p.\n", vnet);
- else
- printf(".\n");
-
- error = 0;
- }
- }
- KHELP_LIST_RUNLOCK();
-}
-
-/*
- * Vnet created and being initialised.
- */
-static void
-khelp_vnet_init(const void *unused __unused)
-{
-
- khelp_init(TD_TO_VNET(curthread));
-}
-
-
-/*
- * As the kernel boots, allow Khelp modules which were compiled into the kernel
- * or loaded by the boot loader to insert their non-virtualised hook functions
- * into the kernel.
- */
-SYSINIT(khelp_init, SI_SUB_PROTO_END, SI_ORDER_FIRST, khelp_init, NULL);
-
-/*
- * When a vnet is created and being initialised, we need to insert the helper
- * hook functions for all currently registered Khelp modules into the vnet's
- * helper hook points. The hhook KPI provides a mechanism for subsystems which
- * export helper hook points to clean up on vnet shutdown, so we don't need a
- * VNET_SYSUNINIT for Khelp.
- */
-VNET_SYSINIT(khelp_vnet_init, SI_SUB_PROTO_END, SI_ORDER_FIRST,
- khelp_vnet_init, NULL);
Modified: trunk/sys/kern/kern_kthread.c
===================================================================
--- trunk/sys/kern/kern_kthread.c 2018-05-25 20:39:59 UTC (rev 9943)
+++ trunk/sys/kern/kern_kthread.c 2018-05-25 20:46:51 UTC (rev 9944)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 1999 Peter Wemm <peter at FreeBSD.org>
* All rights reserved.
@@ -25,7 +26,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/kern_kthread.c 304905 2016-08-27 11:45:05Z kib $");
#include <sys/param.h>
#include <sys/systm.h>
@@ -38,6 +39,7 @@
#include <sys/rwlock.h>
#include <sys/signalvar.h>
#include <sys/sx.h>
+#include <sys/umtx.h>
#include <sys/unistd.h>
#include <sys/wait.h>
#include <sys/sched.h>
@@ -257,12 +259,8 @@
panic("kthread_add called too soon");
/* If no process supplied, put it on proc0 */
- if (p == NULL) {
+ if (p == NULL)
p = &proc0;
- oldtd = &thread0;
- } else {
- oldtd = FIRST_THREAD_IN_PROC(p);
- }
/* Initialize our new td */
newtd = thread_alloc(pages);
@@ -269,9 +267,13 @@
if (newtd == NULL)
return (ENOMEM);
+ PROC_LOCK(p);
+ oldtd = FIRST_THREAD_IN_PROC(p);
+
bzero(&newtd->td_startzero,
__rangeof(struct thread, td_startzero, td_endzero));
-/* XXX check if we should zero. */
+ newtd->td_su = NULL;
+ newtd->td_sleeptimo = 0;
bcopy(&oldtd->td_startcopy, &newtd->td_startcopy,
__rangeof(struct thread, td_startcopy, td_endcopy));
@@ -293,9 +295,7 @@
newtd->td_ucred = crhold(p->p_ucred);
/* this code almost the same as create_thread() in kern_thr.c */
- PROC_LOCK(p);
p->p_flag |= P_HADTHREADS;
- newtd->td_sigmask = oldtd->td_sigmask; /* XXX dubious */
thread_link(newtd, p);
thread_lock(oldtd);
/* let the scheduler know about these things. */
@@ -324,11 +324,13 @@
kthread_exit(void)
{
struct proc *p;
+ struct thread *td;
- p = curthread->td_proc;
+ td = curthread;
+ p = td->td_proc;
/* A module may be waiting for us to exit. */
- wakeup(curthread);
+ wakeup(td);
/*
* The last exiting thread in a kernel process must tear down
@@ -341,8 +343,10 @@
rw_wunlock(&tidhash_lock);
kproc_exit(0);
}
- LIST_REMOVE(curthread, td_hash);
+ LIST_REMOVE(td, td_hash);
rw_wunlock(&tidhash_lock);
+ umtx_thread_exit(td);
+ tdsigcleanup(td);
PROC_SLOCK(p);
thread_exit();
}
Modified: trunk/sys/kern/kern_ktr.c
===================================================================
--- trunk/sys/kern/kern_ktr.c 2018-05-25 20:39:59 UTC (rev 9943)
+++ trunk/sys/kern/kern_ktr.c 2018-05-25 20:46:51 UTC (rev 9944)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
/*-
* Copyright (c) 2000 John Baldwin <jhb at FreeBSD.org>
* All rights reserved.
@@ -10,9 +11,6 @@
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
- * 3. Neither the name of the author nor the names of any co-contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -33,7 +31,7 @@
*/
#include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/kern_ktr.c 293853 2016-01-13 21:38:52Z marius $");
#include "opt_ddb.h"
#include "opt_ktr.h"
@@ -47,15 +45,16 @@
#include <sys/kernel.h>
#include <sys/ktr.h>
#include <sys/libkern.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
#include <sys/proc.h>
+#include <sys/smp.h>
#include <sys/sysctl.h>
#include <sys/systm.h>
#include <sys/time.h>
#include <machine/cpu.h>
-#ifdef __sparc64__
-#include <machine/ktr.h>
-#endif
#ifdef DDB
#include <ddb/ddb.h>
@@ -62,14 +61,25 @@
#include <ddb/db_output.h>
#endif
+#ifndef KTR_BOOT_ENTRIES
+#define KTR_BOOT_ENTRIES 1024
+#endif
+
#ifndef KTR_ENTRIES
#define KTR_ENTRIES 1024
#endif
+/* Limit the allocations to something manageable. */
+#define KTR_ENTRIES_MAX (8 * 1024 * 1024)
+
#ifndef KTR_MASK
#define KTR_MASK (0)
#endif
+#ifndef KTR_CPUMASK
+#define KTR_CPUMASK CPUSET_FSET
+#endif
+
#ifndef KTR_TIME
#define KTR_TIME get_cyclecount()
#endif
@@ -78,41 +88,36 @@
#define KTR_CPU PCPU_GET(cpuid)
#endif
+static MALLOC_DEFINE(M_KTR, "KTR", "KTR");
+
FEATURE(ktr, "Kernel support for KTR kernel tracing facility");
-static SYSCTL_NODE(_debug, OID_AUTO, ktr, CTLFLAG_RD, 0, "KTR options");
+volatile int ktr_idx = 0;
+int ktr_mask = KTR_MASK;
+int ktr_compile = KTR_COMPILE;
+int ktr_entries = KTR_BOOT_ENTRIES;
+int ktr_version = KTR_VERSION;
+struct ktr_entry ktr_buf_init[KTR_BOOT_ENTRIES];
+struct ktr_entry *ktr_buf = ktr_buf_init;
+cpuset_t ktr_cpumask = CPUSET_T_INITIALIZER(KTR_CPUMASK);
+static char ktr_cpumask_str[CPUSETBUFSIZ];
-int ktr_mask = KTR_MASK;
TUNABLE_INT("debug.ktr.mask", &ktr_mask);
-SYSCTL_INT(_debug_ktr, OID_AUTO, mask, CTLFLAG_RW,
- &ktr_mask, 0, "Bitmask of KTR event classes for which logging is enabled");
-int ktr_compile = KTR_COMPILE;
-SYSCTL_INT(_debug_ktr, OID_AUTO, compile, CTLFLAG_RD,
- &ktr_compile, 0, "Bitmask of KTR event classes compiled into the kernel");
+TUNABLE_STR("debug.ktr.cpumask", ktr_cpumask_str, sizeof(ktr_cpumask_str));
-int ktr_entries = KTR_ENTRIES;
-SYSCTL_INT(_debug_ktr, OID_AUTO, entries, CTLFLAG_RD,
- &ktr_entries, 0, "Number of entries in the KTR buffer");
+static SYSCTL_NODE(_debug, OID_AUTO, ktr, CTLFLAG_RD, 0, "KTR options");
-int ktr_version = KTR_VERSION;
SYSCTL_INT(_debug_ktr, OID_AUTO, version, CTLFLAG_RD,
&ktr_version, 0, "Version of the KTR interface");
-cpuset_t ktr_cpumask;
-static char ktr_cpumask_str[CPUSETBUFSIZ];
-TUNABLE_STR("debug.ktr.cpumask", ktr_cpumask_str, sizeof(ktr_cpumask_str));
+SYSCTL_UINT(_debug_ktr, OID_AUTO, compile, CTLFLAG_RD,
+ &ktr_compile, 0, "Bitmask of KTR event classes compiled into the kernel");
static void
ktr_cpumask_initializer(void *dummy __unused)
{
- CPU_FILL(&ktr_cpumask);
-#ifdef KTR_CPUMASK
- if (cpusetobj_strscan(&ktr_cpumask, KTR_CPUMASK) == -1)
- CPU_FILL(&ktr_cpumask);
-#endif
-
/*
* TUNABLE_STR() runs with SI_ORDER_MIDDLE priority, thus it must be
* already set, if necessary.
@@ -147,9 +152,6 @@
sysctl_debug_ktr_cpumask, "S",
"Bitmask of CPUs on which KTR logging is enabled");
-volatile int ktr_idx = 0;
-struct ktr_entry ktr_buf[KTR_ENTRIES];
-
static int
sysctl_debug_ktr_clear(SYSCTL_HANDLER_ARGS)
{
@@ -161,7 +163,7 @@
return (error);
if (clear) {
- bzero(ktr_buf, sizeof(ktr_buf));
+ bzero(ktr_buf, sizeof(*ktr_buf) * ktr_entries);
ktr_idx = 0;
}
@@ -170,6 +172,94 @@
SYSCTL_PROC(_debug_ktr, OID_AUTO, clear, CTLTYPE_INT|CTLFLAG_RW, 0, 0,
sysctl_debug_ktr_clear, "I", "Clear KTR Buffer");
+/*
+ * This is a sysctl proc so that it is serialized as !MPSAFE along with
+ * the other ktr sysctl procs.
+ */
+static int
+sysctl_debug_ktr_mask(SYSCTL_HANDLER_ARGS)
+{
+ int mask, error;
+
+ mask = ktr_mask;
+ error = sysctl_handle_int(oidp, &mask, 0, req);
+ if (error || !req->newptr)
+ return (error);
+ ktr_mask = mask;
+ return (error);
+}
+
+SYSCTL_PROC(_debug_ktr, OID_AUTO, mask, CTLTYPE_UINT|CTLFLAG_RW, 0, 0,
+ sysctl_debug_ktr_mask, "IU",
+ "Bitmask of KTR event classes for which logging is enabled");
+
+#if KTR_ENTRIES > KTR_BOOT_ENTRIES
+/*
+ * A simplified version of sysctl_debug_ktr_entries.
+ * No need to care about SMP, scheduling, etc.
+ */
+static void
+ktr_entries_initializer(void *dummy __unused)
+{
+ int mask;
+
+ /* Temporarily disable ktr in case malloc() is being traced. */
+ mask = ktr_mask;
+ ktr_mask = 0;
+ ktr_buf = malloc(sizeof(*ktr_buf) * KTR_ENTRIES, M_KTR,
+ M_WAITOK | M_ZERO);
+ memcpy(ktr_buf, ktr_buf_init + ktr_idx,
+ (KTR_BOOT_ENTRIES - ktr_idx) * sizeof(*ktr_buf));
+ if (ktr_idx != 0)
+ memcpy(ktr_buf + KTR_BOOT_ENTRIES - ktr_idx, ktr_buf_init,
+ ktr_idx * sizeof(*ktr_buf));
+ ktr_entries = KTR_ENTRIES;
+ ktr_mask = mask;
+}
+SYSINIT(ktr_entries_initializer, SI_SUB_KMEM, SI_ORDER_ANY,
+ ktr_entries_initializer, NULL);
+#endif
+
+static int
+sysctl_debug_ktr_entries(SYSCTL_HANDLER_ARGS)
+{
+ int entries, error, mask;
+ struct ktr_entry *buf, *oldbuf;
+
+ entries = ktr_entries;
+ error = sysctl_handle_int(oidp, &entries, 0, req);
+ if (error || !req->newptr)
+ return (error);
+ if (entries > KTR_ENTRIES_MAX)
+ return (ERANGE);
+ /* Disable ktr temporarily. */
+ mask = ktr_mask;
+ atomic_store_rel_int(&ktr_mask, 0);
+ /* Wait for threads to go idle. */
+ if ((error = quiesce_all_cpus("ktrent", PCATCH)) != 0) {
+ ktr_mask = mask;
+ return (error);
+ }
+ if (ktr_buf != ktr_buf_init)
+ oldbuf = ktr_buf;
+ else
+ oldbuf = NULL;
+ /* Allocate a new buffer. */
+ buf = malloc(sizeof(*buf) * entries, M_KTR, M_WAITOK | M_ZERO);
+ /* Install the new buffer and restart ktr. */
+ ktr_buf = buf;
+ ktr_entries = entries;
+ ktr_idx = 0;
+ atomic_store_rel_int(&ktr_mask, mask);
+ if (oldbuf != NULL)
+ free(oldbuf, M_KTR);
+
+ return (error);
+}
+
+SYSCTL_PROC(_debug_ktr, OID_AUTO, entries, CTLTYPE_INT|CTLFLAG_RW, 0, 0,
+ sysctl_debug_ktr_entries, "I", "Number of entries in the KTR buffer");
+
#ifdef KTR_VERBOSE
int ktr_verbose = KTR_VERBOSE;
TUNABLE_INT("debug.ktr.verbose", &ktr_verbose);
@@ -251,7 +341,7 @@
if (panicstr)
return;
- if ((ktr_mask & mask) == 0)
+ if ((ktr_mask & mask) == 0 || ktr_buf == NULL)
return;
cpu = KTR_CPU;
if (!CPU_ISSET(cpu, &ktr_cpumask))
@@ -283,7 +373,7 @@
{
do {
saveindex = ktr_idx;
- newindex = (saveindex + 1) % KTR_ENTRIES;
+ newindex = (saveindex + 1) % ktr_entries;
} while (atomic_cmpset_rel_int(&ktr_idx, saveindex, newindex) == 0);
entry = &ktr_buf[saveindex];
}
@@ -338,12 +428,12 @@
DB_SHOW_COMMAND(ktr, db_ktr_all)
{
- tstate.cur = (ktr_idx - 1) % KTR_ENTRIES;
+ tstate.cur = (ktr_idx - 1) % ktr_entries;
tstate.first = -1;
db_ktr_verbose = 0;
- db_ktr_verbose |= (index(modif, 'v') != NULL) ? 2 : 0;
- db_ktr_verbose |= (index(modif, 'V') != NULL) ? 1 : 0; /* just timestap please */
- if (index(modif, 'a') != NULL) {
+ db_ktr_verbose |= (strchr(modif, 'v') != NULL) ? 2 : 0;
+ db_ktr_verbose |= (strchr(modif, 'V') != NULL) ? 1 : 0; /* just timestap please */
+ if (strchr(modif, 'a') != NULL) {
db_disable_pager();
while (cncheckc() != -1)
if (db_mach_vtrace() == 0)
@@ -360,7 +450,7 @@
{
struct ktr_entry *kp;
- if (tstate.cur == tstate.first) {
+ if (tstate.cur == tstate.first || ktr_buf == NULL) {
db_printf("--- End of trace buffer ---\n");
return (0);
}
@@ -392,7 +482,7 @@
tstate.first = tstate.cur;
if (--tstate.cur < 0)
- tstate.cur = KTR_ENTRIES - 1;
+ tstate.cur = ktr_entries - 1;
return (1);
}
More information about the Midnightbsd-cvs
mailing list