[Midnightbsd-cvs] src [9950] trunk/sys/kern: sync with freebsd

Fri May 25 17:07:10 EDT 2018

Revision: 9950
          http://svnweb.midnightbsd.org/src/?rev=9950
Author:   laffer1
Date:     2018-05-25 17:07:09 -0400 (Fri, 25 May 2018)
Log Message:
-----------
sync with freebsd

Modified Paths:
--------------
    trunk/sys/kern/kern_time.c
    trunk/sys/kern/kern_timeout.c
    trunk/sys/kern/kern_umtx.c
    trunk/sys/kern/kern_uuid.c
    trunk/sys/kern/ksched.c
    trunk/sys/kern/link_elf.c
    trunk/sys/kern/link_elf_obj.c
    trunk/sys/kern/md4c.c
    trunk/sys/kern/md5c.c
    trunk/sys/kern/p1003_1b.c
    trunk/sys/kern/posix4_mib.c
    trunk/sys/kern/sched_4bsd.c
    trunk/sys/kern/sched_ule.c
    trunk/sys/kern/stack_protector.c
    trunk/sys/kern/subr_acl_nfs4.c
    trunk/sys/kern/subr_blist.c
    trunk/sys/kern/subr_bufring.c
    trunk/sys/kern/subr_bus.c
    trunk/sys/kern/subr_clock.c
    trunk/sys/kern/subr_devstat.c
    trunk/sys/kern/subr_disk.c
    trunk/sys/kern/subr_dummy_vdso_tc.c
    trunk/sys/kern/subr_eventhandler.c
    trunk/sys/kern/subr_fattime.c
    trunk/sys/kern/subr_firmware.c
    trunk/sys/kern/subr_hash.c
    trunk/sys/kern/subr_hints.c
    trunk/sys/kern/subr_kdb.c
    trunk/sys/kern/subr_kobj.c
    trunk/sys/kern/subr_lock.c
    trunk/sys/kern/subr_log.c
    trunk/sys/kern/subr_mbpool.c
    trunk/sys/kern/subr_mchain.c
    trunk/sys/kern/subr_module.c
    trunk/sys/kern/subr_msgbuf.c

Added Paths:
-----------
    trunk/sys/kern/subr_bus_dma.c
    trunk/sys/kern/subr_busdma_bufalloc.c
    trunk/sys/kern/subr_counter.c
    trunk/sys/kern/subr_dnvlist.c
    trunk/sys/kern/subr_nvlist.c
    trunk/sys/kern/subr_nvpair.c

Modified: trunk/sys/kern/kern_time.c
===================================================================

--- trunk/sys/kern/kern_time.c	2018-05-25 20:59:46 UTC (rev 9949)
+++ trunk/sys/kern/kern_time.c	2018-05-25 21:07:09 UTC (rev 9950)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
@@ -30,8 +31,10 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/kern_time.c 330422 2018-03-04 23:31:25Z bdrewery $");
 
+#include "opt_ktrace.h"
+
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/limits.h>
@@ -43,6 +46,7 @@
 #include <sys/resourcevar.h>
 #include <sys/signalvar.h>
 #include <sys/kernel.h>
+#include <sys/sleepqueue.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
@@ -53,11 +57,20 @@
 #include <sys/timers.h>
 #include <sys/timetc.h>
 #include <sys/vnode.h>
+#ifdef KTRACE
+#include <sys/ktrace.h>
+#endif
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 
 #define MAX_CLOCKS 	(CLOCK_MONOTONIC+1)
+#define CPUCLOCK_BIT		0x80000000
+#define CPUCLOCK_PROCESS_BIT	0x40000000
+#define CPUCLOCK_ID_MASK	(~(CPUCLOCK_BIT|CPUCLOCK_PROCESS_BIT))
+#define MAKE_THREAD_CPUCLOCK(tid)	(CPUCLOCK_BIT|(tid))
+#define MAKE_PROCESS_CPUCLOCK(pid)	\
+	(CPUCLOCK_BIT|CPUCLOCK_PROCESS_BIT|(pid))
 
 static struct kclock	posix_clocks[MAX_CLOCKS];
 static uma_zone_t	itimer_zone = NULL;
@@ -91,9 +104,6 @@
 static int	realtimer_delete(struct itimer *);
 static void	realtimer_clocktime(clockid_t, struct timespec *);
 static void	realtimer_expire(void *);
-static int	kern_timer_create(struct thread *, clockid_t,
-			struct sigevent *, int *, int);
-static int	kern_timer_delete(struct thread *, int);
 
 int		register_posix_clock(int, struct kclock *);
 void		itimer_fire(struct itimer *it);
@@ -165,6 +175,57 @@
 }
 
 #ifndef _SYS_SYSPROTO_H_
+struct clock_getcpuclockid2_args {
+	id_t id;
+	int which,
+	clockid_t *clock_id;
+};
+#endif
+/* ARGSUSED */
+int
+sys_clock_getcpuclockid2(struct thread *td, struct clock_getcpuclockid2_args *uap)
+{
+	clockid_t clk_id;
+	int error;
+
+	error = kern_clock_getcpuclockid2(td, uap->id, uap->which, &clk_id);
+	if (error == 0)
+		error = copyout(&clk_id, uap->clock_id, sizeof(clockid_t));
+	return (error);
+}
+
+int
+kern_clock_getcpuclockid2(struct thread *td, id_t id, int which,
+    clockid_t *clk_id)
+{
+	struct proc *p;
+	pid_t pid;
+	lwpid_t tid;
+	int error;
+
+	switch (which) {
+	case CPUCLOCK_WHICH_PID:
+		if (id != 0) {
+			error = pget(id, PGET_CANSEE | PGET_NOTID, &p);
+			if (error != 0)
+				return (error);
+			PROC_UNLOCK(p);
+			pid = id;
+		} else {
+			pid = td->td_proc->p_pid;
+		}
+		*clk_id = MAKE_PROCESS_CPUCLOCK(pid);
+		return (0);
+	case CPUCLOCK_WHICH_TID:
+		tid = id == 0 ? td->td_tid : id;
+		*clk_id = MAKE_THREAD_CPUCLOCK(tid);
+		return (0);
+	default:
+		return (EINVAL);
+	}
+}
+
+#ifndef _SYS_SYSPROTO_H_
 struct clock_gettime_args {
 	clockid_t clock_id;
 	struct	timespec *tp;
@@ -184,12 +245,80 @@
 	return (error);
 }
 
+static inline void 
+cputick2timespec(uint64_t runtime, struct timespec *ats)
+{
+	runtime = cputick2usec(runtime);
+	ats->tv_sec = runtime / 1000000;
+	ats->tv_nsec = runtime % 1000000 * 1000;
+}
+
+static void
+get_thread_cputime(struct thread *targettd, struct timespec *ats)
+{
+	uint64_t runtime, curtime, switchtime;
+
+	if (targettd == NULL) { /* current thread */
+		critical_enter();
+		switchtime = PCPU_GET(switchtime);
+		curtime = cpu_ticks();
+		runtime = curthread->td_runtime;
+		critical_exit();
+		runtime += curtime - switchtime;
+	} else {
+		thread_lock(targettd);
+		runtime = targettd->td_runtime;
+		thread_unlock(targettd);
+	}
+	cputick2timespec(runtime, ats);
+}
+
+static void
+get_process_cputime(struct proc *targetp, struct timespec *ats)
+{
+	uint64_t runtime;
+	struct rusage ru;
+
+	PROC_STATLOCK(targetp);
+	rufetch(targetp, &ru);
+	runtime = targetp->p_rux.rux_runtime;
+	PROC_STATUNLOCK(targetp);
+	cputick2timespec(runtime, ats);
+}
+
+static int
+get_cputime(struct thread *td, clockid_t clock_id, struct timespec *ats)
+{
+	struct proc *p, *p2;
+	struct thread *td2;
+	lwpid_t tid;
+	pid_t pid;
+	int error;
+
+	p = td->td_proc;
+	if ((clock_id & CPUCLOCK_PROCESS_BIT) == 0) {
+		tid = clock_id & CPUCLOCK_ID_MASK;
+		td2 = tdfind(tid, p->p_pid);
+		if (td2 == NULL)
+			return (EINVAL);
+		get_thread_cputime(td2, ats);
+		PROC_UNLOCK(td2->td_proc);
+	} else {
+		pid = clock_id & CPUCLOCK_ID_MASK;
+		error = pget(pid, PGET_CANSEE, &p2);
+		if (error != 0)
+			return (EINVAL);
+		get_process_cputime(p2, ats);
+		PROC_UNLOCK(p2);
+	}
+	return (0);
+}
+
 int
 kern_clock_gettime(struct thread *td, clockid_t clock_id, struct timespec *ats)
 {
 	struct timeval sys, user;
 	struct proc *p;
-	uint64_t runtime, curtime, switchtime;
 
 	p = td->td_proc;
 	switch (clock_id) {
@@ -202,17 +331,17 @@
 		break;
 	case CLOCK_VIRTUAL:
 		PROC_LOCK(p);
-		PROC_SLOCK(p);
+		PROC_STATLOCK(p);
 		calcru(p, &user, &sys);
-		PROC_SUNLOCK(p);
+		PROC_STATUNLOCK(p);
 		PROC_UNLOCK(p);
 		TIMEVAL_TO_TIMESPEC(&user, ats);
 		break;
 	case CLOCK_PROF:
 		PROC_LOCK(p);
-		PROC_SLOCK(p);
+		PROC_STATLOCK(p);
 		calcru(p, &user, &sys);
-		PROC_SUNLOCK(p);
+		PROC_STATUNLOCK(p);
 		PROC_UNLOCK(p);
 		timevaladd(&user, &sys);
 		TIMEVAL_TO_TIMESPEC(&user, ats);
@@ -232,17 +361,17 @@
 		ats->tv_nsec = 0;
 		break;
 	case CLOCK_THREAD_CPUTIME_ID:
-		critical_enter();
-		switchtime = PCPU_GET(switchtime);
-		curtime = cpu_ticks();
-		runtime = td->td_runtime;
-		critical_exit();
-		runtime = cputick2usec(runtime + curtime - switchtime);
-		ats->tv_sec = runtime / 1000000;
-		ats->tv_nsec = runtime % 1000000 * 1000;
+		get_thread_cputime(NULL, ats);
 		break;
+	case CLOCK_PROCESS_CPUTIME_ID:
+		PROC_LOCK(p);
+		get_process_cputime(p, ats);
+		PROC_UNLOCK(p);
+		break;
 	default:
-		return (EINVAL);
+		if ((int)clock_id >= 0)
+			return (EINVAL);
+		return (get_cputime(td, clock_id, ats));
 	}
 	return (0);
 }
@@ -336,6 +465,8 @@
 		ts->tv_nsec = 0;
 		break;
 	case CLOCK_THREAD_CPUTIME_ID:
+	case CLOCK_PROCESS_CPUTIME_ID:
+	cputime:
 		/* sync with cputick2usec */
 		ts->tv_nsec = 1000000 / cpu_tickrate();
 		if (ts->tv_nsec == 0)
@@ -342,18 +473,21 @@
 			ts->tv_nsec = 1000;
 		break;
 	default:
+		if ((int)clock_id < 0)
+			goto cputime;
 		return (EINVAL);
 	}
 	return (0);
 }
 
-static int nanowait;
+static uint8_t nanowait[MAXCPU];
 
 int
 kern_nanosleep(struct thread *td, struct timespec *rqt, struct timespec *rmt)
 {
-	struct timespec ts, ts2, ts3;
-	struct timeval tv;
+	struct timespec ts;
+	sbintime_t sbt, sbtt, prec, tmp;
+	time_t over;
 	int error;
 
 	if (rqt->tv_nsec < 0 || rqt->tv_nsec >= 1000000000)
@@ -360,30 +494,37 @@
 		return (EINVAL);
 	if (rqt->tv_sec < 0 || (rqt->tv_sec == 0 && rqt->tv_nsec == 0))
 		return (0);
-	getnanouptime(&ts);
-	timespecadd(&ts, rqt);
-	TIMESPEC_TO_TIMEVAL(&tv, rqt);
-	for (;;) {
-		error = tsleep(&nanowait, PWAIT | PCATCH, "nanslp",
-		    tvtohz(&tv));
-		getnanouptime(&ts2);
-		if (error != EWOULDBLOCK) {
-			if (error == ERESTART)
-				error = EINTR;
-			if (rmt != NULL) {
-				timespecsub(&ts, &ts2);
-				if (ts.tv_sec < 0)
-					timespecclear(&ts);
-				*rmt = ts;
-			}
-			return (error);
+	ts = *rqt;
+	if (ts.tv_sec > INT32_MAX / 2) {
+		over = ts.tv_sec - INT32_MAX / 2;
+		ts.tv_sec -= over;
+	} else
+		over = 0;
+	tmp = tstosbt(ts);
+	prec = tmp;
+	prec >>= tc_precexp;
+	if (TIMESEL(&sbt, tmp))
+		sbt += tc_tick_sbt;
+	sbt += tmp;
+	error = tsleep_sbt(&nanowait[curcpu], PWAIT | PCATCH, "nanslp",
+	    sbt, prec, C_ABSOLUTE);
+	if (error != EWOULDBLOCK) {
+		if (error == ERESTART)
+			error = EINTR;
+		if (TIMESEL(&sbtt, tmp))
+			sbtt += tc_tick_sbt;
+		if (rmt != NULL) {
+			ts = sbttots(sbt - sbtt);
+			ts.tv_sec += over;
+			if (ts.tv_sec < 0)
+				timespecclear(&ts);
+			*rmt = ts;
 		}
-		if (timespeccmp(&ts2, &ts, >=))
+		if (sbtt >= sbt)
 			return (0);
-		ts3 = ts;
-		timespecsub(&ts3, &ts2);
-		TIMESPEC_TO_TIMEVAL(&tv, &ts3);
+		return (error);
 	}
+	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
@@ -407,7 +548,7 @@
 	    !useracc((caddr_t)uap->rmtp, sizeof(rmt), VM_PROT_WRITE))
 			return (EFAULT);
 	error = kern_nanosleep(td, &rqt, &rmt);
-	if (error && uap->rmtp) {
+	if (error == EINTR && uap->rmtp) {
 		int error2;
 
 		error2 = copyout(&rmt, uap->rmtp, sizeof(rmt));
@@ -554,7 +695,7 @@
 		*aitv = p->p_realtimer;
 		PROC_UNLOCK(p);
 		if (timevalisset(&aitv->it_value)) {
-			getmicrouptime(&ctv);
+			microuptime(&ctv);
 			if (timevalcmp(&aitv->it_value, &ctv, <))
 				timevalclear(&aitv->it_value);
 			else
@@ -561,10 +702,14 @@
 				timevalsub(&aitv->it_value, &ctv);
 		}
 	} else {
-		PROC_SLOCK(p);
+		PROC_ITIMLOCK(p);
 		*aitv = p->p_stats->p_timer[which];
-		PROC_SUNLOCK(p);
+		PROC_ITIMUNLOCK(p);
 	}
+#ifdef KTRACE
+	if (KTRPOINT(td, KTR_STRUCT))
+		ktritimerval(aitv);
+#endif
 	return (0);
 }
 
@@ -599,6 +744,7 @@
 {
 	struct proc *p = td->td_proc;
 	struct timeval ctv;
+	sbintime_t sbt, pr;
 
 	if (aitv == NULL)
 		return (kern_getitimer(td, which, oitv));
@@ -605,11 +751,17 @@
 
 	if (which > ITIMER_PROF)
 		return (EINVAL);
-	if (itimerfix(&aitv->it_value))
+#ifdef KTRACE
+	if (KTRPOINT(td, KTR_STRUCT))
+		ktritimerval(aitv);
+#endif
+	if (itimerfix(&aitv->it_value) ||
+	    aitv->it_value.tv_sec > INT32_MAX / 2)
 		return (EINVAL);
 	if (!timevalisset(&aitv->it_value))
 		timevalclear(&aitv->it_interval);
-	else if (itimerfix(&aitv->it_interval))
+	else if (itimerfix(&aitv->it_interval) ||
+	    aitv->it_interval.tv_sec > INT32_MAX / 2)
 		return (EINVAL);
 
 	if (which == ITIMER_REAL) {
@@ -616,11 +768,13 @@
 		PROC_LOCK(p);
 		if (timevalisset(&p->p_realtimer.it_value))
 			callout_stop(&p->p_itcallout);
-		getmicrouptime(&ctv);
+		microuptime(&ctv);
 		if (timevalisset(&aitv->it_value)) {
-			callout_reset(&p->p_itcallout, tvtohz(&aitv->it_value),
-			    realitexpire, p);
+			pr = tvtosbt(aitv->it_value) >> tc_precexp;
 			timevaladd(&aitv->it_value, &ctv);
+			sbt = tvtosbt(aitv->it_value);
+			callout_reset_sbt(&p->p_itcallout, sbt, pr,
+			    realitexpire, p, C_ABSOLUTE);
 		}
 		*oitv = p->p_realtimer;
 		p->p_realtimer = *aitv;
@@ -632,11 +786,23 @@
 				timevalsub(&oitv->it_value, &ctv);
 		}
 	} else {
-		PROC_SLOCK(p);
+		if (aitv->it_interval.tv_sec == 0 &&
+		    aitv->it_interval.tv_usec != 0 &&
+		    aitv->it_interval.tv_usec < tick)
+			aitv->it_interval.tv_usec = tick;
+		if (aitv->it_value.tv_sec == 0 &&
+		    aitv->it_value.tv_usec != 0 &&
+		    aitv->it_value.tv_usec < tick)
+			aitv->it_value.tv_usec = tick;
+		PROC_ITIMLOCK(p);
 		*oitv = p->p_stats->p_timer[which];
 		p->p_stats->p_timer[which] = *aitv;
-		PROC_SUNLOCK(p);
+		PROC_ITIMUNLOCK(p);
 	}
+#ifdef KTRACE
+	if (KTRPOINT(td, KTR_STRUCT))
+		ktritimerval(oitv);
+#endif
 	return (0);
 }
 
@@ -656,7 +822,8 @@
 realitexpire(void *arg)
 {
 	struct proc *p;
-	struct timeval ctv, ntv;
+	struct timeval ctv;
+	sbintime_t isbt;
 
 	p = (struct proc *)arg;
 	kern_psignal(p, SIGALRM);
@@ -666,19 +833,17 @@
 			wakeup(&p->p_itcallout);
 		return;
 	}
-	for (;;) {
+	isbt = tvtosbt(p->p_realtimer.it_interval);
+	if (isbt >= sbt_timethreshold)
+		getmicrouptime(&ctv);
+	else
+		microuptime(&ctv);
+	do {
 		timevaladd(&p->p_realtimer.it_value,
 		    &p->p_realtimer.it_interval);
-		getmicrouptime(&ctv);
-		if (timevalcmp(&p->p_realtimer.it_value, &ctv, >)) {
-			ntv = p->p_realtimer.it_value;
-			timevalsub(&ntv, &ctv);
-			callout_reset(&p->p_itcallout, tvtohz(&ntv) - 1,
-			    realitexpire, p);
-			return;
-		}
-	}
-	/*NOTREACHED*/
+	} while (timevalcmp(&p->p_realtimer.it_value, &ctv, <=));
+	callout_reset_sbt(&p->p_itcallout, tvtosbt(p->p_realtimer.it_value),
+	    isbt >> tc_precexp, realitexpire, p, C_ABSOLUTE);
 }
 
 /*
@@ -693,8 +858,9 @@
 
 	if (tv->tv_sec < 0 || tv->tv_usec < 0 || tv->tv_usec >= 1000000)
 		return (EINVAL);
-	if (tv->tv_sec == 0 && tv->tv_usec != 0 && tv->tv_usec < tick)
-		tv->tv_usec = tick;
+	if (tv->tv_sec == 0 && tv->tv_usec != 0 &&
+	    tv->tv_usec < (u_int)tick / 16)
+		tv->tv_usec = (u_int)tick / 16;
 	return (0);
 }
 
@@ -835,7 +1001,7 @@
 		return (maxpps != 0);
 	} else {
 		(*curpps)++;		/* NB: ignore potential overflow */
-		return (maxpps < 0 || *curpps < maxpps);
+		return (maxpps < 0 || *curpps <= maxpps);
 	}
 }
 
@@ -922,31 +1088,30 @@
 int
 sys_ktimer_create(struct thread *td, struct ktimer_create_args *uap)
 {
-	struct sigevent *evp1, ev;
+	struct sigevent *evp, ev;
 	int id;
 	int error;
 
-	if (uap->evp != NULL) {
+	if (uap->evp == NULL) {
+		evp = NULL;
+	} else {
 		error = copyin(uap->evp, &ev, sizeof(ev));
 		if (error != 0)
 			return (error);
-		evp1 = &ev;
-	} else
-		evp1 = NULL;
-
-	error = kern_timer_create(td, uap->clock_id, evp1, &id, -1);
-
+		evp = &ev;
+	}
+	error = kern_ktimer_create(td, uap->clock_id, evp, &id, -1);
 	if (error == 0) {
 		error = copyout(&id, uap->timerid, sizeof(int));
 		if (error != 0)
-			kern_timer_delete(td, id);
+			kern_ktimer_delete(td, id);
 	}
 	return (error);
 }
 
-static int
-kern_timer_create(struct thread *td, clockid_t clock_id,
-	struct sigevent *evp, int *timerid, int preset_id)
+int
+kern_ktimer_create(struct thread *td, clockid_t clock_id, struct sigevent *evp,
+    int *timerid, int preset_id)
 {
 	struct proc *p = td->td_proc;
 	struct itimer *it;
@@ -1061,7 +1226,8 @@
 int
 sys_ktimer_delete(struct thread *td, struct ktimer_delete_args *uap)
 {
-	return (kern_timer_delete(td, uap->timerid));
+
+	return (kern_ktimer_delete(td, uap->timerid));
 }
 
 static struct itimer *
@@ -1083,8 +1249,8 @@
 	return (it);
 }
 
-static int
-kern_timer_delete(struct thread *td, int timerid)
+int
+kern_ktimer_delete(struct thread *td, int timerid)
 {
 	struct proc *p = td->td_proc;
 	struct itimer *it;
@@ -1126,8 +1292,6 @@
 int
 sys_ktimer_settime(struct thread *td, struct ktimer_settime_args *uap)
 {
-	struct proc *p = td->td_proc;
-	struct itimer *it;
 	struct itimerspec val, oval, *ovalp;
 	int error;
 
@@ -1134,27 +1298,34 @@
 	error = copyin(uap->value, &val, sizeof(val));
 	if (error != 0)
 		return (error);
-	
-	if (uap->ovalue != NULL)
-		ovalp = &oval;
-	else
-		ovalp = NULL;
+	ovalp = uap->ovalue != NULL ? &oval : NULL;
+	error = kern_ktimer_settime(td, uap->timerid, uap->flags, &val, ovalp);
+	if (error == 0 && uap->ovalue != NULL)
+		error = copyout(ovalp, uap->ovalue, sizeof(*ovalp));
+	return (error);
+}
 
+int
+kern_ktimer_settime(struct thread *td, int timer_id, int flags,
+    struct itimerspec *val, struct itimerspec *oval)
+{
+	struct proc *p;
+	struct itimer *it;
+	int error;
+
+	p = td->td_proc;
 	PROC_LOCK(p);
-	if (uap->timerid < 3 ||
-	    (it = itimer_find(p, uap->timerid)) == NULL) {
+	if (timer_id < 3 || (it = itimer_find(p, timer_id)) == NULL) {
 		PROC_UNLOCK(p);
 		error = EINVAL;
 	} else {
 		PROC_UNLOCK(p);
 		itimer_enter(it);
-		error = CLOCK_CALL(it->it_clockid, timer_settime,
-				(it, uap->flags, &val, ovalp));
+		error = CLOCK_CALL(it->it_clockid, timer_settime, (it,
+		    flags, val, oval));
 		itimer_leave(it);
 		ITIMER_UNLOCK(it);
 	}
-	if (error == 0 && uap->ovalue != NULL)
-		error = copyout(ovalp, uap->ovalue, sizeof(*ovalp));
 	return (error);
 }
 
@@ -1167,26 +1338,34 @@
 int
 sys_ktimer_gettime(struct thread *td, struct ktimer_gettime_args *uap)
 {
-	struct proc *p = td->td_proc;
-	struct itimer *it;
 	struct itimerspec val;
 	int error;
 
+	error = kern_ktimer_gettime(td, uap->timerid, &val);
+	if (error == 0)
+		error = copyout(&val, uap->value, sizeof(val));
+	return (error);
+}
+
+int
+kern_ktimer_gettime(struct thread *td, int timer_id, struct itimerspec *val)
+{
+	struct proc *p;
+	struct itimer *it;
+	int error;
+
+	p = td->td_proc;
 	PROC_LOCK(p);
-	if (uap->timerid < 3 ||
-	   (it = itimer_find(p, uap->timerid)) == NULL) {
+	if (timer_id < 3 || (it = itimer_find(p, timer_id)) == NULL) {
 		PROC_UNLOCK(p);
 		error = EINVAL;
 	} else {
 		PROC_UNLOCK(p);
 		itimer_enter(it);
-		error = CLOCK_CALL(it->it_clockid, timer_gettime,
-				(it, &val));
+		error = CLOCK_CALL(it->it_clockid, timer_gettime, (it, val));
 		itimer_leave(it);
 		ITIMER_UNLOCK(it);
 	}
-	if (error == 0)
-		error = copyout(&val, uap->value, sizeof(val));
 	return (error);
 }
 
@@ -1198,13 +1377,20 @@
 int
 sys_ktimer_getoverrun(struct thread *td, struct ktimer_getoverrun_args *uap)
 {
+
+	return (kern_ktimer_getoverrun(td, uap->timerid));
+}
+
+int
+kern_ktimer_getoverrun(struct thread *td, int timer_id)
+{
 	struct proc *p = td->td_proc;
 	struct itimer *it;
 	int error ;
 
 	PROC_LOCK(p);
-	if (uap->timerid < 3 ||
-	    (it = itimer_find(p, uap->timerid)) == NULL) {
+	if (timer_id < 3 ||
+	    (it = itimer_find(p, timer_id)) == NULL) {
 		PROC_UNLOCK(p);
 		error = EINVAL;
 	} else {
@@ -1481,7 +1667,7 @@
 			panic("unhandled event");
 		for (; i < TIMER_MAX; ++i) {
 			if ((it = its->its_timers[i]) != NULL)
-				kern_timer_delete(curthread, i);
+				kern_ktimer_delete(curthread, i);
 		}
 		if (its->its_timers[0] == NULL &&
 		    its->its_timers[1] == NULL &&

Modified: trunk/sys/kern/kern_timeout.c
===================================================================
--- trunk/sys/kern/kern_timeout.c	2018-05-25 20:59:46 UTC (rev 9949)
+++ trunk/sys/kern/kern_timeout.c	2018-05-25 21:07:09 UTC (rev 9950)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1982, 1986, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
@@ -35,15 +36,20 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/kern_timeout.c 305853 2016-09-16 00:14:26Z hiren $");
 
+#include "opt_callout_profiling.h"
 #include "opt_kdtrace.h"
+#include "opt_ddb.h"
+#if defined(__arm__)
+#include "opt_timer.h"
+#endif
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/callout.h>
-#include <sys/condvar.h>
+#include <sys/file.h>
 #include <sys/interrupt.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
@@ -56,18 +62,24 @@
 #include <sys/sysctl.h>
 #include <sys/smp.h>
 
+#ifdef DDB
+#include <ddb/ddb.h>
+#include <machine/_inttypes.h>
+#endif
+
 #ifdef SMP
 #include <machine/cpu.h>
 #endif
 
+#ifndef NO_EVENTTIMERS
+DPCPU_DECLARE(sbintime_t, hardclocktime);
+#endif
+
 SDT_PROVIDER_DEFINE(callout_execute);
-SDT_PROBE_DEFINE(callout_execute, kernel, , callout_start, callout-start);
-SDT_PROBE_ARGTYPE(callout_execute, kernel, , callout_start, 0,
-    "struct callout *");
-SDT_PROBE_DEFINE(callout_execute, kernel, , callout_end, callout-end); 
-SDT_PROBE_ARGTYPE(callout_execute, kernel, , callout_end, 0,
-    "struct callout *");
+SDT_PROBE_DEFINE1(callout_execute, , , callout__start, "struct callout *");
+SDT_PROBE_DEFINE1(callout_execute, , , callout__end, "struct callout *");
 
+#ifdef CALLOUT_PROFILING
 static int avg_depth;
 SYSCTL_INT(_debug, OID_AUTO, to_avg_depth, CTLFLAG_RD, &avg_depth, 0,
     "Average number of items examined per softclock call. Units = 1/1000");
@@ -80,65 +92,85 @@
 static int avg_mpcalls;
 SYSCTL_INT(_debug, OID_AUTO, to_avg_mpcalls, CTLFLAG_RD, &avg_mpcalls, 0,
     "Average number of MP callouts made per softclock call. Units = 1/1000");
+static int avg_depth_dir;
+SYSCTL_INT(_debug, OID_AUTO, to_avg_depth_dir, CTLFLAG_RD, &avg_depth_dir, 0,
+    "Average number of direct callouts examined per callout_process call. "
+    "Units = 1/1000");
+static int avg_lockcalls_dir;
+SYSCTL_INT(_debug, OID_AUTO, to_avg_lockcalls_dir, CTLFLAG_RD,
+    &avg_lockcalls_dir, 0, "Average number of lock direct callouts made per "
+    "callout_process call. Units = 1/1000");
+static int avg_mpcalls_dir;
+SYSCTL_INT(_debug, OID_AUTO, to_avg_mpcalls_dir, CTLFLAG_RD, &avg_mpcalls_dir,
+    0, "Average number of MP direct callouts made per callout_process call. "
+    "Units = 1/1000");
+#endif
+
+static int ncallout;
+SYSCTL_INT(_kern, OID_AUTO, ncallout, CTLFLAG_RDTUN, &ncallout, 0,
+    "Number of entries in callwheel and size of timeout() preallocation");
+
 /*
  * TODO:
  *	allocate more timeout table slots when table overflows.
  */
-int callwheelsize, callwheelbits, callwheelmask;
+u_int callwheelsize, callwheelmask;
 
 /*
- * The callout cpu migration entity represents informations necessary for
- * describing the migrating callout to the new callout cpu.
+ * The callout cpu exec entities represent informations necessary for
+ * describing the state of callouts currently running on the CPU and the ones
+ * necessary for migrating callouts to the new callout cpu. In particular,
+ * the first entry of the array cc_exec_entity holds informations for callout
+ * running in SWI thread context, while the second one holds informations
+ * for callout running directly from hardware interrupt context.
  * The cached informations are very important for deferring migration when
  * the migrating callout is already running.
  */
-struct cc_mig_ent {
+struct cc_exec {
+	struct callout		*cc_curr;
 #ifdef SMP
-	void	(*ce_migration_func)(void *);
-	void	*ce_migration_arg;
-	int	ce_migration_cpu;
-	int	ce_migration_ticks;
+	void			(*ce_migration_func)(void *);
+	void			*ce_migration_arg;
+	int			ce_migration_cpu;
+	sbintime_t		ce_migration_time;
+	sbintime_t		ce_migration_prec;
 #endif
+	bool			cc_cancel;
+	bool			cc_waiting;
 };
-	
+
 /*
  * There is one struct callout_cpu per cpu, holding all relevant
  * state for the callout processing thread on the individual CPU.
- * In particular:
- *	cc_ticks is incremented once per tick in callout_cpu().
- *	It tracks the global 'ticks' but in a way that the individual
- *	threads should not worry about races in the order in which
- *	hardclock() and hardclock_cpu() run on the various CPUs.
- *	cc_softclock is advanced in callout_cpu() to point to the
- *	first entry in cc_callwheel that may need handling. In turn,
- *	a softclock() is scheduled so it can serve the various entries i
- *	such that cc_softclock <= i <= cc_ticks .
- *	XXX maybe cc_softclock and cc_ticks should be volatile ?
- *
- *	cc_ticks is also used in callout_reset_cpu() to determine
- *	when the callout should be served.
  */
 struct callout_cpu {
-	struct cc_mig_ent	cc_migrating_entity;
-	struct mtx		cc_lock;
+	struct mtx_padalign	cc_lock;
+	struct cc_exec 		cc_exec_entity[2];
+	struct callout		*cc_next;
 	struct callout		*cc_callout;
-	struct callout_tailq	*cc_callwheel;
-	struct callout_list	cc_callfree;
-	struct callout		*cc_next;
-	struct callout		*cc_curr;
+	struct callout_list	*cc_callwheel;
+	struct callout_tailq	cc_expireq;
+	struct callout_slist	cc_callfree;
+	sbintime_t		cc_firstevent;
+	sbintime_t		cc_lastscan;
 	void			*cc_cookie;
-	int 			cc_ticks;
-	int 			cc_softticks;
-	int			cc_cancel;
-	int			cc_waiting;
-	int 			cc_firsttick;
+	u_int			cc_bucket;
+	u_int			cc_inited;
+	char			cc_ktr_event_name[20];
 };
 
+#define	callout_migrating(c)	((c)->c_iflags & CALLOUT_DFRMIGRATION)
+
+#define	cc_exec_curr(cc, dir)		cc->cc_exec_entity[dir].cc_curr
+#define	cc_exec_next(cc)		cc->cc_next
+#define	cc_exec_cancel(cc, dir)		cc->cc_exec_entity[dir].cc_cancel
+#define	cc_exec_waiting(cc, dir)	cc->cc_exec_entity[dir].cc_waiting
 #ifdef SMP
-#define	cc_migration_func	cc_migrating_entity.ce_migration_func
-#define	cc_migration_arg	cc_migrating_entity.ce_migration_arg
-#define	cc_migration_cpu	cc_migrating_entity.ce_migration_cpu
-#define	cc_migration_ticks	cc_migrating_entity.ce_migration_ticks
+#define	cc_migration_func(cc, dir)	cc->cc_exec_entity[dir].ce_migration_func
+#define	cc_migration_arg(cc, dir)	cc->cc_exec_entity[dir].ce_migration_arg
+#define	cc_migration_cpu(cc, dir)	cc->cc_exec_entity[dir].ce_migration_cpu
+#define	cc_migration_time(cc, dir)	cc->cc_exec_entity[dir].ce_migration_time
+#define	cc_migration_prec(cc, dir)	cc->cc_exec_entity[dir].ce_migration_prec
 
 struct callout_cpu cc_cpu[MAXCPU];
 #define	CPUBLOCK	MAXCPU
@@ -154,39 +186,49 @@
 #define	CC_LOCK_ASSERT(cc)	mtx_assert(&(cc)->cc_lock, MA_OWNED)
 
 static int timeout_cpu;
-void (*callout_new_inserted)(int cpu, int ticks) = NULL;
 
+static void	callout_cpu_init(struct callout_cpu *cc, int cpu);
+static void	softclock_call_cc(struct callout *c, struct callout_cpu *cc,
+#ifdef CALLOUT_PROFILING
+		    int *mpcalls, int *lockcalls, int *gcalls,
+#endif
+		    int direct);
+
 static MALLOC_DEFINE(M_CALLOUT, "callout", "Callout datastructures");
 
 /**
  * Locked by cc_lock:
- *   cc_curr         - If a callout is in progress, it is curr_callout.
- *                     If curr_callout is non-NULL, threads waiting in
+ *   cc_curr         - If a callout is in progress, it is cc_curr.
+ *                     If cc_curr is non-NULL, threads waiting in
  *                     callout_drain() will be woken up as soon as the
  *                     relevant callout completes.
- *   cc_cancel       - Changing to 1 with both callout_lock and c_lock held
+ *   cc_cancel       - Changing to 1 with both callout_lock and cc_lock held
  *                     guarantees that the current callout will not run.
  *                     The softclock() function sets this to 0 before it
  *                     drops callout_lock to acquire c_lock, and it calls
  *                     the handler only if curr_cancelled is still 0 after
- *                     c_lock is successfully acquired.
+ *                     cc_lock is successfully acquired.
  *   cc_waiting      - If a thread is waiting in callout_drain(), then
  *                     callout_wait is nonzero.  Set only when
- *                     curr_callout is non-NULL.
+ *                     cc_curr is non-NULL.
  */
 
 /*
- * Resets the migration entity tied to a specific callout cpu.
+ * Resets the execution entity tied to a specific callout cpu.
  */
 static void
-cc_cme_cleanup(struct callout_cpu *cc)
+cc_cce_cleanup(struct callout_cpu *cc, int direct)
 {
 
+	cc_exec_curr(cc, direct) = NULL;
+	cc_exec_cancel(cc, direct) = false;
+	cc_exec_waiting(cc, direct) = false;
 #ifdef SMP
-	cc->cc_migration_cpu = CPUBLOCK;
-	cc->cc_migration_ticks = 0;
-	cc->cc_migration_func = NULL;
-	cc->cc_migration_arg = NULL;
+	cc_migration_cpu(cc, direct) = CPUBLOCK;
+	cc_migration_time(cc, direct) = 0;
+	cc_migration_prec(cc, direct) = 0;
+	cc_migration_func(cc, direct) = NULL;
+	cc_migration_arg(cc, direct) = NULL;
 #endif
 }
 
@@ -194,11 +236,11 @@
  * Checks if migration is requested by a specific callout cpu.
  */
 static int
-cc_cme_migrating(struct callout_cpu *cc)
+cc_cce_migrating(struct callout_cpu *cc, int direct)
 {
 
 #ifdef SMP
-	return (cc->cc_migration_cpu != CPUBLOCK);
+	return (cc_migration_cpu(cc, direct) != CPUBLOCK);
 #else
 	return (0);
 #endif
@@ -205,36 +247,50 @@
 }
 
 /*
- * kern_timeout_callwheel_alloc() - kernel low level callwheel initialization 
- *
- *	This code is called very early in the kernel initialization sequence,
- *	and may be called more then once.
+ * Kernel low level callwheel initialization
+ * called on cpu0 during kernel startup.
  */
-caddr_t
-kern_timeout_callwheel_alloc(caddr_t v)
+static void
+callout_callwheel_init(void *dummy)
 {
 	struct callout_cpu *cc;
 
-	timeout_cpu = PCPU_GET(cpuid);
-	cc = CC_CPU(timeout_cpu);
 	/*
-	 * Calculate callout wheel size
+	 * Calculate the size of the callout wheel and the preallocated
+	 * timeout() structures.
+	 * XXX: Clip callout to result of previous function of maxusers
+	 * maximum 384.  This is still huge, but acceptable.
 	 */
-	for (callwheelsize = 1, callwheelbits = 0;
-	     callwheelsize < ncallout;
-	     callwheelsize <<= 1, ++callwheelbits)
-		;
+	memset(CC_CPU(0), 0, sizeof(cc_cpu));
+	ncallout = imin(16 + maxproc + maxfiles, 18508);
+	TUNABLE_INT_FETCH("kern.ncallout", &ncallout);
+
+	/*
+	 * Calculate callout wheel size, should be next power of two higher
+	 * than 'ncallout'.
+	 */
+	callwheelsize = 1 << fls(ncallout);
 	callwheelmask = callwheelsize - 1;
 
-	cc->cc_callout = (struct callout *)v;
-	v = (caddr_t)(cc->cc_callout + ncallout);
-	cc->cc_callwheel = (struct callout_tailq *)v;
-	v = (caddr_t)(cc->cc_callwheel + callwheelsize);
-	return(v);
+	/*
+	 * Only cpu0 handles timeout(9) and receives a preallocation.
+	 *
+	 * XXX: Once all timeout(9) consumers are converted this can
+	 * be removed.
+	 */
+	timeout_cpu = PCPU_GET(cpuid);
+	cc = CC_CPU(timeout_cpu);
+	cc->cc_callout = malloc(ncallout * sizeof(struct callout),
+	    M_CALLOUT, M_WAITOK);
+	callout_cpu_init(cc, timeout_cpu);
 }
+SYSINIT(callwheel_init, SI_SUB_CPU, SI_ORDER_ANY, callout_callwheel_init, NULL);
 
+/*
+ * Initialize the per-cpu callout structures.
+ */
 static void
-callout_cpu_init(struct callout_cpu *cc)
+callout_cpu_init(struct callout_cpu *cc, int cpu)
 {
 	struct callout *c;
 	int i;
@@ -241,16 +297,23 @@
 
 	mtx_init(&cc->cc_lock, "callout", NULL, MTX_SPIN | MTX_RECURSE);
 	SLIST_INIT(&cc->cc_callfree);
-	for (i = 0; i < callwheelsize; i++) {
-		TAILQ_INIT(&cc->cc_callwheel[i]);
-	}
-	cc_cme_cleanup(cc);
-	if (cc->cc_callout == NULL)
+	cc->cc_inited = 1;
+	cc->cc_callwheel = malloc(sizeof(struct callout_list) * callwheelsize,
+	    M_CALLOUT, M_WAITOK);
+	for (i = 0; i < callwheelsize; i++)
+		LIST_INIT(&cc->cc_callwheel[i]);
+	TAILQ_INIT(&cc->cc_expireq);
+	cc->cc_firstevent = SBT_MAX;
+	for (i = 0; i < 2; i++)
+		cc_cce_cleanup(cc, i);
+	snprintf(cc->cc_ktr_event_name, sizeof(cc->cc_ktr_event_name),
+	    "callwheel cpu %d", cpu);
+	if (cc->cc_callout == NULL)	/* Only cpu0 handles timeout(9) */
 		return;
 	for (i = 0; i < ncallout; i++) {
 		c = &cc->cc_callout[i];
 		callout_init(c, 0);
-		c->c_flags = CALLOUT_LOCAL_ALLOC;
+		c->c_iflags = CALLOUT_LOCAL_ALLOC;
 		SLIST_INSERT_HEAD(&cc->cc_callfree, c, c_links.sle);
 	}
 }
@@ -286,19 +349,6 @@
 #endif
 
 /*
- * kern_timeout_callwheel_init() - initialize previously reserved callwheel
- *				   space.
- *
- *	This code is called just once, after the space reserved for the
- *	callout wheel has been finalized.
- */
-void
-kern_timeout_callwheel_init(void)
-{
-	callout_cpu_init(CC_CPU(timeout_cpu));
-}
-
-/*
  * Start standard softclock thread.
  */
 static void
@@ -318,78 +368,159 @@
 		if (cpu == timeout_cpu)
 			continue;
 		cc = CC_CPU(cpu);
+		cc->cc_callout = NULL;	/* Only cpu0 handles timeout(9). */
+		callout_cpu_init(cc, cpu);
 		if (swi_add(NULL, "clock", softclock, cc, SWI_CLOCK,
 		    INTR_MPSAFE, &cc->cc_cookie))
 			panic("died while creating standard software ithreads");
-		cc->cc_callout = NULL;	/* Only cpu0 handles timeout(). */
-		cc->cc_callwheel = malloc(
-		    sizeof(struct callout_tailq) * callwheelsize, M_CALLOUT,
-		    M_WAITOK);
-		callout_cpu_init(cc);
 	}
 #endif
 }
-
 SYSINIT(start_softclock, SI_SUB_SOFTINTR, SI_ORDER_FIRST, start_softclock, NULL);
 
+#define	CC_HASH_SHIFT	8
+
+static inline u_int
+callout_hash(sbintime_t sbt)
+{
+
+	return (sbt >> (32 - CC_HASH_SHIFT));
+}
+
+static inline u_int
+callout_get_bucket(sbintime_t sbt)
+{
+
+	return (callout_hash(sbt) & callwheelmask);
+}
+
 void
-callout_tick(void)
+callout_process(sbintime_t now)
 {
+	struct callout *tmp, *tmpn;
 	struct callout_cpu *cc;
-	int need_softclock;
-	int bucket;
+	struct callout_list *sc;
+	sbintime_t first, last, max, tmp_max;
+	uint32_t lookahead;
+	u_int firstb, lastb, nowb;
+#ifdef CALLOUT_PROFILING
+	int depth_dir = 0, mpcalls_dir = 0, lockcalls_dir = 0;
+#endif
 
+	cc = CC_SELF();
+	mtx_lock_spin_flags(&cc->cc_lock, MTX_QUIET);
+
+	/* Compute the buckets of the last scan and present times. */
+	firstb = callout_hash(cc->cc_lastscan);
+	cc->cc_lastscan = now;
+	nowb = callout_hash(now);
+
+	/* Compute the last bucket and minimum time of the bucket after it. */
+	if (nowb == firstb)
+		lookahead = (SBT_1S / 16);
+	else if (nowb - firstb == 1)
+		lookahead = (SBT_1S / 8);
+	else
+		lookahead = (SBT_1S / 2);
+	first = last = now;
+	first += (lookahead / 2);
+	last += lookahead;
+	last &= (0xffffffffffffffffLLU << (32 - CC_HASH_SHIFT));
+	lastb = callout_hash(last) - 1;
+	max = last;
+
 	/*
-	 * Process callouts at a very low cpu priority, so we don't keep the
-	 * relatively high clock interrupt priority any longer than necessary.
+	 * Check if we wrapped around the entire wheel from the last scan.
+	 * In case, we need to scan entirely the wheel for pending callouts.
 	 */
-	need_softclock = 0;
-	cc = CC_SELF();
-	mtx_lock_spin_flags(&cc->cc_lock, MTX_QUIET);
-	cc->cc_firsttick = cc->cc_ticks = ticks;
-	for (; (cc->cc_softticks - cc->cc_ticks) <= 0; cc->cc_softticks++) {
-		bucket = cc->cc_softticks & callwheelmask;
-		if (!TAILQ_EMPTY(&cc->cc_callwheel[bucket])) {
-			need_softclock = 1;
-			break;
+	if (lastb - firstb >= callwheelsize) {
+		lastb = firstb + callwheelsize - 1;
+		if (nowb - firstb >= callwheelsize)
+			nowb = lastb;
+	}
+
+	/* Iterate callwheel from firstb to nowb and then up to lastb. */
+	do {
+		sc = &cc->cc_callwheel[firstb & callwheelmask];
+		tmp = LIST_FIRST(sc);
+		while (tmp != NULL) {
+			/* Run the callout if present time within allowed. */
+			if (tmp->c_time <= now) {
+				/*
+				 * Consumer told us the callout may be run
+				 * directly from hardware interrupt context.
+				 */
+				if (tmp->c_iflags & CALLOUT_DIRECT) {
+#ifdef CALLOUT_PROFILING
+					++depth_dir;
+#endif
+					cc_exec_next(cc) =
+					    LIST_NEXT(tmp, c_links.le);
+					cc->cc_bucket = firstb & callwheelmask;
+					LIST_REMOVE(tmp, c_links.le);
+					softclock_call_cc(tmp, cc,
+#ifdef CALLOUT_PROFILING
+					    &mpcalls_dir, &lockcalls_dir, NULL,
+#endif
+					    1);
+					tmp = cc_exec_next(cc);
+					cc_exec_next(cc) = NULL;
+				} else {
+					tmpn = LIST_NEXT(tmp, c_links.le);
+					LIST_REMOVE(tmp, c_links.le);
+					TAILQ_INSERT_TAIL(&cc->cc_expireq,
+					    tmp, c_links.tqe);
+					tmp->c_iflags |= CALLOUT_PROCESSED;
+					tmp = tmpn;
+				}
+				continue;
+			}
+			/* Skip events from distant future. */
+			if (tmp->c_time >= max)
+				goto next;
+			/*
+			 * Event minimal time is bigger than present maximal
+			 * time, so it cannot be aggregated.
+			 */
+			if (tmp->c_time > last) {
+				lastb = nowb;
+				goto next;
+			}
+			/* Update first and last time, respecting this event. */
+			if (tmp->c_time < first)
+				first = tmp->c_time;
+			tmp_max = tmp->c_time + tmp->c_precision;
+			if (tmp_max < last)
+				last = tmp_max;
+next:
+			tmp = LIST_NEXT(tmp, c_links.le);
 		}
-	}
+		/* Proceed with the next bucket. */
+		firstb++;
+		/*
+		 * Stop if we looked after present time and found
+		 * some event we can't execute at now.
+		 * Stop if we looked far enough into the future.
+		 */
+	} while (((int)(firstb - lastb)) <= 0);
+	cc->cc_firstevent = last;
+#ifndef NO_EVENTTIMERS
+	cpu_new_callout(curcpu, last, first);
+#endif
+#ifdef CALLOUT_PROFILING
+	avg_depth_dir += (depth_dir * 1000 - avg_depth_dir) >> 8;
+	avg_mpcalls_dir += (mpcalls_dir * 1000 - avg_mpcalls_dir) >> 8;
+	avg_lockcalls_dir += (lockcalls_dir * 1000 - avg_lockcalls_dir) >> 8;
+#endif
 	mtx_unlock_spin_flags(&cc->cc_lock, MTX_QUIET);
 	/*
 	 * swi_sched acquires the thread lock, so we don't want to call it
 	 * with cc_lock held; incorrect locking order.
 	 */
-	if (need_softclock)
+	if (!TAILQ_EMPTY(&cc->cc_expireq))
 		swi_sched(cc->cc_cookie, 0);
 }
 
-int
-callout_tickstofirst(int limit)
-{
-	struct callout_cpu *cc;
-	struct callout *c;
-	struct callout_tailq *sc;
-	int curticks;
-	int skip = 1;
-
-	cc = CC_SELF();
-	mtx_lock_spin_flags(&cc->cc_lock, MTX_QUIET);
-	curticks = cc->cc_ticks;
-	while( skip < ncallout && skip < limit ) {
-		sc = &cc->cc_callwheel[ (curticks+skip) & callwheelmask ];
-		/* search scanning ticks */
-		TAILQ_FOREACH( c, sc, c_links.tqe ){
-			if (c->c_time - curticks <= ncallout)
-				goto out;
-		}
-		skip++;
-	}
-out:
-	cc->cc_firsttick = curticks + skip;
-	mtx_unlock_spin_flags(&cc->cc_lock, MTX_QUIET);
-	return (skip);
-}
-
 static struct callout_cpu *
 callout_lock(struct callout *c)
 {
@@ -415,26 +546,44 @@
 }
 
 static void
-callout_cc_add(struct callout *c, struct callout_cpu *cc, int to_ticks,
-    void (*func)(void *), void *arg, int cpu)
+callout_cc_add(struct callout *c, struct callout_cpu *cc,
+    sbintime_t sbt, sbintime_t precision, void (*func)(void *),
+    void *arg, int cpu, int flags)
 {
+	int bucket;
 
 	CC_LOCK_ASSERT(cc);
-
-	if (to_ticks <= 0)
-		to_ticks = 1;
+	if (sbt < cc->cc_lastscan)
+		sbt = cc->cc_lastscan;
 	c->c_arg = arg;
-	c->c_flags |= (CALLOUT_ACTIVE | CALLOUT_PENDING);
+	c->c_iflags |= CALLOUT_PENDING;
+	c->c_iflags &= ~CALLOUT_PROCESSED;
+	c->c_flags |= CALLOUT_ACTIVE;
+	if (flags & C_DIRECT_EXEC)
+		c->c_iflags |= CALLOUT_DIRECT;
 	c->c_func = func;
-	c->c_time = ticks + to_ticks;
-	TAILQ_INSERT_TAIL(&cc->cc_callwheel[c->c_time & callwheelmask], 
-	    c, c_links.tqe);
-	if ((c->c_time - cc->cc_firsttick) < 0 &&
-	    callout_new_inserted != NULL) {
-		cc->cc_firsttick = c->c_time;
-		(*callout_new_inserted)(cpu,
-		    to_ticks + (ticks - cc->cc_ticks));
+	c->c_time = sbt;
+	c->c_precision = precision;
+	bucket = callout_get_bucket(c->c_time);
+	CTR3(KTR_CALLOUT, "precision set for %p: %d.%08x",
+	    c, (int)(c->c_precision >> 32),
+	    (u_int)(c->c_precision & 0xffffffff));
+	LIST_INSERT_HEAD(&cc->cc_callwheel[bucket], c, c_links.le);
+	if (cc->cc_bucket == bucket)
+		cc_exec_next(cc) = c;
+#ifndef NO_EVENTTIMERS
+	/*
+	 * Inform the eventtimers(4) subsystem there's a new callout
+	 * that has been inserted, but only if really required.
+	 */
+	if (SBT_MAX - c->c_time < c->c_precision)
+		c->c_precision = SBT_MAX - c->c_time;
+	sbt = c->c_time + c->c_precision;
+	if (sbt < cc->cc_firstevent) {
+		cc->cc_firstevent = sbt;
+		cpu_new_callout(cpu, sbt, c->c_time);
 	}
+#endif
 }
 
 static void
@@ -441,7 +590,7 @@
 callout_cc_del(struct callout *c, struct callout_cpu *cc)
 {
 
-	if ((c->c_flags & CALLOUT_LOCAL_ALLOC) == 0)
+	if ((c->c_iflags & CALLOUT_LOCAL_ALLOC) == 0)
 		return;
 	c->c_func = NULL;
 	SLIST_INSERT_HEAD(&cc->cc_callfree, c, c_links.sle);
@@ -448,100 +597,122 @@
 }
 
 static void
-softclock_call_cc(struct callout *c, struct callout_cpu *cc, int *mpcalls,
-    int *lockcalls, int *gcalls)
+softclock_call_cc(struct callout *c, struct callout_cpu *cc,
+#ifdef CALLOUT_PROFILING
+    int *mpcalls, int *lockcalls, int *gcalls,
+#endif
+    int direct)
 {
+	struct rm_priotracker tracker;
 	void (*c_func)(void *);
 	void *c_arg;
 	struct lock_class *class;
 	struct lock_object *c_lock;
-	int c_flags, sharedlock;
+	uintptr_t lock_status;
+	int c_iflags;
 #ifdef SMP
 	struct callout_cpu *new_cc;
 	void (*new_func)(void *);
 	void *new_arg;
-	int new_cpu, new_ticks;
+	int flags, new_cpu;
+	sbintime_t new_prec, new_time;
 #endif
-#ifdef DIAGNOSTIC
-	struct bintime bt1, bt2;
+#if defined(DIAGNOSTIC) || defined(CALLOUT_PROFILING) 
+	sbintime_t sbt1, sbt2;
 	struct timespec ts2;
-	static uint64_t maxdt = 36893488147419102LL;	/* 2 msec */
+	static sbintime_t maxdt = 2 * SBT_1MS;	/* 2 msec */
 	static timeout_t *lastfunc;
 #endif
 
-	KASSERT((c->c_flags & (CALLOUT_PENDING | CALLOUT_ACTIVE)) ==
-	    (CALLOUT_PENDING | CALLOUT_ACTIVE),
-	    ("softclock_call_cc: pend|act %p %x", c, c->c_flags));
+	KASSERT((c->c_iflags & CALLOUT_PENDING) == CALLOUT_PENDING,
+	    ("softclock_call_cc: pend %p %x", c, c->c_iflags));
+	KASSERT((c->c_flags & CALLOUT_ACTIVE) == CALLOUT_ACTIVE,
+	    ("softclock_call_cc: act %p %x", c, c->c_flags));
 	class = (c->c_lock != NULL) ? LOCK_CLASS(c->c_lock) : NULL;
-	sharedlock = (c->c_flags & CALLOUT_SHAREDLOCK) ? 0 : 1;
+	lock_status = 0;
+	if (c->c_flags & CALLOUT_SHAREDLOCK) {
+		if (class == &lock_class_rm)
+			lock_status = (uintptr_t)&tracker;
+		else
+			lock_status = 1;
+	}
 	c_lock = c->c_lock;
 	c_func = c->c_func;
 	c_arg = c->c_arg;
-	c_flags = c->c_flags;
-	if (c->c_flags & CALLOUT_LOCAL_ALLOC)
-		c->c_flags = CALLOUT_LOCAL_ALLOC;
+	c_iflags = c->c_iflags;
+	if (c->c_iflags & CALLOUT_LOCAL_ALLOC)
+		c->c_iflags = CALLOUT_LOCAL_ALLOC;
 	else
-		c->c_flags &= ~CALLOUT_PENDING;
-	cc->cc_curr = c;
-	cc->cc_cancel = 0;
+		c->c_iflags &= ~CALLOUT_PENDING;
+	
+	cc_exec_curr(cc, direct) = c;
+	cc_exec_cancel(cc, direct) = false;
 	CC_UNLOCK(cc);
 	if (c_lock != NULL) {
-		class->lc_lock(c_lock, sharedlock);
+		class->lc_lock(c_lock, lock_status);
 		/*
 		 * The callout may have been cancelled
 		 * while we switched locks.
 		 */
-		if (cc->cc_cancel) {
+		if (cc_exec_cancel(cc, direct)) {
 			class->lc_unlock(c_lock);
 			goto skip;
 		}
 		/* The callout cannot be stopped now. */
-		cc->cc_cancel = 1;
-
+		cc_exec_cancel(cc, direct) = true;
 		if (c_lock == &Giant.lock_object) {
+#ifdef CALLOUT_PROFILING
 			(*gcalls)++;
-			CTR3(KTR_CALLOUT, "callout %p func %p arg %p",
+#endif
+			CTR3(KTR_CALLOUT, "callout giant %p func %p arg %p",
 			    c, c_func, c_arg);
 		} else {
+#ifdef CALLOUT_PROFILING
 			(*lockcalls)++;
+#endif
 			CTR3(KTR_CALLOUT, "callout lock %p func %p arg %p",
 			    c, c_func, c_arg);
 		}
 	} else {
+#ifdef CALLOUT_PROFILING
 		(*mpcalls)++;
-		CTR3(KTR_CALLOUT, "callout mpsafe %p func %p arg %p",
+#endif
+		CTR3(KTR_CALLOUT, "callout %p func %p arg %p",
 		    c, c_func, c_arg);
 	}
-#ifdef DIAGNOSTIC
-	binuptime(&bt1);
+	KTR_STATE3(KTR_SCHED, "callout", cc->cc_ktr_event_name, "running",
+	    "func:%p", c_func, "arg:%p", c_arg, "direct:%d", direct);
+#if defined(DIAGNOSTIC) || defined(CALLOUT_PROFILING)
+	sbt1 = sbinuptime();
 #endif
 	THREAD_NO_SLEEPING();
-	SDT_PROBE(callout_execute, kernel, , callout_start, c, 0, 0, 0, 0);
+	SDT_PROBE1(callout_execute, , , callout__start, c);
 	c_func(c_arg);
-	SDT_PROBE(callout_execute, kernel, , callout_end, c, 0, 0, 0, 0);
+	SDT_PROBE1(callout_execute, , , callout__end, c);
 	THREAD_SLEEPING_OK();
-#ifdef DIAGNOSTIC
-	binuptime(&bt2);
-	bintime_sub(&bt2, &bt1);
-	if (bt2.frac > maxdt) {
-		if (lastfunc != c_func || bt2.frac > maxdt * 2) {
-			bintime2timespec(&bt2, &ts2);
+#if defined(DIAGNOSTIC) || defined(CALLOUT_PROFILING)
+	sbt2 = sbinuptime();
+	sbt2 -= sbt1;
+	if (sbt2 > maxdt) {
+		if (lastfunc != c_func || sbt2 > maxdt * 2) {
+			ts2 = sbttots(sbt2);
 			printf(
 		"Expensive timeout(9) function: %p(%p) %jd.%09ld s\n",
 			    c_func, c_arg, (intmax_t)ts2.tv_sec, ts2.tv_nsec);
 		}
-		maxdt = bt2.frac;
+		maxdt = sbt2;
 		lastfunc = c_func;
 	}
 #endif
+	KTR_STATE0(KTR_SCHED, "callout", cc->cc_ktr_event_name, "idle");
 	CTR1(KTR_CALLOUT, "callout %p finished", c);
-	if ((c_flags & CALLOUT_RETURNUNLOCKED) == 0)
+	if ((c_iflags & CALLOUT_RETURNUNLOCKED) == 0)
 		class->lc_unlock(c_lock);
 skip:
 	CC_LOCK(cc);
-	KASSERT(cc->cc_curr == c, ("mishandled cc_curr"));
-	cc->cc_curr = NULL;
-	if (cc->cc_waiting) {
+	KASSERT(cc_exec_curr(cc, direct) == c, ("mishandled cc_curr"));
+	cc_exec_curr(cc, direct) = NULL;
+	if (cc_exec_waiting(cc, direct)) {
 		/*
 		 * There is someone waiting for the
 		 * callout to complete.
@@ -548,21 +719,21 @@
 		 * If the callout was scheduled for
 		 * migration just cancel it.
 		 */
-		if (cc_cme_migrating(cc)) {
-			cc_cme_cleanup(cc);
+		if (cc_cce_migrating(cc, direct)) {
+			cc_cce_cleanup(cc, direct);
 
 			/*
 			 * It should be assert here that the callout is not
 			 * destroyed but that is not easy.
 			 */
-			c->c_flags &= ~CALLOUT_DFRMIGRATION;
+			c->c_iflags &= ~CALLOUT_DFRMIGRATION;
 		}
-		cc->cc_waiting = 0;
+		cc_exec_waiting(cc, direct) = false;
 		CC_UNLOCK(cc);
-		wakeup(&cc->cc_waiting);
+		wakeup(&cc_exec_waiting(cc, direct));
 		CC_LOCK(cc);
-	} else if (cc_cme_migrating(cc)) {
-		KASSERT((c_flags & CALLOUT_LOCAL_ALLOC) == 0,
+	} else if (cc_cce_migrating(cc, direct)) {
+		KASSERT((c_iflags & CALLOUT_LOCAL_ALLOC) == 0,
 		    ("Migrating legacy callout %p", c));
 #ifdef SMP
 		/*
@@ -569,11 +740,12 @@
 		 * If the callout was scheduled for
 		 * migration just perform it now.
 		 */
-		new_cpu = cc->cc_migration_cpu;
-		new_ticks = cc->cc_migration_ticks;
-		new_func = cc->cc_migration_func;
-		new_arg = cc->cc_migration_arg;
-		cc_cme_cleanup(cc);
+		new_cpu = cc_migration_cpu(cc, direct);
+		new_time = cc_migration_time(cc, direct);
+		new_prec = cc_migration_prec(cc, direct);
+		new_func = cc_migration_func(cc, direct);
+		new_arg = cc_migration_arg(cc, direct);
+		cc_cce_cleanup(cc, direct);
 
 		/*
 		 * It should be assert here that the callout is not destroyed
@@ -581,7 +753,7 @@
 		 *
 		 * As first thing, handle deferred callout stops.
 		 */
-		if ((c->c_flags & CALLOUT_DFRMIGRATION) == 0) {
+		if (!callout_migrating(c)) {
 			CTR3(KTR_CALLOUT,
 			     "deferred cancelled %p func %p arg %p",
 			     c, new_func, new_arg);
@@ -588,11 +760,12 @@
 			callout_cc_del(c, cc);
 			return;
 		}
-		c->c_flags &= ~CALLOUT_DFRMIGRATION;
+		c->c_iflags &= ~CALLOUT_DFRMIGRATION;
 
 		new_cc = callout_cpu_switch(c, cc, new_cpu);
-		callout_cc_add(c, new_cc, new_ticks, new_func, new_arg,
-		    new_cpu);
+		flags = (direct) ? C_DIRECT_EXEC : 0;
+		callout_cc_add(c, new_cc, new_time, new_prec, new_func,
+		    new_arg, new_cpu, flags);
 		CC_UNLOCK(new_cc);
 		CC_LOCK(cc);
 #else
@@ -603,19 +776,19 @@
 	 * If the current callout is locally allocated (from
 	 * timeout(9)) then put it on the freelist.
 	 *
-	 * Note: we need to check the cached copy of c_flags because
+	 * Note: we need to check the cached copy of c_iflags because
 	 * if it was not local, then it's not safe to deref the
 	 * callout pointer.
 	 */
-	KASSERT((c_flags & CALLOUT_LOCAL_ALLOC) == 0 ||
-	    c->c_flags == CALLOUT_LOCAL_ALLOC,
+	KASSERT((c_iflags & CALLOUT_LOCAL_ALLOC) == 0 ||
+	    c->c_iflags == CALLOUT_LOCAL_ALLOC,
 	    ("corrupted callout"));
-	if (c_flags & CALLOUT_LOCAL_ALLOC)
+	if (c_iflags & CALLOUT_LOCAL_ALLOC)
 		callout_cc_del(c, cc);
 }
 
 /*
- * The callout mechanism is based on the work of Adam M. Costello and 
+ * The callout mechanism is based on the work of Adam M. Costello and
  * George Varghese, published in a technical report entitled "Redesigning
  * the BSD Callout and Timer Facilities" and modified slightly for inclusion
  * in FreeBSD by Justin T. Gibbs.  The original work on the data structures
@@ -635,63 +808,29 @@
 {
 	struct callout_cpu *cc;
 	struct callout *c;
-	struct callout_tailq *bucket;
-	int curticks;
-	int steps;	/* #steps since we last allowed interrupts */
-	int depth;
-	int mpcalls;
-	int lockcalls;
-	int gcalls;
+#ifdef CALLOUT_PROFILING
+	int depth = 0, gcalls = 0, lockcalls = 0, mpcalls = 0;
+#endif
 
-#ifndef MAX_SOFTCLOCK_STEPS
-#define MAX_SOFTCLOCK_STEPS 100 /* Maximum allowed value of steps. */
-#endif /* MAX_SOFTCLOCK_STEPS */
-
-	mpcalls = 0;
-	lockcalls = 0;
-	gcalls = 0;
-	depth = 0;
-	steps = 0;
 	cc = (struct callout_cpu *)arg;
 	CC_LOCK(cc);
-	while (cc->cc_softticks - 1 != cc->cc_ticks) {
-		/*
-		 * cc_softticks may be modified by hard clock, so cache
-		 * it while we work on a given bucket.
-		 */
-		curticks = cc->cc_softticks;
-		cc->cc_softticks++;
-		bucket = &cc->cc_callwheel[curticks & callwheelmask];
-		c = TAILQ_FIRST(bucket);
-		while (c != NULL) {
-			depth++;
-			if (c->c_time != curticks) {
-				c = TAILQ_NEXT(c, c_links.tqe);
-				++steps;
-				if (steps >= MAX_SOFTCLOCK_STEPS) {
-					cc->cc_next = c;
-					/* Give interrupts a chance. */
-					CC_UNLOCK(cc);
-					;	/* nothing */
-					CC_LOCK(cc);
-					c = cc->cc_next;
-					steps = 0;
-				}
-			} else {
-				cc->cc_next = TAILQ_NEXT(c, c_links.tqe);
-				TAILQ_REMOVE(bucket, c, c_links.tqe);
-				softclock_call_cc(c, cc, &mpcalls,
-				    &lockcalls, &gcalls);
-				steps = 0;
-				c = cc->cc_next;
-			}
-		}
+	while ((c = TAILQ_FIRST(&cc->cc_expireq)) != NULL) {
+		TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe);
+		softclock_call_cc(c, cc,
+#ifdef CALLOUT_PROFILING
+		    &mpcalls, &lockcalls, &gcalls,
+#endif
+		    0);
+#ifdef CALLOUT_PROFILING
+		++depth;
+#endif
 	}
+#ifdef CALLOUT_PROFILING
 	avg_depth += (depth * 1000 - avg_depth) >> 8;
 	avg_mpcalls += (mpcalls * 1000 - avg_mpcalls) >> 8;
 	avg_lockcalls += (lockcalls * 1000 - avg_lockcalls) >> 8;
 	avg_gcalls += (gcalls * 1000 - avg_gcalls) >> 8;
-	cc->cc_next = NULL;
+#endif
 	CC_UNLOCK(cc);
 }
 
@@ -706,7 +845,7 @@
  *	Initialize a handle so that using it with untimeout is benign.
  *
  *	See AT&T BCI Driver Reference Manual for specification.  This
- *	implementation differs from that one in that although an 
+ *	implementation differs from that one in that although an
  *	identification value is returned from timeout, the original
  *	arguments to timeout as well as the identifier are used to
  *	identify entries for untimeout.
@@ -764,6 +903,56 @@
 	handle->callout = NULL;
 }
 
+void
+callout_when(sbintime_t sbt, sbintime_t precision, int flags,
+    sbintime_t *res, sbintime_t *prec_res)
+{
+	sbintime_t to_sbt, to_pr;
+
+	if ((flags & (C_ABSOLUTE | C_PRECALC)) != 0) {
+		*res = sbt;
+		*prec_res = precision;
+		return;
+	}
+	if ((flags & C_HARDCLOCK) != 0 && sbt < tick_sbt)
+		sbt = tick_sbt;
+	if ((flags & C_HARDCLOCK) != 0 ||
+#ifdef NO_EVENTTIMERS
+	    sbt >= sbt_timethreshold) {
+		to_sbt = getsbinuptime();
+
+		/* Add safety belt for the case of hz > 1000. */
+		to_sbt += tc_tick_sbt - tick_sbt;
+#else
+	    sbt >= sbt_tickthreshold) {
+		/*
+		 * Obtain the time of the last hardclock() call on
+		 * this CPU directly from the kern_clocksource.c.
+		 * This value is per-CPU, but it is equal for all
+		 * active ones.
+		 */
+#ifdef __LP64__
+		to_sbt = DPCPU_GET(hardclocktime);
+#else
+		spinlock_enter();
+		to_sbt = DPCPU_GET(hardclocktime);
+		spinlock_exit();
+#endif
+#endif
+		if ((flags & C_HARDCLOCK) == 0)
+			to_sbt += tick_sbt;
+	} else
+		to_sbt = sbinuptime();
+	if (SBT_MAX - to_sbt < sbt)
+		to_sbt = SBT_MAX;
+	else
+		to_sbt += sbt;
+	*res = to_sbt;
+	to_pr = ((C_PRELGET(flags) < 0) ? sbt >> tc_precexp :
+	    sbt >> C_PRELGET(flags));
+	*prec_res = to_pr > precision ? to_pr : precision;
+}
+
 /*
  * New interface; clients allocate their own callout structures.
  *
@@ -781,28 +970,56 @@
  * callout_deactivate() - marks the callout as having been serviced
  */
 int
-callout_reset_on(struct callout *c, int to_ticks, void (*ftn)(void *),
-    void *arg, int cpu)
+callout_reset_sbt_on(struct callout *c, sbintime_t sbt, sbintime_t prec,
+    void (*ftn)(void *), void *arg, int cpu, int flags)
 {
+	sbintime_t to_sbt, precision;
 	struct callout_cpu *cc;
-	int cancelled = 0;
+	int cancelled, direct;
+	int ignore_cpu=0;
 
+	cancelled = 0;
+	if (cpu == -1) {
+		ignore_cpu = 1;
+	} else if ((cpu >= MAXCPU) ||
+		   ((CC_CPU(cpu))->cc_inited == 0)) {
+		/* Invalid CPU spec */
+		panic("Invalid CPU in callout %d", cpu);
+	}
+	callout_when(sbt, prec, flags, &to_sbt, &precision);
+
+	/* 
+	 * This flag used to be added by callout_cc_add, but the
+	 * first time you call this we could end up with the
+	 * wrong direct flag if we don't do it before we add.
+	 */
+	if (flags & C_DIRECT_EXEC) {
+		direct = 1;
+	} else {
+		direct = 0;
+	}
+	KASSERT(!direct || c->c_lock == NULL,
+	    ("%s: direct callout %p has lock", __func__, c));
+	cc = callout_lock(c);
 	/*
 	 * Don't allow migration of pre-allocated callouts lest they
-	 * become unbalanced.
+	 * become unbalanced or handle the case where the user does
+	 * not care. 
 	 */
-	if (c->c_flags & CALLOUT_LOCAL_ALLOC)
+	if ((c->c_iflags & CALLOUT_LOCAL_ALLOC) ||
+	    ignore_cpu) {
 		cpu = c->c_cpu;
-	cc = callout_lock(c);
-	if (cc->cc_curr == c) {
+	}
+
+	if (cc_exec_curr(cc, direct) == c) {
 		/*
 		 * We're being asked to reschedule a callout which is
 		 * currently in progress.  If there is a lock then we
 		 * can cancel the callout if it has not really started.
 		 */
-		if (c->c_lock != NULL && !cc->cc_cancel)
-			cancelled = cc->cc_cancel = 1;
-		if (cc->cc_waiting) {
+		if (c->c_lock != NULL && !cc_exec_cancel(cc, direct))
+			cancelled = cc_exec_cancel(cc, direct) = true;
+		if (cc_exec_waiting(cc, direct)) {
 			/*
 			 * Someone has called callout_drain to kill this
 			 * callout.  Don't reschedule.
@@ -813,16 +1030,37 @@
 			CC_UNLOCK(cc);
 			return (cancelled);
 		}
+#ifdef SMP
+		if (callout_migrating(c)) {
+			/* 
+			 * This only occurs when a second callout_reset_sbt_on
+			 * is made after a previous one moved it into
+			 * deferred migration (below). Note we do *not* change
+			 * the prev_cpu even though the previous target may
+			 * be different.
+			 */
+			cc_migration_cpu(cc, direct) = cpu;
+			cc_migration_time(cc, direct) = to_sbt;
+			cc_migration_prec(cc, direct) = precision;
+			cc_migration_func(cc, direct) = ftn;
+			cc_migration_arg(cc, direct) = arg;
+			cancelled = 1;
+			CC_UNLOCK(cc);
+			return (cancelled);
+		}
+#endif
 	}
-	if (c->c_flags & CALLOUT_PENDING) {
-		if (cc->cc_next == c) {
-			cc->cc_next = TAILQ_NEXT(c, c_links.tqe);
+	if (c->c_iflags & CALLOUT_PENDING) {
+		if ((c->c_iflags & CALLOUT_PROCESSED) == 0) {
+			if (cc_exec_next(cc) == c)
+				cc_exec_next(cc) = LIST_NEXT(c, c_links.le);
+			LIST_REMOVE(c, c_links.le);
+		} else {
+			TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe);
 		}
-		TAILQ_REMOVE(&cc->cc_callwheel[c->c_time & callwheelmask], c,
-		    c_links.tqe);
-
 		cancelled = 1;
-		c->c_flags &= ~(CALLOUT_ACTIVE | CALLOUT_PENDING);
+		c->c_iflags &= ~ CALLOUT_PENDING;
+		c->c_flags &= ~ CALLOUT_ACTIVE;
 	}
 
 #ifdef SMP
@@ -832,15 +1070,34 @@
 	 * to a more appropriate moment.
 	 */
 	if (c->c_cpu != cpu) {
-		if (cc->cc_curr == c) {
-			cc->cc_migration_cpu = cpu;
-			cc->cc_migration_ticks = to_ticks;
-			cc->cc_migration_func = ftn;
-			cc->cc_migration_arg = arg;
-			c->c_flags |= CALLOUT_DFRMIGRATION;
-			CTR5(KTR_CALLOUT,
-		    "migration of %p func %p arg %p in %d to %u deferred",
-			    c, c->c_func, c->c_arg, to_ticks, cpu);
+		if (cc_exec_curr(cc, direct) == c) {
+			/* 
+			 * Pending will have been removed since we are
+			 * actually executing the callout on another
+			 * CPU. That callout should be waiting on the
+			 * lock the caller holds. If we set both
+			 * active/and/pending after we return and the
+			 * lock on the executing callout proceeds, it
+			 * will then see pending is true and return.
+			 * At the return from the actual callout execution
+			 * the migration will occur in softclock_call_cc
+			 * and this new callout will be placed on the 
+			 * new CPU via a call to callout_cpu_switch() which
+			 * will get the lock on the right CPU followed
+			 * by a call callout_cc_add() which will add it there.
+			 * (see above in softclock_call_cc()).
+			 */
+			cc_migration_cpu(cc, direct) = cpu;
+			cc_migration_time(cc, direct) = to_sbt;
+			cc_migration_prec(cc, direct) = precision;
+			cc_migration_func(cc, direct) = ftn;
+			cc_migration_arg(cc, direct) = arg;
+			c->c_iflags |= (CALLOUT_DFRMIGRATION | CALLOUT_PENDING);
+			c->c_flags |= CALLOUT_ACTIVE;
+			CTR6(KTR_CALLOUT,
+		    "migration of %p func %p arg %p in %d.%08x to %u deferred",
+			    c, c->c_func, c->c_arg, (int)(to_sbt >> 32),
+			    (u_int)(to_sbt & 0xffffffff), cpu);
 			CC_UNLOCK(cc);
 			return (cancelled);
 		}
@@ -848,9 +1105,10 @@
 	}
 #endif
 
-	callout_cc_add(c, cc, to_ticks, ftn, arg, cpu);
-	CTR5(KTR_CALLOUT, "%sscheduled %p func %p arg %p in %d",
-	    cancelled ? "re" : "", c, c->c_func, c->c_arg, to_ticks);
+	callout_cc_add(c, cc, to_sbt, precision, ftn, arg, cpu, flags);
+	CTR6(KTR_CALLOUT, "%sscheduled %p func %p arg %p in %d.%08x",
+	    cancelled ? "re" : "", c, c->c_func, c->c_arg, (int)(to_sbt >> 32),
+	    (u_int)(to_sbt & 0xffffffff));
 	CC_UNLOCK(cc);
 
 	return (cancelled);
@@ -872,19 +1130,20 @@
 }
 
 int
-_callout_stop_safe(c, safe)
+_callout_stop_safe(c, flags)
 	struct	callout *c;
-	int	safe;
+	int	flags;
 {
 	struct callout_cpu *cc, *old_cc;
 	struct lock_class *class;
-	int use_lock, sq_locked;
+	int direct, sq_locked, use_lock;
+	int not_on_a_list;
 
 	/*
 	 * Some old subsystems don't hold Giant while running a callout_stop(),
 	 * so just discard this check for the moment.
 	 */
-	if (!safe && c->c_lock != NULL) {
+	if ((flags & CS_DRAIN) == 0 && c->c_lock != NULL) {
 		if (c->c_lock == &Giant.lock_object)
 			use_lock = mtx_owned(&Giant);
 		else {
@@ -894,12 +1153,38 @@
 		}
 	} else
 		use_lock = 0;
-
+	if (c->c_iflags & CALLOUT_DIRECT) {
+		direct = 1;
+	} else {
+		direct = 0;
+	}
 	sq_locked = 0;
 	old_cc = NULL;
 again:
 	cc = callout_lock(c);
 
+	if ((c->c_iflags & (CALLOUT_DFRMIGRATION | CALLOUT_PENDING)) ==
+	    (CALLOUT_DFRMIGRATION | CALLOUT_PENDING) &&
+	    ((c->c_flags & CALLOUT_ACTIVE) == CALLOUT_ACTIVE)) {
+		/*
+		 * Special case where this slipped in while we
+		 * were migrating *as* the callout is about to
+		 * execute. The caller probably holds the lock
+		 * the callout wants.
+		 *
+		 * Get rid of the migration first. Then set
+		 * the flag that tells this code *not* to
+		 * try to remove it from any lists (its not
+		 * on one yet). When the callout wheel runs,
+		 * it will ignore this callout.
+		 */
+		c->c_iflags &= ~CALLOUT_PENDING;
+		c->c_flags &= ~CALLOUT_ACTIVE;
+		not_on_a_list = 1;
+	} else {
+		not_on_a_list = 0;
+	}
+
 	/*
 	 * If the callout was migrating while the callout cpu lock was
 	 * dropped,  just drop the sleepqueue lock and check the states
@@ -908,7 +1193,7 @@
 	if (sq_locked != 0 && cc != old_cc) {
 #ifdef SMP
 		CC_UNLOCK(cc);
-		sleepq_release(&old_cc->cc_waiting);
+		sleepq_release(&cc_exec_waiting(old_cc, direct));
 		sq_locked = 0;
 		old_cc = NULL;
 		goto again;
@@ -922,7 +1207,7 @@
 	 * don't attempt to remove it from the queue.  We can try to
 	 * stop it by other means however.
 	 */
-	if (!(c->c_flags & CALLOUT_PENDING)) {
+	if (!(c->c_iflags & CALLOUT_PENDING)) {
 		c->c_flags &= ~CALLOUT_ACTIVE;
 
 		/*
@@ -929,16 +1214,16 @@
 		 * If it wasn't on the queue and it isn't the current
 		 * callout, then we can't stop it, so just bail.
 		 */
-		if (cc->cc_curr != c) {
+		if (cc_exec_curr(cc, direct) != c) {
 			CTR3(KTR_CALLOUT, "failed to stop %p func %p arg %p",
 			    c, c->c_func, c->c_arg);
 			CC_UNLOCK(cc);
 			if (sq_locked)
-				sleepq_release(&cc->cc_waiting);
+				sleepq_release(&cc_exec_waiting(cc, direct));
 			return (0);
 		}
 
-		if (safe) {
+		if ((flags & CS_DRAIN) != 0) {
 			/*
 			 * The current callout is running (or just
 			 * about to run) and blocking is allowed, so
@@ -945,8 +1230,7 @@
 			 * just wait for the current invocation to
 			 * finish.
 			 */
-			while (cc->cc_curr == c) {
-
+			while (cc_exec_curr(cc, direct) == c) {
 				/*
 				 * Use direct calls to sleepqueue interface
 				 * instead of cv/msleep in order to avoid
@@ -966,7 +1250,8 @@
 				 */
 				if (!sq_locked) {
 					CC_UNLOCK(cc);
-					sleepq_lock(&cc->cc_waiting);
+					sleepq_lock(
+					    &cc_exec_waiting(cc, direct));
 					sq_locked = 1;
 					old_cc = cc;
 					goto again;
@@ -978,13 +1263,16 @@
 				 * will be packed up, just let softclock()
 				 * take care of it.
 				 */
-				cc->cc_waiting = 1;
+				cc_exec_waiting(cc, direct) = true;
 				DROP_GIANT();
 				CC_UNLOCK(cc);
-				sleepq_add(&cc->cc_waiting,
+				sleepq_add(
+				    &cc_exec_waiting(cc, direct),
 				    &cc->cc_lock.lock_object, "codrain",
 				    SLEEPQ_SLEEP, 0);
-				sleepq_wait(&cc->cc_waiting, 0);
+				sleepq_wait(
+				    &cc_exec_waiting(cc, direct),
+					     0);
 				sq_locked = 0;
 				old_cc = NULL;
 
@@ -992,7 +1280,9 @@
 				PICKUP_GIANT();
 				CC_LOCK(cc);
 			}
-		} else if (use_lock && !cc->cc_cancel) {
+		} else if (use_lock &&
+			   !cc_exec_cancel(cc, direct)) {
+			
 			/*
 			 * The current callout is waiting for its
 			 * lock which we hold.  Cancel the callout
@@ -1000,20 +1290,52 @@
 			 * lock, the callout will be skipped in
 			 * softclock().
 			 */
-			cc->cc_cancel = 1;
+			cc_exec_cancel(cc, direct) = true;
 			CTR3(KTR_CALLOUT, "cancelled %p func %p arg %p",
 			    c, c->c_func, c->c_arg);
-			KASSERT(!cc_cme_migrating(cc),
+			KASSERT(!cc_cce_migrating(cc, direct),
 			    ("callout wrongly scheduled for migration"));
+			if (callout_migrating(c)) {
+				c->c_iflags &= ~CALLOUT_DFRMIGRATION;
+#ifdef SMP
+				cc_migration_cpu(cc, direct) = CPUBLOCK;
+				cc_migration_time(cc, direct) = 0;
+				cc_migration_prec(cc, direct) = 0;
+				cc_migration_func(cc, direct) = NULL;
+				cc_migration_arg(cc, direct) = NULL;
+#endif
+			}
 			CC_UNLOCK(cc);
 			KASSERT(!sq_locked, ("sleepqueue chain locked"));
 			return (1);
-		} else if ((c->c_flags & CALLOUT_DFRMIGRATION) != 0) {
-			c->c_flags &= ~CALLOUT_DFRMIGRATION;
+		} else if (callout_migrating(c)) {
+			/*
+			 * The callout is currently being serviced
+			 * and the "next" callout is scheduled at
+			 * its completion with a migration. We remove
+			 * the migration flag so it *won't* get rescheduled,
+			 * but we can't stop the one thats running so
+			 * we return 0.
+			 */
+			c->c_iflags &= ~CALLOUT_DFRMIGRATION;
+#ifdef SMP
+			/* 
+			 * We can't call cc_cce_cleanup here since
+			 * if we do it will remove .ce_curr and
+			 * its still running. This will prevent a
+			 * reschedule of the callout when the 
+			 * execution completes.
+			 */
+			cc_migration_cpu(cc, direct) = CPUBLOCK;
+			cc_migration_time(cc, direct) = 0;
+			cc_migration_prec(cc, direct) = 0;
+			cc_migration_func(cc, direct) = NULL;
+			cc_migration_arg(cc, direct) = NULL;
+#endif
 			CTR3(KTR_CALLOUT, "postponing stop %p func %p arg %p",
 			    c, c->c_func, c->c_arg);
 			CC_UNLOCK(cc);
-			return (1);
+			return ((flags & CS_MIGRBLOCK) != 0);
 		}
 		CTR3(KTR_CALLOUT, "failed to stop %p func %p arg %p",
 		    c, c->c_func, c->c_arg);
@@ -1022,18 +1344,23 @@
 		return (0);
 	}
 	if (sq_locked)
-		sleepq_release(&cc->cc_waiting);
+		sleepq_release(&cc_exec_waiting(cc, direct));
 
-	c->c_flags &= ~(CALLOUT_ACTIVE | CALLOUT_PENDING);
+	c->c_iflags &= ~CALLOUT_PENDING;
+	c->c_flags &= ~CALLOUT_ACTIVE;
 
 	CTR3(KTR_CALLOUT, "cancelled %p func %p arg %p",
 	    c, c->c_func, c->c_arg);
-	if (cc->cc_next == c)
-		cc->cc_next = TAILQ_NEXT(c, c_links.tqe);
-	TAILQ_REMOVE(&cc->cc_callwheel[c->c_time & callwheelmask], c,
-	    c_links.tqe);
+	if (not_on_a_list == 0) {
+		if ((c->c_iflags & CALLOUT_PROCESSED) == 0) {
+			if (cc_exec_next(cc) == c)
+				cc_exec_next(cc) = LIST_NEXT(c, c_links.le);
+			LIST_REMOVE(c, c_links.le);
+		} else {
+			TAILQ_REMOVE(&cc->cc_expireq, c, c_links.tqe);
+		}
+	}
 	callout_cc_del(c, cc);
-
 	CC_UNLOCK(cc);
 	return (1);
 }
@@ -1046,10 +1373,10 @@
 	bzero(c, sizeof *c);
 	if (mpsafe) {
 		c->c_lock = NULL;
-		c->c_flags = CALLOUT_RETURNUNLOCKED;
+		c->c_iflags = CALLOUT_RETURNUNLOCKED;
 	} else {
 		c->c_lock = &Giant.lock_object;
-		c->c_flags = 0;
+		c->c_iflags = 0;
 	}
 	c->c_cpu = timeout_cpu;
 }
@@ -1069,7 +1396,7 @@
 	KASSERT(lock == NULL || !(LOCK_CLASS(lock)->lc_flags &
 	    (LC_SPINLOCK | LC_SLEEPABLE)), ("%s: invalid lock class",
 	    __func__));
-	c->c_flags = flags & (CALLOUT_RETURNUNLOCKED | CALLOUT_SHAREDLOCK);
+	c->c_iflags = flags & (CALLOUT_RETURNUNLOCKED | CALLOUT_SHAREDLOCK);
 	c->c_cpu = timeout_cpu;
 }
 
@@ -1086,7 +1413,7 @@
  * which set the timer can do the maintanence the timer was for as close
  * as possible to the originally intended time.  Testing this code for a 
  * week showed that resuming from a suspend resulted in 22 to 25 timers 
- * firing, which seemed independant on whether the suspend was 2 hours or
+ * firing, which seemed independent on whether the suspend was 2 hours or
  * 2 days.  Your milage may vary.   - Ken Key <key at cs.utk.edu>
  */
 void
@@ -1138,3 +1465,152 @@
 	return;
 }
 #endif /* APM_FIXUP_CALLTODO */
+
+static int
+flssbt(sbintime_t sbt)
+{
+
+	sbt += (uint64_t)sbt >> 1;
+	if (sizeof(long) >= sizeof(sbintime_t))
+		return (flsl(sbt));
+	if (sbt >= SBT_1S)
+		return (flsl(((uint64_t)sbt) >> 32) + 32);
+	return (flsl(sbt));
+}
+
+/*
+ * Dump immediate statistic snapshot of the scheduled callouts.
+ */
+static int
+sysctl_kern_callout_stat(SYSCTL_HANDLER_ARGS)
+{
+	struct callout *tmp;
+	struct callout_cpu *cc;
+	struct callout_list *sc;
+	sbintime_t maxpr, maxt, medpr, medt, now, spr, st, t;
+	int ct[64], cpr[64], ccpbk[32];
+	int error, val, i, count, tcum, pcum, maxc, c, medc;
+#ifdef SMP
+	int cpu;
+#endif
+
+	val = 0;
+	error = sysctl_handle_int(oidp, &val, 0, req);
+	if (error != 0 || req->newptr == NULL)
+		return (error);
+	count = maxc = 0;
+	st = spr = maxt = maxpr = 0;
+	bzero(ccpbk, sizeof(ccpbk));
+	bzero(ct, sizeof(ct));
+	bzero(cpr, sizeof(cpr));
+	now = sbinuptime();
+#ifdef SMP
+	CPU_FOREACH(cpu) {
+		cc = CC_CPU(cpu);
+#else
+		cc = CC_CPU(timeout_cpu);
+#endif
+		CC_LOCK(cc);
+		for (i = 0; i < callwheelsize; i++) {
+			sc = &cc->cc_callwheel[i];
+			c = 0;
+			LIST_FOREACH(tmp, sc, c_links.le) {
+				c++;
+				t = tmp->c_time - now;
+				if (t < 0)
+					t = 0;
+				st += t / SBT_1US;
+				spr += tmp->c_precision / SBT_1US;
+				if (t > maxt)
+					maxt = t;
+				if (tmp->c_precision > maxpr)
+					maxpr = tmp->c_precision;
+				ct[flssbt(t)]++;
+				cpr[flssbt(tmp->c_precision)]++;
+			}
+			if (c > maxc)
+				maxc = c;
+			ccpbk[fls(c + c / 2)]++;
+			count += c;
+		}
+		CC_UNLOCK(cc);
+#ifdef SMP
+	}
+#endif
+
+	for (i = 0, tcum = 0; i < 64 && tcum < count / 2; i++)
+		tcum += ct[i];
+	medt = (i >= 2) ? (((sbintime_t)1) << (i - 2)) : 0;
+	for (i = 0, pcum = 0; i < 64 && pcum < count / 2; i++)
+		pcum += cpr[i];
+	medpr = (i >= 2) ? (((sbintime_t)1) << (i - 2)) : 0;
+	for (i = 0, c = 0; i < 32 && c < count / 2; i++)
+		c += ccpbk[i];
+	medc = (i >= 2) ? (1 << (i - 2)) : 0;
+
+	printf("Scheduled callouts statistic snapshot:\n");
+	printf("  Callouts: %6d  Buckets: %6d*%-3d  Bucket size: 0.%06ds\n",
+	    count, callwheelsize, mp_ncpus, 1000000 >> CC_HASH_SHIFT);
+	printf("  C/Bk: med %5d         avg %6d.%06jd  max %6d\n",
+	    medc,
+	    count / callwheelsize / mp_ncpus,
+	    (uint64_t)count * 1000000 / callwheelsize / mp_ncpus % 1000000,
+	    maxc);
+	printf("  Time: med %5jd.%06jds avg %6jd.%06jds max %6jd.%06jds\n",
+	    medt / SBT_1S, (medt & 0xffffffff) * 1000000 >> 32,
+	    (st / count) / 1000000, (st / count) % 1000000,
+	    maxt / SBT_1S, (maxt & 0xffffffff) * 1000000 >> 32);
+	printf("  Prec: med %5jd.%06jds avg %6jd.%06jds max %6jd.%06jds\n",
+	    medpr / SBT_1S, (medpr & 0xffffffff) * 1000000 >> 32,
+	    (spr / count) / 1000000, (spr / count) % 1000000,
+	    maxpr / SBT_1S, (maxpr & 0xffffffff) * 1000000 >> 32);
+	printf("  Distribution:       \tbuckets\t   time\t   tcum\t"
+	    "   prec\t   pcum\n");
+	for (i = 0, tcum = pcum = 0; i < 64; i++) {
+		if (ct[i] == 0 && cpr[i] == 0)
+			continue;
+		t = (i != 0) ? (((sbintime_t)1) << (i - 1)) : 0;
+		tcum += ct[i];
+		pcum += cpr[i];
+		printf("  %10jd.%06jds\t 2**%d\t%7d\t%7d\t%7d\t%7d\n",
+		    t / SBT_1S, (t & 0xffffffff) * 1000000 >> 32,
+		    i - 1 - (32 - CC_HASH_SHIFT),
+		    ct[i], tcum, cpr[i], pcum);
+	}
+	return (error);
+}
+SYSCTL_PROC(_kern, OID_AUTO, callout_stat,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
+    0, 0, sysctl_kern_callout_stat, "I",
+    "Dump immediate statistic snapshot of the scheduled callouts");
+
+#ifdef DDB
+static void
+_show_callout(struct callout *c)
+{
+
+	db_printf("callout %p\n", c);
+#define	C_DB_PRINTF(f, e)	db_printf("   %s = " f "\n", #e, c->e);
+	db_printf("   &c_links = %p\n", &(c->c_links));
+	C_DB_PRINTF("%" PRId64,	c_time);
+	C_DB_PRINTF("%" PRId64,	c_precision);
+	C_DB_PRINTF("%p",	c_arg);
+	C_DB_PRINTF("%p",	c_func);
+	C_DB_PRINTF("%p",	c_lock);
+	C_DB_PRINTF("%#x",	c_flags);
+	C_DB_PRINTF("%#x",	c_iflags);
+	C_DB_PRINTF("%d",	c_cpu);
+#undef	C_DB_PRINTF
+}
+
+DB_SHOW_COMMAND(callout, db_show_callout)
+{
+
+	if (!have_addr) {
+		db_printf("usage: show callout <struct callout *>\n");
+		return;
+	}
+
+	_show_callout((struct callout *)addr);
+}
+#endif /* DDB */

Modified: trunk/sys/kern/kern_umtx.c
===================================================================
--- trunk/sys/kern/kern_umtx.c	2018-05-25 20:59:46 UTC (rev 9949)
+++ trunk/sys/kern/kern_umtx.c	2018-05-25 21:07:09 UTC (rev 9950)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 2004, David Xu <davidxu at freebsd.org>
  * Copyright (c) 2002, Jeffrey Roberson <jeff at freebsd.org>
@@ -26,7 +27,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/kern_umtx.c 330678 2018-03-09 01:21:22Z brooks $");
 
 #include "opt_compat.h"
 #include "opt_umtx_profiling.h"
@@ -39,6 +40,7 @@
 #include <sys/mutex.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
+#include <sys/sbuf.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/sysctl.h>
@@ -64,6 +66,11 @@
 #define _UMUTEX_TRY		1
 #define _UMUTEX_WAIT		2
 
+#ifdef UMTX_PROFILING
+#define	UPROF_PERC_BIGGER(w, f, sw, sf)					\
+	(((w) > (sw)) || ((w) == (sw) && (f) > (sf)))
+#endif
+
 /* Priority inheritance mutex info. */
 struct umtx_pi {
 	/* Owner thread */
@@ -157,13 +164,12 @@
 	TAILQ_HEAD(,umtx_pi)	uc_pi_list;
 
 #ifdef UMTX_PROFILING
-	int 			length;
-	int			max_length;
+	u_int 			length;
+	u_int			max_length;
 #endif
 };
 
 #define	UMTXQ_LOCKED_ASSERT(uc)		mtx_assert(&(uc)->uc_lock, MA_OWNED)
-#define	UMTXQ_BUSY_ASSERT(uc)	KASSERT(&(uc)->uc_busy, ("umtx chain is not busy"))
 
 /*
  * Don't propagate time-sharing priority, there is a security reason,
@@ -187,6 +193,12 @@
 
 #define BUSY_SPINS		200
 
+struct abs_timeout {
+	int clockid;
+	struct timespec cur;
+	struct timespec end;
+};
+
 static uma_zone_t		umtx_pi_zone;
 static struct umtxq_chain	umtxq_chains[2][UMTX_CHAINS];
 static MALLOC_DEFINE(M_UMTX, "umtx", "UMTX queue memory");
@@ -211,7 +223,7 @@
 static void umtxq_unbusy(struct umtx_key *key);
 static void umtxq_insert_queue(struct umtx_q *uq, int q);
 static void umtxq_remove_queue(struct umtx_q *uq, int q);
-static int umtxq_sleep(struct umtx_q *uq, const char *wmesg, int timo);
+static int umtxq_sleep(struct umtx_q *uq, const char *wmesg, struct abs_timeout *);
 static int umtxq_count(struct umtx_key *key);
 static struct umtx_pi *umtx_pi_alloc(int);
 static void umtx_pi_free(struct umtx_pi *pi);
@@ -246,6 +258,117 @@
 		    "max_length1", CTLFLAG_RD, &umtxq_chains[1][i].max_length, 0, NULL);
 	}
 }
+
+static int
+sysctl_debug_umtx_chains_peaks(SYSCTL_HANDLER_ARGS)
+{
+	char buf[512];
+	struct sbuf sb;
+	struct umtxq_chain *uc;
+	u_int fract, i, j, tot, whole;
+	u_int sf0, sf1, sf2, sf3, sf4;
+	u_int si0, si1, si2, si3, si4;
+	u_int sw0, sw1, sw2, sw3, sw4;
+
+	sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN);
+	for (i = 0; i < 2; i++) {
+		tot = 0;
+		for (j = 0; j < UMTX_CHAINS; ++j) {
+			uc = &umtxq_chains[i][j];
+			mtx_lock(&uc->uc_lock);
+			tot += uc->max_length;
+			mtx_unlock(&uc->uc_lock);
+		}
+		if (tot == 0)
+			sbuf_printf(&sb, "%u) Empty ", i);
+		else {
+			sf0 = sf1 = sf2 = sf3 = sf4 = 0;
+			si0 = si1 = si2 = si3 = si4 = 0;
+			sw0 = sw1 = sw2 = sw3 = sw4 = 0;
+			for (j = 0; j < UMTX_CHAINS; j++) {
+				uc = &umtxq_chains[i][j];
+				mtx_lock(&uc->uc_lock);
+				whole = uc->max_length * 100;
+				mtx_unlock(&uc->uc_lock);
+				fract = (whole % tot) * 100;
+				if (UPROF_PERC_BIGGER(whole, fract, sw0, sf0)) {
+					sf0 = fract;
+					si0 = j;
+					sw0 = whole;
+				} else if (UPROF_PERC_BIGGER(whole, fract, sw1,
+				    sf1)) {
+					sf1 = fract;
+					si1 = j;
+					sw1 = whole;
+				} else if (UPROF_PERC_BIGGER(whole, fract, sw2,
+				    sf2)) {
+					sf2 = fract;
+					si2 = j;
+					sw2 = whole;
+				} else if (UPROF_PERC_BIGGER(whole, fract, sw3,
+				    sf3)) {
+					sf3 = fract;
+					si3 = j;
+					sw3 = whole;
+				} else if (UPROF_PERC_BIGGER(whole, fract, sw4,
+				    sf4)) {
+					sf4 = fract;
+					si4 = j;
+					sw4 = whole;
+				}
+			}
+			sbuf_printf(&sb, "queue %u:\n", i);
+			sbuf_printf(&sb, "1st: %u.%u%% idx: %u\n", sw0 / tot,
+			    sf0 / tot, si0);
+			sbuf_printf(&sb, "2nd: %u.%u%% idx: %u\n", sw1 / tot,
+			    sf1 / tot, si1);
+			sbuf_printf(&sb, "3rd: %u.%u%% idx: %u\n", sw2 / tot,
+			    sf2 / tot, si2);
+			sbuf_printf(&sb, "4th: %u.%u%% idx: %u\n", sw3 / tot,
+			    sf3 / tot, si3);
+			sbuf_printf(&sb, "5th: %u.%u%% idx: %u\n", sw4 / tot,
+			    sf4 / tot, si4);
+		}
+	}
+	sbuf_trim(&sb);
+	sbuf_finish(&sb);
+	sysctl_handle_string(oidp, sbuf_data(&sb), sbuf_len(&sb), req);
+	sbuf_delete(&sb);
+	return (0);
+}
+
+static int
+sysctl_debug_umtx_chains_clear(SYSCTL_HANDLER_ARGS)
+{
+	struct umtxq_chain *uc;
+	u_int i, j;
+	int clear, error;
+
+	clear = 0;
+	error = sysctl_handle_int(oidp, &clear, 0, req);
+	if (error != 0 || req->newptr == NULL)
+		return (error);
+
+	if (clear != 0) {
+		for (i = 0; i < 2; ++i) {
+			for (j = 0; j < UMTX_CHAINS; ++j) {
+				uc = &umtxq_chains[i][j];
+				mtx_lock(&uc->uc_lock);
+				uc->length = 0;
+				uc->max_length = 0;	
+				mtx_unlock(&uc->uc_lock);
+			}
+		}
+	}
+	return (0);
+}
+
+SYSCTL_PROC(_debug_umtx_chains, OID_AUTO, clear,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 0,
+    sysctl_debug_umtx_chains_clear, "I", "Clear umtx chains statistics");
+SYSCTL_PROC(_debug_umtx_chains, OID_AUTO, peaks,
+    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 0,
+    sysctl_debug_umtx_chains_peaks, "A", "Highest peaks in chains max length");
 #endif
 
 static void
@@ -274,7 +397,7 @@
 #ifdef UMTX_PROFILING
 	umtx_init_profiling();
 #endif
-	mtx_init(&umtx_lock, "umtx lock", NULL, MTX_SPIN);
+	mtx_init(&umtx_lock, "umtx lock", NULL, MTX_DEF);
 	EVENTHANDLER_REGISTER(process_exec, umtx_exec_hook, NULL,
 	    EVENTHANDLER_PRI_ANY);
 }
@@ -387,6 +510,15 @@
 		wakeup_one(uc);
 }
 
+static inline void
+umtxq_unbusy_unlocked(struct umtx_key *key)
+{
+
+	umtxq_lock(key);
+	umtxq_unbusy(key);
+	umtxq_unlock(key);
+}
+
 static struct umtxq_queue *
 umtxq_queue_lookup(struct umtx_key *key, int q)
 {
@@ -419,19 +551,19 @@
 		uh = uq->uq_spare_queue;
 		uh->key = uq->uq_key;
 		LIST_INSERT_HEAD(&uc->uc_queue[q], uh, link);
+#ifdef UMTX_PROFILING
+		uc->length++;
+		if (uc->length > uc->max_length) {
+			uc->max_length = uc->length;
+			if (uc->max_length > max_length)
+				max_length = uc->max_length;	
+		}
+#endif
 	}
 	uq->uq_spare_queue = NULL;
 
 	TAILQ_INSERT_TAIL(&uh->head, uq, uq_link);
 	uh->length++;
-#ifdef UMTX_PROFILING
-	uc->length++;
-	if (uc->length > uc->max_length) {
-		uc->max_length = uc->length;
-		if (uc->max_length > max_length)
-			max_length = uc->max_length;	
-	}
-#endif
 	uq->uq_flags |= UQF_UMTXQ;
 	uq->uq_cur_queue = uh;
 	return;
@@ -449,13 +581,13 @@
 		uh = uq->uq_cur_queue;
 		TAILQ_REMOVE(&uh->head, uq, uq_link);
 		uh->length--;
-#ifdef UMTX_PROFILING
-		uc->length--;
-#endif
 		uq->uq_flags &= ~UQF_UMTXQ;
 		if (TAILQ_EMPTY(&uh->head)) {
 			KASSERT(uh->length == 0,
 			    ("inconsistent umtxq_queue length"));
+#ifdef UMTX_PROFILING
+			uc->length--;
+#endif
 			LIST_REMOVE(uh, link);
 		} else {
 			uh = LIST_FIRST(&uc->uc_spare_queue);
@@ -505,6 +637,32 @@
 	return (0);
 }
 
+static int
+umtxq_check_susp(struct thread *td)
+{
+	struct proc *p;
+	int error;
+
+	/*
+	 * The check for TDF_NEEDSUSPCHK is racy, but it is enough to
+	 * eventually break the lockstep loop.
+	 */
+	if ((td->td_flags & TDF_NEEDSUSPCHK) == 0)
+		return (0);
+	error = 0;
+	p = td->td_proc;
+	PROC_LOCK(p);
+	if (P_SHOULDSTOP(p) ||
+	    ((p->p_flag & P_TRACED) && (td->td_dbgflags & TDB_SUSPEND))) {
+		if (p->p_flag & P_SINGLE_EXIT)
+			error = EINTR;
+		else
+			error = ERESTART;
+	}
+	PROC_UNLOCK(p);
+	return (error);
+}
+
 /*
  * Wake up threads waiting on an userland object.
  */
@@ -547,23 +705,88 @@
 	wakeup(uq);
 }
 
+static inline int 
+tstohz(const struct timespec *tsp)
+{
+	struct timeval tv;
+
+	TIMESPEC_TO_TIMEVAL(&tv, tsp);
+	return tvtohz(&tv);
+}
+
+static void
+abs_timeout_init(struct abs_timeout *timo, int clockid, int absolute,
+	const struct timespec *timeout)
+{
+
+	timo->clockid = clockid;
+	if (!absolute) {
+		kern_clock_gettime(curthread, clockid, &timo->end);
+		timo->cur = timo->end;
+		timespecadd(&timo->end, timeout);
+	} else {
+		timo->end = *timeout;
+		kern_clock_gettime(curthread, clockid, &timo->cur);
+	}
+}
+
+static void
+abs_timeout_init2(struct abs_timeout *timo, const struct _umtx_time *umtxtime)
+{
+
+	abs_timeout_init(timo, umtxtime->_clockid,
+		(umtxtime->_flags & UMTX_ABSTIME) != 0,
+		&umtxtime->_timeout);
+}
+
+static inline void
+abs_timeout_update(struct abs_timeout *timo)
+{
+	kern_clock_gettime(curthread, timo->clockid, &timo->cur);
+}
+
+static int
+abs_timeout_gethz(struct abs_timeout *timo)
+{
+	struct timespec tts;
+
+	if (timespeccmp(&timo->end, &timo->cur, <=))
+		return (-1); 
+	tts = timo->end;
+	timespecsub(&tts, &timo->cur);
+	return (tstohz(&tts));
+}
+
 /*
  * Put thread into sleep state, before sleeping, check if
  * thread was removed from umtx queue.
  */
 static inline int
-umtxq_sleep(struct umtx_q *uq, const char *wmesg, int timo)
+umtxq_sleep(struct umtx_q *uq, const char *wmesg, struct abs_timeout *abstime)
 {
 	struct umtxq_chain *uc;
-	int error;
+	int error, timo;
 
 	uc = umtxq_getchain(&uq->uq_key);
 	UMTXQ_LOCKED_ASSERT(uc);
-	if (!(uq->uq_flags & UQF_UMTXQ))
-		return (0);
-	error = msleep(uq, &uc->uc_lock, PCATCH, wmesg, timo);
-	if (error == EWOULDBLOCK)
-		error = ETIMEDOUT;
+	for (;;) {
+		if (!(uq->uq_flags & UQF_UMTXQ))
+			return (0);
+		if (abstime != NULL) {
+			timo = abs_timeout_gethz(abstime);
+			if (timo < 0)
+				return (ETIMEDOUT);
+		} else
+			timo = 0;
+		error = msleep(uq, &uc->uc_lock, PCATCH | PDROP, wmesg, timo);
+		if (error != EWOULDBLOCK) {
+			umtxq_lock(&uq->uq_key);
+			break;
+		}
+		if (abstime != NULL)
+			abs_timeout_update(abstime);
+		umtxq_lock(&uq->uq_key);
+	}
 	return (error);
 }
 
@@ -627,8 +850,10 @@
  * Lock a umtx object.
  */
 static int
-_do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id, int timo)
+do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id,
+	const struct timespec *timeout)
 {
+	struct abs_timeout timo;
 	struct umtx_q *uq;
 	u_long owner;
 	u_long old;
@@ -635,6 +860,8 @@
 	int error = 0;
 
 	uq = td->td_umtxq;
+	if (timeout != NULL)
+		abs_timeout_init(&timo, CLOCK_REALTIME, 0, timeout);
 
 	/*
 	 * Care must be exercised when dealing with umtx structure. It
@@ -666,6 +893,10 @@
 			if (owner == -1)
 				return (EFAULT);
 
+			error = umtxq_check_susp(td);
+			if (error != 0)
+				break;
+
 			/* If this failed the lock has changed, restart. */
 			continue;
 		}
@@ -675,7 +906,7 @@
 		 * exit immediately.
 		 */
 		if (error != 0)
-			return (error);
+			break;
 
 		if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK,
 			AUTO_SHARE, &uq->uq_key)) != 0)
@@ -711,48 +942,21 @@
 		 */
 		umtxq_lock(&uq->uq_key);
 		if (old == owner)
-			error = umtxq_sleep(uq, "umtx", timo);
+			error = umtxq_sleep(uq, "umtx", timeout == NULL ? NULL :
+			    &timo);
 		umtxq_remove(uq);
 		umtxq_unlock(&uq->uq_key);
 		umtx_key_release(&uq->uq_key);
+
+		if (error == 0)
+			error = umtxq_check_susp(td);
 	}
 
-	return (0);
-}
-
-/*
- * Lock a umtx object.
- */
-static int
-do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id,
-	struct timespec *timeout)
-{
-	struct timespec ts, ts2, ts3;
-	struct timeval tv;
-	int error;
-
 	if (timeout == NULL) {
-		error = _do_lock_umtx(td, umtx, id, 0);
 		/* Mutex locking is restarted if it is interrupted. */
 		if (error == EINTR)
 			error = ERESTART;
 	} else {
-		getnanouptime(&ts);
-		timespecadd(&ts, timeout);
-		TIMESPEC_TO_TIMEVAL(&tv, timeout);
-		for (;;) {
-			error = _do_lock_umtx(td, umtx, id, tvtohz(&tv));
-			if (error != ETIMEDOUT)
-				break;
-			getnanouptime(&ts2);
-			if (timespeccmp(&ts2, &ts, >=)) {
-				error = ETIMEDOUT;
-				break;
-			}
-			ts3 = ts;
-			timespecsub(&ts3, &ts2);
-			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
-		}
 		/* Timed-locking is not restarted. */
 		if (error == ERESTART)
 			error = EINTR;
@@ -827,8 +1031,10 @@
  * Lock a umtx object.
  */
 static int
-_do_lock_umtx32(struct thread *td, uint32_t *m, uint32_t id, int timo)
+do_lock_umtx32(struct thread *td, uint32_t *m, uint32_t id,
+	const struct timespec *timeout)
 {
+	struct abs_timeout timo;
 	struct umtx_q *uq;
 	uint32_t owner;
 	uint32_t old;
@@ -836,6 +1042,9 @@
 
 	uq = td->td_umtxq;
 
+	if (timeout != NULL)
+		abs_timeout_init(&timo, CLOCK_REALTIME, 0, timeout);
+
 	/*
 	 * Care must be exercised when dealing with umtx structure. It
 	 * can fault on any access.
@@ -865,6 +1074,10 @@
 			if (owner == -1)
 				return (EFAULT);
 
+			error = umtxq_check_susp(td);
+			if (error != 0)
+				break;
+
 			/* If this failed the lock has changed, restart. */
 			continue;
 		}
@@ -910,48 +1123,21 @@
 		 */
 		umtxq_lock(&uq->uq_key);
 		if (old == owner)
-			error = umtxq_sleep(uq, "umtx", timo);
+			error = umtxq_sleep(uq, "umtx", timeout == NULL ?
+			    NULL : &timo);
 		umtxq_remove(uq);
 		umtxq_unlock(&uq->uq_key);
 		umtx_key_release(&uq->uq_key);
+
+		if (error == 0)
+			error = umtxq_check_susp(td);
 	}
 
-	return (0);
-}
-
-/*
- * Lock a umtx object.
- */
-static int
-do_lock_umtx32(struct thread *td, void *m, uint32_t id,
-	struct timespec *timeout)
-{
-	struct timespec ts, ts2, ts3;
-	struct timeval tv;
-	int error;
-
 	if (timeout == NULL) {
-		error = _do_lock_umtx32(td, m, id, 0);
 		/* Mutex locking is restarted if it is interrupted. */
 		if (error == EINTR)
 			error = ERESTART;
 	} else {
-		getnanouptime(&ts);
-		timespecadd(&ts, timeout);
-		TIMESPEC_TO_TIMEVAL(&tv, timeout);
-		for (;;) {
-			error = _do_lock_umtx32(td, m, id, tvtohz(&tv));
-			if (error != ETIMEDOUT)
-				break;
-			getnanouptime(&ts2);
-			if (timespeccmp(&ts2, &ts, >=)) {
-				error = ETIMEDOUT;
-				break;
-			}
-			ts3 = ts;
-			timespecsub(&ts3, &ts2);
-			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
-		}
 		/* Timed-locking is not restarted. */
 		if (error == ERESTART)
 			error = EINTR;
@@ -1026,12 +1212,12 @@
  */
 static int
 do_wait(struct thread *td, void *addr, u_long id,
-	struct timespec *timeout, int compat32, int is_private)
+	struct _umtx_time *timeout, int compat32, int is_private)
 {
+	struct abs_timeout timo;
 	struct umtx_q *uq;
-	struct timespec ts, ts2, ts3;
-	struct timeval tv;
 	u_long tmp;
+	uint32_t tmp32;
 	int error = 0;
 
 	uq = td->td_umtxq;
@@ -1039,50 +1225,36 @@
 		is_private ? THREAD_SHARE : AUTO_SHARE, &uq->uq_key)) != 0)
 		return (error);
 
+	if (timeout != NULL)
+		abs_timeout_init2(&timo, timeout);
+
 	umtxq_lock(&uq->uq_key);
 	umtxq_insert(uq);
 	umtxq_unlock(&uq->uq_key);
-	if (compat32 == 0)
-		tmp = fuword(addr);
-        else
-		tmp = (unsigned int)fuword32(addr);
-	if (tmp != id) {
-		umtxq_lock(&uq->uq_key);
-		umtxq_remove(uq);
-		umtxq_unlock(&uq->uq_key);
-	} else if (timeout == NULL) {
-		umtxq_lock(&uq->uq_key);
-		error = umtxq_sleep(uq, "uwait", 0);
-		umtxq_remove(uq);
-		umtxq_unlock(&uq->uq_key);
+	if (compat32 == 0) {
+		error = fueword(addr, &tmp);
+		if (error != 0)
+			error = EFAULT;
 	} else {
-		getnanouptime(&ts);
-		timespecadd(&ts, timeout);
-		TIMESPEC_TO_TIMEVAL(&tv, timeout);
-		umtxq_lock(&uq->uq_key);
-		for (;;) {
-			error = umtxq_sleep(uq, "uwait", tvtohz(&tv));
-			if (!(uq->uq_flags & UQF_UMTXQ)) {
-				error = 0;
-				break;
-			}
-			if (error != ETIMEDOUT)
-				break;
-			umtxq_unlock(&uq->uq_key);
-			getnanouptime(&ts2);
-			if (timespeccmp(&ts2, &ts, >=)) {
-				error = ETIMEDOUT;
-				umtxq_lock(&uq->uq_key);
-				break;
-			}
-			ts3 = ts;
-			timespecsub(&ts3, &ts2);
-			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
-			umtxq_lock(&uq->uq_key);
-		}
+		error = fueword32(addr, &tmp32);
+		if (error == 0)
+			tmp = tmp32;
+		else
+			error = EFAULT;
+	}
+	umtxq_lock(&uq->uq_key);
+	if (error == 0) {
+		if (tmp == id)
+			error = umtxq_sleep(uq, "uwait", timeout == NULL ?
+			    NULL : &timo);
+		if ((uq->uq_flags & UQF_UMTXQ) == 0)
+			error = 0;
+		else
+			umtxq_remove(uq);
+	} else if ((uq->uq_flags & UQF_UMTXQ) != 0) {
 		umtxq_remove(uq);
-		umtxq_unlock(&uq->uq_key);
 	}
+	umtxq_unlock(&uq->uq_key);
 	umtx_key_release(&uq->uq_key);
 	if (error == ERESTART)
 		error = EINTR;
@@ -1102,7 +1274,7 @@
 		is_private ? THREAD_SHARE : AUTO_SHARE, &key)) != 0)
 		return (ret);
 	umtxq_lock(&key);
-	ret = umtxq_signal(&key, n_wake);
+	umtxq_signal(&key, n_wake);
 	umtxq_unlock(&key);
 	umtx_key_release(&key);
 	return (0);
@@ -1112,15 +1284,19 @@
  * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
  */
 static int
-_do_lock_normal(struct thread *td, struct umutex *m, uint32_t flags, int timo,
-	int mode)
+do_lock_normal(struct thread *td, struct umutex *m, uint32_t flags,
+	struct _umtx_time *timeout, int mode)
 {
+	struct abs_timeout timo;
 	struct umtx_q *uq;
 	uint32_t owner, old, id;
-	int error = 0;
+	int error, rv;
 
 	id = td->td_tid;
 	uq = td->td_umtxq;
+	error = 0;
+	if (timeout != NULL)
+		abs_timeout_init2(&timo, timeout);
 
 	/*
 	 * Care must be exercised when dealing with umtx structure. It
@@ -1127,7 +1303,9 @@
 	 * can fault on any access.
 	 */
 	for (;;) {
-		owner = fuword32(__DEVOLATILE(void *, &m->m_owner));
+		rv = fueword32(&m->m_owner, &owner);
+		if (rv == -1)
+			return (EFAULT);
 		if (mode == _UMUTEX_WAIT) {
 			if (owner == UMUTEX_UNOWNED || owner == UMUTEX_CONTESTED)
 				return (0);
@@ -1135,27 +1313,31 @@
 			/*
 			 * Try the uncontested case.  This should be done in userland.
 			 */
-			owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
+			rv = casueword32(&m->m_owner, UMUTEX_UNOWNED,
+			    &owner, id);
+			/* The address was invalid. */
+			if (rv == -1)
+				return (EFAULT);
 
 			/* The acquire succeeded. */
 			if (owner == UMUTEX_UNOWNED)
 				return (0);
 
-			/* The address was invalid. */
-			if (owner == -1)
-				return (EFAULT);
-
 			/* If no one owns it but it is contested try to acquire it. */
 			if (owner == UMUTEX_CONTESTED) {
-				owner = casuword32(&m->m_owner,
-				    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
+				rv = casueword32(&m->m_owner,
+				    UMUTEX_CONTESTED, &owner,
+				    id | UMUTEX_CONTESTED);
+				/* The address was invalid. */
+				if (rv == -1)
+					return (EFAULT);
 
 				if (owner == UMUTEX_CONTESTED)
 					return (0);
 
-				/* The address was invalid. */
-				if (owner == -1)
-					return (EFAULT);
+				rv = umtxq_check_susp(td);
+				if (rv != 0)
+					return (rv);
 
 				/* If this failed the lock has changed, restart. */
 				continue;
@@ -1191,10 +1373,11 @@
 		 * either some one else has acquired the lock or it has been
 		 * released.
 		 */
-		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
+		rv = casueword32(&m->m_owner, owner, &old,
+		    owner | UMUTEX_CONTESTED);
 
 		/* The address was invalid. */
-		if (old == -1) {
+		if (rv == -1) {
 			umtxq_lock(&uq->uq_key);
 			umtxq_remove(uq);
 			umtxq_unbusy(&uq->uq_key);
@@ -1211,10 +1394,14 @@
 		umtxq_lock(&uq->uq_key);
 		umtxq_unbusy(&uq->uq_key);
 		if (old == owner)
-			error = umtxq_sleep(uq, "umtxn", timo);
+			error = umtxq_sleep(uq, "umtxn", timeout == NULL ?
+			    NULL : &timo);
 		umtxq_remove(uq);
 		umtxq_unlock(&uq->uq_key);
 		umtx_key_release(&uq->uq_key);
+
+		if (error == 0)
+			error = umtxq_check_susp(td);
 	}
 
 	return (0);
@@ -1221,9 +1408,6 @@
 }
 
 /*
- * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
- */
-/*
  * Unlock PTHREAD_PRIO_NONE protocol POSIX mutex.
  */
 static int
@@ -1238,8 +1422,8 @@
 	/*
 	 * Make sure we own this mtx.
 	 */
-	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
-	if (owner == -1)
+	error = fueword32(&m->m_owner, &owner);
+	if (error == -1)
 		return (EFAULT);
 
 	if ((owner & ~UMUTEX_CONTESTED) != id)
@@ -1246,8 +1430,8 @@
 		return (EPERM);
 
 	if ((owner & UMUTEX_CONTESTED) == 0) {
-		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
-		if (old == -1)
+		error = casueword32(&m->m_owner, owner, &old, UMUTEX_UNOWNED);
+		if (error == -1)
 			return (EFAULT);
 		if (old == owner)
 			return (0);
@@ -1269,14 +1453,14 @@
 	 * there is zero or one thread only waiting for it.
 	 * Otherwise, it must be marked as contested.
 	 */
-	old = casuword32(&m->m_owner, owner,
-		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
+	error = casueword32(&m->m_owner, owner, &old,
+	    count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
 	umtxq_lock(&key);
 	umtxq_signal(&key,1);
 	umtxq_unbusy(&key);
 	umtxq_unlock(&key);
 	umtx_key_release(&key);
-	if (old == -1)
+	if (error == -1)
 		return (EFAULT);
 	if (old != owner)
 		return (EINVAL);
@@ -1296,14 +1480,16 @@
 	int error;
 	int count;
 
-	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
-	if (owner == -1)
+	error = fueword32(&m->m_owner, &owner);
+	if (error == -1)
 		return (EFAULT);
 
 	if ((owner & ~UMUTEX_CONTESTED) != 0)
 		return (0);
 
-	flags = fuword32(&m->m_flags);
+	error = fueword32(&m->m_flags, &flags);
+	if (error == -1)
+		return (EFAULT);
 
 	/* We should only ever be in here for contested locks */
 	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
@@ -1315,16 +1501,20 @@
 	count = umtxq_count(&key);
 	umtxq_unlock(&key);
 
-	if (count <= 1)
-		owner = casuword32(&m->m_owner, UMUTEX_CONTESTED, UMUTEX_UNOWNED);
+	if (count <= 1) {
+		error = casueword32(&m->m_owner, UMUTEX_CONTESTED, &owner,
+		    UMUTEX_UNOWNED);
+		if (error == -1)
+			error = EFAULT;
+	}
 
 	umtxq_lock(&key);
-	if (count != 0 && (owner & ~UMUTEX_CONTESTED) == 0)
+	if (error == 0 && count != 0 && (owner & ~UMUTEX_CONTESTED) == 0)
 		umtxq_signal(&key, 1);
 	umtxq_unbusy(&key);
 	umtxq_unlock(&key);
 	umtx_key_release(&key);
-	return (0);
+	return (error);
 }
 
 /*
@@ -1367,31 +1557,47 @@
 	 * any memory.
 	 */
 	if (count > 1) {
-		owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
-		while ((owner & UMUTEX_CONTESTED) ==0) {
-			old = casuword32(&m->m_owner, owner,
-			    owner|UMUTEX_CONTESTED);
+		error = fueword32(&m->m_owner, &owner);
+		if (error == -1)
+			error = EFAULT;
+		while (error == 0 && (owner & UMUTEX_CONTESTED) == 0) {
+			error = casueword32(&m->m_owner, owner, &old,
+			    owner | UMUTEX_CONTESTED);
+			if (error == -1) {
+				error = EFAULT;
+				break;
+			}
 			if (old == owner)
 				break;
 			owner = old;
+			error = umtxq_check_susp(td);
+			if (error != 0)
+				break;
 		}
 	} else if (count == 1) {
-		owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
-		while ((owner & ~UMUTEX_CONTESTED) != 0 &&
+		error = fueword32(&m->m_owner, &owner);
+		if (error == -1)
+			error = EFAULT;
+		while (error == 0 && (owner & ~UMUTEX_CONTESTED) != 0 &&
 		       (owner & UMUTEX_CONTESTED) == 0) {
-			old = casuword32(&m->m_owner, owner,
-			    owner|UMUTEX_CONTESTED);
+			error = casueword32(&m->m_owner, owner, &old,
+			    owner | UMUTEX_CONTESTED);
+			if (error == -1) {
+				error = EFAULT;
+				break;
+			}
 			if (old == owner)
 				break;
 			owner = old;
+			error = umtxq_check_susp(td);
+			if (error != 0)
+				break;
 		}
 	}
 	umtxq_lock(&key);
-	if (owner == -1) {
-		error = EFAULT;
+	if (error == EFAULT) {
 		umtxq_signal(&key, INT_MAX);
-	}
-	else if (count != 0 && (owner & ~UMUTEX_CONTESTED) == 0)
+	} else if (count != 0 && (owner & ~UMUTEX_CONTESTED) == 0)
 		umtxq_signal(&key, 1);
 	umtxq_unbusy(&key);
 	umtxq_unlock(&key);
@@ -1462,7 +1668,48 @@
 	return (1);
 }
 
+static struct umtx_pi *
+umtx_pi_next(struct umtx_pi *pi)
+{
+	struct umtx_q *uq_owner;
+
+	if (pi->pi_owner == NULL)
+		return (NULL);
+	uq_owner = pi->pi_owner->td_umtxq;
+	if (uq_owner == NULL)
+		return (NULL);
+	return (uq_owner->uq_pi_blocked);
+}
+
 /*
+ * Floyd's Cycle-Finding Algorithm.
+ */
+static bool
+umtx_pi_check_loop(struct umtx_pi *pi)
+{
+	struct umtx_pi *pi1;	/* fast iterator */
+
+	mtx_assert(&umtx_lock, MA_OWNED);
+	if (pi == NULL)
+		return (false);
+	pi1 = pi;
+	for (;;) {
+		pi = umtx_pi_next(pi);
+		if (pi == NULL)
+			break;
+		pi1 = umtx_pi_next(pi1);
+		if (pi1 == NULL)
+			break;
+		pi1 = umtx_pi_next(pi1);
+		if (pi1 == NULL)
+			break;
+		if (pi == pi1)
+			return (true);
+	}
+	return (false);
+}
+
+/*
  * Propagate priority when a thread is blocked on POSIX
  * PI mutex.
  */ 
@@ -1479,6 +1726,8 @@
 	pi = uq->uq_pi_blocked;
 	if (pi == NULL)
 		return;
+	if (umtx_pi_check_loop(pi))
+		return;
 
 	for (;;) {
 		td = pi->pi_owner;
@@ -1522,6 +1771,8 @@
 
 	mtx_assert(&umtx_lock, MA_OWNED);
 
+	if (umtx_pi_check_loop(pi))
+		return;
 	while (pi != NULL && pi->pi_owner != NULL) {
 		pri = PRI_MAX;
 		uq_owner = pi->pi_owner->td_umtxq;
@@ -1555,23 +1806,35 @@
 	uq_owner = owner->td_umtxq;
 	mtx_assert(&umtx_lock, MA_OWNED);
 	if (pi->pi_owner != NULL)
-		panic("pi_ower != NULL");
+		panic("pi_owner != NULL");
 	pi->pi_owner = owner;
 	TAILQ_INSERT_TAIL(&uq_owner->uq_pi_contested, pi, pi_link);
 }
 
+
 /*
+ * Disown a PI mutex, and remove it from the owned list.
+ */
+static void
+umtx_pi_disown(struct umtx_pi *pi)
+{
+
+	mtx_assert(&umtx_lock, MA_OWNED);
+	TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested, pi, pi_link);
+	pi->pi_owner = NULL;
+}
+
+/*
  * Claim ownership of a PI mutex.
  */
 static int
 umtx_pi_claim(struct umtx_pi *pi, struct thread *owner)
 {
-	struct umtx_q *uq, *uq_owner;
+	struct umtx_q *uq;
 
-	uq_owner = owner->td_umtxq;
-	mtx_lock_spin(&umtx_lock);
+	mtx_lock(&umtx_lock);
 	if (pi->pi_owner == owner) {
-		mtx_unlock_spin(&umtx_lock);
+		mtx_unlock(&umtx_lock);
 		return (0);
 	}
 
@@ -1579,7 +1842,7 @@
 		/*
 		 * userland may have already messed the mutex, sigh.
 		 */
-		mtx_unlock_spin(&umtx_lock);
+		mtx_unlock(&umtx_lock);
 		return (EPERM);
 	}
 	umtx_pi_setowner(pi, owner);
@@ -1593,7 +1856,7 @@
 			sched_lend_user_prio(owner, pri);
 		thread_unlock(owner);
 	}
-	mtx_unlock_spin(&umtx_lock);
+	mtx_unlock(&umtx_lock);
 	return (0);
 }
 
@@ -1608,7 +1871,7 @@
 	struct umtx_pi *pi;
 
 	uq = td->td_umtxq;
-	mtx_lock_spin(&umtx_lock);
+	mtx_lock(&umtx_lock);
 	/*
 	 * Pick up the lock that td is blocked on.
 	 */
@@ -1617,7 +1880,7 @@
 		umtx_pi_adjust_thread(pi, td);
 		umtx_repropagate_priority(pi);
 	}
-	mtx_unlock_spin(&umtx_lock);
+	mtx_unlock(&umtx_lock);
 }
 
 /*
@@ -1625,7 +1888,7 @@
  */
 static int
 umtxq_sleep_pi(struct umtx_q *uq, struct umtx_pi *pi,
-	uint32_t owner, const char *wmesg, int timo)
+	uint32_t owner, const char *wmesg, struct abs_timeout *timo)
 {
 	struct umtxq_chain *uc;
 	struct thread *td, *td1;
@@ -1637,14 +1900,14 @@
 	KASSERT(td == curthread, ("inconsistent uq_thread"));
 	uc = umtxq_getchain(&uq->uq_key);
 	UMTXQ_LOCKED_ASSERT(uc);
-	UMTXQ_BUSY_ASSERT(uc);
+	KASSERT(uc->uc_busy != 0, ("umtx chain is not busy"));
 	umtxq_insert(uq);
-	mtx_lock_spin(&umtx_lock);
+	mtx_lock(&umtx_lock);
 	if (pi->pi_owner == NULL) {
-		mtx_unlock_spin(&umtx_lock);
+		mtx_unlock(&umtx_lock);
 		/* XXX Only look up thread in current process. */
 		td1 = tdfind(owner, curproc->p_pid);
-		mtx_lock_spin(&umtx_lock);
+		mtx_lock(&umtx_lock);
 		if (td1 != NULL) {
 			if (pi->pi_owner == NULL)
 				umtx_pi_setowner(pi, td1);
@@ -1668,18 +1931,13 @@
 	td->td_flags |= TDF_UPIBLOCKED;
 	thread_unlock(td);
 	umtx_propagate_priority(td);
-	mtx_unlock_spin(&umtx_lock);
+	mtx_unlock(&umtx_lock);
 	umtxq_unbusy(&uq->uq_key);
 
-	if (uq->uq_flags & UQF_UMTXQ) {
-		error = msleep(uq, &uc->uc_lock, PCATCH, wmesg, timo);
-		if (error == EWOULDBLOCK)
-			error = ETIMEDOUT;
-		if (uq->uq_flags & UQF_UMTXQ) {
-			umtxq_remove(uq);
-		}
-	}
-	mtx_lock_spin(&umtx_lock);
+	error = umtxq_sleep(uq, wmesg, timo);
+	umtxq_remove(uq);
+
+	mtx_lock(&umtx_lock);
 	uq->uq_pi_blocked = NULL;
 	thread_lock(td);
 	td->td_flags &= ~TDF_UPIBLOCKED;
@@ -1686,7 +1944,7 @@
 	thread_unlock(td);
 	TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
 	umtx_repropagate_priority(pi);
-	mtx_unlock_spin(&umtx_lock);
+	mtx_unlock(&umtx_lock);
 	umtxq_unlock(&uq->uq_key);
 
 	return (error);
@@ -1718,15 +1976,12 @@
 	UMTXQ_LOCKED_ASSERT(uc);
 	KASSERT(pi->pi_refcount > 0, ("invalid reference count"));
 	if (--pi->pi_refcount == 0) {
-		mtx_lock_spin(&umtx_lock);
-		if (pi->pi_owner != NULL) {
-			TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested,
-				pi, pi_link);
-			pi->pi_owner = NULL;
-		}
+		mtx_lock(&umtx_lock);
+		if (pi->pi_owner != NULL)
+			umtx_pi_disown(pi);
 		KASSERT(TAILQ_EMPTY(&pi->pi_blocked),
 			("blocked queue not empty"));
-		mtx_unlock_spin(&umtx_lock);
+		mtx_unlock(&umtx_lock);
 		TAILQ_REMOVE(&uc->uc_pi_list, pi, pi_hashlink);
 		umtx_pi_free(pi);
 	}
@@ -1769,13 +2024,14 @@
  * Lock a PI mutex.
  */
 static int
-_do_lock_pi(struct thread *td, struct umutex *m, uint32_t flags, int timo,
-	int try)
+do_lock_pi(struct thread *td, struct umutex *m, uint32_t flags,
+    struct _umtx_time *timeout, int try)
 {
+	struct abs_timeout timo;
 	struct umtx_q *uq;
 	struct umtx_pi *pi, *new_pi;
 	uint32_t id, owner, old;
-	int error;
+	int error, rv;
 
 	id = td->td_tid;
 	uq = td->td_umtxq;
@@ -1783,6 +2039,10 @@
 	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
 	    &uq->uq_key)) != 0)
 		return (error);
+
+	if (timeout != NULL)
+		abs_timeout_init2(&timo, timeout);
+
 	umtxq_lock(&uq->uq_key);
 	pi = umtx_pi_lookup(&uq->uq_key);
 	if (pi == NULL) {
@@ -1814,7 +2074,12 @@
 		/*
 		 * Try the uncontested case.  This should be done in userland.
 		 */
-		owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
+		rv = casueword32(&m->m_owner, UMUTEX_UNOWNED, &owner, id);
+		/* The address was invalid. */
+		if (rv == -1) {
+			error = EFAULT;
+			break;
+		}
 
 		/* The acquire succeeded. */
 		if (owner == UMUTEX_UNOWNED) {
@@ -1822,16 +2087,15 @@
 			break;
 		}
 
-		/* The address was invalid. */
-		if (owner == -1) {
-			error = EFAULT;
-			break;
-		}
-
 		/* If no one owns it but it is contested try to acquire it. */
 		if (owner == UMUTEX_CONTESTED) {
-			owner = casuword32(&m->m_owner,
-			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
+			rv = casueword32(&m->m_owner,
+			    UMUTEX_CONTESTED, &owner, id | UMUTEX_CONTESTED);
+			/* The address was invalid. */
+			if (rv == -1) {
+				error = EFAULT;
+				break;
+			}
 
 			if (owner == UMUTEX_CONTESTED) {
 				umtxq_lock(&uq->uq_key);
@@ -1839,21 +2103,29 @@
 				error = umtx_pi_claim(pi, td);
 				umtxq_unbusy(&uq->uq_key);
 				umtxq_unlock(&uq->uq_key);
+				if (error != 0) {
+					/*
+					 * Since we're going to return an
+					 * error, restore the m_owner to its
+					 * previous, unowned state to avoid
+					 * compounding the problem.
+					 */
+					(void)casuword32(&m->m_owner,
+					    id | UMUTEX_CONTESTED,
+					    UMUTEX_CONTESTED);
+				}
 				break;
 			}
 
-			/* The address was invalid. */
-			if (owner == -1) {
-				error = EFAULT;
+			error = umtxq_check_susp(td);
+			if (error != 0)
 				break;
-			}
 
 			/* If this failed the lock has changed, restart. */
 			continue;
 		}
 
-		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
-		    (owner & ~UMUTEX_CONTESTED) == id) {
+		if ((owner & ~UMUTEX_CONTESTED) == id) {
 			error = EDEADLK;
 			break;
 		}
@@ -1880,13 +2152,12 @@
 		 * either some one else has acquired the lock or it has been
 		 * released.
 		 */
-		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
+		rv = casueword32(&m->m_owner, owner, &old,
+		    owner | UMUTEX_CONTESTED);
 
 		/* The address was invalid. */
-		if (old == -1) {
-			umtxq_lock(&uq->uq_key);
-			umtxq_unbusy(&uq->uq_key);
-			umtxq_unlock(&uq->uq_key);
+		if (rv == -1) {
+			umtxq_unbusy_unlocked(&uq->uq_key);
 			error = EFAULT;
 			break;
 		}
@@ -1897,13 +2168,19 @@
 		 * and we need to retry or we lost a race to the thread
 		 * unlocking the umtx.
 		 */
-		if (old == owner)
+		if (old == owner) {
 			error = umtxq_sleep_pi(uq, pi, owner & ~UMUTEX_CONTESTED,
-				 "umtxpi", timo);
-		else {
+			    "umtxpi", timeout == NULL ? NULL : &timo);
+			if (error != 0)
+				continue;
+		} else {
 			umtxq_unbusy(&uq->uq_key);
 			umtxq_unlock(&uq->uq_key);
 		}
+
+		error = umtxq_check_susp(td);
+		if (error != 0)
+			break;
 	}
 
 	umtxq_lock(&uq->uq_key);
@@ -1932,8 +2209,8 @@
 	/*
 	 * Make sure we own this mtx.
 	 */
-	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
-	if (owner == -1)
+	error = fueword32(&m->m_owner, &owner);
+	if (error == -1)
 		return (EFAULT);
 
 	if ((owner & ~UMUTEX_CONTESTED) != id)
@@ -1941,8 +2218,8 @@
 
 	/* This should be done in userland */
 	if ((owner & UMUTEX_CONTESTED) == 0) {
-		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
-		if (old == -1)
+		error = casueword32(&m->m_owner, owner, &old, UMUTEX_UNOWNED);
+		if (error == -1)
 			return (EFAULT);
 		if (old == owner)
 			return (0);
@@ -1958,11 +2235,11 @@
 	umtxq_busy(&key);
 	count = umtxq_count_pi(&key, &uq_first);
 	if (uq_first != NULL) {
-		mtx_lock_spin(&umtx_lock);
+		mtx_lock(&umtx_lock);
 		pi = uq_first->uq_pi_blocked;
 		KASSERT(pi != NULL, ("pi == NULL?"));
-		if (pi->pi_owner != curthread) {
-			mtx_unlock_spin(&umtx_lock);
+		if (pi->pi_owner != td) {
+			mtx_unlock(&umtx_lock);
 			umtxq_unbusy(&key);
 			umtxq_unlock(&key);
 			umtx_key_release(&key);
@@ -1969,9 +2246,8 @@
 			/* userland messed the mutex */
 			return (EPERM);
 		}
-		uq_me = curthread->td_umtxq;
-		pi->pi_owner = NULL;
-		TAILQ_REMOVE(&uq_me->uq_pi_contested, pi, pi_link);
+		uq_me = td->td_umtxq;
+		umtx_pi_disown(pi);
 		/* get highest priority thread which is still sleeping. */
 		uq_first = TAILQ_FIRST(&pi->pi_blocked);
 		while (uq_first != NULL && 
@@ -1986,12 +2262,31 @@
 					pri = UPRI(uq_first2->uq_thread);
 			}
 		}
-		thread_lock(curthread);
-		sched_lend_user_prio(curthread, pri);
-		thread_unlock(curthread);
-		mtx_unlock_spin(&umtx_lock);
+		thread_lock(td);
+		sched_lend_user_prio(td, pri);
+		thread_unlock(td);
+		mtx_unlock(&umtx_lock);
 		if (uq_first)
 			umtxq_signal_thread(uq_first);
+	} else {
+		pi = umtx_pi_lookup(&key);
+		/*
+		 * A umtx_pi can exist if a signal or timeout removed the
+		 * last waiter from the umtxq, but there is still
+		 * a thread in do_lock_pi() holding the umtx_pi.
+		 */
+		if (pi != NULL) {
+			/*
+			 * The umtx_pi can be unowned, such as when a thread
+			 * has just entered do_lock_pi(), allocated the
+			 * umtx_pi, and unlocked the umtxq.
+			 * If the current thread owns it, it must disown it.
+			 */
+			mtx_lock(&umtx_lock);
+			if (pi->pi_owner == td)
+				umtx_pi_disown(pi);
+			mtx_unlock(&umtx_lock);
+		}
 	}
 	umtxq_unlock(&key);
 
@@ -2000,14 +2295,12 @@
 	 * there is zero or one thread only waiting for it.
 	 * Otherwise, it must be marked as contested.
 	 */
-	old = casuword32(&m->m_owner, owner,
-		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
+	error = casueword32(&m->m_owner, owner, &old,
+	    count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
 
-	umtxq_lock(&key);
-	umtxq_unbusy(&key);
-	umtxq_unlock(&key);
+	umtxq_unbusy_unlocked(&key);
 	umtx_key_release(&key);
-	if (old == -1)
+	if (error == -1)
 		return (EFAULT);
 	if (old != owner)
 		return (EINVAL);
@@ -2018,14 +2311,15 @@
  * Lock a PP mutex.
  */
 static int
-_do_lock_pp(struct thread *td, struct umutex *m, uint32_t flags, int timo,
-	int try)
+do_lock_pp(struct thread *td, struct umutex *m, uint32_t flags,
+    struct _umtx_time *timeout, int try)
 {
+	struct abs_timeout timo;
 	struct umtx_q *uq, *uq2;
 	struct umtx_pi *pi;
 	uint32_t ceiling;
 	uint32_t owner, id;
-	int error, pri, old_inherited_pri, su;
+	int error, pri, old_inherited_pri, su, rv;
 
 	id = td->td_tid;
 	uq = td->td_umtxq;
@@ -2032,6 +2326,10 @@
 	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
 	    &uq->uq_key)) != 0)
 		return (error);
+
+	if (timeout != NULL)
+		abs_timeout_init2(&timo, timeout);
+
 	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
 	for (;;) {
 		old_inherited_pri = uq->uq_inherited_pri;
@@ -2039,15 +2337,20 @@
 		umtxq_busy(&uq->uq_key);
 		umtxq_unlock(&uq->uq_key);
 
-		ceiling = RTP_PRIO_MAX - fuword32(&m->m_ceilings[0]);
+		rv = fueword32(&m->m_ceilings[0], &ceiling);
+		if (rv == -1) {
+			error = EFAULT;
+			goto out;
+		}
+		ceiling = RTP_PRIO_MAX - ceiling;
 		if (ceiling > RTP_PRIO_MAX) {
 			error = EINVAL;
 			goto out;
 		}
 
-		mtx_lock_spin(&umtx_lock);
+		mtx_lock(&umtx_lock);
 		if (UPRI(td) < PRI_MIN_REALTIME + ceiling) {
-			mtx_unlock_spin(&umtx_lock);
+			mtx_unlock(&umtx_lock);
 			error = EINVAL;
 			goto out;
 		}
@@ -2058,10 +2361,15 @@
 				sched_lend_user_prio(td, uq->uq_inherited_pri);
 			thread_unlock(td);
 		}
-		mtx_unlock_spin(&umtx_lock);
+		mtx_unlock(&umtx_lock);
 
-		owner = casuword32(&m->m_owner,
-		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
+		rv = casueword32(&m->m_owner,
+		    UMUTEX_CONTESTED, &owner, id | UMUTEX_CONTESTED);
+		/* The address was invalid. */
+		if (rv == -1) {
+			error = EFAULT;
+			break;
+		}
 
 		if (owner == UMUTEX_CONTESTED) {
 			error = 0;
@@ -2068,12 +2376,6 @@
 			break;
 		}
 
-		/* The address was invalid. */
-		if (owner == -1) {
-			error = EFAULT;
-			break;
-		}
-
 		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
 		    (owner & ~UMUTEX_CONTESTED) == id) {
 			error = EDEADLK;
@@ -2095,11 +2397,12 @@
 		umtxq_lock(&uq->uq_key);
 		umtxq_insert(uq);
 		umtxq_unbusy(&uq->uq_key);
-		error = umtxq_sleep(uq, "umtxpp", timo);
+		error = umtxq_sleep(uq, "umtxpp", timeout == NULL ?
+		    NULL : &timo);
 		umtxq_remove(uq);
 		umtxq_unlock(&uq->uq_key);
 
-		mtx_lock_spin(&umtx_lock);
+		mtx_lock(&umtx_lock);
 		uq->uq_inherited_pri = old_inherited_pri;
 		pri = PRI_MAX;
 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
@@ -2114,11 +2417,11 @@
 		thread_lock(td);
 		sched_lend_user_prio(td, pri);
 		thread_unlock(td);
-		mtx_unlock_spin(&umtx_lock);
+		mtx_unlock(&umtx_lock);
 	}
 
 	if (error != 0) {
-		mtx_lock_spin(&umtx_lock);
+		mtx_lock(&umtx_lock);
 		uq->uq_inherited_pri = old_inherited_pri;
 		pri = PRI_MAX;
 		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
@@ -2133,13 +2436,11 @@
 		thread_lock(td);
 		sched_lend_user_prio(td, pri);
 		thread_unlock(td);
-		mtx_unlock_spin(&umtx_lock);
+		mtx_unlock(&umtx_lock);
 	}
 
 out:
-	umtxq_lock(&uq->uq_key);
-	umtxq_unbusy(&uq->uq_key);
-	umtxq_unlock(&uq->uq_key);
+	umtxq_unbusy_unlocked(&uq->uq_key);
 	umtx_key_release(&uq->uq_key);
 	return (error);
 }
@@ -2164,8 +2465,8 @@
 	/*
 	 * Make sure we own this mtx.
 	 */
-	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
-	if (owner == -1)
+	error = fueword32(&m->m_owner, &owner);
+	if (error == -1)
 		return (EFAULT);
 
 	if ((owner & ~UMUTEX_CONTESTED) != id)
@@ -2196,8 +2497,7 @@
 	 * to lock the mutex, it is necessary because thread priority
 	 * has to be adjusted for such mutex.
 	 */
-	error = suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
-		UMUTEX_CONTESTED);
+	error = suword32(&m->m_owner, UMUTEX_CONTESTED);
 
 	umtxq_lock(&key);
 	if (error == 0)
@@ -2208,7 +2508,7 @@
 	if (error == -1)
 		error = EFAULT;
 	else {
-		mtx_lock_spin(&umtx_lock);
+		mtx_lock(&umtx_lock);
 		if (su != 0)
 			uq->uq_inherited_pri = new_inherited_pri;
 		pri = PRI_MAX;
@@ -2224,7 +2524,7 @@
 		thread_lock(td);
 		sched_lend_user_prio(td, pri);
 		thread_unlock(td);
-		mtx_unlock_spin(&umtx_lock);
+		mtx_unlock(&umtx_lock);
 	}
 	umtx_key_release(&key);
 	return (error);
@@ -2238,9 +2538,11 @@
 	uint32_t save_ceiling;
 	uint32_t owner, id;
 	uint32_t flags;
-	int error;
+	int error, rv;
 
-	flags = fuword32(&m->m_flags);
+	error = fueword32(&m->m_flags, &flags);
+	if (error == -1)
+		return (EFAULT);
 	if ((flags & UMUTEX_PRIO_PROTECT) == 0)
 		return (EINVAL);
 	if (ceiling > RTP_PRIO_MAX)
@@ -2255,25 +2557,26 @@
 		umtxq_busy(&uq->uq_key);
 		umtxq_unlock(&uq->uq_key);
 
-		save_ceiling = fuword32(&m->m_ceilings[0]);
+		rv = fueword32(&m->m_ceilings[0], &save_ceiling);
+		if (rv == -1) {
+			error = EFAULT;
+			break;
+		}
 
-		owner = casuword32(&m->m_owner,
-		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
+		rv = casueword32(&m->m_owner,
+		    UMUTEX_CONTESTED, &owner, id | UMUTEX_CONTESTED);
+		if (rv == -1) {
+			error = EFAULT;
+			break;
+		}
 
 		if (owner == UMUTEX_CONTESTED) {
 			suword32(&m->m_ceilings[0], ceiling);
-			suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
-				UMUTEX_CONTESTED);
+			suword32(&m->m_owner, UMUTEX_CONTESTED);
 			error = 0;
 			break;
 		}
 
-		/* The address was invalid. */
-		if (owner == -1) {
-			error = EFAULT;
-			break;
-		}
-
 		if ((owner & ~UMUTEX_CONTESTED) == id) {
 			suword32(&m->m_ceilings[0], ceiling);
 			error = 0;
@@ -2295,7 +2598,7 @@
 		umtxq_lock(&uq->uq_key);
 		umtxq_insert(uq);
 		umtxq_unbusy(&uq->uq_key);
-		error = umtxq_sleep(uq, "umtxpp", 0);
+		error = umtxq_sleep(uq, "umtxpp", NULL);
 		umtxq_remove(uq);
 		umtxq_unlock(&uq->uq_key);
 	}
@@ -2310,59 +2613,37 @@
 	return (error);
 }
 
-static int
-_do_lock_umutex(struct thread *td, struct umutex *m, int flags, int timo,
-	int mode)
-{
-	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
-	case 0:
-		return (_do_lock_normal(td, m, flags, timo, mode));
-	case UMUTEX_PRIO_INHERIT:
-		return (_do_lock_pi(td, m, flags, timo, mode));
-	case UMUTEX_PRIO_PROTECT:
-		return (_do_lock_pp(td, m, flags, timo, mode));
-	}
-	return (EINVAL);
-}
-
 /*
  * Lock a userland POSIX mutex.
  */
 static int
 do_lock_umutex(struct thread *td, struct umutex *m,
-	struct timespec *timeout, int mode)
+    struct _umtx_time *timeout, int mode)
 {
-	struct timespec ts, ts2, ts3;
-	struct timeval tv;
 	uint32_t flags;
 	int error;
 
-	flags = fuword32(&m->m_flags);
-	if (flags == -1)
+	error = fueword32(&m->m_flags, &flags);
+	if (error == -1)
 		return (EFAULT);
 
+	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
+	case 0:
+		error = do_lock_normal(td, m, flags, timeout, mode);
+		break;
+	case UMUTEX_PRIO_INHERIT:
+		error = do_lock_pi(td, m, flags, timeout, mode);
+		break;
+	case UMUTEX_PRIO_PROTECT:
+		error = do_lock_pp(td, m, flags, timeout, mode);
+		break;
+	default:
+		return (EINVAL);
+	}
 	if (timeout == NULL) {
-		error = _do_lock_umutex(td, m, flags, 0, mode);
-		/* Mutex locking is restarted if it is interrupted. */
 		if (error == EINTR && mode != _UMUTEX_WAIT)
 			error = ERESTART;
 	} else {
-		getnanouptime(&ts);
-		timespecadd(&ts, timeout);
-		TIMESPEC_TO_TIMEVAL(&tv, timeout);
-		for (;;) {
-			error = _do_lock_umutex(td, m, flags, tvtohz(&tv), mode);
-			if (error != ETIMEDOUT)
-				break;
-			getnanouptime(&ts2);
-			if (timespeccmp(&ts2, &ts, >=)) {
-				error = ETIMEDOUT;
-				break;
-			}
-			ts3 = ts;
-			timespecsub(&ts3, &ts2);
-			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
-		}
 		/* Timed-locking is not restarted. */
 		if (error == ERESTART)
 			error = EINTR;
@@ -2377,9 +2658,10 @@
 do_unlock_umutex(struct thread *td, struct umutex *m)
 {
 	uint32_t flags;
+	int error;
 
-	flags = fuword32(&m->m_flags);
-	if (flags == -1)
+	error = fueword32(&m->m_flags, &flags);
+	if (error == -1)
 		return (EFAULT);
 
 	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
@@ -2398,24 +2680,29 @@
 do_cv_wait(struct thread *td, struct ucond *cv, struct umutex *m,
 	struct timespec *timeout, u_long wflags)
 {
+	struct abs_timeout timo;
 	struct umtx_q *uq;
-	struct timeval tv;
-	struct timespec cts, ets, tts;
-	uint32_t flags;
-	uint32_t clockid;
+	uint32_t flags, clockid, hasw;
 	int error;
 
 	uq = td->td_umtxq;
-	flags = fuword32(&cv->c_flags);
+	error = fueword32(&cv->c_flags, &flags);
+	if (error == -1)
+		return (EFAULT);
 	error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &uq->uq_key);
 	if (error != 0)
 		return (error);
 
 	if ((wflags & CVWAIT_CLOCKID) != 0) {
-		clockid = fuword32(&cv->c_clockid);
+		error = fueword32(&cv->c_clockid, &clockid);
+		if (error == -1) {
+			umtx_key_release(&uq->uq_key);
+			return (EFAULT);
+		}
 		if (clockid < CLOCK_REALTIME ||
 		    clockid >= CLOCK_THREAD_CPUTIME_ID) {
 			/* hmm, only HW clock id will work. */
+			umtx_key_release(&uq->uq_key);
 			return (EINVAL);
 		}
 	} else {
@@ -2431,45 +2718,22 @@
 	 * Set c_has_waiters to 1 before releasing user mutex, also
 	 * don't modify cache line when unnecessary.
 	 */
-	if (fuword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters)) == 0)
-		suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 1);
+	error = fueword32(&cv->c_has_waiters, &hasw);
+	if (error == 0 && hasw == 0)
+		suword32(&cv->c_has_waiters, 1);
 
-	umtxq_lock(&uq->uq_key);
-	umtxq_unbusy(&uq->uq_key);
-	umtxq_unlock(&uq->uq_key);
+	umtxq_unbusy_unlocked(&uq->uq_key);
 
 	error = do_unlock_umutex(td, m);
+
+	if (timeout != NULL)
+		abs_timeout_init(&timo, clockid, ((wflags & CVWAIT_ABSTIME) != 0),
+			timeout);
 	
 	umtxq_lock(&uq->uq_key);
 	if (error == 0) {
-		if (timeout == NULL) {
-			error = umtxq_sleep(uq, "ucond", 0);
-		} else {
-			if ((wflags & CVWAIT_ABSTIME) == 0) {
-				kern_clock_gettime(td, clockid, &ets);
-				timespecadd(&ets, timeout);
-				tts = *timeout;
-			} else { /* absolute time */
-				ets = *timeout;
-				tts = *timeout;
-				kern_clock_gettime(td, clockid, &cts);
-				timespecsub(&tts, &cts);
-			}
-			TIMESPEC_TO_TIMEVAL(&tv, &tts);
-			for (;;) {
-				error = umtxq_sleep(uq, "ucond", tvtohz(&tv));
-				if (error != ETIMEDOUT)
-					break;
-				kern_clock_gettime(td, clockid, &cts);
-				if (timespeccmp(&cts, &ets, >=)) {
-					error = ETIMEDOUT;
-					break;
-				}
-				tts = ets;
-				timespecsub(&tts, &cts);
-				TIMESPEC_TO_TIMEVAL(&tv, &tts);
-			}
-		}
+		error = umtxq_sleep(uq, "ucond", timeout == NULL ?
+		    NULL : &timo);
 	}
 
 	if ((uq->uq_flags & UQF_UMTXQ) == 0)
@@ -2486,9 +2750,7 @@
 			umtxq_remove(uq);
 			if (oldlen == 1) {
 				umtxq_unlock(&uq->uq_key);
-				suword32(
-				    __DEVOLATILE(uint32_t *,
-					 &cv->c_has_waiters), 0);
+				suword32(&cv->c_has_waiters, 0);
 				umtxq_lock(&uq->uq_key);
 			}
 		}
@@ -2512,7 +2774,9 @@
 	int error, cnt, nwake;
 	uint32_t flags;
 
-	flags = fuword32(&cv->c_flags);
+	error = fueword32(&cv->c_flags, &flags);
+	if (error == -1)
+		return (EFAULT);
 	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
 		return (error);	
 	umtxq_lock(&key);
@@ -2521,8 +2785,9 @@
 	nwake = umtxq_signal(&key, 1);
 	if (cnt <= nwake) {
 		umtxq_unlock(&key);
-		error = suword32(
-		    __DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);
+		error = suword32(&cv->c_has_waiters, 0);
+		if (error == -1)
+			error = EFAULT;
 		umtxq_lock(&key);
 	}
 	umtxq_unbusy(&key);
@@ -2538,7 +2803,9 @@
 	int error;
 	uint32_t flags;
 
-	flags = fuword32(&cv->c_flags);
+	error = fueword32(&cv->c_flags, &flags);
+	if (error == -1)
+		return (EFAULT);
 	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
 		return (error);	
 
@@ -2547,11 +2814,11 @@
 	umtxq_signal(&key, INT_MAX);
 	umtxq_unlock(&key);
 
-	error = suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);
+	error = suword32(&cv->c_has_waiters, 0);
+	if (error == -1)
+		error = EFAULT;
 
-	umtxq_lock(&key);
-	umtxq_unbusy(&key);
-	umtxq_unlock(&key);
+	umtxq_unbusy_unlocked(&key);
 
 	umtx_key_release(&key);
 	return (error);
@@ -2558,26 +2825,37 @@
 }
 
 static int
-do_rw_rdlock(struct thread *td, struct urwlock *rwlock, long fflag, int timo)
+do_rw_rdlock(struct thread *td, struct urwlock *rwlock, long fflag, struct _umtx_time *timeout)
 {
+	struct abs_timeout timo;
 	struct umtx_q *uq;
 	uint32_t flags, wrflags;
 	int32_t state, oldstate;
 	int32_t blocked_readers;
-	int error;
+	int error, error1, rv;
 
 	uq = td->td_umtxq;
-	flags = fuword32(&rwlock->rw_flags);
+	error = fueword32(&rwlock->rw_flags, &flags);
+	if (error == -1)
+		return (EFAULT);
 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
 	if (error != 0)
 		return (error);
 
+	if (timeout != NULL)
+		abs_timeout_init2(&timo, timeout);
+
 	wrflags = URWLOCK_WRITE_OWNER;
 	if (!(fflag & URWLOCK_PREFER_READER) && !(flags & URWLOCK_PREFER_READER))
 		wrflags |= URWLOCK_WRITE_WAITERS;
 
 	for (;;) {
-		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
+		rv = fueword32(&rwlock->rw_state, &state);
+		if (rv == -1) {
+			umtx_key_release(&uq->uq_key);
+			return (EFAULT);
+		}
+
 		/* try to lock it */
 		while (!(state & wrflags)) {
 			if (__predict_false(URWLOCK_READER_COUNT(state) == URWLOCK_MAX_READERS)) {
@@ -2584,11 +2862,19 @@
 				umtx_key_release(&uq->uq_key);
 				return (EAGAIN);
 			}
-			oldstate = casuword32(&rwlock->rw_state, state, state + 1);
+			rv = casueword32(&rwlock->rw_state, state,
+			    &oldstate, state + 1);
+			if (rv == -1) {
+				umtx_key_release(&uq->uq_key);
+				return (EFAULT);
+			}
 			if (oldstate == state) {
 				umtx_key_release(&uq->uq_key);
 				return (0);
 			}
+			error = umtxq_check_susp(td);
+			if (error != 0)
+				break;
 			state = oldstate;
 		}
 
@@ -2604,27 +2890,49 @@
 		 * re-read the state, in case it changed between the try-lock above
 		 * and the check below
 		 */
-		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
+		rv = fueword32(&rwlock->rw_state, &state);
+		if (rv == -1)
+			error = EFAULT;
 
 		/* set read contention bit */
-		while ((state & wrflags) && !(state & URWLOCK_READ_WAITERS)) {
-			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_READ_WAITERS);
+		while (error == 0 && (state & wrflags) &&
+		    !(state & URWLOCK_READ_WAITERS)) {
+			rv = casueword32(&rwlock->rw_state, state,
+			    &oldstate, state | URWLOCK_READ_WAITERS);
+			if (rv == -1) {
+				error = EFAULT;
+				break;
+			}
 			if (oldstate == state)
 				goto sleep;
 			state = oldstate;
+			error = umtxq_check_susp(td);
+			if (error != 0)
+				break;
 		}
+		if (error != 0) {
+			umtxq_unbusy_unlocked(&uq->uq_key);
+			break;
+		}
 
 		/* state is changed while setting flags, restart */
 		if (!(state & wrflags)) {
-			umtxq_lock(&uq->uq_key);
-			umtxq_unbusy(&uq->uq_key);
-			umtxq_unlock(&uq->uq_key);
+			umtxq_unbusy_unlocked(&uq->uq_key);
+			error = umtxq_check_susp(td);
+			if (error != 0)
+				break;
 			continue;
 		}
 
 sleep:
 		/* contention bit is set, before sleeping, increase read waiter count */
-		blocked_readers = fuword32(&rwlock->rw_blocked_readers);
+		rv = fueword32(&rwlock->rw_blocked_readers,
+		    &blocked_readers);
+		if (rv == -1) {
+			umtxq_unbusy_unlocked(&uq->uq_key);
+			error = EFAULT;
+			break;
+		}
 		suword32(&rwlock->rw_blocked_readers, blocked_readers+1);
 
 		while (state & wrflags) {
@@ -2632,7 +2940,8 @@
 			umtxq_insert(uq);
 			umtxq_unbusy(&uq->uq_key);
 
-			error = umtxq_sleep(uq, "urdlck", timo);
+			error = umtxq_sleep(uq, "urdlck", timeout == NULL ?
+			    NULL : &timo);
 
 			umtxq_busy(&uq->uq_key);
 			umtxq_remove(uq);
@@ -2639,54 +2948,53 @@
 			umtxq_unlock(&uq->uq_key);
 			if (error)
 				break;
-			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
+			rv = fueword32(&rwlock->rw_state, &state);
+			if (rv == -1) {
+				error = EFAULT;
+				break;
+			}
 		}
 
 		/* decrease read waiter count, and may clear read contention bit */
-		blocked_readers = fuword32(&rwlock->rw_blocked_readers);
+		rv = fueword32(&rwlock->rw_blocked_readers,
+		    &blocked_readers);
+		if (rv == -1) {
+			umtxq_unbusy_unlocked(&uq->uq_key);
+			error = EFAULT;
+			break;
+		}
 		suword32(&rwlock->rw_blocked_readers, blocked_readers-1);
 		if (blocked_readers == 1) {
-			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
+			rv = fueword32(&rwlock->rw_state, &state);
+			if (rv == -1) {
+				umtxq_unbusy_unlocked(&uq->uq_key);
+				error = EFAULT;
+				break;
+			}
 			for (;;) {
-				oldstate = casuword32(&rwlock->rw_state, state,
-					 state & ~URWLOCK_READ_WAITERS);
+				rv = casueword32(&rwlock->rw_state, state,
+				    &oldstate, state & ~URWLOCK_READ_WAITERS);
+				if (rv == -1) {
+					error = EFAULT;
+					break;
+				}
 				if (oldstate == state)
 					break;
 				state = oldstate;
+				error1 = umtxq_check_susp(td);
+				if (error1 != 0) {
+					if (error == 0)
+						error = error1;
+					break;
+				}
 			}
 		}
 
-		umtxq_lock(&uq->uq_key);
-		umtxq_unbusy(&uq->uq_key);
-		umtxq_unlock(&uq->uq_key);
+		umtxq_unbusy_unlocked(&uq->uq_key);
+		if (error != 0)
+			break;
 	}
 	umtx_key_release(&uq->uq_key);
-	return (error);
-}
-
-static int
-do_rw_rdlock2(struct thread *td, void *obj, long val, struct timespec *timeout)
-{
-	struct timespec ts, ts2, ts3;
-	struct timeval tv;
-	int error;
-
-	getnanouptime(&ts);
-	timespecadd(&ts, timeout);
-	TIMESPEC_TO_TIMEVAL(&tv, timeout);
-	for (;;) {
-		error = do_rw_rdlock(td, obj, val, tvtohz(&tv));
-		if (error != ETIMEDOUT)
-			break;
-		getnanouptime(&ts2);
-		if (timespeccmp(&ts2, &ts, >=)) {
-			error = ETIMEDOUT;
-			break;
-		}
-		ts3 = ts;
-		timespecsub(&ts3, &ts2);
-		TIMESPEC_TO_TIMEVAL(&tv, &ts3);
-	}
 	if (error == ERESTART)
 		error = EINTR;
 	return (error);
@@ -2693,31 +3001,49 @@
 }
 
 static int
-do_rw_wrlock(struct thread *td, struct urwlock *rwlock, int timo)
+do_rw_wrlock(struct thread *td, struct urwlock *rwlock, struct _umtx_time *timeout)
 {
+	struct abs_timeout timo;
 	struct umtx_q *uq;
 	uint32_t flags;
 	int32_t state, oldstate;
 	int32_t blocked_writers;
 	int32_t blocked_readers;
-	int error;
+	int error, error1, rv;
 
 	uq = td->td_umtxq;
-	flags = fuword32(&rwlock->rw_flags);
+	error = fueword32(&rwlock->rw_flags, &flags);
+	if (error == -1)
+		return (EFAULT);
 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
 	if (error != 0)
 		return (error);
 
+	if (timeout != NULL)
+		abs_timeout_init2(&timo, timeout);
+
 	blocked_readers = 0;
 	for (;;) {
-		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
+		rv = fueword32(&rwlock->rw_state, &state);
+		if (rv == -1) {
+			umtx_key_release(&uq->uq_key);
+			return (EFAULT);
+		}
 		while (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
-			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_WRITE_OWNER);
+			rv = casueword32(&rwlock->rw_state, state,
+			    &oldstate, state | URWLOCK_WRITE_OWNER);
+			if (rv == -1) {
+				umtx_key_release(&uq->uq_key);
+				return (EFAULT);
+			}
 			if (oldstate == state) {
 				umtx_key_release(&uq->uq_key);
 				return (0);
 			}
 			state = oldstate;
+			error = umtxq_check_susp(td);
+			if (error != 0)
+				break;
 		}
 
 		if (error) {
@@ -2742,24 +3068,46 @@
 		 * re-read the state, in case it changed between the try-lock above
 		 * and the check below
 		 */
-		state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
+		rv = fueword32(&rwlock->rw_state, &state);
+		if (rv == -1)
+			error = EFAULT;
 
-		while (((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) &&
-		       (state & URWLOCK_WRITE_WAITERS) == 0) {
-			oldstate = casuword32(&rwlock->rw_state, state, state | URWLOCK_WRITE_WAITERS);
+		while (error == 0 && ((state & URWLOCK_WRITE_OWNER) ||
+		    URWLOCK_READER_COUNT(state) != 0) &&
+		    (state & URWLOCK_WRITE_WAITERS) == 0) {
+			rv = casueword32(&rwlock->rw_state, state,
+			    &oldstate, state | URWLOCK_WRITE_WAITERS);
+			if (rv == -1) {
+				error = EFAULT;
+				break;
+			}
 			if (oldstate == state)
 				goto sleep;
 			state = oldstate;
+			error = umtxq_check_susp(td);
+			if (error != 0)
+				break;
 		}
+		if (error != 0) {
+			umtxq_unbusy_unlocked(&uq->uq_key);
+			break;
+		}
 
 		if (!(state & URWLOCK_WRITE_OWNER) && URWLOCK_READER_COUNT(state) == 0) {
-			umtxq_lock(&uq->uq_key);
-			umtxq_unbusy(&uq->uq_key);
-			umtxq_unlock(&uq->uq_key);
+			umtxq_unbusy_unlocked(&uq->uq_key);
+			error = umtxq_check_susp(td);
+			if (error != 0)
+				break;
 			continue;
 		}
 sleep:
-		blocked_writers = fuword32(&rwlock->rw_blocked_writers);
+		rv = fueword32(&rwlock->rw_blocked_writers,
+		    &blocked_writers);
+		if (rv == -1) {
+			umtxq_unbusy_unlocked(&uq->uq_key);
+			error = EFAULT;
+			break;
+		}
 		suword32(&rwlock->rw_blocked_writers, blocked_writers+1);
 
 		while ((state & URWLOCK_WRITE_OWNER) || URWLOCK_READER_COUNT(state) != 0) {
@@ -2767,7 +3115,8 @@
 			umtxq_insert_queue(uq, UMTX_EXCLUSIVE_QUEUE);
 			umtxq_unbusy(&uq->uq_key);
 
-			error = umtxq_sleep(uq, "uwrlck", timo);
+			error = umtxq_sleep(uq, "uwrlck", timeout == NULL ?
+			    NULL : &timo);
 
 			umtxq_busy(&uq->uq_key);
 			umtxq_remove_queue(uq, UMTX_EXCLUSIVE_QUEUE);
@@ -2774,56 +3123,64 @@
 			umtxq_unlock(&uq->uq_key);
 			if (error)
 				break;
-			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
+			rv = fueword32(&rwlock->rw_state, &state);
+			if (rv == -1) {
+				error = EFAULT;
+				break;
+			}
 		}
 
-		blocked_writers = fuword32(&rwlock->rw_blocked_writers);
+		rv = fueword32(&rwlock->rw_blocked_writers,
+		    &blocked_writers);
+		if (rv == -1) {
+			umtxq_unbusy_unlocked(&uq->uq_key);
+			error = EFAULT;
+			break;
+		}
 		suword32(&rwlock->rw_blocked_writers, blocked_writers-1);
 		if (blocked_writers == 1) {
-			state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
+			rv = fueword32(&rwlock->rw_state, &state);
+			if (rv == -1) {
+				umtxq_unbusy_unlocked(&uq->uq_key);
+				error = EFAULT;
+				break;
+			}
 			for (;;) {
-				oldstate = casuword32(&rwlock->rw_state, state,
-					 state & ~URWLOCK_WRITE_WAITERS);
+				rv = casueword32(&rwlock->rw_state, state,
+				    &oldstate, state & ~URWLOCK_WRITE_WAITERS);
+				if (rv == -1) {
+					error = EFAULT;
+					break;
+				}
 				if (oldstate == state)
 					break;
 				state = oldstate;
+				error1 = umtxq_check_susp(td);
+				/*
+				 * We are leaving the URWLOCK_WRITE_WAITERS
+				 * behind, but this should not harm the
+				 * correctness.
+				 */
+				if (error1 != 0) {
+					if (error == 0)
+						error = error1;
+					break;
+				}
 			}
-			blocked_readers = fuword32(&rwlock->rw_blocked_readers);
+			rv = fueword32(&rwlock->rw_blocked_readers,
+			    &blocked_readers);
+			if (rv == -1) {
+				umtxq_unbusy_unlocked(&uq->uq_key);
+				error = EFAULT;
+				break;
+			}
 		} else
 			blocked_readers = 0;
 
-		umtxq_lock(&uq->uq_key);
-		umtxq_unbusy(&uq->uq_key);
-		umtxq_unlock(&uq->uq_key);
+		umtxq_unbusy_unlocked(&uq->uq_key);
 	}
 
 	umtx_key_release(&uq->uq_key);
-	return (error);
-}
-
-static int
-do_rw_wrlock2(struct thread *td, void *obj, struct timespec *timeout)
-{
-	struct timespec ts, ts2, ts3;
-	struct timeval tv;
-	int error;
-
-	getnanouptime(&ts);
-	timespecadd(&ts, timeout);
-	TIMESPEC_TO_TIMEVAL(&tv, timeout);
-	for (;;) {
-		error = do_rw_wrlock(td, obj, tvtohz(&tv));
-		if (error != ETIMEDOUT)
-			break;
-		getnanouptime(&ts2);
-		if (timespeccmp(&ts2, &ts, >=)) {
-			error = ETIMEDOUT;
-			break;
-		}
-		ts3 = ts;
-		timespecsub(&ts3, &ts2);
-		TIMESPEC_TO_TIMEVAL(&tv, &ts3);
-	}
 	if (error == ERESTART)
 		error = EINTR;
 	return (error);
@@ -2835,19 +3192,29 @@
 	struct umtx_q *uq;
 	uint32_t flags;
 	int32_t state, oldstate;
-	int error, q, count;
+	int error, rv, q, count;
 
 	uq = td->td_umtxq;
-	flags = fuword32(&rwlock->rw_flags);
+	error = fueword32(&rwlock->rw_flags, &flags);
+	if (error == -1)
+		return (EFAULT);
 	error = umtx_key_get(rwlock, TYPE_RWLOCK, GET_SHARE(flags), &uq->uq_key);
 	if (error != 0)
 		return (error);
 
-	state = fuword32(__DEVOLATILE(int32_t *, &rwlock->rw_state));
+	error = fueword32(&rwlock->rw_state, &state);
+	if (error == -1) {
+		error = EFAULT;
+		goto out;
+	}
 	if (state & URWLOCK_WRITE_OWNER) {
 		for (;;) {
-			oldstate = casuword32(&rwlock->rw_state, state, 
-				state & ~URWLOCK_WRITE_OWNER);
+			rv = casueword32(&rwlock->rw_state, state, 
+			    &oldstate, state & ~URWLOCK_WRITE_OWNER);
+			if (rv == -1) {
+				error = EFAULT;
+				goto out;
+			}
 			if (oldstate != state) {
 				state = oldstate;
 				if (!(oldstate & URWLOCK_WRITE_OWNER)) {
@@ -2854,13 +3221,20 @@
 					error = EPERM;
 					goto out;
 				}
+				error = umtxq_check_susp(td);
+				if (error != 0)
+					goto out;
 			} else
 				break;
 		}
 	} else if (URWLOCK_READER_COUNT(state) != 0) {
 		for (;;) {
-			oldstate = casuword32(&rwlock->rw_state, state,
-				state - 1);
+			rv = casueword32(&rwlock->rw_state, state,
+			    &oldstate, state - 1);
+			if (rv == -1) {
+				error = EFAULT;
+				goto out;
+			}
 			if (oldstate != state) {
 				state = oldstate;
 				if (URWLOCK_READER_COUNT(oldstate) == 0) {
@@ -2867,8 +3241,10 @@
 					error = EPERM;
 					goto out;
 				}
-			}
-			else
+				error = umtxq_check_susp(td);
+				if (error != 0)
+					goto out;
+			} else
 				break;
 		}
 	} else {
@@ -2909,62 +3285,43 @@
 }
 
 static int
-do_sem_wait(struct thread *td, struct _usem *sem, struct timespec *timeout)
+do_sem_wait(struct thread *td, struct _usem *sem, struct _umtx_time *timeout)
 {
+	struct abs_timeout timo;
 	struct umtx_q *uq;
-	struct timeval tv;
-	struct timespec cts, ets, tts;
-	uint32_t flags, count;
-	int error;
+	uint32_t flags, count, count1;
+	int error, rv;
 
 	uq = td->td_umtxq;
-	flags = fuword32(&sem->_flags);
+	error = fueword32(&sem->_flags, &flags);
+	if (error == -1)
+		return (EFAULT);
 	error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &uq->uq_key);
 	if (error != 0)
 		return (error);
+
+	if (timeout != NULL)
+		abs_timeout_init2(&timo, timeout);
+
 	umtxq_lock(&uq->uq_key);
 	umtxq_busy(&uq->uq_key);
 	umtxq_insert(uq);
 	umtxq_unlock(&uq->uq_key);
-
-	if (fuword32(__DEVOLATILE(uint32_t *, &sem->_has_waiters)) == 0)
-		casuword32(__DEVOLATILE(uint32_t *, &sem->_has_waiters), 0, 1);
-
-	count = fuword32(__DEVOLATILE(uint32_t *, &sem->_count));
-	if (count != 0) {
+	rv = casueword32(&sem->_has_waiters, 0, &count1, 1);
+	if (rv == 0)
+		rv = fueword32(&sem->_count, &count);
+	if (rv == -1 || count != 0) {
 		umtxq_lock(&uq->uq_key);
 		umtxq_unbusy(&uq->uq_key);
 		umtxq_remove(uq);
 		umtxq_unlock(&uq->uq_key);
 		umtx_key_release(&uq->uq_key);
-		return (0);
+		return (rv == -1 ? EFAULT : 0);
 	}
-
 	umtxq_lock(&uq->uq_key);
 	umtxq_unbusy(&uq->uq_key);
-	umtxq_unlock(&uq->uq_key);
 
-	umtxq_lock(&uq->uq_key);
-	if (timeout == NULL) {
-		error = umtxq_sleep(uq, "usem", 0);
-	} else {
-		getnanouptime(&ets);
-		timespecadd(&ets, timeout);
-		TIMESPEC_TO_TIMEVAL(&tv, timeout);
-		for (;;) {
-			error = umtxq_sleep(uq, "usem", tvtohz(&tv));
-			if (error != ETIMEDOUT)
-				break;
-			getnanouptime(&cts);
-			if (timespeccmp(&cts, &ets, >=)) {
-				error = ETIMEDOUT;
-				break;
-			}
-			tts = ets;
-			timespecsub(&tts, &cts);
-			TIMESPEC_TO_TIMEVAL(&tv, &tts);
-		}
-	}
+	error = umtxq_sleep(uq, "usem", timeout == NULL ? NULL : &timo);
 
 	if ((uq->uq_flags & UQF_UMTXQ) == 0)
 		error = 0;
@@ -2971,7 +3328,8 @@
 	else {
 		umtxq_remove(uq);
 		/* A relative timeout cannot be restarted. */
-		if (error == ERESTART && timeout != NULL)
+		if (error == ERESTART && timeout != NULL &&
+		    (timeout->_flags & UMTX_ABSTIME) == 0)
 			error = EINTR;
 	}
 	umtxq_unlock(&uq->uq_key);
@@ -2986,21 +3344,31 @@
 do_sem_wake(struct thread *td, struct _usem *sem)
 {
 	struct umtx_key key;
-	int error, cnt, nwake;
+	int error, cnt;
 	uint32_t flags;
 
-	flags = fuword32(&sem->_flags);
+	error = fueword32(&sem->_flags, &flags);
+	if (error == -1)
+		return (EFAULT);
 	if ((error = umtx_key_get(sem, TYPE_SEM, GET_SHARE(flags), &key)) != 0)
 		return (error);	
 	umtxq_lock(&key);
 	umtxq_busy(&key);
 	cnt = umtxq_count(&key);
-	nwake = umtxq_signal(&key, 1);
-	if (cnt <= nwake) {
-		umtxq_unlock(&key);
-		error = suword32(
-		    __DEVOLATILE(uint32_t *, &sem->_has_waiters), 0);
-		umtxq_lock(&key);
+	if (cnt > 0) {
+		/*
+		 * Check if count is greater than 0, this means the memory is
+		 * still being referenced by user code, so we can safely
+		 * update _has_waiters flag.
+		 */
+		if (cnt == 1) {
+			umtxq_unlock(&key);
+			error = suword32(&sem->_has_waiters, 0);
+			umtxq_lock(&key);
+			if (error == -1)
+				error = EFAULT;
+		}
+		umtxq_signal(&key, 1);
 	}
 	umtxq_unbusy(&key);
 	umtxq_unlock(&key);
@@ -3012,7 +3380,7 @@
 sys__umtx_lock(struct thread *td, struct _umtx_lock_args *uap)
     /* struct umtx *umtx */
 {
-	return _do_lock_umtx(td, uap->umtx, td->td_tid, 0);
+	return do_lock_umtx(td, uap->umtx, td->td_tid, 0);
 }
 
 int
@@ -3037,6 +3405,25 @@
 	return (error);
 }
 
+static inline int
+umtx_copyin_umtx_time(const void *addr, size_t size, struct _umtx_time *tp)
+{
+	int error;
+	
+	if (size <= sizeof(struct timespec)) {
+		tp->_clockid = CLOCK_REALTIME;
+		tp->_flags = 0;
+		error = copyin(addr, &tp->_timeout, sizeof(struct timespec));
+	} else 
+		error = copyin(addr, tp, sizeof(struct _umtx_time));
+	if (error != 0)
+		return (error);
+	if (tp->_timeout.tv_sec < 0 ||
+	    tp->_timeout.tv_nsec >= 1000000000 || tp->_timeout.tv_nsec < 0)
+		return (EINVAL);
+	return (0);
+}
+
 static int
 __umtx_op_lock_umtx(struct thread *td, struct _umtx_op_args *uap)
 {
@@ -3064,52 +3451,55 @@
 static int
 __umtx_op_wait(struct thread *td, struct _umtx_op_args *uap)
 {
-	struct timespec *ts, timeout;
+	struct _umtx_time timeout, *tm_p;
 	int error;
 
 	if (uap->uaddr2 == NULL)
-		ts = NULL;
+		tm_p = NULL;
 	else {
-		error = umtx_copyin_timeout(uap->uaddr2, &timeout);
+		error = umtx_copyin_umtx_time(
+		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
-		ts = &timeout;
+		tm_p = &timeout;
 	}
-	return do_wait(td, uap->obj, uap->val, ts, 0, 0);
+	return do_wait(td, uap->obj, uap->val, tm_p, 0, 0);
 }
 
 static int
 __umtx_op_wait_uint(struct thread *td, struct _umtx_op_args *uap)
 {
-	struct timespec *ts, timeout;
+	struct _umtx_time timeout, *tm_p;
 	int error;
 
 	if (uap->uaddr2 == NULL)
-		ts = NULL;
+		tm_p = NULL;
 	else {
-		error = umtx_copyin_timeout(uap->uaddr2, &timeout);
+		error = umtx_copyin_umtx_time(
+		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
-		ts = &timeout;
+		tm_p = &timeout;
 	}
-	return do_wait(td, uap->obj, uap->val, ts, 1, 0);
+	return do_wait(td, uap->obj, uap->val, tm_p, 1, 0);
 }
 
 static int
 __umtx_op_wait_uint_private(struct thread *td, struct _umtx_op_args *uap)
 {
-	struct timespec *ts, timeout;
+	struct _umtx_time *tm_p, timeout;
 	int error;
 
 	if (uap->uaddr2 == NULL)
-		ts = NULL;
+		tm_p = NULL;
 	else {
-		error = umtx_copyin_timeout(uap->uaddr2, &timeout);
+		error = umtx_copyin_umtx_time(
+		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
-		ts = &timeout;
+		tm_p = &timeout;
 	}
-	return do_wait(td, uap->obj, uap->val, ts, 1, 1);
+	return do_wait(td, uap->obj, uap->val, tm_p, 1, 1);
 }
 
 static int
@@ -3153,19 +3543,20 @@
 static int
 __umtx_op_lock_umutex(struct thread *td, struct _umtx_op_args *uap)
 {
-	struct timespec *ts, timeout;
+	struct _umtx_time *tm_p, timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL)
-		ts = NULL;
+		tm_p = NULL;
 	else {
-		error = umtx_copyin_timeout(uap->uaddr2, &timeout);
+		error = umtx_copyin_umtx_time(
+		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
-		ts = &timeout;
+		tm_p = &timeout;
 	}
-	return do_lock_umutex(td, uap->obj, ts, 0);
+	return do_lock_umutex(td, uap->obj, tm_p, 0);
 }
 
 static int
@@ -3177,19 +3568,20 @@
 static int
 __umtx_op_wait_umutex(struct thread *td, struct _umtx_op_args *uap)
 {
-	struct timespec *ts, timeout;
+	struct _umtx_time *tm_p, timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL)
-		ts = NULL;
+		tm_p = NULL;
 	else {
-		error = umtx_copyin_timeout(uap->uaddr2, &timeout);
+		error = umtx_copyin_umtx_time(
+		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
-		ts = &timeout;
+		tm_p = &timeout;
 	}
-	return do_lock_umutex(td, uap->obj, ts, _UMUTEX_WAIT);
+	return do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT);
 }
 
 static int
@@ -3243,7 +3635,7 @@
 static int
 __umtx_op_rw_rdlock(struct thread *td, struct _umtx_op_args *uap)
 {
-	struct timespec timeout;
+	struct _umtx_time timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
@@ -3250,10 +3642,11 @@
 	if (uap->uaddr2 == NULL) {
 		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
 	} else {
-		error = umtx_copyin_timeout(uap->uaddr2, &timeout);
+		error = umtx_copyin_umtx_time(uap->uaddr2,
+		   (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
-		error = do_rw_rdlock2(td, uap->obj, uap->val, &timeout);
+		error = do_rw_rdlock(td, uap->obj, uap->val, &timeout);
 	}
 	return (error);
 }
@@ -3261,7 +3654,7 @@
 static int
 __umtx_op_rw_wrlock(struct thread *td, struct _umtx_op_args *uap)
 {
-	struct timespec timeout;
+	struct _umtx_time timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
@@ -3268,11 +3661,12 @@
 	if (uap->uaddr2 == NULL) {
 		error = do_rw_wrlock(td, uap->obj, 0);
 	} else {
-		error = umtx_copyin_timeout(uap->uaddr2, &timeout);
+		error = umtx_copyin_umtx_time(uap->uaddr2, 
+		   (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
 
-		error = do_rw_wrlock2(td, uap->obj, &timeout);
+		error = do_rw_wrlock(td, uap->obj, &timeout);
 	}
 	return (error);
 }
@@ -3286,19 +3680,20 @@
 static int
 __umtx_op_sem_wait(struct thread *td, struct _umtx_op_args *uap)
 {
-	struct timespec *ts, timeout;
+	struct _umtx_time *tm_p, timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL)
-		ts = NULL;
+		tm_p = NULL;
 	else {
-		error = umtx_copyin_timeout(uap->uaddr2, &timeout);
+		error = umtx_copyin_umtx_time(
+		    uap->uaddr2, (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
-		ts = &timeout;
+		tm_p = &timeout;
 	}
-	return (do_sem_wait(td, uap->obj, ts));
+	return (do_sem_wait(td, uap->obj, tm_p));
 }
 
 static int
@@ -3369,6 +3764,12 @@
 	int32_t tv_nsec;
 };
 
+struct umtx_time32 {
+	struct	timespec32	timeout;
+	uint32_t		flags;
+	uint32_t		clockid;
+};
+
 static inline int
 umtx_copyin_timeout32(void *addr, struct timespec *tsp)
 {
@@ -3389,6 +3790,30 @@
 	return (error);
 }
 
+static inline int
+umtx_copyin_umtx_time32(const void *addr, size_t size, struct _umtx_time *tp)
+{
+	struct umtx_time32 t32;
+	int error;
+	
+	t32.clockid = CLOCK_REALTIME;
+	t32.flags   = 0;
+	if (size <= sizeof(struct timespec32))
+		error = copyin(addr, &t32.timeout, sizeof(struct timespec32));
+	else 
+		error = copyin(addr, &t32, sizeof(struct umtx_time32));
+	if (error != 0)
+		return (error);
+	if (t32.timeout.tv_sec < 0 ||
+	    t32.timeout.tv_nsec >= 1000000000 || t32.timeout.tv_nsec < 0)
+		return (EINVAL);
+	tp->_timeout.tv_sec = t32.timeout.tv_sec;
+	tp->_timeout.tv_nsec = t32.timeout.tv_nsec;
+	tp->_flags = t32.flags;
+	tp->_clockid = t32.clockid;
+	return (0);
+}
+
 static int
 __umtx_op_lock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
 {
@@ -3416,54 +3841,57 @@
 static int
 __umtx_op_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
 {
-	struct timespec *ts, timeout;
+	struct _umtx_time *tm_p, timeout;
 	int error;
 
 	if (uap->uaddr2 == NULL)
-		ts = NULL;
+		tm_p = NULL;
 	else {
-		error = umtx_copyin_timeout32(uap->uaddr2, &timeout);
+		error = umtx_copyin_umtx_time32(uap->uaddr2,
+			(size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
-		ts = &timeout;
+		tm_p = &timeout;
 	}
-	return do_wait(td, uap->obj, uap->val, ts, 1, 0);
+	return do_wait(td, uap->obj, uap->val, tm_p, 1, 0);
 }
 
 static int
 __umtx_op_lock_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
 {
-	struct timespec *ts, timeout;
+	struct _umtx_time *tm_p, timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL)
-		ts = NULL;
+		tm_p = NULL;
 	else {
-		error = umtx_copyin_timeout32(uap->uaddr2, &timeout);
+		error = umtx_copyin_umtx_time32(uap->uaddr2,
+			    (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
-		ts = &timeout;
+		tm_p = &timeout;
 	}
-	return do_lock_umutex(td, uap->obj, ts, 0);
+	return do_lock_umutex(td, uap->obj, tm_p, 0);
 }
 
 static int
 __umtx_op_wait_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
 {
-	struct timespec *ts, timeout;
+	struct _umtx_time *tm_p, timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL)
-		ts = NULL;
+		tm_p = NULL;
 	else {
-		error = umtx_copyin_timeout32(uap->uaddr2, &timeout);
+		error = umtx_copyin_umtx_time32(uap->uaddr2, 
+		    (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
-		ts = &timeout;
+		tm_p = &timeout;
 	}
-	return do_lock_umutex(td, uap->obj, ts, _UMUTEX_WAIT);
+	return do_lock_umutex(td, uap->obj, tm_p, _UMUTEX_WAIT);
 }
 
 static int
@@ -3487,7 +3915,7 @@
 static int
 __umtx_op_rw_rdlock_compat32(struct thread *td, struct _umtx_op_args *uap)
 {
-	struct timespec timeout;
+	struct _umtx_time timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
@@ -3494,10 +3922,11 @@
 	if (uap->uaddr2 == NULL) {
 		error = do_rw_rdlock(td, uap->obj, uap->val, 0);
 	} else {
-		error = umtx_copyin_timeout32(uap->uaddr2, &timeout);
+		error = umtx_copyin_umtx_time32(uap->uaddr2,
+		    (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
-		error = do_rw_rdlock2(td, uap->obj, uap->val, &timeout);
+		error = do_rw_rdlock(td, uap->obj, uap->val, &timeout);
 	}
 	return (error);
 }
@@ -3505,7 +3934,7 @@
 static int
 __umtx_op_rw_wrlock_compat32(struct thread *td, struct _umtx_op_args *uap)
 {
-	struct timespec timeout;
+	struct _umtx_time timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
@@ -3512,11 +3941,11 @@
 	if (uap->uaddr2 == NULL) {
 		error = do_rw_wrlock(td, uap->obj, 0);
 	} else {
-		error = umtx_copyin_timeout32(uap->uaddr2, &timeout);
+		error = umtx_copyin_umtx_time32(uap->uaddr2,
+		    (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
-
-		error = do_rw_wrlock2(td, uap->obj, &timeout);
+		error = do_rw_wrlock(td, uap->obj, &timeout);
 	}
 	return (error);
 }
@@ -3524,36 +3953,38 @@
 static int
 __umtx_op_wait_uint_private_compat32(struct thread *td, struct _umtx_op_args *uap)
 {
-	struct timespec *ts, timeout;
+	struct _umtx_time *tm_p, timeout;
 	int error;
 
 	if (uap->uaddr2 == NULL)
-		ts = NULL;
+		tm_p = NULL;
 	else {
-		error = umtx_copyin_timeout32(uap->uaddr2, &timeout);
+		error = umtx_copyin_umtx_time32(
+		    uap->uaddr2, (size_t)uap->uaddr1,&timeout);
 		if (error != 0)
 			return (error);
-		ts = &timeout;
+		tm_p = &timeout;
 	}
-	return do_wait(td, uap->obj, uap->val, ts, 1, 1);
+	return do_wait(td, uap->obj, uap->val, tm_p, 1, 1);
 }
 
 static int
 __umtx_op_sem_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
 {
-	struct timespec *ts, timeout;
+	struct _umtx_time *tm_p, timeout;
 	int error;
 
 	/* Allow a null timespec (wait forever). */
 	if (uap->uaddr2 == NULL)
-		ts = NULL;
+		tm_p = NULL;
 	else {
-		error = umtx_copyin_timeout32(uap->uaddr2, &timeout);
+		error = umtx_copyin_umtx_time32(uap->uaddr2,
+		    (size_t)uap->uaddr1, &timeout);
 		if (error != 0)
 			return (error);
-		ts = &timeout;
+		tm_p = &timeout;
 	}
-	return (do_sem_wait(td, uap->obj, ts));
+	return (do_sem_wait(td, uap->obj, tm_p));
 }
 
 static int
@@ -3679,13 +4110,13 @@
 	if ((uq = td->td_umtxq) == NULL)
 		return;
 
-	mtx_lock_spin(&umtx_lock);
+	mtx_lock(&umtx_lock);
 	uq->uq_inherited_pri = PRI_MAX;
 	while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) {
 		pi->pi_owner = NULL;
 		TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link);
 	}
-	mtx_unlock_spin(&umtx_lock);
+	mtx_unlock(&umtx_lock);
 	thread_lock(td);
 	sched_lend_user_prio(td, PRI_MAX);
 	thread_unlock(td);

Modified: trunk/sys/kern/kern_uuid.c
===================================================================
--- trunk/sys/kern/kern_uuid.c	2018-05-25 20:59:46 UTC (rev 9949)
+++ trunk/sys/kern/kern_uuid.c	2018-05-25 21:07:09 UTC (rev 9950)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 2002 Marcel Moolenaar
  * All rights reserved.
@@ -25,7 +26,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/kern_uuid.c 262239 2014-02-20 08:55:59Z brueffer $");
 
 #include <sys/param.h>
 #include <sys/endian.h>
@@ -71,54 +72,41 @@
 
 CTASSERT(sizeof(struct uuid_private) == 16);
 
+struct uuid_macaddr {
+	uint16_t	state;
+#define	UUID_ETHER_EMPTY	0
+#define	UUID_ETHER_RANDOM	1
+#define	UUID_ETHER_UNIQUE	2
+	uint16_t	node[UUID_NODE_LEN>>1];
+};
+
 static struct uuid_private uuid_last;
 
+#define UUID_NETHER	4
+static struct uuid_macaddr uuid_ether[UUID_NETHER];
+
 static struct mtx uuid_mutex;
 MTX_SYSINIT(uuid_lock, &uuid_mutex, "UUID generator mutex lock", MTX_DEF);
 
 /*
- * Return the first MAC address we encounter or, if none was found,
- * construct a sufficiently random multicast address. We don't try
- * to return the same MAC address as previously returned. We always
- * generate a new multicast address if no MAC address exists in the
- * system.
- * It would be nice to know if 'ifnet' or any of its sub-structures
- * has been changed in any way. If not, we could simply skip the
- * scan and safely return the MAC address we returned before.
+ * Return the first MAC address added in the array. If it's empty, then
+ * construct a sufficiently random multicast MAC address first. Any
+ * addresses added later will bump the random MAC address up tp the next
+ * index.
  */
 static void
 uuid_node(uint16_t *node)
 {
-	struct ifnet *ifp;
-	struct ifaddr *ifa;
-	struct sockaddr_dl *sdl;
 	int i;
 
-	CURVNET_SET(TD_TO_VNET(curthread));
-	IFNET_RLOCK_NOSLEEP();
-	TAILQ_FOREACH(ifp, &V_ifnet, if_link) {
-		/* Walk the address list */
-		IF_ADDR_RLOCK(ifp);
-		TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
-			sdl = (struct sockaddr_dl*)ifa->ifa_addr;
-			if (sdl != NULL && sdl->sdl_family == AF_LINK &&
-			    sdl->sdl_type == IFT_ETHER) {
-				/* Got a MAC address. */
-				bcopy(LLADDR(sdl), node, UUID_NODE_LEN);
-				IF_ADDR_RUNLOCK(ifp);
-				IFNET_RUNLOCK_NOSLEEP();
-				CURVNET_RESTORE();
-				return;
-			}
-		}
-		IF_ADDR_RUNLOCK(ifp);
+	if (uuid_ether[0].state == UUID_ETHER_EMPTY) {
+		for (i = 0; i < (UUID_NODE_LEN>>1); i++)
+			uuid_ether[0].node[i] = (uint16_t)arc4random();
+		*((uint8_t*)uuid_ether[0].node) |= 0x01;
+		uuid_ether[0].state = UUID_ETHER_RANDOM;
 	}
-	IFNET_RUNLOCK_NOSLEEP();
-
 	for (i = 0; i < (UUID_NODE_LEN>>1); i++)
-		node[i] = (uint16_t)arc4random();
-	*((uint8_t*)node) |= 0x01;
-	CURVNET_RESTORE();
+		node[i] = uuid_ether[0].node[i];
 }
 
 /*
@@ -211,6 +199,76 @@
 }
 
 int
+uuid_ether_add(const uint8_t *addr)
+{
+	int i, sum;
+
+	/*
+	 * Validate input. No multicast (flag 0x1), no locally administered
+	 * (flag 0x2) and no 'all-zeroes' addresses.
+	 */
+	if (addr[0] & 0x03)
+		return (EINVAL);
+	sum = 0;
+	for (i = 0; i < UUID_NODE_LEN; i++)
+		sum += addr[i];
+	if (sum == 0)
+		return (EINVAL);
+
+	mtx_lock(&uuid_mutex);
+
+	/* Make sure the MAC isn't known already and that there's space. */
+	i = 0;
+	while (i < UUID_NETHER && uuid_ether[i].state == UUID_ETHER_UNIQUE) {
+		if (!bcmp(addr, uuid_ether[i].node, UUID_NODE_LEN)) {
+			mtx_unlock(&uuid_mutex);
+			return (EEXIST);
+		}
+		i++;
+	}
+	if (i == UUID_NETHER) {
+		mtx_unlock(&uuid_mutex);
+		return (ENOSPC);
+	}
+
+	/* Insert MAC at index, moving the non-empty entry if possible. */
+	if (uuid_ether[i].state == UUID_ETHER_RANDOM && i < UUID_NETHER - 1)
+		uuid_ether[i + 1] = uuid_ether[i];
+	uuid_ether[i].state = UUID_ETHER_UNIQUE;
+	bcopy(addr, uuid_ether[i].node, UUID_NODE_LEN);
+	mtx_unlock(&uuid_mutex);
+	return (0);
+}
+
+int
+uuid_ether_del(const uint8_t *addr)
+{
+	int i;
+
+	mtx_lock(&uuid_mutex);
+	i = 0;
+	while (i < UUID_NETHER && uuid_ether[i].state == UUID_ETHER_UNIQUE &&
+	    bcmp(addr, uuid_ether[i].node, UUID_NODE_LEN))
+		i++;
+	if (i == UUID_NETHER || uuid_ether[i].state != UUID_ETHER_UNIQUE) {
+		mtx_unlock(&uuid_mutex);
+		return (ENOENT);
+	}
+
+	/* Remove it by shifting higher index entries down. */
+	while (i < UUID_NETHER - 1 && uuid_ether[i].state != UUID_ETHER_EMPTY) {
+		uuid_ether[i] = uuid_ether[i + 1];
+		i++;
+	}
+	if (uuid_ether[i].state != UUID_ETHER_EMPTY) {
+		uuid_ether[i].state = UUID_ETHER_EMPTY;
+		bzero(uuid_ether[i].node, UUID_NODE_LEN);
+	}
+	mtx_unlock(&uuid_mutex);
+	return (0);
+}
+
+int
 snprintf_uuid(char *buf, size_t sz, struct uuid *uuid)
 {
 	struct uuid_private *id;
@@ -314,7 +372,7 @@
 
 	p = buf;
 	uuid->time_low = be32dec(p);
-	uuid->time_mid = le16dec(p + 4);
+	uuid->time_mid = be16dec(p + 4);
 	uuid->time_hi_and_version = be16dec(p + 6);
 	uuid->clock_seq_hi_and_reserved = p[8];
 	uuid->clock_seq_low = p[9];

Modified: trunk/sys/kern/ksched.c
===================================================================
--- trunk/sys/kern/ksched.c	2018-05-25 20:59:46 UTC (rev 9949)
+++ trunk/sys/kern/ksched.c	2018-05-25 21:07:09 UTC (rev 9950)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1996, 1997
  *	HD Associates, Inc.  All rights reserved.
@@ -30,11 +31,10 @@
  * SUCH DAMAGE.
  */
 
-/* ksched: Soft real time scheduling based on "rtprio".
- */
+/* ksched: Soft real time scheduling based on "rtprio". */
 
 #include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/ksched.c 287508 2015-09-06 17:36:09Z kib $");
 
 #include "opt_posix.h"
 
@@ -51,8 +51,7 @@
 
 FEATURE(kposix_priority_scheduling, "POSIX P1003.1B realtime extensions");
 
-/* ksched: Real-time extension to support POSIX priority scheduling.
- */
+/* ksched: Real-time extension to support POSIX priority scheduling. */
 
 struct ksched {
 	struct timespec rr_interval;
@@ -61,21 +60,21 @@
 int
 ksched_attach(struct ksched **p)
 {
-	struct ksched *ksched= p31b_malloc(sizeof(*ksched));
+	struct ksched *ksched;
 
+	ksched = malloc(sizeof(*ksched), M_P31B, M_WAITOK);
 	ksched->rr_interval.tv_sec = 0;
-	ksched->rr_interval.tv_nsec = 1000000000L / sched_rr_interval();
-
+	ksched->rr_interval.tv_nsec = 1000000000L / hz * sched_rr_interval();
 	*p = ksched;
-	return 0;
+	return (0);
 }
 
 int
 ksched_detach(struct ksched *ks)
 {
-	p31b_free(ks);
 
-	return 0;
+	free(ks, M_P31B);
+	return (0);
 }
 
 /*
@@ -108,25 +107,22 @@
 getscheduler(struct ksched *ksched, struct thread *td, int *policy)
 {
 	struct rtprio rtp;
-	int e = 0;
+	int e;
 
+	e = 0;
 	pri_to_rtp(td, &rtp);
-	switch (rtp.type)
-	{
-		case RTP_PRIO_FIFO:
+	switch (rtp.type) {
+	case RTP_PRIO_FIFO:
 		*policy = SCHED_FIFO;
 		break;
-
-		case RTP_PRIO_REALTIME:
+	case RTP_PRIO_REALTIME:
 		*policy = SCHED_RR;
 		break;
-
-		default:
+	default:
 		*policy = SCHED_OTHER;
 		break;
 	}
-
-	return e;
+	return (e);
 }
 
 int
@@ -133,22 +129,17 @@
 ksched_setparam(struct ksched *ksched,
     struct thread *td, const struct sched_param *param)
 {
-	int policy;
-	int e;
+	int e, policy;
 
 	e = getscheduler(ksched, td, &policy);
-
 	if (e == 0)
-	{
-			e = ksched_setscheduler(ksched, td, policy, param);
-	}
-
-	return e;
+		e = ksched_setscheduler(ksched, td, policy, param);
+	return (e);
 }
 
 int
-ksched_getparam(struct ksched *ksched,
-    struct thread *td, struct sched_param *param)
+ksched_getparam(struct ksched *ksched, struct thread *td,
+    struct sched_param *param)
 {
 	struct rtprio rtp;
 
@@ -159,13 +150,14 @@
 		if (PRI_MIN_TIMESHARE < rtp.prio) 
 			/*
 		 	 * The interactive score has it to min realtime
-			 * so we must show max (64 most likely
+			 * so we must show max (64 most likely).
 			 */ 
-			param->sched_priority = (PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE);
+			param->sched_priority = PRI_MAX_TIMESHARE -
+			    PRI_MIN_TIMESHARE;
 		else
 			param->sched_priority = tsprio_to_p4prio(rtp.prio);
 	}
-	return 0;
+	return (0);
 }
 
 /*
@@ -176,117 +168,106 @@
  *
  */
 int
-ksched_setscheduler(struct ksched *ksched,
-    struct thread *td, int policy, const struct sched_param *param)
+ksched_setscheduler(struct ksched *ksched, struct thread *td, int policy,
+    const struct sched_param *param)
 {
-	int e = 0;
 	struct rtprio rtp;
+	int e;
 
-	switch(policy)
-	{
-		case SCHED_RR:
-		case SCHED_FIFO:
-
+	e = 0;
+	switch(policy) {
+	case SCHED_RR:
+	case SCHED_FIFO:
 		if (param->sched_priority >= P1B_PRIO_MIN &&
-		    param->sched_priority <= P1B_PRIO_MAX)
-		{
+		    param->sched_priority <= P1B_PRIO_MAX) {
 			rtp.prio = p4prio_to_rtpprio(param->sched_priority);
-			rtp.type = (policy == SCHED_FIFO)
-				? RTP_PRIO_FIFO : RTP_PRIO_REALTIME;
-
+			rtp.type = (policy == SCHED_FIFO) ? RTP_PRIO_FIFO :
+			    RTP_PRIO_REALTIME;
 			rtp_to_pri(&rtp, td);
+		} else {
+			e = EPERM;
 		}
-		else
-			e = EPERM;
-
-
 		break;
-
-		case SCHED_OTHER:
-		if (param->sched_priority >= 0 &&
-			param->sched_priority <= (PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE)) {
+	case SCHED_OTHER:
+		if (param->sched_priority >= 0 && param->sched_priority <=
+		    (PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE)) {
 			rtp.type = RTP_PRIO_NORMAL;
 			rtp.prio = p4prio_to_tsprio(param->sched_priority);
 			rtp_to_pri(&rtp, td);
-		} else
+		} else {
 			e = EINVAL;
-
+		}
 		break;
-		
-		default:
-			e = EINVAL;
-			break;
+	default:
+		e = EINVAL;
+		break;
 	}
-
-	return e;
+	return (e);
 }
 
 int
 ksched_getscheduler(struct ksched *ksched, struct thread *td, int *policy)
 {
-	return getscheduler(ksched, td, policy);
+
+	return (getscheduler(ksched, td, policy));
 }
 
-/* ksched_yield: Yield the CPU.
- */
+/* ksched_yield: Yield the CPU. */
 int
 ksched_yield(struct ksched *ksched)
 {
+
 	sched_relinquish(curthread);
-	return 0;
+	return (0);
 }
 
 int
 ksched_get_priority_max(struct ksched *ksched, int policy, int *prio)
 {
-	int e = 0;
+	int e;
 
-	switch (policy)
-	{
-		case SCHED_FIFO:
-		case SCHED_RR:
-		*prio = RTP_PRIO_MAX;
+	e = 0;
+	switch (policy)	{
+	case SCHED_FIFO:
+	case SCHED_RR:
+		*prio = P1B_PRIO_MAX;
 		break;
-
-		case SCHED_OTHER:
+	case SCHED_OTHER:
 		*prio = PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE;
 		break;
-
-		default:
+	default:
 		e = EINVAL;
+		break;
 	}
-
-	return e;
+	return (e);
 }
 
 int
 ksched_get_priority_min(struct ksched *ksched, int policy, int *prio)
 {
-	int e = 0;
+	int e;
 
-	switch (policy)
-	{
-		case SCHED_FIFO:
-		case SCHED_RR:
+	e = 0;
+	switch (policy)	{
+	case SCHED_FIFO:
+	case SCHED_RR:
 		*prio = P1B_PRIO_MIN;
 		break;
-
-		case SCHED_OTHER:
+	case SCHED_OTHER:
 		*prio = 0;
 		break;
-
-		default:
+	default:
 		e = EINVAL;
+		break;
 	}
-
-	return e;
+	return (e);
 }
 
 int
-ksched_rr_get_interval(struct ksched *ksched,
-   struct thread *td, struct timespec *timespec)
+ksched_rr_get_interval(struct ksched *ksched, struct thread *td,
+    struct timespec *timespec)
 {
+
 	*timespec = ksched->rr_interval;
-
-	return 0;
+	return (0);
 }

Modified: trunk/sys/kern/link_elf.c
===================================================================
--- trunk/sys/kern/link_elf.c	2018-05-25 20:59:46 UTC (rev 9949)
+++ trunk/sys/kern/link_elf.c	2018-05-25 21:07:09 UTC (rev 9950)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1998-2000 Doug Rabson
  * All rights reserved.
@@ -25,7 +26,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/link_elf.c 296729 2016-03-12 17:23:15Z kib $");
 
 #include "opt_ddb.h"
 #include "opt_gdb.h"
@@ -158,7 +159,7 @@
 static void	link_elf_reloc_local(linker_file_t);
 static long	link_elf_symtab_get(linker_file_t, const Elf_Sym **);
 static long	link_elf_strtab_get(linker_file_t, caddr_t *);
-static Elf_Addr	elf_lookup(linker_file_t, Elf_Size, int);
+static int	elf_lookup(linker_file_t, Elf_Size, int, Elf_Addr *);
 
 static kobj_method_t link_elf_methods[] = {
 	KOBJMETHOD(linker_lookup_symbol,	link_elf_lookup_symbol),
@@ -575,7 +576,7 @@
 
 static int
 parse_dpcpu(elf_file_t ef)
-{ 
+{
 	int count;
 	int error;
 
@@ -606,7 +607,7 @@
 #ifdef VIMAGE
 static int
 parse_vnet(elf_file_t ef)
-{ 
+{
 	int count;
 	int error;
 
@@ -702,16 +703,6 @@
 	int error;
 
 	ef = (elf_file_t) lf;
-#if 0	/* this will be more trouble than it's worth for now */
-	for (dp = ef->dynamic; dp->d_tag != DT_NULL; dp++) {
-		if (dp->d_tag != DT_NEEDED)
-			continue;
-		modname = ef->strtab + dp->d_un.d_val;
-		error = linker_load_module(modname, lf);
-		if (error != 0)
-			goto out;
-    }
-#endif
 	error = relocate_file(ef);
 	if (error != 0)
 		return (error);
@@ -750,17 +741,15 @@
 	int symstrindex;
 	int symcnt;
 	int strcnt;
-	int vfslocked;
 
 	shdr = NULL;
 	lf = NULL;
 
-	NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE, UIO_SYSSPACE, filename, td);
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, filename, td);
 	flags = FREAD;
 	error = vn_open(&nd, &flags, 0, NULL);
 	if (error != 0)
 		return (error);
-	vfslocked = NDHASGIANT(&nd);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	if (nd.ni_vp->v_type != VREG) {
 		error = ENOEXEC;
@@ -884,7 +873,7 @@
 	 */
 	base_offset = trunc_page(segs[0]->p_offset);
 	base_vaddr = trunc_page(segs[0]->p_vaddr);
-	base_vlimit = round_page(segs[nsegs - 1]->p_vaddr + 
+	base_vlimit = round_page(segs[nsegs - 1]->p_vaddr +
 	    segs[nsegs - 1]->p_memsz);
 	mapsize = base_vlimit - base_vaddr;
 
@@ -903,7 +892,7 @@
 	}
 	ef->address = (caddr_t) vm_map_min(kernel_map);
 	error = vm_map_find(kernel_map, ef->object, 0,
-	    (vm_offset_t *) &ef->address, mapsize, 1,
+	    (vm_offset_t *) &ef->address, mapsize, 0, VMFS_OPTIMAL_SPACE,
 	    VM_PROT_ALL, VM_PROT_ALL, 0);
 	if (error != 0) {
 		vm_object_deallocate(ef->object);
@@ -975,16 +964,6 @@
 	vn_lock(nd.ni_vp, LK_EXCLUSIVE | LK_RETRY);
 	if (error != 0)
 		goto out;
-#if 0	/* this will be more trouble than it's worth for now */
-	for (dp = ef->dynamic; dp->d_tag != DT_NULL; dp++) {
-		if (dp->d_tag != DT_NEEDED)
-			continue;
-		modname = ef->strtab + dp->d_un.d_val;
-		error = linker_load_module(modname, lf);
-		if (error != 0)
-			goto out;
-    }
-#endif
 	error = relocate_file(ef);
 	if (error != 0)
 		goto out;
@@ -1047,13 +1026,10 @@
 out:
 	VOP_UNLOCK(nd.ni_vp, 0);
 	vn_close(nd.ni_vp, FREAD, td->td_ucred, td);
-	VFS_UNLOCK_GIANT(vfslocked);
 	if (error != 0 && lf != NULL)
 		linker_file_unload(lf, LINKER_UNLOAD_FORCE);
-	if (shdr != NULL)
-		free(shdr, M_LINKER);
-	if (firstpage != NULL)
-		free(firstpage, M_LINKER);
+	free(shdr, M_LINKER);
+	free(firstpage, M_LINKER);
 
 	return (error);
 }
@@ -1115,19 +1091,13 @@
 		    + (ef->object->size << PAGE_SHIFT));
 	}
 #else
-	if (ef->address != NULL)
-		free(ef->address, M_LINKER);
+	free(ef->address, M_LINKER);
 #endif
-	if (ef->symbase != NULL)
-		free(ef->symbase, M_LINKER);
-	if (ef->strbase != NULL)
-		free(ef->strbase, M_LINKER);
-	if (ef->ctftab != NULL)
-		free(ef->ctftab, M_LINKER);
-	if (ef->ctfoff != NULL)
-		free(ef->ctfoff, M_LINKER);
-	if (ef->typoff != NULL)
-		free(ef->typoff, M_LINKER);
+	free(ef->symbase, M_LINKER);
+	free(ef->strbase, M_LINKER);
+	free(ef->ctftab, M_LINKER);
+	free(ef->ctfoff, M_LINKER);
+	free(ef->typoff, M_LINKER);
 }
 
 static void
@@ -1439,7 +1409,7 @@
 	elf_file_t ef = (elf_file_t)file;
 	const Elf_Sym *symp;
 	int i, error;
-	
+
 	/* Exhaustive search */
 	for (i = 0, symp = ef->ddbsymtab; i < ef->ddbsymcnt; i++, symp++) {
 		if (symp->st_value != 0 &&
@@ -1521,8 +1491,8 @@
  * This is not only more efficient, it's also more correct. It's not always
  * the case that the symbol can be found through the hash table.
  */
-static Elf_Addr
-elf_lookup(linker_file_t lf, Elf_Size symidx, int deps)
+static int
+elf_lookup(linker_file_t lf, Elf_Size symidx, int deps, Elf_Addr *res)
 {
 	elf_file_t ef = (elf_file_t)lf;
 	const Elf_Sym *sym;
@@ -1530,8 +1500,10 @@
 	Elf_Addr addr, start, base;
 
 	/* Don't even try to lookup the symbol if the index is bogus. */
-	if (symidx >= ef->nchains)
-		return (0);
+	if (symidx >= ef->nchains) {
+		*res = 0;
+		return (EINVAL);
+	}
 
 	sym = ef->symtab + symidx;
 
@@ -1541,9 +1513,12 @@
 	 */
 	if (ELF_ST_BIND(sym->st_info) == STB_LOCAL) {
 		/* Force lookup failure when we have an insanity. */
-		if (sym->st_shndx == SHN_UNDEF || sym->st_value == 0)
-			return (0);
-		return ((Elf_Addr)ef->address + sym->st_value);
+		if (sym->st_shndx == SHN_UNDEF || sym->st_value == 0) {
+			*res = 0;
+			return (EINVAL);
+		}
+		*res = ((Elf_Addr)ef->address + sym->st_value);
+		return (0);
 	}
 
 	/*
@@ -1556,10 +1531,16 @@
 	symbol = ef->strtab + sym->st_name;
 
 	/* Force a lookup failure if the symbol name is bogus. */
-	if (*symbol == 0)
-		return (0);
+	if (*symbol == 0) {
+		*res = 0;
+		return (EINVAL);
+	}
 
 	addr = ((Elf_Addr)linker_file_lookup_symbol(lf, symbol, deps));
+	if (addr == 0 && ELF_ST_BIND(sym->st_info) != STB_WEAK) {
+		*res = 0;
+		return (EINVAL);
+	}
 
 	if (elf_set_find(&set_pcpu_list, addr, &start, &base))
 		addr = addr - start + base;
@@ -1567,7 +1548,8 @@
 	else if (elf_set_find(&set_vnet_list, addr, &start, &base))
 		addr = addr - start + base;
 #endif
-	return addr;
+	*res = addr;
+	return (0);
 }
 
 static void
@@ -1613,7 +1595,7 @@
 
 	return (ef->ddbsymcnt);
 }
-    
+
 static long
 link_elf_strtab_get(linker_file_t lf, caddr_t *strtab)
 {

Modified: trunk/sys/kern/link_elf_obj.c
===================================================================
--- trunk/sys/kern/link_elf_obj.c	2018-05-25 20:59:46 UTC (rev 9949)
+++ trunk/sys/kern/link_elf_obj.c	2018-05-25 21:07:09 UTC (rev 9950)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1998-2000 Doug Rabson
  * Copyright (c) 2004 Peter Wemm
@@ -26,7 +27,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/link_elf_obj.c 302234 2016-06-27 21:50:30Z bdrewery $");
 
 #include "opt_ddb.h"
 
@@ -140,11 +141,12 @@
 static int	link_elf_each_function_nameval(linker_file_t,
 				linker_function_nameval_callback_t,
 				void *);
-static void	link_elf_reloc_local(linker_file_t);
+static int	link_elf_reloc_local(linker_file_t);
 static long	link_elf_symtab_get(linker_file_t, const Elf_Sym **);
 static long	link_elf_strtab_get(linker_file_t, caddr_t *);
 
-static Elf_Addr elf_obj_lookup(linker_file_t lf, Elf_Size symidx, int deps);
+static int	elf_obj_lookup(linker_file_t lf, Elf_Size symidx, int deps,
+		    Elf_Addr *);
 
 static kobj_method_t link_elf_methods[] = {
 	KOBJMETHOD(linker_lookup_symbol,	link_elf_lookup_symbol),
@@ -173,6 +175,7 @@
 };
 
 static int	relocate_file(elf_file_t ef);
+static void	elf_obj_cleanup_globals_cache(elf_file_t);
 
 static void
 link_elf_error(const char *filename, const char *s)
@@ -255,6 +258,9 @@
 		switch (shdr[i].sh_type) {
 		case SHT_PROGBITS:
 		case SHT_NOBITS:
+#ifdef __amd64__
+		case SHT_AMD64_UNWIND:
+#endif
 			ef->nprogtab++;
 			break;
 		case SHT_SYMTAB:
@@ -325,9 +331,16 @@
 		switch (shdr[i].sh_type) {
 		case SHT_PROGBITS:
 		case SHT_NOBITS:
+#ifdef __amd64__
+		case SHT_AMD64_UNWIND:
+#endif
 			ef->progtab[pb].addr = (void *)shdr[i].sh_addr;
 			if (shdr[i].sh_type == SHT_PROGBITS)
 				ef->progtab[pb].name = "<<PROGBITS>>";
+#ifdef __amd64__
+			else if (shdr[i].sh_type == SHT_AMD64_UNWIND)
+				ef->progtab[pb].name = "<<UNWIND>>";
+#endif
 			else
 				ef->progtab[pb].name = "<<NOBITS>>";
 			ef->progtab[pb].size = shdr[i].sh_size;
@@ -389,15 +402,26 @@
 			break;
 		}
 	}
-	if (pb != ef->nprogtab)
-		panic("lost progbits");
-	if (rl != ef->nreltab)
-		panic("lost reltab");
-	if (ra != ef->nrelatab)
-		panic("lost relatab");
+	if (pb != ef->nprogtab) {
+		printf("%s: lost progbits\n", filename);
+		error = ENOEXEC;
+		goto out;
+	}
+	if (rl != ef->nreltab) {
+		printf("%s: lost reltab\n", filename);
+		error = ENOEXEC;
+		goto out;
+	}
+	if (ra != ef->nrelatab) {
+		printf("%s: lost relatab\n", filename);
+		error = ENOEXEC;
+		goto out;
+	}
 
 	/* Local intra-module relocations */
-	link_elf_reloc_local(lf);
+	error = link_elf_reloc_local(lf);
+	if (error != 0)
+		goto out;
 
 	*result = lf;
 	return (0);
@@ -450,7 +474,6 @@
 	int nsym;
 	int pb, rl, ra;
 	int alignmask;
-	int vfslocked;
 
 	shdr = NULL;
 	lf = NULL;
@@ -457,12 +480,11 @@
 	mapsize = 0;
 	hdr = NULL;
 
-	NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE, UIO_SYSSPACE, filename, td);
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, filename, td);
 	flags = FREAD;
 	error = vn_open(&nd, &flags, 0, NULL);
 	if (error)
 		return error;
-	vfslocked = NDHASGIANT(&nd);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	if (nd.ni_vp->v_type != VREG) {
 		error = ENOEXEC;
@@ -553,6 +575,9 @@
 		switch (shdr[i].sh_type) {
 		case SHT_PROGBITS:
 		case SHT_NOBITS:
+#ifdef __amd64__
+		case SHT_AMD64_UNWIND:
+#endif
 			ef->nprogtab++;
 			break;
 		case SHT_SYMTAB:
@@ -599,8 +624,11 @@
 		ef->relatab = malloc(ef->nrelatab * sizeof(*ef->relatab),
 		    M_LINKER, M_WAITOK | M_ZERO);
 
-	if (symtabindex == -1)
-		panic("lost symbol table index");
+	if (symtabindex == -1) {
+		link_elf_error(filename, "lost symbol table index");
+		error = ENOEXEC;
+		goto out;
+	}
 	/* Allocate space for and load the symbol table */
 	ef->ddbsymcnt = shdr[symtabindex].sh_size / sizeof(Elf_Sym);
 	ef->ddbsymtab = malloc(shdr[symtabindex].sh_size, M_LINKER, M_WAITOK);
@@ -615,8 +643,11 @@
 		goto out;
 	}
 
-	if (symstrindex == -1)
-		panic("lost symbol string index");
+	if (symstrindex == -1) {
+		link_elf_error(filename, "lost symbol string index");
+		error = ENOEXEC;
+		goto out;
+	}
 	/* Allocate space for and load the symbol strings */
 	ef->ddbstrcnt = shdr[symstrindex].sh_size;
 	ef->ddbstrtab = malloc(shdr[symstrindex].sh_size, M_LINKER, M_WAITOK);
@@ -659,6 +690,9 @@
 		switch (shdr[i].sh_type) {
 		case SHT_PROGBITS:
 		case SHT_NOBITS:
+#ifdef __amd64__
+		case SHT_AMD64_UNWIND:
+#endif
 			alignmask = shdr[i].sh_addralign - 1;
 			mapsize += alignmask;
 			mapsize &= ~alignmask;
@@ -685,9 +719,14 @@
 	 * location of code and data in the kernel's address space, request a
 	 * mapping that is above the kernel.  
 	 */
+#ifdef __amd64__
 	mapbase = KERNBASE;
+#else
+	mapbase = VM_MIN_KERNEL_ADDRESS;
+#endif
 	error = vm_map_find(kernel_map, ef->object, 0, &mapbase,
-	    round_page(mapsize), TRUE, VM_PROT_ALL, VM_PROT_ALL, FALSE);
+	    round_page(mapsize), 0, VMFS_OPTIMAL_SPACE, VM_PROT_ALL,
+	    VM_PROT_ALL, 0);
 	if (error) {
 		vm_object_deallocate(ef->object);
 		ef->object = 0;
@@ -721,6 +760,9 @@
 		switch (shdr[i].sh_type) {
 		case SHT_PROGBITS:
 		case SHT_NOBITS:
+#ifdef __amd64__
+		case SHT_AMD64_UNWIND:
+#endif
 			alignmask = shdr[i].sh_addralign - 1;
 			mapbase += alignmask;
 			mapbase &= ~alignmask;
@@ -729,6 +771,10 @@
 				    ef->shstrtab + shdr[i].sh_name;
 			else if (shdr[i].sh_type == SHT_PROGBITS)
 				ef->progtab[pb].name = "<<PROGBITS>>";
+#ifdef __amd64__
+			else if (shdr[i].sh_type == SHT_AMD64_UNWIND)
+				ef->progtab[pb].name = "<<UNWIND>>";
+#endif
 			else
 				ef->progtab[pb].name = "<<NOBITS>>";
 			if (ef->progtab[pb].name != NULL && 
@@ -750,7 +796,11 @@
 			}
 			ef->progtab[pb].size = shdr[i].sh_size;
 			ef->progtab[pb].sec = i;
-			if (shdr[i].sh_type == SHT_PROGBITS) {
+			if (shdr[i].sh_type == SHT_PROGBITS
+#ifdef __amd64__
+			    || shdr[i].sh_type == SHT_AMD64_UNWIND
+#endif
+			    ) {
 				error = vn_rdwr(UIO_READ, nd.ni_vp,
 				    ef->progtab[pb].addr,
 				    shdr[i].sh_size, shdr[i].sh_offset,
@@ -826,19 +876,35 @@
 			break;
 		}
 	}
-	if (pb != ef->nprogtab)
-		panic("lost progbits");
-	if (rl != ef->nreltab)
-		panic("lost reltab");
-	if (ra != ef->nrelatab)
-		panic("lost relatab");
-	if (mapbase != (vm_offset_t)ef->address + mapsize)
-		panic("mapbase 0x%lx != address %p + mapsize 0x%lx (0x%lx)\n",
+	if (pb != ef->nprogtab) {
+		link_elf_error(filename, "lost progbits");
+		error = ENOEXEC;
+		goto out;
+	}
+	if (rl != ef->nreltab) {
+		link_elf_error(filename, "lost reltab");
+		error = ENOEXEC;
+		goto out;
+	}
+	if (ra != ef->nrelatab) {
+		link_elf_error(filename, "lost relatab");
+		error = ENOEXEC;
+		goto out;
+	}
+	if (mapbase != (vm_offset_t)ef->address + mapsize) {
+		printf(
+		    "%s: mapbase 0x%lx != address %p + mapsize 0x%lx (0x%lx)\n",
+		    filename != NULL ? filename : "<none>",
 		    (u_long)mapbase, ef->address, (u_long)mapsize,
 		    (u_long)(vm_offset_t)ef->address + mapsize);
+		error = ENOMEM;
+		goto out;
+	}
 
 	/* Local intra-module relocations */
-	link_elf_reloc_local(lf);
+	error = link_elf_reloc_local(lf);
+	if (error != 0)
+		goto out;
 
 	/* Pull in dependencies */
 	VOP_UNLOCK(nd.ni_vp, 0);
@@ -862,11 +928,9 @@
 out:
 	VOP_UNLOCK(nd.ni_vp, 0);
 	vn_close(nd.ni_vp, FREAD, td->td_ucred, td);
-	VFS_UNLOCK_GIANT(vfslocked);
 	if (error && lf)
 		linker_file_unload(lf, LINKER_UNLOAD_FORCE);
-	if (hdr)
-		free(hdr, M_LINKER);
+	free(hdr, M_LINKER);
 
 	return error;
 }
@@ -897,18 +961,12 @@
 		}
 	}
 	if (ef->preloaded) {
-		if (ef->reltab)
-			free(ef->reltab, M_LINKER);
-		if (ef->relatab)
-			free(ef->relatab, M_LINKER);
-		if (ef->progtab)
-			free(ef->progtab, M_LINKER);
-		if (ef->ctftab)
-			free(ef->ctftab, M_LINKER);
-		if (ef->ctfoff)
-			free(ef->ctfoff, M_LINKER);
-		if (ef->typoff)
-			free(ef->typoff, M_LINKER);
+		free(ef->reltab, M_LINKER);
+		free(ef->relatab, M_LINKER);
+		free(ef->progtab, M_LINKER);
+		free(ef->ctftab, M_LINKER);
+		free(ef->ctfoff, M_LINKER);
+		free(ef->typoff, M_LINKER);
 		if (file->filename != NULL)
 			preload_delete_name(file->filename);
 		/* XXX reclaim module memory? */
@@ -916,17 +974,12 @@
 	}
 
 	for (i = 0; i < ef->nreltab; i++)
-		if (ef->reltab[i].rel)
-			free(ef->reltab[i].rel, M_LINKER);
+		free(ef->reltab[i].rel, M_LINKER);
 	for (i = 0; i < ef->nrelatab; i++)
-		if (ef->relatab[i].rela)
-			free(ef->relatab[i].rela, M_LINKER);
-	if (ef->reltab)
-		free(ef->reltab, M_LINKER);
-	if (ef->relatab)
-		free(ef->relatab, M_LINKER);
-	if (ef->progtab)
-		free(ef->progtab, M_LINKER);
+		free(ef->relatab[i].rela, M_LINKER);
+	free(ef->reltab, M_LINKER);
+	free(ef->relatab, M_LINKER);
+	free(ef->progtab, M_LINKER);
 
 	if (ef->object) {
 		vm_map_remove(kernel_map, (vm_offset_t) ef->address,
@@ -933,20 +986,13 @@
 		    (vm_offset_t) ef->address +
 		    (ef->object->size << PAGE_SHIFT));
 	}
-	if (ef->e_shdr)
-		free(ef->e_shdr, M_LINKER);
-	if (ef->ddbsymtab)
-		free(ef->ddbsymtab, M_LINKER);
-	if (ef->ddbstrtab)
-		free(ef->ddbstrtab, M_LINKER);
-	if (ef->shstrtab)
-		free(ef->shstrtab, M_LINKER);
-	if (ef->ctftab)
-		free(ef->ctftab, M_LINKER);
-	if (ef->ctfoff)
-		free(ef->ctfoff, M_LINKER);
-	if (ef->typoff)
-		free(ef->typoff, M_LINKER);
+	free(ef->e_shdr, M_LINKER);
+	free(ef->ddbsymtab, M_LINKER);
+	free(ef->ddbstrtab, M_LINKER);
+	free(ef->shstrtab, M_LINKER);
+	free(ef->ctftab, M_LINKER);
+	free(ef->ctfoff, M_LINKER);
+	free(ef->typoff, M_LINKER);
 }
 
 static const char *
@@ -993,12 +1039,16 @@
 	/* Perform relocations without addend if there are any: */
 	for (i = 0; i < ef->nreltab; i++) {
 		rel = ef->reltab[i].rel;
-		if (rel == NULL)
-			panic("lost a reltab!");
+		if (rel == NULL) {
+			link_elf_error(ef->lf.filename, "lost a reltab!");
+			return (ENOEXEC);
+		}
 		rellim = rel + ef->reltab[i].nrel;
 		base = findbase(ef, ef->reltab[i].sec);
-		if (base == 0)
-			panic("lost base for reltab");
+		if (base == 0) {
+			link_elf_error(ef->lf.filename, "lost base for reltab");
+			return (ENOEXEC);
+		}
 		for ( ; rel < rellim; rel++) {
 			symidx = ELF_R_SYM(rel->r_info);
 			if (symidx >= ef->ddbsymcnt)
@@ -1012,7 +1062,7 @@
 				symname = symbol_name(ef, rel->r_info);
 				printf("link_elf_obj: symbol %s undefined\n",
 				    symname);
-				return ENOENT;
+				return (ENOENT);
 			}
 		}
 	}
@@ -1020,12 +1070,17 @@
 	/* Perform relocations with addend if there are any: */
 	for (i = 0; i < ef->nrelatab; i++) {
 		rela = ef->relatab[i].rela;
-		if (rela == NULL)
-			panic("lost a relatab!");
+		if (rela == NULL) {
+			link_elf_error(ef->lf.filename, "lost a relatab!");
+			return (ENOEXEC);
+		}
 		relalim = rela + ef->relatab[i].nrela;
 		base = findbase(ef, ef->relatab[i].sec);
-		if (base == 0)
-			panic("lost base for relatab");
+		if (base == 0) {
+			link_elf_error(ef->lf.filename,
+			    "lost base for relatab");
+			return (ENOEXEC);
+		}
 		for ( ; rela < relalim; rela++) {
 			symidx = ELF_R_SYM(rela->r_info);
 			if (symidx >= ef->ddbsymcnt)
@@ -1039,12 +1094,19 @@
 				symname = symbol_name(ef, rela->r_info);
 				printf("link_elf_obj: symbol %s undefined\n",
 				    symname);
-				return ENOENT;
+				return (ENOENT);
 			}
 		}
 	}
 
-	return 0;
+	/*
+	 * Only clean SHN_FBSD_CACHED for successful return.  If we
+	 * modified symbol table for the object but found an
+	 * unresolved symbol, there is no reason to roll back.
+	 */
+	elf_obj_cleanup_globals_cache(ef);
+
+	return (0);
 }
 
 static int
@@ -1192,6 +1254,21 @@
 	return (0);
 }
 
+static void
+elf_obj_cleanup_globals_cache(elf_file_t ef)
+{
+	Elf_Sym *sym;
+	Elf_Size i;
+
+	for (i = 0; i < ef->ddbsymcnt; i++) {
+		sym = ef->ddbsymtab + i;
+		if (sym->st_shndx == SHN_FBSD_CACHED) {
+			sym->st_shndx = SHN_UNDEF;
+			sym->st_value = 0;
+		}
+	}
+}
+
 /*
  * Symbol lookup function that can be used when the symbol index is known (ie
  * in relocations). It uses the symbol index instead of doing a fully fledged
@@ -1199,46 +1276,71 @@
  * This is not only more efficient, it's also more correct. It's not always
  * the case that the symbol can be found through the hash table.
  */
-static Elf_Addr
-elf_obj_lookup(linker_file_t lf, Elf_Size symidx, int deps)
+static int
+elf_obj_lookup(linker_file_t lf, Elf_Size symidx, int deps, Elf_Addr *res)
 {
 	elf_file_t ef = (elf_file_t)lf;
-	const Elf_Sym *sym;
+	Elf_Sym *sym;
 	const char *symbol;
-	Elf_Addr ret;
+	Elf_Addr res1;
 
 	/* Don't even try to lookup the symbol if the index is bogus. */
-	if (symidx >= ef->ddbsymcnt)
-		return (0);
+	if (symidx >= ef->ddbsymcnt) {
+		*res = 0;
+		return (EINVAL);
+	}
 
 	sym = ef->ddbsymtab + symidx;
 
 	/* Quick answer if there is a definition included. */
-	if (sym->st_shndx != SHN_UNDEF)
-		return (sym->st_value);
+	if (sym->st_shndx != SHN_UNDEF) {
+		*res = sym->st_value;
+		return (0);
+	}
 
 	/* If we get here, then it is undefined and needs a lookup. */
 	switch (ELF_ST_BIND(sym->st_info)) {
 	case STB_LOCAL:
 		/* Local, but undefined? huh? */
-		return (0);
+		*res = 0;
+		return (EINVAL);
 
 	case STB_GLOBAL:
+	case STB_WEAK:
 		/* Relative to Data or Function name */
 		symbol = ef->ddbstrtab + sym->st_name;
 
 		/* Force a lookup failure if the symbol name is bogus. */
-		if (*symbol == 0)
+		if (*symbol == 0) {
+			*res = 0;
+			return (EINVAL);
+		}
+		res1 = (Elf_Addr)linker_file_lookup_symbol(lf, symbol, deps);
+
+		/*
+		 * Cache global lookups during module relocation. The failure
+		 * case is particularly expensive for callers, who must scan
+		 * through the entire globals table doing strcmp(). Cache to
+		 * avoid doing such work repeatedly.
+		 *
+		 * After relocation is complete, undefined globals will be
+		 * restored to SHN_UNDEF in elf_obj_cleanup_globals_cache(),
+		 * above.
+		 */
+		if (res1 != 0) {
+			sym->st_shndx = SHN_FBSD_CACHED;
+			sym->st_value = res1;
+			*res = res1;
 			return (0);
-		ret = ((Elf_Addr)linker_file_lookup_symbol(lf, symbol, deps));
-		return ret;
+		} else if (ELF_ST_BIND(sym->st_info) == STB_WEAK) {
+			sym->st_value = 0;
+			*res = 0;
+			return (0);
+		}
+		return (EINVAL);
 
-	case STB_WEAK:
-		printf("link_elf_obj: Weak symbols not supported\n");
-		return (0);
-
 	default:
-		return (0);
+		return (EINVAL);
 	}
 }
 
@@ -1287,7 +1389,7 @@
 	}
 }
 
-static void
+static int
 link_elf_reloc_local(linker_file_t lf)
 {
 	elf_file_t ef = (elf_file_t)lf;
@@ -1305,12 +1407,16 @@
 	/* Perform relocations without addend if there are any: */
 	for (i = 0; i < ef->nreltab; i++) {
 		rel = ef->reltab[i].rel;
-		if (rel == NULL)
-			panic("lost a reltab!");
+		if (rel == NULL) {
+			link_elf_error(ef->lf.filename, "lost a reltab");
+			return (ENOEXEC);
+		}
 		rellim = rel + ef->reltab[i].nrel;
 		base = findbase(ef, ef->reltab[i].sec);
-		if (base == 0)
-			panic("lost base for reltab");
+		if (base == 0) {
+			link_elf_error(ef->lf.filename, "lost base for reltab");
+			return (ENOEXEC);
+		}
 		for ( ; rel < rellim; rel++) {
 			symidx = ELF_R_SYM(rel->r_info);
 			if (symidx >= ef->ddbsymcnt)
@@ -1327,12 +1433,16 @@
 	/* Perform relocations with addend if there are any: */
 	for (i = 0; i < ef->nrelatab; i++) {
 		rela = ef->relatab[i].rela;
-		if (rela == NULL)
-			panic("lost a relatab!");
+		if (rela == NULL) {
+			link_elf_error(ef->lf.filename, "lost a relatab!");
+			return (ENOEXEC);
+		}
 		relalim = rela + ef->relatab[i].nrela;
 		base = findbase(ef, ef->relatab[i].sec);
-		if (base == 0)
-			panic("lost base for relatab");
+		if (base == 0) {
+			link_elf_error(ef->lf.filename, "lost base for reltab");
+			return (ENOEXEC);
+		}
 		for ( ; rela < relalim; rela++) {
 			symidx = ELF_R_SYM(rela->r_info);
 			if (symidx >= ef->ddbsymcnt)
@@ -1345,6 +1455,7 @@
 			    elf_obj_lookup);
 		}
 	}
+	return (0);
 }
 
 static long

Modified: trunk/sys/kern/md4c.c
===================================================================
--- trunk/sys/kern/md4c.c	2018-05-25 20:59:46 UTC (rev 9949)
+++ trunk/sys/kern/md4c.c	2018-05-25 21:07:09 UTC (rev 9950)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /* MD4C.C - RSA Data Security, Inc., MD4 message-digest algorithm
  */
 
@@ -24,7 +25,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/md4c.c 139804 2005-01-06 23:35:40Z imp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>

Modified: trunk/sys/kern/md5c.c
===================================================================
--- trunk/sys/kern/md5c.c	2018-05-25 20:59:46 UTC (rev 9949)
+++ trunk/sys/kern/md5c.c	2018-05-25 21:07:09 UTC (rev 9950)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * MD5C.C - RSA Data Security, Inc., MD5 message-digest algorithm
  *
@@ -30,7 +31,7 @@
  * This file should be kept in sync with src/lib/libmd/md5c.c
  */
 #include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/md5c.c 157304 2006-03-30 18:45:50Z pjd $");
 
 #include <sys/types.h>
 

Modified: trunk/sys/kern/p1003_1b.c
===================================================================
--- trunk/sys/kern/p1003_1b.c	2018-05-25 20:59:46 UTC (rev 9949)
+++ trunk/sys/kern/p1003_1b.c	2018-05-25 21:07:09 UTC (rev 9950)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1996, 1997, 1998
  *	HD Associates, Inc.  All rights reserved.
@@ -34,7 +35,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/p1003_1b.c 293485 2016-01-09 14:44:41Z dchagin $");
 
 #include "opt_posix.h"
 
@@ -130,16 +131,29 @@
 		targettd = FIRST_THREAD_IN_PROC(targetp);
 	}
 
-	e = p_cansched(td, targetp);
-	if (e == 0) {
-		e = ksched_setparam(ksched, targettd,
-			(const struct sched_param *)&sched_param);
-	}
+	e = kern_sched_setparam(td, targettd, &sched_param);
 	PROC_UNLOCK(targetp);
 	return (e);
 }
 
 int
+kern_sched_setparam(struct thread *td, struct thread *targettd,
+    struct sched_param *param)
+{
+	struct proc *targetp;
+	int error;
+
+	targetp = targettd->td_proc;
+	PROC_LOCK_ASSERT(targetp, MA_OWNED);
+
+	error = p_cansched(td, targetp);
+	if (error == 0)
+		error = ksched_setparam(ksched, targettd,
+		    (const struct sched_param *)param);
+	return (error);
+}
+
+int
 sys_sched_getparam(struct thread *td, struct sched_getparam_args *uap)
 {
 	int e;
@@ -159,10 +173,7 @@
 		targettd = FIRST_THREAD_IN_PROC(targetp);
 	}
 
-	e = p_cansee(td, targetp);
-	if (e == 0) {
-		e = ksched_getparam(ksched, targettd, &sched_param);
-	}
+	e = kern_sched_getparam(td, targettd, &sched_param);
 	PROC_UNLOCK(targetp);
 	if (e == 0)
 		e = copyout(&sched_param, uap->param, sizeof(sched_param));
@@ -170,6 +181,22 @@
 }
 
 int
+kern_sched_getparam(struct thread *td, struct thread *targettd,
+    struct sched_param *param)
+{
+	struct proc *targetp;
+	int error;
+
+	targetp = targettd->td_proc;
+	PROC_LOCK_ASSERT(targetp, MA_OWNED);
+
+	error = p_cansee(td, targetp);
+	if (error == 0)
+		error = ksched_getparam(ksched, targettd, param);
+	return (error);
+}
+
+int
 sys_sched_setscheduler(struct thread *td, struct sched_setscheduler_args *uap)
 {
 	int e;
@@ -177,11 +204,6 @@
 	struct thread *targettd;
 	struct proc *targetp;
 
-	/* Don't allow non root user to set a scheduler policy. */
-	e = priv_check(td, PRIV_SCHED_SET);
-	if (e)
-		return (e);
-
 	e = copyin(uap->param, &sched_param, sizeof(sched_param));
 	if (e)
 		return (e);
@@ -197,16 +219,35 @@
 		targettd = FIRST_THREAD_IN_PROC(targetp);
 	}
 
-	e = p_cansched(td, targetp);
-	if (e == 0) {
-		e = ksched_setscheduler(ksched, targettd,
-			uap->policy, (const struct sched_param *)&sched_param);
-	}
+	e = kern_sched_setscheduler(td, targettd, uap->policy,
+	    &sched_param);
 	PROC_UNLOCK(targetp);
 	return (e);
 }
 
 int
+kern_sched_setscheduler(struct thread *td, struct thread *targettd,
+    int policy, struct sched_param *param)
+{
+	struct proc *targetp;
+	int error;
+
+	targetp = targettd->td_proc;
+	PROC_LOCK_ASSERT(targetp, MA_OWNED);
+
+	/* Don't allow non root user to set a scheduler policy. */
+	error = priv_check(td, PRIV_SCHED_SET);
+	if (error)
+		return (error);
+
+	error = p_cansched(td, targetp);
+	if (error == 0)
+		error = ksched_setscheduler(ksched, targettd, policy,
+		    (const struct sched_param *)param);
+	return (error);
+}
+
+int
 sys_sched_getscheduler(struct thread *td, struct sched_getscheduler_args *uap)
 {
 	int e, policy;
@@ -224,17 +265,31 @@
 		targettd = FIRST_THREAD_IN_PROC(targetp);
 	}
 
-	e = p_cansee(td, targetp);
-	if (e == 0) {
-		e = ksched_getscheduler(ksched, targettd, &policy);
+	e = kern_sched_getscheduler(td, targettd, &policy);
+	PROC_UNLOCK(targetp);
+	if (e == 0)
 		td->td_retval[0] = policy;
-	}
-	PROC_UNLOCK(targetp);
 
 	return (e);
 }
 
 int
+kern_sched_getscheduler(struct thread *td, struct thread *targettd,
+    int *policy)
+{
+	struct proc *targetp;
+	int error;
+
+	targetp = targettd->td_proc;
+	PROC_LOCK_ASSERT(targetp, MA_OWNED);
+
+	error = p_cansee(td, targetp);
+	if (error == 0)
+		error = ksched_getscheduler(ksched, targettd, policy);
+	return (error);
+}
+
+int
 sys_sched_yield(struct thread *td, struct sched_yield_args *uap)
 {
 
@@ -296,13 +351,26 @@
 		targettd = FIRST_THREAD_IN_PROC(targetp);
 	}
 
-	e = p_cansee(td, targetp);
-	if (e == 0)
-		e = ksched_rr_get_interval(ksched, targettd, ts);
+	e = kern_sched_rr_get_interval_td(td, targettd, ts);
 	PROC_UNLOCK(targetp);
 	return (e);
 }
 
+int
+kern_sched_rr_get_interval_td(struct thread *td, struct thread *targettd,
+    struct timespec *ts)
+{
+	struct proc *p;
+	int error;
+
+	p = targettd->td_proc;
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+
+	error = p_cansee(td, p);
+	if (error == 0)
+		error = ksched_rr_get_interval(ksched, targettd, ts);
+	return (error);
+}
 #endif
 
 static void

Modified: trunk/sys/kern/posix4_mib.c
===================================================================
--- trunk/sys/kern/posix4_mib.c	2018-05-25 20:59:46 UTC (rev 9949)
+++ trunk/sys/kern/posix4_mib.c	2018-05-25 21:07:09 UTC (rev 9950)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1998
  *	HD Associates, Inc.  All rights reserved.
@@ -31,7 +32,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/posix4_mib.c 299613 2016-05-13 07:56:14Z ngie $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -114,9 +115,9 @@
 	num = arg2;
 	if (!P31B_VALID(num))
 		return (EINVAL);
-	val = facility_initialized[num] ? facility[num - 1] : 0;
+	val = facility_initialized[num - 1] ? facility[num - 1] : 0;
 	error = sysctl_handle_int(oidp, &val, 0, req);
-	if (error == 0 && req->newptr != NULL && facility_initialized[num])
+	if (error == 0 && req->newptr != NULL && facility_initialized[num - 1])
 		facility[num - 1] = val;
 	return (error);
 }
@@ -138,7 +139,7 @@
 {
 
 	facility[num - 1] = 0;
-	facility_initialized[num -1] = 0;
+	facility_initialized[num - 1] = 0;
 }
 
 int

Modified: trunk/sys/kern/sched_4bsd.c
===================================================================
--- trunk/sys/kern/sched_4bsd.c	2018-05-25 20:59:46 UTC (rev 9949)
+++ trunk/sys/kern/sched_4bsd.c	2018-05-25 21:07:09 UTC (rev 9950)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1982, 1986, 1990, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
@@ -33,7 +34,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/sched_4bsd.c 316841 2017-04-14 14:44:06Z avg $");
 
 #include "opt_hwpmc_hooks.h"
 #include "opt_sched.h"
@@ -143,7 +144,7 @@
         schedcpu_thread,
         NULL
 };
-SYSINIT(schedcpu, SI_SUB_RUN_SCHEDULER, SI_ORDER_FIRST, kproc_start,
+SYSINIT(schedcpu, SI_SUB_LAST, SI_ORDER_FIRST, kproc_start,
     &sched_kp);
 SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL);
 
@@ -255,20 +256,20 @@
 
 SDT_PROVIDER_DEFINE(sched);
 
-SDT_PROBE_DEFINE3(sched, , , change_pri, change-pri, "struct thread *", 
+SDT_PROBE_DEFINE3(sched, , , change__pri, "struct thread *", 
     "struct proc *", "uint8_t");
-SDT_PROBE_DEFINE3(sched, , , dequeue, dequeue, "struct thread *", 
+SDT_PROBE_DEFINE3(sched, , , dequeue, "struct thread *", 
     "struct proc *", "void *");
-SDT_PROBE_DEFINE4(sched, , , enqueue, enqueue, "struct thread *", 
+SDT_PROBE_DEFINE4(sched, , , enqueue, "struct thread *", 
     "struct proc *", "void *", "int");
-SDT_PROBE_DEFINE4(sched, , , lend_pri, lend-pri, "struct thread *", 
+SDT_PROBE_DEFINE4(sched, , , lend__pri, "struct thread *", 
     "struct proc *", "uint8_t", "struct thread *");
-SDT_PROBE_DEFINE2(sched, , , load_change, load-change, "int", "int");
-SDT_PROBE_DEFINE2(sched, , , off_cpu, off-cpu, "struct thread *",
+SDT_PROBE_DEFINE2(sched, , , load__change, "int", "int");
+SDT_PROBE_DEFINE2(sched, , , off__cpu, "struct thread *",
     "struct proc *");
-SDT_PROBE_DEFINE(sched, , , on_cpu, on-cpu);
-SDT_PROBE_DEFINE(sched, , , remain_cpu, remain-cpu);
-SDT_PROBE_DEFINE2(sched, , , surrender, surrender, "struct thread *",
+SDT_PROBE_DEFINE(sched, , , on__cpu);
+SDT_PROBE_DEFINE(sched, , , remain__cpu);
+SDT_PROBE_DEFINE2(sched, , , surrender, "struct thread *",
     "struct proc *");
 
 static __inline void
@@ -277,7 +278,7 @@
 
 	sched_tdcnt++;
 	KTR_COUNTER0(KTR_SCHED, "load", "global load", sched_tdcnt);
-	SDT_PROBE2(sched, , , load_change, NOCPU, sched_tdcnt);
+	SDT_PROBE2(sched, , , load__change, NOCPU, sched_tdcnt);
 }
 
 static __inline void
@@ -286,7 +287,7 @@
 
 	sched_tdcnt--;
 	KTR_COUNTER0(KTR_SCHED, "load", "global load", sched_tdcnt);
-	SDT_PROBE2(sched, , , load_change, NOCPU, sched_tdcnt);
+	SDT_PROBE2(sched, , , load__change, NOCPU, sched_tdcnt);
 }
 /*
  * Arrange to reschedule if necessary, taking the priorities and
@@ -304,9 +305,8 @@
 /*
  * This function is called when a thread is about to be put on run queue
  * because it has been made runnable or its priority has been adjusted.  It
- * determines if the new thread should be immediately preempted to.  If so,
- * it switches to it and eventually returns true.  If not, it returns false
- * so that the caller may place the thread on an appropriate run queue.
+ * determines if the new thread should preempt the current thread.  If so,
+ * it sets td_owepreempt to request a preemption.
  */
 int
 maybe_preempt(struct thread *td)
@@ -352,29 +352,8 @@
 		return (0);
 #endif
 
-	if (ctd->td_critnest > 1) {
-		CTR1(KTR_PROC, "maybe_preempt: in critical section %d",
-		    ctd->td_critnest);
-		ctd->td_owepreempt = 1;
-		return (0);
-	}
-	/*
-	 * Thread is runnable but not yet put on system run queue.
-	 */
-	MPASS(ctd->td_lock == td->td_lock);
-	MPASS(TD_ON_RUNQ(td));
-	TD_SET_RUNNING(td);
-	CTR3(KTR_PROC, "preempting to thread %p (pid %d, %s)\n", td,
-	    td->td_proc->p_pid, td->td_name);
-	mi_switch(SW_INVOL | SW_PREEMPT | SWT_PREEMPT, td);
-	/*
-	 * td's lock pointer may have changed.  We have to return with it
-	 * locked.
-	 */
-	spinlock_enter();
-	thread_unlock(ctd);
-	thread_lock(td);
-	spinlock_exit();
+	CTR0(KTR_PROC, "maybe_preempt: scheduling preemption");
+	ctd->td_owepreempt = 1;
 	return (1);
 #else
 	return (0);
@@ -793,6 +772,8 @@
 {
 	struct td_sched *ts;
 
+	childtd->td_oncpu = NOCPU;
+	childtd->td_lastcpu = NOCPU;
 	childtd->td_estcpu = td->td_estcpu;
 	childtd->td_lock = &sched_lock;
 	childtd->td_cpuset = cpuset_ref(td->td_cpuset);
@@ -836,12 +817,12 @@
 	KTR_POINT3(KTR_SCHED, "thread", sched_tdname(td), "priority change",
 	    "prio:%d", td->td_priority, "new prio:%d", prio, KTR_ATTR_LINKED,
 	    sched_tdname(curthread));
-	SDT_PROBE3(sched, , , change_pri, td, td->td_proc, prio);
+	SDT_PROBE3(sched, , , change__pri, td, td->td_proc, prio);
 	if (td != curthread && prio > td->td_priority) {
 		KTR_POINT3(KTR_SCHED, "thread", sched_tdname(curthread),
 		    "lend prio", "prio:%d", td->td_priority, "new prio:%d",
 		    prio, KTR_ATTR_LINKED, sched_tdname(td));
-		SDT_PROBE4(sched, , , lend_pri, td, td->td_proc, prio, 
+		SDT_PROBE4(sched, , , lend__pri, td, td->td_proc, prio, 
 		    curthread);
 	}
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
@@ -983,7 +964,8 @@
 		sched_load_rem();
 
 	td->td_lastcpu = td->td_oncpu;
-	preempted = !(td->td_flags & TDF_SLICEEND);
+	preempted = (td->td_flags & TDF_SLICEEND) == 0 &&
+	    (flags & SW_PREEMPT) != 0;
 	td->td_flags &= ~(TDF_NEEDRESCHED | TDF_SLICEEND);
 	td->td_owepreempt = 0;
 	td->td_oncpu = NOCPU;
@@ -1027,6 +1009,16 @@
 		MPASS(newtd->td_lock == &sched_lock);
 	}
 
+#if (KTR_COMPILE & KTR_SCHED) != 0
+	if (TD_IS_IDLETHREAD(td))
+		KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "idle",
+		    "prio:%d", td->td_priority);
+	else
+		KTR_STATE3(KTR_SCHED, "thread", sched_tdname(td), KTDSTATE(td),
+		    "prio:%d", td->td_priority, "wmesg:\"%s\"", td->td_wmesg,
+		    "lockname:\"%s\"", td->td_lockname);
+#endif
+
 	if (td != newtd) {
 #ifdef	HWPMC_HOOKS
 		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
@@ -1033,7 +1025,7 @@
 			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT);
 #endif
 
-		SDT_PROBE2(sched, , , off_cpu, td, td->td_proc);
+		SDT_PROBE2(sched, , , off__cpu, newtd, newtd->td_proc);
 
                 /* I feel sleepy */
 		lock_profile_release_lock(&sched_lock.lock_object);
@@ -1067,14 +1059,17 @@
 		 * need to reap it.
 		 */
 
-		SDT_PROBE0(sched, , , on_cpu);
+		SDT_PROBE0(sched, , , on__cpu);
 #ifdef	HWPMC_HOOKS
 		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
 			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN);
 #endif
 	} else
-		SDT_PROBE0(sched, , , remain_cpu);
+		SDT_PROBE0(sched, , , remain__cpu);
 
+	KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "running",
+	    "prio:%d", td->td_priority);
+
 #ifdef SMP
 	if (td->td_flags & TDF_IDLETD)
 		CPU_SET(PCPU_GET(cpuid), &idle_cpus_mask);
@@ -1232,7 +1227,7 @@
 
 	mtx_assert(&sched_lock, MA_OWNED);
 
-	if (THREAD_CAN_SCHED(td, td->td_lastcpu))
+	if (td->td_lastcpu != NOCPU && THREAD_CAN_SCHED(td, td->td_lastcpu))
 		best = td->td_lastcpu;
 	else
 		best = NOCPU;
@@ -1323,6 +1318,12 @@
 		ts->ts_runq = &runq;
 	}
 
+	if ((td->td_flags & TDF_NOLOAD) == 0)
+		sched_load_add();
+	runq_add(ts->ts_runq, td, flags);
+	if (cpu != NOCPU)
+		runq_length[cpu]++;
+
 	cpuid = PCPU_GET(cpuid);
 	if (single_cpu && cpu != cpuid) {
 	        kick_other_cpu(td->td_priority, cpu);
@@ -1339,18 +1340,10 @@
 		}
 
 		if (!forwarded) {
-			if ((flags & SRQ_YIELDING) == 0 && maybe_preempt(td))
-				return;
-			else
+			if (!maybe_preempt(td))
 				maybe_resched(td);
 		}
 	}
-
-	if ((td->td_flags & TDF_NOLOAD) == 0)
-		sched_load_add();
-	runq_add(ts->ts_runq, td, flags);
-	if (cpu != NOCPU)
-		runq_length[cpu]++;
 }
 #else /* SMP */
 {
@@ -1384,23 +1377,11 @@
 	CTR2(KTR_RUNQ, "sched_add: adding td_sched:%p (td:%p) to runq", ts, td);
 	ts->ts_runq = &runq;
 
-	/*
-	 * If we are yielding (on the way out anyhow) or the thread
-	 * being saved is US, then don't try be smart about preemption
-	 * or kicking off another CPU as it won't help and may hinder.
-	 * In the YIEDLING case, we are about to run whoever is being
-	 * put in the queue anyhow, and in the OURSELF case, we are
-	 * puting ourself on the run queue which also only happens
-	 * when we are about to yield.
-	 */
-	if ((flags & SRQ_YIELDING) == 0) {
-		if (maybe_preempt(td))
-			return;
-	}
 	if ((td->td_flags & TDF_NOLOAD) == 0)
 		sched_load_add();
 	runq_add(ts->ts_runq, td, flags);
-	maybe_resched(td);
+	if (!maybe_preempt(td))
+		maybe_resched(td);
 }
 #endif /* SMP */
 
@@ -1585,7 +1566,7 @@
 	return (ts->ts_pctcpu);
 }
 
-#ifdef	RACCT
+#ifdef RACCT
 /*
  * Calculates the contribution to the thread cpu usage for the latest
  * (unfinished) second.
@@ -1632,6 +1613,7 @@
 {
 	struct pcpuidlestat *stat;
 
+	THREAD_NO_SLEEPING();
 	stat = DPCPU_PTR(idlestat);
 	for (;;) {
 		mtx_assert(&Giant, MA_NOTOWNED);
@@ -1670,6 +1652,8 @@
 	} else {
 		lock_profile_release_lock(&sched_lock.lock_object);
 		MPASS(td->td_lock == &sched_lock);
+		td->td_lastcpu = td->td_oncpu;
+		td->td_oncpu = NOCPU;
 	}
 	mtx_assert(&sched_lock, MA_OWNED);
 	KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count"));
@@ -1689,6 +1673,10 @@
 	lock_profile_obtain_lock_success(&sched_lock.lock_object,
 	    0, 0, __FILE__, __LINE__);
 	THREAD_LOCK_ASSERT(td, MA_OWNED | MA_NOTRECURSED);
+
+	KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "running",
+	    "prio:%d", td->td_priority);
+	SDT_PROBE0(sched, , , on__cpu);
 }
 
 char *

Modified: trunk/sys/kern/sched_ule.c
===================================================================
--- trunk/sys/kern/sched_ule.c	2018-05-25 20:59:46 UTC (rev 9949)
+++ trunk/sys/kern/sched_ule.c	2018-05-25 21:07:09 UTC (rev 9950)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 2002-2007, Jeffrey Roberson <jeff at freebsd.org>
  * All rights reserved.
@@ -36,7 +37,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/sched_ule.c 316841 2017-04-14 14:44:06Z avg $");
 
 #include "opt_hwpmc_hooks.h"
 #include "opt_kdtrace.h"
@@ -77,10 +78,6 @@
 #include <machine/cpu.h>
 #include <machine/smp.h>
 
-#if defined(__powerpc__) && defined(E500)
-#error "This architecture is not currently compatible with ULE"
-#endif
-
 #define	KTR_ULE	0
 
 #define	TS_NAME_LEN (MAXCOMLEN + sizeof(" td ") + sizeof(__XSTRING(UINT_MAX)))
@@ -189,6 +186,12 @@
 #define	SCHED_INTERACT_HALF	(SCHED_INTERACT_MAX / 2)
 #define	SCHED_INTERACT_THRESH	(30)
 
+/*
+ * These parameters determine the slice behavior for batch work.
+ */
+#define	SCHED_SLICE_DEFAULT_DIVISOR	10	/* ~94 ms, 12 stathz ticks. */
+#define	SCHED_SLICE_MIN_DIVISOR		6	/* DEFAULT/MIN = ~16 ms. */
+
 /* Flags kept in td_flags. */
 #define	TDF_SLICEEND	TDF_SCHED2	/* Thread time slice is over. */
 
@@ -201,9 +204,10 @@
  * preempt_thresh:	Priority threshold for preemption and remote IPIs.
  */
 static int sched_interact = SCHED_INTERACT_THRESH;
-static int realstathz = 127;
 static int tickincr = 8 << SCHED_TICK_SHIFT;
-static int sched_slice = 12;
+static int realstathz = 127;	/* reset during boot. */
+static int sched_slice = 10;	/* reset during boot. */
+static int sched_slice_min = 1;	/* reset during boot. */
 #ifdef PREEMPTION
 #ifdef FULL_PREEMPTION
 static int preempt_thresh = PRI_MAX_IDLE;
@@ -223,8 +227,12 @@
  * locking in sched_pickcpu();
  */
 struct tdq {
-	/* Ordered to improve efficiency of cpu_search() and switch(). */
-	struct mtx	tdq_lock;		/* run queue lock. */
+	/* 
+	 * Ordered to improve efficiency of cpu_search() and switch().
+	 * tdq_lock is padded to avoid false sharing with tdq_load and
+	 * tdq_cpu_idle.
+	 */
+	struct mtx_padalign tdq_lock;		/* run queue lock. */
 	struct cpu_group *tdq_cg;		/* Pointer to cpu topology. */
 	volatile int	tdq_load;		/* Aggregate load. */
 	volatile int	tdq_cpu_idle;		/* cpu_idle() is active. */
@@ -287,7 +295,7 @@
 #define	TDQ_LOCK(t)		mtx_lock_spin(TDQ_LOCKPTR((t)))
 #define	TDQ_LOCK_FLAGS(t, f)	mtx_lock_spin_flags(TDQ_LOCKPTR((t)), (f))
 #define	TDQ_UNLOCK(t)		mtx_unlock_spin(TDQ_LOCKPTR((t)))
-#define	TDQ_LOCKPTR(t)		(&(t)->tdq_lock)
+#define	TDQ_LOCKPTR(t)		((struct mtx *)(&(t)->tdq_lock))
 
 static void sched_priority(struct thread *);
 static void sched_thread_priority(struct thread *, u_char);
@@ -333,20 +341,20 @@
 
 SDT_PROVIDER_DEFINE(sched);
 
-SDT_PROBE_DEFINE3(sched, , , change_pri, change-pri, "struct thread *", 
+SDT_PROBE_DEFINE3(sched, , , change__pri, "struct thread *", 
     "struct proc *", "uint8_t");
-SDT_PROBE_DEFINE3(sched, , , dequeue, dequeue, "struct thread *", 
+SDT_PROBE_DEFINE3(sched, , , dequeue, "struct thread *", 
     "struct proc *", "void *");
-SDT_PROBE_DEFINE4(sched, , , enqueue, enqueue, "struct thread *", 
+SDT_PROBE_DEFINE4(sched, , , enqueue, "struct thread *", 
     "struct proc *", "void *", "int");
-SDT_PROBE_DEFINE4(sched, , , lend_pri, lend-pri, "struct thread *", 
+SDT_PROBE_DEFINE4(sched, , , lend__pri, "struct thread *", 
     "struct proc *", "uint8_t", "struct thread *");
-SDT_PROBE_DEFINE2(sched, , , load_change, load-change, "int", "int");
-SDT_PROBE_DEFINE2(sched, , , off_cpu, off-cpu, "struct thread *", 
+SDT_PROBE_DEFINE2(sched, , , load__change, "int", "int");
+SDT_PROBE_DEFINE2(sched, , , off__cpu, "struct thread *", 
     "struct proc *");
-SDT_PROBE_DEFINE(sched, , , on_cpu, on-cpu);
-SDT_PROBE_DEFINE(sched, , , remain_cpu, remain-cpu);
-SDT_PROBE_DEFINE2(sched, , , surrender, surrender, "struct thread *", 
+SDT_PROBE_DEFINE(sched, , , on__cpu);
+SDT_PROBE_DEFINE(sched, , , remain__cpu);
+SDT_PROBE_DEFINE2(sched, , , surrender, "struct thread *", 
     "struct proc *");
 
 /*
@@ -531,7 +539,7 @@
 	if ((td->td_flags & TDF_NOLOAD) == 0)
 		tdq->tdq_sysload++;
 	KTR_COUNTER0(KTR_SCHED, "load", tdq->tdq_loadname, tdq->tdq_load);
-	SDT_PROBE2(sched, , , load_change, (int)TDQ_ID(tdq), tdq->tdq_load);
+	SDT_PROBE2(sched, , , load__change, (int)TDQ_ID(tdq), tdq->tdq_load);
 }
 
 /*
@@ -551,10 +559,34 @@
 	if ((td->td_flags & TDF_NOLOAD) == 0)
 		tdq->tdq_sysload--;
 	KTR_COUNTER0(KTR_SCHED, "load", tdq->tdq_loadname, tdq->tdq_load);
-	SDT_PROBE2(sched, , , load_change, (int)TDQ_ID(tdq), tdq->tdq_load);
+	SDT_PROBE2(sched, , , load__change, (int)TDQ_ID(tdq), tdq->tdq_load);
 }
 
 /*
+ * Bound timeshare latency by decreasing slice size as load increases.  We
+ * consider the maximum latency as the sum of the threads waiting to run
+ * aside from curthread and target no more than sched_slice latency but
+ * no less than sched_slice_min runtime.
+ */
+static inline int
+tdq_slice(struct tdq *tdq)
+{
+	int load;
+
+	/*
+	 * It is safe to use sys_load here because this is called from
+	 * contexts where timeshare threads are running and so there
+	 * cannot be higher priority load in the system.
+	 */
+	load = tdq->tdq_sysload - 1;
+	if (load >= SCHED_SLICE_MIN_DIVISOR)
+		return (sched_slice_min);
+	if (load <= 1)
+		return (sched_slice);
+	return (sched_slice / load);
+}
+
+/*
  * Set lowpri to its exact value by searching the run-queue and
  * evaluating curthread.  curthread may be passed as an optimization.
  */
@@ -591,12 +623,14 @@
 	for ((cpu) = 0; (cpu) <= mp_maxid; (cpu)++)		\
 		if (CPU_ISSET(cpu, &mask))
 
-static __inline int cpu_search(const struct cpu_group *cg, struct cpu_search *low,
-    struct cpu_search *high, const int match);
-int cpu_search_lowest(const struct cpu_group *cg, struct cpu_search *low);
-int cpu_search_highest(const struct cpu_group *cg, struct cpu_search *high);
-int cpu_search_both(const struct cpu_group *cg, struct cpu_search *low,
+static __always_inline int cpu_search(const struct cpu_group *cg,
+    struct cpu_search *low, struct cpu_search *high, const int match);
+int __noinline cpu_search_lowest(const struct cpu_group *cg,
+    struct cpu_search *low);
+int __noinline cpu_search_highest(const struct cpu_group *cg,
     struct cpu_search *high);
+int __noinline cpu_search_both(const struct cpu_group *cg,
+    struct cpu_search *low, struct cpu_search *high);
 
 /*
  * Search the tree of cpu_groups for the lowest or highest loaded cpu
@@ -609,7 +643,7 @@
  * match argument.  It is reduced to the minimum set for each case.  It is
  * also recursive to the depth of the tree.
  */
-static __inline int
+static __always_inline int
 cpu_search(const struct cpu_group *cg, struct cpu_search *low,
     struct cpu_search *high, const int match)
 {
@@ -632,10 +666,14 @@
 	}
 
 	/* Iterate through the child CPU groups and then remaining CPUs. */
-	for (i = cg->cg_children, cpu = mp_maxid; i >= 0; ) {
+	for (i = cg->cg_children, cpu = mp_maxid; ; ) {
 		if (i == 0) {
+#ifdef HAVE_INLINE_FFSL
+			cpu = CPU_FFS(&cpumask) - 1;
+#else
 			while (cpu >= 0 && !CPU_ISSET(cpu, &cpumask))
 				cpu--;
+#endif
 			if (cpu < 0)
 				break;
 			child = NULL;
@@ -660,6 +698,7 @@
 				break;
 			}
 		} else {			/* Handle child CPU. */
+			CPU_CLR(cpu, &cpumask);
 			tdq = TDQ_CPU(cpu);
 			load = tdq->tdq_load * 256;
 			rndptr = DPCPU_PTR(randomval);
@@ -707,8 +746,11 @@
 			i--;
 			if (i == 0 && CPU_EMPTY(&cpumask))
 				break;
-		} else
+		}
+#ifndef HAVE_INLINE_FFSL
+		else
 			cpu--;
+#endif
 	}
 	return (total);
 }
@@ -771,30 +813,6 @@
 	return high.cs_cpu;
 }
 
-/*
- * Simultaneously find the highest and lowest loaded cpu reachable via
- * cg.
- */
-static inline void
-sched_both(const struct cpu_group *cg, cpuset_t mask, int *lowcpu, int *highcpu)
-{
-	struct cpu_search high;
-	struct cpu_search low;
-
-	low.cs_cpu = -1;
-	low.cs_prefer = -1;
-	low.cs_pri = -1;
-	low.cs_limit = INT_MAX;
-	low.cs_mask = mask;
-	high.cs_cpu = -1;
-	high.cs_limit = -1;
-	high.cs_mask = mask;
-	cpu_search_both(cg, &low, &high);
-	*lowcpu = low.cs_cpu;
-	*highcpu = high.cs_cpu;
-	return;
-}
-
 static void
 sched_balance_group(struct cpu_group *cg)
 {
@@ -905,10 +923,8 @@
 		 * reschedule with the new workload.
 		 */
 		cpu = TDQ_ID(low);
-		sched_pin();
 		if (cpu != PCPU_GET(cpuid))
 			ipi_cpu(cpu, IPI_PREEMPT);
-		sched_unpin();
 	}
 	tdq_unlock_pair(high, low);
 	return (moved);
@@ -1022,6 +1038,14 @@
 	ctd = pcpu_find(cpu)->pc_curthread;
 	if (!sched_shouldpreempt(pri, ctd->td_priority, 1))
 		return;
+
+	/*
+	 * Make sure that tdq_load updated before calling this function
+	 * is globally visible before we read tdq_cpu_idle.  Idle thread
+	 * accesses both of them without locks, and the order is important.
+	 */
+	mb();
+
 	if (TD_IS_IDLETHREAD(ctd)) {
 		/*
 		 * If the MD code has an idle wakeup routine try that before
@@ -1382,7 +1406,8 @@
 	int incr;
 
 	realstathz = stathz ? stathz : hz;
-	sched_slice = realstathz / 10;	/* ~100ms */
+	sched_slice = realstathz / SCHED_SLICE_DEFAULT_DIVISOR;
+	sched_slice_min = sched_slice / SCHED_SLICE_MIN_DIVISOR;
 	hogticks = imax(1, (2 * hz * sched_slice + realstathz / 2) /
 	    realstathz);
 
@@ -1407,7 +1432,7 @@
 	affinity = SCHED_AFFINITY_DEFAULT;
 #endif
 	if (sched_idlespinthresh < 0)
-		sched_idlespinthresh = imax(16, 2 * hz / realstathz);
+		sched_idlespinthresh = 2 * max(10000, 6 * hz) / realstathz;
 }
 
 
@@ -1491,7 +1516,7 @@
 		pri = SCHED_PRI_MIN;
 		if (td->td_sched->ts_ticks)
 			pri += min(SCHED_PRI_TICKS(td->td_sched),
-			    SCHED_PRI_RANGE);
+			    SCHED_PRI_RANGE - 1);
 		pri += SCHED_PRI_NICE(td->td_proc->p_nice);
 		KASSERT(pri >= PRI_MIN_BATCH && pri <= PRI_MAX_BATCH,
 		    ("sched_priority: invalid priority %d: nice %d, " 
@@ -1583,7 +1608,7 @@
 	thread0.td_sched = &td_sched0;
 	td_sched0.ts_ltick = ticks;
 	td_sched0.ts_ftick = ticks;
-	td_sched0.ts_slice = sched_slice;
+	td_sched0.ts_slice = 0;
 }
 
 /*
@@ -1638,12 +1663,12 @@
 	KTR_POINT3(KTR_SCHED, "thread", sched_tdname(td), "prio",
 	    "prio:%d", td->td_priority, "new prio:%d", prio,
 	    KTR_ATTR_LINKED, sched_tdname(curthread));
-	SDT_PROBE3(sched, , , change_pri, td, td->td_proc, prio);
+	SDT_PROBE3(sched, , , change__pri, td, td->td_proc, prio);
 	if (td != curthread && prio < td->td_priority) {
 		KTR_POINT3(KTR_SCHED, "thread", sched_tdname(curthread),
 		    "lend prio", "prio:%d", td->td_priority, "new prio:%d",
 		    prio, KTR_ATTR_LINKED, sched_tdname(td));
-		SDT_PROBE4(sched, , , lend_pri, td, td->td_proc, prio, 
+		SDT_PROBE4(sched, , , lend__pri, td, td->td_proc, prio, 
 		    curthread);
 	} 
 	ts = td->td_sched;
@@ -1846,10 +1871,12 @@
 	ts->ts_rltick = ticks;
 	td->td_lastcpu = td->td_oncpu;
 	td->td_oncpu = NOCPU;
-	preempted = !(td->td_flags & TDF_SLICEEND);
+	preempted = (td->td_flags & TDF_SLICEEND) == 0 &&
+	    (flags & SW_PREEMPT) != 0;
 	td->td_flags &= ~(TDF_NEEDRESCHED | TDF_SLICEEND);
 	td->td_owepreempt = 0;
-	tdq->tdq_switchcnt++;
+	if (!TD_IS_IDLETHREAD(td))
+		tdq->tdq_switchcnt++;
 	/*
 	 * The lock pointer in an idle thread should never change.  Reset it
 	 * to CAN_RUN as well.
@@ -1880,6 +1907,17 @@
 		mtx = thread_lock_block(td);
 		tdq_load_rem(tdq, td);
 	}
+
+#if (KTR_COMPILE & KTR_SCHED) != 0
+	if (TD_IS_IDLETHREAD(td))
+		KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "idle",
+		    "prio:%d", td->td_priority);
+	else
+		KTR_STATE3(KTR_SCHED, "thread", sched_tdname(td), KTDSTATE(td),
+		    "prio:%d", td->td_priority, "wmesg:\"%s\"", td->td_wmesg,
+		    "lockname:\"%s\"", td->td_lockname);
+#endif
+
 	/*
 	 * We enter here with the thread blocked and assigned to the
 	 * appropriate cpu run-queue or sleep-queue and with the current
@@ -1895,7 +1933,7 @@
 		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
 			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT);
 #endif
-		SDT_PROBE2(sched, , , off_cpu, td, td->td_proc);
+		SDT_PROBE2(sched, , , off__cpu, newtd, newtd->td_proc);
 		lock_profile_release_lock(&TDQ_LOCKPTR(tdq)->lock_object);
 		TDQ_LOCKPTR(tdq)->mtx_lock = (uintptr_t)newtd;
 		sched_pctcpu_update(newtd->td_sched, 0);
@@ -1921,7 +1959,7 @@
 		lock_profile_obtain_lock_success(
 		    &TDQ_LOCKPTR(tdq)->lock_object, 0, 0, __FILE__, __LINE__);
 
-		SDT_PROBE0(sched, , , on_cpu);
+		SDT_PROBE0(sched, , , on__cpu);
 #ifdef	HWPMC_HOOKS
 		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
 			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN);
@@ -1928,8 +1966,12 @@
 #endif
 	} else {
 		thread_unblock_switch(td, mtx);
-		SDT_PROBE0(sched, , , remain_cpu);
+		SDT_PROBE0(sched, , , remain__cpu);
 	}
+
+	KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "running",
+	    "prio:%d", td->td_priority);
+
 	/*
 	 * Assert that all went well and return.
 	 */
@@ -2001,8 +2043,10 @@
 		sched_interact_update(td);
 		sched_pctcpu_update(ts, 0);
 	}
-	/* Reset the slice value after we sleep. */
-	ts->ts_slice = sched_slice;
+	/*
+	 * Reset the slice value since we slept and advanced the round-robin.
+	 */
+	ts->ts_slice = 0;
 	sched_add(td, SRQ_BORING);
 }
 
@@ -2034,7 +2078,9 @@
 {
 	struct td_sched *ts;
 	struct td_sched *ts2;
+	struct tdq *tdq;
 
+	tdq = TDQ_SELF();
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	/*
 	 * Initialize child.
@@ -2041,7 +2087,9 @@
 	 */
 	ts = td->td_sched;
 	ts2 = child->td_sched;
-	child->td_lock = TDQ_LOCKPTR(TDQ_SELF());
+	child->td_oncpu = NOCPU;
+	child->td_lastcpu = NOCPU;
+	child->td_lock = TDQ_LOCKPTR(tdq);
 	child->td_cpuset = cpuset_ref(td->td_cpuset);
 	ts2->ts_cpu = ts->ts_cpu;
 	ts2->ts_flags = 0;
@@ -2060,7 +2108,8 @@
 	 */
 	ts2->ts_slptime = ts->ts_slptime;
 	ts2->ts_runtime = ts->ts_runtime;
-	ts2->ts_slice = 1;	/* Attempt to quickly learn interactivity. */
+	/* Attempt to quickly learn interactivity. */
+	ts2->ts_slice = tdq_slice(tdq) - sched_slice_min;
 #ifdef KTR
 	bzero(ts2->ts_name, sizeof(ts2->ts_name));
 #endif
@@ -2225,8 +2274,8 @@
 	 * Force a context switch if the current thread has used up a full
 	 * time slice (default is 100ms).
 	 */
-	if (!TD_IS_IDLETHREAD(td) && --ts->ts_slice <= 0) {
-		ts->ts_slice = sched_slice;
+	if (!TD_IS_IDLETHREAD(td) && ++ts->ts_slice >= tdq_slice(tdq)) {
+		ts->ts_slice = 0;
 		td->td_flags |= TDF_NEEDRESCHED | TDF_SLICEEND;
 	}
 }
@@ -2575,18 +2624,31 @@
 {
 	struct thread *td;
 	struct tdq *tdq;
-	int switchcnt;
+	int oldswitchcnt, switchcnt;
 	int i;
 
 	mtx_assert(&Giant, MA_NOTOWNED);
 	td = curthread;
 	tdq = TDQ_SELF();
+	THREAD_NO_SLEEPING();
+	oldswitchcnt = -1;
 	for (;;) {
+		if (tdq->tdq_load) {
+			thread_lock(td);
+			mi_switch(SW_VOL | SWT_IDLE, NULL);
+			thread_unlock(td);
+		}
+		switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt;
 #ifdef SMP
-		if (tdq_idled(tdq) == 0)
-			continue;
+		if (switchcnt != oldswitchcnt) {
+			oldswitchcnt = switchcnt;
+			if (tdq_idled(tdq) == 0)
+				continue;
+		}
+		switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt;
+#else
+		oldswitchcnt = switchcnt;
 #endif
-		switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt;
 		/*
 		 * If we're switching very frequently, spin while checking
 		 * for load rather than entering a low power state that 
@@ -2601,20 +2663,32 @@
 				cpu_spinwait();
 			}
 		}
+
+		/* If there was context switch during spin, restart it. */
 		switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt;
-		if (tdq->tdq_load == 0) {
-			tdq->tdq_cpu_idle = 1;
-			if (tdq->tdq_load == 0) {
-				cpu_idle(switchcnt > sched_idlespinthresh * 4);
-				tdq->tdq_switchcnt++;
-			}
-			tdq->tdq_cpu_idle = 0;
-		}
-		if (tdq->tdq_load) {
-			thread_lock(td);
-			mi_switch(SW_VOL | SWT_IDLE, NULL);
-			thread_unlock(td);
-		}
+		if (tdq->tdq_load != 0 || switchcnt != oldswitchcnt)
+			continue;
+
+		/* Run main MD idle handler. */
+		tdq->tdq_cpu_idle = 1;
+		/*
+		 * Make sure that tdq_cpu_idle update is globally visible
+		 * before cpu_idle() read tdq_load.  The order is important
+		 * to avoid race with tdq_notify.
+		 */
+		mb();
+		cpu_idle(switchcnt * 4 > sched_idlespinthresh);
+		tdq->tdq_cpu_idle = 0;
+
+		/*
+		 * Account thread-less hardware interrupts and
+		 * other wakeup reasons equal to context switches.
+		 */
+		switchcnt = tdq->tdq_switchcnt + tdq->tdq_oldswitchcnt;
+		if (switchcnt != oldswitchcnt)
+			continue;
+		tdq->tdq_switchcnt++;
+		oldswitchcnt++;
 	}
 }
 
@@ -2638,6 +2712,8 @@
 		MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
 		tdq_load_rem(tdq, td);
 		lock_profile_release_lock(&TDQ_LOCKPTR(tdq)->lock_object);
+		td->td_lastcpu = td->td_oncpu;
+		td->td_oncpu = NOCPU;
 	}
 	KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count"));
 	newtd = choosethread();
@@ -2652,7 +2728,6 @@
 void
 sched_fork_exit(struct thread *td)
 {
-	struct td_sched *ts;
 	struct tdq *tdq;
 	int cpuid;
 
@@ -2662,7 +2737,6 @@
 	 */
 	cpuid = PCPU_GET(cpuid);
 	tdq = TDQ_CPU(cpuid);
-	ts = td->td_sched;
 	if (TD_IS_IDLETHREAD(td))
 		td->td_lock = TDQ_LOCKPTR(tdq);
 	MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
@@ -2670,6 +2744,10 @@
 	TDQ_LOCK_ASSERT(tdq, MA_OWNED | MA_NOTRECURSED);
 	lock_profile_obtain_lock_success(
 	    &TDQ_LOCKPTR(tdq)->lock_object, 0, 0, __FILE__, __LINE__);
+
+	KTR_STATE1(KTR_SCHED, "thread", sched_tdname(td), "running",
+	    "prio:%d", td->td_priority);
+	SDT_PROBE0(sched, , , on__cpu);
 }
 
 /*
@@ -2796,6 +2874,7 @@
 	if (new_val <= 0)
 		return (EINVAL);
 	sched_slice = imax(1, (new_val + period / 2) / period);
+	sched_slice_min = sched_slice / SCHED_SLICE_MIN_DIVISOR;
 	hogticks = imax(1, (2 * hz * sched_slice + realstathz / 2) /
 	    realstathz);
 	return (0);

Modified: trunk/sys/kern/stack_protector.c
===================================================================
--- trunk/sys/kern/stack_protector.c	2018-05-25 20:59:46 UTC (rev 9949)
+++ trunk/sys/kern/stack_protector.c	2018-05-25 21:07:09 UTC (rev 9950)
@@ -1,5 +1,6 @@
+/* $MidnightBSD$ */
 #include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/stack_protector.c 198295 2009-10-20 16:36:51Z ru $");
 
 #include <sys/types.h>
 #include <sys/param.h>

Modified: trunk/sys/kern/subr_acl_nfs4.c
===================================================================
--- trunk/sys/kern/subr_acl_nfs4.c	2018-05-25 20:59:46 UTC (rev 9949)
+++ trunk/sys/kern/subr_acl_nfs4.c	2018-05-25 21:07:09 UTC (rev 9950)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 2008-2010 Edward Tomasz Napierała <trasz at FreeBSD.org>
  * All rights reserved.
@@ -32,9 +33,11 @@
 
 #ifdef _KERNEL
 #include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/subr_acl_nfs4.c 290893 2015-11-15 23:54:34Z ngie $");
 
 #include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
 #include <sys/systm.h>
 #include <sys/mount.h>
 #include <sys/priv.h>
@@ -1066,6 +1069,7 @@
 		child_aclp->acl_cnt++;
 
 		entry->ae_flags &= ~ACL_ENTRY_INHERIT_ONLY;
+		entry->ae_flags |= ACL_ENTRY_INHERITED;
 
 		/*
 		 * If the type of the ACE is neither ALLOW nor DENY,
@@ -1370,3 +1374,46 @@
 
 	return (0);
 }
+
+#ifdef	_KERNEL
+static int
+acl_nfs4_modload(module_t module, int what, void *arg)
+{
+	int ret;
+
+	ret = 0;
+
+	switch (what) {
+	case MOD_LOAD:
+	case MOD_SHUTDOWN:
+		break;
+
+	case MOD_QUIESCE:
+		/* XXX TODO */
+		ret = 0;
+		break;
+
+	case MOD_UNLOAD:
+		/* XXX TODO */
+		ret = 0;
+		break;
+	default:
+		ret = EINVAL;
+		break;
+	}
+
+	return (ret);
+}
+
+static moduledata_t acl_nfs4_mod = {
+	"acl_nfs4",
+	acl_nfs4_modload,
+	NULL
+};
+
+/*
+ * XXX TODO: which subsystem, order?
+ */
+DECLARE_MODULE(acl_nfs4, acl_nfs4_mod, SI_SUB_VFS, SI_ORDER_FIRST);
+MODULE_VERSION(acl_nfs4, 1);
+#endif	/* _KERNEL */

Modified: trunk/sys/kern/subr_blist.c
===================================================================
--- trunk/sys/kern/subr_blist.c	2018-05-25 20:59:46 UTC (rev 9949)
+++ trunk/sys/kern/subr_blist.c	2018-05-25 21:07:09 UTC (rev 9950)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1998 Matthew Dillon.  All Rights Reserved.
  * Redistribution and use in source and binary forms, with or without
@@ -57,8 +58,8 @@
  *	The non-blocking features of the blist code are used in the swap code
  *	(vm/swap_pager.c).
  *
- *	LAYOUT: The radix tree is layed out recursively using a
- *	linear array.  Each meta node is immediately followed (layed out
+ *	LAYOUT: The radix tree is laid out recursively using a
+ *	linear array.  Each meta node is immediately followed (laid out
  *	sequentially in memory) by BLIST_META_RADIX lower level nodes.  This
  *	is a recursive structure but one that can be easily scanned through
  *	a very simple 'skip' calculation.  In order to support large radixes, 
@@ -80,7 +81,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/subr_blist.c 321459 2017-07-25 04:13:43Z alc $");
 
 #ifdef _KERNEL
 
@@ -99,19 +100,18 @@
 #define BLIST_DEBUG
 #endif
 
-#define SWAPBLK_NONE ((daddr_t)-1)
-
 #include <sys/types.h>
+#include <sys/malloc.h>
 #include <stdio.h>
 #include <string.h>
 #include <stdlib.h>
 #include <stdarg.h>
+#include <stdbool.h>
 
+#define	bitcount64(x)	__bitcount64((uint64_t)(x))
 #define malloc(a,b,c)	calloc(a, 1)
 #define free(a,b)	free(a)
 
-typedef unsigned int u_daddr_t;
-
 #include <sys/blist.h>
 
 void panic(const char *ctl, ...);
@@ -122,22 +122,23 @@
  * static support functions
  */
 
-static daddr_t blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count);
-static daddr_t blst_meta_alloc(blmeta_t *scan, daddr_t blk, 
-				daddr_t count, daddr_t radix, int skip);
+static daddr_t	blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count,
+		    daddr_t cursor);
+static daddr_t	blst_meta_alloc(blmeta_t *scan, daddr_t blk, daddr_t count,
+		    daddr_t radix, daddr_t skip, daddr_t cursor);
 static void blst_leaf_free(blmeta_t *scan, daddr_t relblk, int count);
 static void blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, 
-					daddr_t radix, int skip, daddr_t blk);
+		    daddr_t radix, daddr_t skip, daddr_t blk);
 static void blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, 
 				daddr_t skip, blist_t dest, daddr_t count);
-static int blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count);
-static int blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count,
-				daddr_t radix, int skip, daddr_t blk);
-static daddr_t	blst_radix_init(blmeta_t *scan, daddr_t radix, 
-						int skip, daddr_t count);
+static daddr_t blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count);
+static daddr_t blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count,
+		    daddr_t radix, daddr_t skip, daddr_t blk);
+static daddr_t	blst_radix_init(blmeta_t *scan, daddr_t radix, daddr_t skip,
+		    daddr_t count);
 #ifndef _KERNEL
-static void	blst_radix_print(blmeta_t *scan, daddr_t blk, 
-					daddr_t radix, int skip, int tab);
+static void	blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix,
+		    daddr_t skip, int tab);
 #endif
 
 #ifdef _KERNEL
@@ -159,27 +160,33 @@
 blist_create(daddr_t blocks, int flags)
 {
 	blist_t bl;
-	int radix;
-	int skip = 0;
+	daddr_t nodes, radix, skip;
 
 	/*
 	 * Calculate radix and skip field used for scanning.
 	 */
 	radix = BLIST_BMAP_RADIX;
-
+	skip = 0;
 	while (radix < blocks) {
 		radix *= BLIST_META_RADIX;
 		skip = (skip + 1) * BLIST_META_RADIX;
 	}
+	nodes = 1 + blst_radix_init(NULL, radix, skip, blocks);
 
-	bl = malloc(sizeof(struct blist), M_SWAP, flags | M_ZERO);
+	bl = malloc(sizeof(struct blist), M_SWAP, flags);
+	if (bl == NULL)
+		return (NULL);
 
 	bl->bl_blocks = blocks;
 	bl->bl_radix = radix;
 	bl->bl_skip = skip;
-	bl->bl_rootblks = 1 +
-	    blst_radix_init(NULL, bl->bl_radix, bl->bl_skip, blocks);
-	bl->bl_root = malloc(sizeof(blmeta_t) * bl->bl_rootblks, M_SWAP, flags);
+	bl->bl_cursor = 0;
+	bl->bl_root = malloc(nodes * sizeof(blmeta_t), M_SWAP, flags);
+	if (bl->bl_root == NULL) {
+		free(bl, M_SWAP);
+		return (NULL);
+	}
+	blst_radix_init(bl->bl_root, radix, skip, blocks);
 
 #if defined(BLIST_DEBUG)
 	printf(
@@ -187,14 +194,13 @@
 		", requiring %lldK of ram\n",
 		(long long)bl->bl_blocks,
 		(long long)bl->bl_blocks * 4 / 1024,
-		(long long)(bl->bl_rootblks * sizeof(blmeta_t) + 1023) / 1024
+		(long long)(nodes * sizeof(blmeta_t) + 1023) / 1024
 	);
 	printf("BLIST raw radix tree contains %lld records\n",
-	    (long long)bl->bl_rootblks);
+	    (long long)nodes);
 #endif
-	blst_radix_init(bl->bl_root, bl->bl_radix, bl->bl_skip, blocks);
 
-	return(bl);
+	return (bl);
 }
 
 void 
@@ -205,7 +211,7 @@
 }
 
 /*
- * blist_alloc() - reserve space in the block bitmap.  Return the base
+ * blist_alloc() -   reserve space in the block bitmap.  Return the base
  *		     of a contiguous region or SWAPBLK_NONE if space could
  *		     not be allocated.
  */
@@ -213,20 +219,45 @@
 daddr_t 
 blist_alloc(blist_t bl, daddr_t count)
 {
-	daddr_t blk = SWAPBLK_NONE;
+	daddr_t blk;
 
-	if (bl) {
+	/*
+	 * This loop iterates at most twice.  An allocation failure in the
+	 * first iteration leads to a second iteration only if the cursor was
+	 * non-zero.  When the cursor is zero, an allocation failure will
+	 * reduce the hint, stopping further iterations.
+	 */
+	while (count <= bl->bl_root->bm_bighint) {
 		if (bl->bl_radix == BLIST_BMAP_RADIX)
-			blk = blst_leaf_alloc(bl->bl_root, 0, count);
+			blk = blst_leaf_alloc(bl->bl_root, 0, count,
+			    bl->bl_cursor);
 		else
-			blk = blst_meta_alloc(bl->bl_root, 0, count, bl->bl_radix, bl->bl_skip);
-		if (blk != SWAPBLK_NONE)
-			bl->bl_free -= count;
+			blk = blst_meta_alloc(bl->bl_root, 0, count,
+			    bl->bl_radix, bl->bl_skip, bl->bl_cursor);
+		if (blk != SWAPBLK_NONE) {
+			bl->bl_cursor = blk + count;
+			return (blk);
+		} else if (bl->bl_cursor != 0)
+			bl->bl_cursor = 0;
 	}
-	return(blk);
+	return (SWAPBLK_NONE);
 }
 
 /*
+ * blist_avail() -	return the number of free blocks.
+ */
+
+daddr_t
+blist_avail(blist_t bl)
+{
+
+	if (bl->bl_radix == BLIST_BMAP_RADIX)
+		return (bitcount64(bl->bl_root->u.bmu_bitmap));
+	else
+		return (bl->bl_root->u.bmu_avail);
+}
+
+/*
  * blist_free() -	free up space in the block bitmap.  Return the base
  *		     	of a contiguous region.  Panic if an inconsistancy is
  *			found.
@@ -239,8 +270,8 @@
 		if (bl->bl_radix == BLIST_BMAP_RADIX)
 			blst_leaf_free(bl->bl_root, blkno, count);
 		else
-			blst_meta_free(bl->bl_root, blkno, count, bl->bl_radix, bl->bl_skip, 0);
-		bl->bl_free += count;
+			blst_meta_free(bl->bl_root, blkno, count,
+			    bl->bl_radix, bl->bl_skip, 0);
 	}
 }
 
@@ -251,10 +282,10 @@
  *			actually filled that were free before the call.
  */
 
-int
+daddr_t
 blist_fill(blist_t bl, daddr_t blkno, daddr_t count)
 {
-	int filled;
+	daddr_t filled;
 
 	if (bl) {
 		if (bl->bl_radix == BLIST_BMAP_RADIX)
@@ -262,10 +293,9 @@
 		else
 			filled = blst_meta_fill(bl->bl_root, blkno, count,
 			    bl->bl_radix, bl->bl_skip, 0);
-		bl->bl_free -= filled;
-		return filled;
-	} else
-		return 0;
+		return (filled);
+	}
+	return (0);
 }
 
 /*
@@ -325,77 +355,92 @@
 /*
  * blist_leaf_alloc() -	allocate at a leaf in the radix tree (a bitmap).
  *
- *	This is the core of the allocator and is optimized for the 1 block
- *	and the BLIST_BMAP_RADIX block allocation cases.  Other cases are
- *	somewhat slower.  The 1 block allocation case is log2 and extremely
- *	quick.
+ *	This is the core of the allocator and is optimized for the
+ *	BLIST_BMAP_RADIX block allocation case.  Otherwise, execution
+ *	time is proportional to log2(count) + log2(BLIST_BMAP_RADIX).
  */
 
 static daddr_t
-blst_leaf_alloc(
-	blmeta_t *scan,
-	daddr_t blk,
-	int count
-) {
-	u_daddr_t orig = scan->u.bmu_bitmap;
+blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count, daddr_t cursor)
+{
+	u_daddr_t mask;
+	int count1, hi, lo, mid, num_shifts, range1, range_ext;
 
-	if (orig == 0) {
+	if (count == BLIST_BMAP_RADIX) {
 		/*
-		 * Optimize bitmap all-allocated case.  Also, count = 1
-		 * case assumes at least 1 bit is free in the bitmap, so
-		 * we have to take care of this case here.
+		 * Optimize allocation of BLIST_BMAP_RADIX bits.  If this wasn't
+		 * a special case, then forming the final value of 'mask' below
+		 * would require special handling to avoid an invalid left shift
+		 * when count equals the number of bits in mask.
 		 */
+		if (~scan->u.bmu_bitmap != 0) {
+			scan->bm_bighint = BLIST_BMAP_RADIX - 1;
+			return (SWAPBLK_NONE);
+		}
+		if (cursor != blk)
+			return (SWAPBLK_NONE);
+		scan->u.bmu_bitmap = 0;
 		scan->bm_bighint = 0;
-		return(SWAPBLK_NONE);
+		return (blk);
 	}
-	if (count == 1) {
+	range1 = 0;
+	count1 = count - 1;
+	num_shifts = fls(count1);
+	mask = scan->u.bmu_bitmap;
+	while (mask != 0 && num_shifts > 0) {
 		/*
-		 * Optimized code to allocate one bit out of the bitmap
+		 * If bit i is set in mask, then bits in [i, i+range1] are set
+		 * in scan->u.bmu_bitmap.  The value of range1 is equal to
+		 * count1 >> num_shifts.  Grow range and reduce num_shifts to 0,
+		 * while preserving these invariants.  The updates to mask leave
+		 * fewer bits set, but each bit that remains set represents a
+		 * longer string of consecutive bits set in scan->u.bmu_bitmap.
 		 */
-		u_daddr_t mask;
-		int j = BLIST_BMAP_RADIX/2;
-		int r = 0;
-
-		mask = (u_daddr_t)-1 >> (BLIST_BMAP_RADIX/2);
-
-		while (j) {
-			if ((orig & mask) == 0) {
-			    r += j;
-			    orig >>= j;
-			}
-			j >>= 1;
-			mask >>= j;
-		}
-		scan->u.bmu_bitmap &= ~(1 << r);
-		return(blk + r);
+		num_shifts--;
+		range_ext = range1 + ((count1 >> num_shifts) & 1);
+		mask &= mask >> range_ext;
+		range1 += range_ext;
 	}
-	if (count <= BLIST_BMAP_RADIX) {
+	if (mask == 0) {
 		/*
-		 * non-optimized code to allocate N bits out of the bitmap.
-		 * The more bits, the faster the code runs.  It will run
-		 * the slowest allocating 2 bits, but since there aren't any
-		 * memory ops in the core loop (or shouldn't be, anyway),
-		 * you probably won't notice the difference.
+		 * Update bighint.  There is no allocation bigger than range1
+		 * available in this leaf.
 		 */
-		int j;
-		int n = BLIST_BMAP_RADIX - count;
-		u_daddr_t mask;
+		scan->bm_bighint = range1;
+		return (SWAPBLK_NONE);
+	}
 
-		mask = (u_daddr_t)-1 >> n;
+	/*
+	 * Discard any candidates that appear before the cursor.
+	 */
+	lo = cursor - blk;
+	mask &= ~(u_daddr_t)0 << lo;
 
-		for (j = 0; j <= n; ++j) {
-			if ((orig & mask) == mask) {
-				scan->u.bmu_bitmap &= ~mask;
-				return(blk + j);
-			}
-			mask = (mask << 1);
-		}
+	if (mask == 0)
+		return (SWAPBLK_NONE);
+
+	/*
+	 * The least significant set bit in mask marks the start of the first
+	 * available range of sufficient size.  Clear all the bits but that one,
+	 * and then perform a binary search to find its position.
+	 */
+	mask &= -mask;
+	hi = BLIST_BMAP_RADIX - count1;
+	while (lo + 1 < hi) {
+		mid = (lo + hi) >> 1;
+		if ((mask >> mid) != 0)
+			lo = mid;
+		else
+			hi = mid;
 	}
+
 	/*
-	 * We couldn't allocate count in this subtree, update bighint.
+	 * Set in mask exactly the bits being allocated, and clear them from
+	 * the set of available bits.
 	 */
-	scan->bm_bighint = count - 1;
-	return(SWAPBLK_NONE);
+	mask = (mask << count) - mask;
+	scan->u.bmu_bitmap &= ~mask;
+	return (blk + lo);
 }
 
 /*
@@ -408,62 +453,72 @@
  */
 
 static daddr_t
-blst_meta_alloc(
-	blmeta_t *scan, 
-	daddr_t blk,
-	daddr_t count,
-	daddr_t radix, 
-	int skip
-) {
-	int i;
-	int next_skip = ((u_int)skip / BLIST_META_RADIX);
+blst_meta_alloc(blmeta_t *scan, daddr_t blk, daddr_t count, daddr_t radix,
+    daddr_t skip, daddr_t cursor)
+{
+	daddr_t i, next_skip, r;
+	int child;
+	bool scan_from_start;
 
-	if (scan->u.bmu_avail == 0)  {
+	if (scan->u.bmu_avail < count) {
 		/*
-		 * ALL-ALLOCATED special case
+		 * The meta node's hint must be too large if the allocation
+		 * exceeds the number of free blocks.  Reduce the hint, and
+		 * return failure.
 		 */
-		scan->bm_bighint = count;
-		return(SWAPBLK_NONE);
+		scan->bm_bighint = scan->u.bmu_avail;
+		return (SWAPBLK_NONE);
 	}
+	next_skip = skip / BLIST_META_RADIX;
 
+	/*
+	 * An ALL-FREE meta node requires special handling before allocating
+	 * any of its blocks.
+	 */
 	if (scan->u.bmu_avail == radix) {
 		radix /= BLIST_META_RADIX;
 
 		/*
-		 * ALL-FREE special case, initialize uninitialize
-		 * sublevel.
+		 * Reinitialize each of the meta node's children.  An ALL-FREE
+		 * meta node cannot have a terminator in any subtree.
 		 */
 		for (i = 1; i <= skip; i += next_skip) {
-			if (scan[i].bm_bighint == (daddr_t)-1)
-				break;
-			if (next_skip == 1) {
+			if (next_skip == 1)
 				scan[i].u.bmu_bitmap = (u_daddr_t)-1;
-				scan[i].bm_bighint = BLIST_BMAP_RADIX;
-			} else {
-				scan[i].bm_bighint = radix;
+			else
 				scan[i].u.bmu_avail = radix;
-			}
+			scan[i].bm_bighint = radix;
 		}
 	} else {
 		radix /= BLIST_META_RADIX;
 	}
 
-	for (i = 1; i <= skip; i += next_skip) {
+	if (count > radix) {
+		/*
+		 * The allocation exceeds the number of blocks that are
+		 * managed by a subtree of this meta node.
+		 */
+		panic("allocation too large");
+	}
+	scan_from_start = cursor == blk;
+	child = (cursor - blk) / radix;
+	blk += child * radix;
+	for (i = 1 + child * next_skip; i <= skip; i += next_skip) {
 		if (count <= scan[i].bm_bighint) {
 			/*
-			 * count fits in object
+			 * The allocation might fit in the i'th subtree.
 			 */
-			daddr_t r;
 			if (next_skip == 1) {
-				r = blst_leaf_alloc(&scan[i], blk, count);
+				r = blst_leaf_alloc(&scan[i], blk, count,
+				    cursor > blk ? cursor : blk);
 			} else {
-				r = blst_meta_alloc(&scan[i], blk, count, radix, next_skip - 1);
+				r = blst_meta_alloc(&scan[i], blk, count,
+				    radix, next_skip - 1, cursor > blk ?
+				    cursor : blk);
 			}
 			if (r != SWAPBLK_NONE) {
 				scan->u.bmu_avail -= count;
-				if (scan->bm_bighint > scan->u.bmu_avail)
-					scan->bm_bighint = scan->u.bmu_avail;
-				return(r);
+				return (r);
 			}
 		} else if (scan[i].bm_bighint == (daddr_t)-1) {
 			/*
@@ -470,12 +525,6 @@
 			 * Terminator
 			 */
 			break;
-		} else if (count > radix) {
-			/*
-			 * count does not fit in object even if it were
-			 * complete free.
-			 */
-			panic("blist_meta_alloc: allocation too large");
 		}
 		blk += radix;
 	}
@@ -483,9 +532,10 @@
 	/*
 	 * We couldn't allocate count in this subtree, update bighint.
 	 */
-	if (scan->bm_bighint >= count)
+	if (scan_from_start && scan->bm_bighint >= count)
 		scan->bm_bighint = count - 1;
-	return(SWAPBLK_NONE);
+
+	return (SWAPBLK_NONE);
 }
 
 /*
@@ -538,16 +588,11 @@
  */
 
 static void 
-blst_meta_free(
-	blmeta_t *scan, 
-	daddr_t freeBlk,
-	daddr_t count,
-	daddr_t radix, 
-	int skip,
-	daddr_t blk
-) {
-	int i;
-	int next_skip = ((u_int)skip / BLIST_META_RADIX);
+blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, daddr_t radix,
+    daddr_t skip, daddr_t blk)
+{
+	daddr_t i, next_skip, v;
+	int child;
 
 #if 0
 	printf("free (%llx,%lld) FROM (%llx,%lld)\n",
@@ -555,6 +600,7 @@
 	    (long long)blk, (long long)radix
 	);
 #endif
+	next_skip = skip / BLIST_META_RADIX;
 
 	if (scan->u.bmu_avail == 0) {
 		/*
@@ -599,13 +645,10 @@
 
 	radix /= BLIST_META_RADIX;
 
-	i = (freeBlk - blk) / radix;
-	blk += i * radix;
-	i = i * next_skip + 1;
-
+	child = (freeBlk - blk) / radix;
+	blk += child * radix;
+	i = 1 + child * next_skip;
 	while (i <= skip && blk < freeBlk + count) {
-		daddr_t v;
-
 		v = blk + radix - freeBlk;
 		if (v > count)
 			v = count;
@@ -642,8 +685,7 @@
 	blist_t dest,
 	daddr_t count
 ) {
-	int next_skip;
-	int i;
+	daddr_t i, next_skip;
 
 	/*
 	 * Leaf node
@@ -658,7 +700,7 @@
 			int i;
 
 			for (i = 0; i < BLIST_BMAP_RADIX && i < count; ++i) {
-				if (v & (1 << i))
+				if (v & ((u_daddr_t)1 << i))
 					blist_free(dest, blk + i, 1);
 			}
 		}
@@ -688,7 +730,7 @@
 
 
 	radix /= BLIST_META_RADIX;
-	next_skip = ((u_int)skip / BLIST_META_RADIX);
+	next_skip = skip / BLIST_META_RADIX;
 
 	for (i = 1; count && i <= skip; i += next_skip) {
 		if (scan[i].bm_bighint == (daddr_t)-1)
@@ -729,23 +771,21 @@
  *	the number of blocks allocated by the call.
  */
 
-static int
+static daddr_t
 blst_leaf_fill(blmeta_t *scan, daddr_t blk, int count)
 {
 	int n = blk & (BLIST_BMAP_RADIX - 1);
-	int nblks;
-	u_daddr_t mask, bitmap;
+	daddr_t nblks;
+	u_daddr_t mask;
 
 	mask = ((u_daddr_t)-1 << n) &
 	    ((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - count - n));
 
-	/* Count the number of blocks we're about to allocate */
-	bitmap = scan->u.bmu_bitmap & mask;
-	for (nblks = 0; bitmap != 0; nblks++)
-		bitmap &= bitmap - 1;
+	/* Count the number of blocks that we are allocating. */
+	nblks = bitcount64(scan->u.bmu_bitmap & mask);
 
 	scan->u.bmu_bitmap &= ~mask;
-	return nblks;
+	return (nblks);
 }
 
 /*
@@ -756,19 +796,20 @@
  *	range must be within the extent of this node.  Returns the
  *	number of blocks allocated by the call.
  */
-static int
-blst_meta_fill(
-	blmeta_t *scan,
-	daddr_t allocBlk,
-	daddr_t count,
-	daddr_t radix, 
-	int skip,
-	daddr_t blk
-) {
-	int i;
-	int next_skip = ((u_int)skip / BLIST_META_RADIX);
-	int nblks = 0;
+static daddr_t
+blst_meta_fill(blmeta_t *scan, daddr_t allocBlk, daddr_t count, daddr_t radix,
+    daddr_t skip, daddr_t blk)
+{
+	daddr_t i, nblks, next_skip, v;
+	int child;
 
+	if (count > radix) {
+		/*
+		 * The allocation exceeds the number of blocks that are
+		 * managed by this meta node.
+		 */
+		panic("allocation too large");
+	}
 	if (count == radix || scan->u.bmu_avail == 0)  {
 		/*
 		 * ALL-ALLOCATED special case
@@ -775,19 +816,23 @@
 		 */
 		nblks = scan->u.bmu_avail;
 		scan->u.bmu_avail = 0;
-		scan->bm_bighint = count;
+		scan->bm_bighint = 0;
 		return nblks;
 	}
+	next_skip = skip / BLIST_META_RADIX;
 
+	/*
+	 * An ALL-FREE meta node requires special handling before allocating
+	 * any of its blocks.
+	 */
 	if (scan->u.bmu_avail == radix) {
 		radix /= BLIST_META_RADIX;
 
 		/*
-		 * ALL-FREE special case, initialize sublevel
+		 * Reinitialize each of the meta node's children.  An ALL-FREE
+		 * meta node cannot have a terminator in any subtree.
 		 */
 		for (i = 1; i <= skip; i += next_skip) {
-			if (scan[i].bm_bighint == (daddr_t)-1)
-				break;
 			if (next_skip == 1) {
 				scan[i].u.bmu_bitmap = (u_daddr_t)-1;
 				scan[i].bm_bighint = BLIST_BMAP_RADIX;
@@ -800,16 +845,11 @@
 		radix /= BLIST_META_RADIX;
 	}
 
-	if (count > radix)
-		panic("blist_meta_fill: allocation too large");
-
-	i = (allocBlk - blk) / radix;
-	blk += i * radix;
-	i = i * next_skip + 1;
-
+	nblks = 0;
+	child = (allocBlk - blk) / radix;
+	blk += child * radix;
+	i = 1 + child * next_skip;
 	while (i <= skip && blk < allocBlk + count) {
-		daddr_t v;
-
 		v = blk + radix - allocBlk;
 		if (v > count)
 			v = count;
@@ -842,12 +882,12 @@
  */
 
 static daddr_t	
-blst_radix_init(blmeta_t *scan, daddr_t radix, int skip, daddr_t count)
+blst_radix_init(blmeta_t *scan, daddr_t radix, daddr_t skip, daddr_t count)
 {
-	int i;
-	int next_skip;
-	daddr_t memindex = 0;
+	daddr_t i, memindex, next_skip;
 
+	memindex = 0;
+
 	/*
 	 * Leaf node
 	 */
@@ -872,7 +912,7 @@
 	}
 
 	radix /= BLIST_META_RADIX;
-	next_skip = ((u_int)skip / BLIST_META_RADIX);
+	next_skip = skip / BLIST_META_RADIX;
 
 	for (i = 1; i <= skip; i += next_skip) {
 		if (count >= radix) {
@@ -914,15 +954,14 @@
 #ifdef BLIST_DEBUG
 
 static void	
-blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int skip, int tab)
+blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, daddr_t skip,
+    int tab)
 {
-	int i;
-	int next_skip;
-	int lastState = 0;
+	daddr_t i, next_skip;
 
 	if (radix == BLIST_BMAP_RADIX) {
 		printf(
-		    "%*.*s(%08llx,%lld): bitmap %08llx big=%lld\n", 
+		    "%*.*s(%08llx,%lld): bitmap %016llx big=%lld\n", 
 		    tab, tab, "",
 		    (long long)blk, (long long)radix,
 		    (long long)scan->u.bmu_bitmap,
@@ -960,7 +999,7 @@
 	);
 
 	radix /= BLIST_META_RADIX;
-	next_skip = ((u_int)skip / BLIST_META_RADIX);
+	next_skip = skip / BLIST_META_RADIX;
 	tab += 4;
 
 	for (i = 1; i <= skip; i += next_skip) {
@@ -970,7 +1009,6 @@
 			    tab, tab, "",
 			    (long long)blk, (long long)radix
 			);
-			lastState = 0;
 			break;
 		}
 		blst_radix_print(
@@ -1016,11 +1054,10 @@
 
 	for (;;) {
 		char buf[1024];
-		daddr_t da = 0;
-		daddr_t count = 0;
+		long long da = 0;
+		long long count = 0;
 
-
-		printf("%lld/%lld/%lld> ", (long long)bl->bl_free,
+		printf("%lld/%lld/%lld> ", (long long)blist_avail(bl),
 		    (long long)size, (long long)bl->bl_radix);
 		fflush(stdout);
 		if (fgets(buf, sizeof(buf), stdin) == NULL)
@@ -1028,7 +1065,7 @@
 		switch(buf[0]) {
 		case 'r':
 			if (sscanf(buf + 1, "%lld", &count) == 1) {
-				blist_resize(&bl, count, 1);
+				blist_resize(&bl, count, 1, M_WAITOK);
 			} else {
 				printf("?\n");
 			}
@@ -1044,8 +1081,7 @@
 			}
 			break;
 		case 'f':
-			if (sscanf(buf + 1, "%llx %lld",
-			    (long long *)&da, (long long *)&count) == 2) {
+			if (sscanf(buf + 1, "%llx %lld", &da, &count) == 2) {
 				blist_free(bl, da, count);
 			} else {
 				printf("?\n");
@@ -1052,10 +1088,9 @@
 			}
 			break;
 		case 'l':
-			if (sscanf(buf + 1, "%llx %lld",
-			    (long long *)&da, (long long *)&count) == 2) {
-				printf("    n=%d\n",
-				    blist_fill(bl, da, count));
+			if (sscanf(buf + 1, "%llx %lld", &da, &count) == 2) {
+				printf("    n=%jd\n",
+				    (intmax_t)blist_fill(bl, da, count));
 			} else {
 				printf("?\n");
 			}

Modified: trunk/sys/kern/subr_bufring.c
===================================================================
--- trunk/sys/kern/subr_bufring.c	2018-05-25 20:59:46 UTC (rev 9949)
+++ trunk/sys/kern/subr_bufring.c	2018-05-25 21:07:09 UTC (rev 9950)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 2007, 2008 Kip Macy <kmacy at freebsd.org>
  * All rights reserved.
@@ -25,7 +26,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/subr_bufring.c 207673 2010-05-05 20:39:02Z joel $");
 
 
 #include <sys/param.h>

Modified: trunk/sys/kern/subr_bus.c
===================================================================
--- trunk/sys/kern/subr_bus.c	2018-05-25 20:59:46 UTC (rev 9949)
+++ trunk/sys/kern/subr_bus.c	2018-05-25 21:07:09 UTC (rev 9950)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1997,1998,2003 Doug Rabson
  * All rights reserved.
@@ -25,9 +26,10 @@
  */
 
 #include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/subr_bus.c 308402 2016-11-07 09:19:04Z hselasky $");
 
 #include "opt_bus.h"
+#include "opt_random.h"
 
 #include <sys/param.h>
 #include <sys/conf.h>
@@ -40,10 +42,12 @@
 #include <sys/module.h>
 #include <sys/mutex.h>
 #include <sys/poll.h>
+#include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/condvar.h>
 #include <sys/queue.h>
 #include <machine/bus.h>
+#include <sys/random.h>
 #include <sys/rman.h>
 #include <sys/selinfo.h>
 #include <sys/signalvar.h>
@@ -52,9 +56,11 @@
 #include <sys/uio.h>
 #include <sys/bus.h>
 #include <sys/interrupt.h>
+#include <sys/cpuset.h>
 
 #include <net/vnet.h>
 
+#include <machine/cpu.h>
 #include <machine/stdarg.h>
 
 #include <vm/uma.h>
@@ -143,6 +149,8 @@
 static MALLOC_DEFINE(M_BUS, "bus", "Bus data structures");
 static MALLOC_DEFINE(M_BUS_SC, "bus-sc", "Bus data structures, softc");
 
+static void devctl2_init(void);
+
 #ifdef BUS_DEBUG
 
 static int bus_debug = 1;
@@ -281,6 +289,7 @@
 device_sysctl_init(device_t dev)
 {
 	devclass_t dc = dev->devclass;
+	int domain;
 
 	if (dev->sysctl_tree != NULL)
 		return;
@@ -310,6 +319,10 @@
 	    OID_AUTO, "%parent", CTLTYPE_STRING | CTLFLAG_RD,
 	    dev, DEVICE_SYSCTL_PARENT, device_sysctl_handler, "A",
 	    "parent device");
+	if (bus_get_domain(dev, &domain) == 0)
+		SYSCTL_ADD_INT(&dev->sysctl_ctx,
+		    SYSCTL_CHILDREN(dev->sysctl_tree), OID_AUTO, "%domain",
+		    CTLFLAG_RD, NULL, domain, "NUMA domain");
 }
 
 static void
@@ -355,15 +368,16 @@
 /* Deprecated way to adjust queue length */
 static int sysctl_devctl_disable(SYSCTL_HANDLER_ARGS);
 /* XXX Need to support old-style tunable hw.bus.devctl_disable" */
-SYSCTL_PROC(_hw_bus, OID_AUTO, devctl_disable, CTLTYPE_INT | CTLFLAG_RW, NULL,
-    0, sysctl_devctl_disable, "I", "devctl disable -- deprecated");
+SYSCTL_PROC(_hw_bus, OID_AUTO, devctl_disable, CTLTYPE_INT | CTLFLAG_RW |
+    CTLFLAG_MPSAFE, NULL, 0, sysctl_devctl_disable, "I",
+    "devctl disable -- deprecated");
 
 #define DEVCTL_DEFAULT_QUEUE_LEN 1000
 static int sysctl_devctl_queue(SYSCTL_HANDLER_ARGS);
 static int devctl_queue_length = DEVCTL_DEFAULT_QUEUE_LEN;
 TUNABLE_INT("hw.bus.devctl_queue", &devctl_queue_length);
-SYSCTL_PROC(_hw_bus, OID_AUTO, devctl_queue, CTLTYPE_INT | CTLFLAG_RW, NULL,
-    0, sysctl_devctl_queue, "I", "devctl queue length");
+SYSCTL_PROC(_hw_bus, OID_AUTO, devctl_queue, CTLTYPE_INT | CTLFLAG_RW |
+    CTLFLAG_MPSAFE, NULL, 0, sysctl_devctl_queue, "I", "devctl queue length");
 
 static d_open_t		devopen;
 static d_close_t	devclose;
@@ -370,15 +384,16 @@
 static d_read_t		devread;
 static d_ioctl_t	devioctl;
 static d_poll_t		devpoll;
+static d_kqfilter_t	devkqfilter;
 
 static struct cdevsw dev_cdevsw = {
 	.d_version =	D_VERSION,
-	.d_flags =	D_NEEDGIANT,
 	.d_open =	devopen,
 	.d_close =	devclose,
 	.d_read =	devread,
 	.d_ioctl =	devioctl,
 	.d_poll =	devpoll,
+	.d_kqfilter =	devkqfilter,
 	.d_name =	"devctl",
 };
 
@@ -395,13 +410,23 @@
 	int	inuse;
 	int	nonblock;
 	int	queued;
+	int	async;
 	struct mtx mtx;
 	struct cv cv;
 	struct selinfo sel;
 	struct devq devq;
-	struct proc *async_proc;
+	struct sigio *sigio;
 } devsoftc;
 
+static void	filt_devctl_detach(struct knote *kn);
+static int	filt_devctl_read(struct knote *kn, long hint);
+
+struct filterops devctl_rfiltops = {
+	.f_isfd = 1,
+	.f_detach = filt_devctl_detach,
+	.f_event = filt_devctl_read,
+};
+
 static struct cdev *devctl_dev;
 
 static void
@@ -412,17 +437,22 @@
 	mtx_init(&devsoftc.mtx, "dev mtx", "devd", MTX_DEF);
 	cv_init(&devsoftc.cv, "dev cv");
 	TAILQ_INIT(&devsoftc.devq);
+	knlist_init_mtx(&devsoftc.sel.si_note, &devsoftc.mtx);
+	devctl2_init();
 }
 
 static int
 devopen(struct cdev *dev, int oflags, int devtype, struct thread *td)
 {
-	if (devsoftc.inuse)
+
+	mtx_lock(&devsoftc.mtx);
+	if (devsoftc.inuse) {
+		mtx_unlock(&devsoftc.mtx);
 		return (EBUSY);
+	}
 	/* move to init */
 	devsoftc.inuse = 1;
-	devsoftc.nonblock = 0;
-	devsoftc.async_proc = NULL;
+	mtx_unlock(&devsoftc.mtx);
 	return (0);
 }
 
@@ -429,11 +459,14 @@
 static int
 devclose(struct cdev *dev, int fflag, int devtype, struct thread *td)
 {
+
+	mtx_lock(&devsoftc.mtx);
 	devsoftc.inuse = 0;
-	mtx_lock(&devsoftc.mtx);
+	devsoftc.nonblock = 0;
+	devsoftc.async = 0;
 	cv_broadcast(&devsoftc.cv);
+	funsetown(&devsoftc.sigio);
 	mtx_unlock(&devsoftc.mtx);
-	devsoftc.async_proc = NULL;
 	return (0);
 }
 
@@ -489,17 +522,20 @@
 		return (0);
 	case FIOASYNC:
 		if (*(int*)data)
-			devsoftc.async_proc = td->td_proc;
+			devsoftc.async = 1;
 		else
-			devsoftc.async_proc = NULL;
+			devsoftc.async = 0;
 		return (0);
+	case FIOSETOWN:
+		return fsetown(*(int *)data, &devsoftc.sigio);
+	case FIOGETOWN:
+		*(int *)data = fgetown(&devsoftc.sigio);
+		return (0);
 
 		/* (un)Support for other fcntl() calls. */
 	case FIOCLEX:
 	case FIONCLEX:
 	case FIONREAD:
-	case FIOSETOWN:
-	case FIOGETOWN:
 	default:
 		break;
 	}
@@ -523,6 +559,34 @@
 	return (revents);
 }
 
+static int
+devkqfilter(struct cdev *dev, struct knote *kn)
+{
+	int error;
+
+	if (kn->kn_filter == EVFILT_READ) {
+		kn->kn_fop = &devctl_rfiltops;
+		knlist_add(&devsoftc.sel.si_note, kn, 0);
+		error = 0;
+	} else
+		error = EINVAL;
+	return (error);
+}
+
+static void
+filt_devctl_detach(struct knote *kn)
+{
+
+	knlist_remove(&devsoftc.sel.si_note, kn, 0);
+}
+
+static int
+filt_devctl_read(struct knote *kn, long hint)
+{
+	kn->kn_data = devsoftc.queued;
+	return (kn->kn_data != 0);
+}
+
 /**
  * @brief Return whether the userland process is running
  */
@@ -543,7 +607,6 @@
 devctl_queue_data_f(char *data, int flags)
 {
 	struct dev_event_info *n1 = NULL, *n2 = NULL;
-	struct proc *p;
 
 	if (strlen(data) == 0)
 		goto out;
@@ -571,14 +634,11 @@
 	TAILQ_INSERT_TAIL(&devsoftc.devq, n1, dei_link);
 	devsoftc.queued++;
 	cv_broadcast(&devsoftc.cv);
+	KNOTE_LOCKED(&devsoftc.sel.si_note, 0);
 	mtx_unlock(&devsoftc.mtx);
 	selwakeup(&devsoftc.sel);
-	p = devsoftc.async_proc;
-	if (p != NULL) {
-		PROC_LOCK(p);
-		kern_psignal(p, SIGIO);
-		PROC_UNLOCK(p);
-	}
+	if (devsoftc.async && devsoftc.sigio != NULL)
+		pgsigio(&devsoftc.sigio, SIGIO, 0);
 	return;
 out:
 	/*
@@ -1812,6 +1872,8 @@
 
 	PDEBUG(("%s at %s with order %u as unit %d",
 	    name, DEVICENAME(dev), order, unit));
+	KASSERT(name != NULL || unit == -1,
+	    ("child device with wildcard name and specific unit number"));
 
 	child = make_device(dev, name, unit);
 	if (child == NULL)
@@ -1862,7 +1924,11 @@
 
 	PDEBUG(("%s from %s", DEVICENAME(child), DEVICENAME(dev)));
 
-	/* remove children first */
+	/* detach parent before deleting children, if any */
+	if ((error = device_detach(child)) != 0)
+		return (error);
+	
+	/* remove children second */
 	while ((grandchild = TAILQ_FIRST(&child->children)) != NULL) {
 		error = device_delete_child(child, grandchild);
 		if (error)
@@ -1869,8 +1935,6 @@
 			return (error);
 	}
 
-	if ((error = device_detach(child)) != 0)
-		return (error);
 	if (child->devclass)
 		devclass_delete_device(child->devclass, child);
 	if (child->parent)
@@ -2023,9 +2087,15 @@
 			if (!hasclass) {
 				if (device_set_devclass(child,
 				    dl->driver->name) != 0) {
+					char const * devname =
+					    device_get_name(child);
+					if (devname == NULL)
+						devname = "(unknown)";
 					printf("driver bug: Unable to set "
-					    "devclass (devname: %s)\n",
-					    device_get_name(child));
+					    "devclass (class: %s "
+					    "devname: %s)\n",
+					    dl->driver->name,
+					    devname);
 					(void)device_set_driver(child, NULL);
 					continue;
 				}
@@ -2053,6 +2123,16 @@
 			}
 
 			/*
+			 * Probes that return BUS_PROBE_NOWILDCARD or lower
+			 * only match on devices whose driver was explicitly
+			 * specified.
+			 */
+			if (result <= BUS_PROBE_NOWILDCARD &&
+			    !(child->flags & DF_FIXEDCLASS)) {
+				result = ENXIO;
+			}
+
+			/*
 			 * The driver returned an error so it
 			 * certainly doesn't match.
 			 */
@@ -2067,14 +2147,6 @@
 			 * of pri for the first match.
 			 */
 			if (best == NULL || result > pri) {
-				/*
-				 * Probes that return BUS_PROBE_NOWILDCARD
-				 * or lower only match when they are set
-				 * in stone by the parent bus.
-				 */
-				if (result <= BUS_PROBE_NOWILDCARD &&
-				    child->flags & DF_WILDCARD)
-					continue;
 				best = dl;
 				pri = result;
 				continue;
@@ -2758,6 +2830,7 @@
 int
 device_attach(device_t dev)
 {
+	uint64_t attachtime;
 	int error;
 
 	if (resource_disabled(dev->driver->name, dev->unit)) {
@@ -2770,6 +2843,7 @@
 	device_sysctl_init(dev);
 	if (!device_is_quiet(dev))
 		device_print_child(dev->parent, dev);
+	attachtime = get_cyclecount();
 	dev->state = DS_ATTACHING;
 	if ((error = DEVICE_ATTACH(dev)) != 0) {
 		printf("device_attach: %s%d attach returned %d\n",
@@ -2782,6 +2856,17 @@
 		dev->state = DS_NOTPRESENT;
 		return (error);
 	}
+	attachtime = get_cyclecount() - attachtime;
+	/*
+	 * 4 bits per device is a reasonable value for desktop and server
+	 * hardware with good get_cyclecount() implementations, but may
+	 * need to be adjusted on other platforms.
+	 */
+#ifdef RANDOM_DEBUG
+	printf("%s(): feeding %d bit(s) of entropy from %s%d\n",
+	    __func__, 4, dev->driver->name, dev->unit);
+#endif
+	random_harvest(&attachtime, sizeof(attachtime), 4, RANDOM_ATTACH);
 	device_sysctl_update(dev);
 	if (dev->busy)
 		dev->state = DS_BUSY;
@@ -3230,7 +3315,10 @@
 			rle->flags |= RLE_ALLOCATED;
 			return (rle->res);
 		}
-		panic("resource_list_alloc: resource entry is busy");
+		device_printf(bus,
+		    "resource entry %#x type %d for child %s is busy\n", *rid,
+		    type, device_get_nameunit(child));
+		return (NULL);
 	}
 
 	if (isdefault) {
@@ -3314,9 +3402,51 @@
 }
 
 /**
+ * @brief Release all active resources of a given type
+ *
+ * Release all active resources of a specified type.  This is intended
+ * to be used to cleanup resources leaked by a driver after detach or
+ * a failed attach.
+ *
+ * @param rl		the resource list which was allocated from
+ * @param bus		the parent device of @p child
+ * @param child		the device whose active resources are being released
+ * @param type		the type of resources to release
+ * 
+ * @retval 0		success
+ * @retval EBUSY	at least one resource was active
+ */
+int
+resource_list_release_active(struct resource_list *rl, device_t bus,
+    device_t child, int type)
+{
+	struct resource_list_entry *rle;
+	int error, retval;
+
+	retval = 0;
+	STAILQ_FOREACH(rle, rl, link) {
+		if (rle->type != type)
+			continue;
+		if (rle->res == NULL)
+			continue;
+		if ((rle->flags & (RLE_RESERVED | RLE_ALLOCATED)) ==
+		    RLE_RESERVED)
+			continue;
+		retval = EBUSY;
+		error = resource_list_release(rl, bus, child, type,
+		    rman_get_rid(rle->res), rle->res);
+		if (error != 0)
+			device_printf(bus,
+			    "Failed to release active resource: %d\n", error);
+	}
+	return (retval);
+}
+
+
+/**
  * @brief Fully release a reserved resource
  *
- * Fully releases a resouce reserved via resource_list_reserve().
+ * Fully releases a resource reserved via resource_list_reserve().
  *
  * @param rl		the resource list which was allocated from
  * @param bus		the parent device of @p child
@@ -3604,6 +3734,25 @@
 /**
  * @brief Helper function for implementing BUS_PRINT_CHILD().
  *
+ * This function prints out the VM domain for the given device.
+ *
+ * @returns the number of characters printed
+ */
+int
+bus_print_child_domain(device_t dev, device_t child)
+{
+	int domain;
+
+	/* No domain? Don't print anything */
+	if (BUS_GET_DOMAIN(dev, child, &domain) != 0)
+		return (0);
+
+	return (printf(" numa-domain %d", domain));
+}
+
+/**
+ * @brief Helper function for implementing BUS_PRINT_CHILD().
+ *
  * This function simply calls bus_print_child_header() followed by
  * bus_print_child_footer().
  *
@@ -3615,6 +3764,7 @@
 	int	retval = 0;
 
 	retval += bus_print_child_header(dev, child);
+	retval += bus_print_child_domain(dev, child);
 	retval += bus_print_child_footer(dev, child);
 
 	return (retval);
@@ -4029,6 +4179,16 @@
 	return (BUS_CHILD_PRESENT(device_get_parent(dev), dev));
 }
 
+int
+bus_generic_get_domain(device_t dev, device_t child, int *domain)
+{
+
+	if (dev->parent)
+		return (BUS_GET_DOMAIN(dev->parent, dev, domain));
+
+	return (ENOENT);
+}
+
 /*
  * Some convenience functions to make it easier for drivers to use the
  * resource-management functions.  All these really do is hide the
@@ -4361,6 +4521,18 @@
 	return (BUS_GET_DMA_TAG(parent, dev));
 }
 
+/**
+ * @brief Wrapper function for BUS_GET_DOMAIN().
+ *
+ * This function simply calls the BUS_GET_DOMAIN() method of the
+ * parent of @p dev.
+ */
+int
+bus_get_domain(device_t dev, int *domain)
+{
+	return (BUS_GET_DOMAIN(device_get_parent(dev), dev, domain));
+}
+
 /* Resume all devices and then notify userland that we're up again. */
 static int
 root_resume(device_t dev)
@@ -4395,7 +4567,7 @@
 }
 
 /*
- * If we get here, assume that the device is permanant and really is
+ * If we get here, assume that the device is permanent and really is
  * present in the system.  Removable bus drivers are expected to intercept
  * this call long before it gets here.  We return -1 so that drivers that
  * really care can check vs -1 or some ERRNO returned higher in the food
@@ -4833,3 +5005,259 @@
 		return (0);
 	return (bus_release_resource(dev, type, rman_get_rid(r), r));
 }
+
+device_t
+device_lookup_by_name(const char *name)
+{
+	device_t dev;
+
+	TAILQ_FOREACH(dev, &bus_data_devices, devlink) {
+		if (dev->nameunit != NULL && strcmp(dev->nameunit, name) == 0)
+			return (dev);
+	}
+	return (NULL);
+}
+
+/*
+ * /dev/devctl2 implementation.  The existing /dev/devctl device has
+ * implicit semantics on open, so it could not be reused for this.
+ * Another option would be to call this /dev/bus?
+ */
+static int
+find_device(struct devreq *req, device_t *devp)
+{
+	device_t dev;
+
+	/*
+	 * First, ensure that the name is nul terminated.
+	 */
+	if (memchr(req->dr_name, '\0', sizeof(req->dr_name)) == NULL)
+		return (EINVAL);
+
+	/*
+	 * Second, try to find an attached device whose name matches
+	 * 'name'.
+	 */
+	dev = device_lookup_by_name(req->dr_name);
+	if (dev != NULL) {
+		*devp = dev;
+		return (0);
+	}
+
+	/* Finally, give device enumerators a chance. */
+	dev = NULL;
+	EVENTHANDLER_INVOKE(dev_lookup, req->dr_name, &dev);
+	if (dev == NULL)
+		return (ENOENT);
+	*devp = dev;
+	return (0);
+}
+
+static bool
+driver_exists(struct device *bus, const char *driver)
+{
+	devclass_t dc;
+
+	for (dc = bus->devclass; dc != NULL; dc = dc->parent) {
+		if (devclass_find_driver_internal(dc, driver) != NULL)
+			return (true);
+	}
+	return (false);
+}
+
+static int
+devctl2_ioctl(struct cdev *cdev, u_long cmd, caddr_t data, int fflag,
+    struct thread *td)
+{
+	struct devreq *req;
+	device_t dev;
+	int error, old;
+
+	/* Locate the device to control. */
+	mtx_lock(&Giant);
+	req = (struct devreq *)data;
+	switch (cmd) {
+	case DEV_ATTACH:
+	case DEV_DETACH:
+	case DEV_ENABLE:
+	case DEV_DISABLE:
+	case DEV_SET_DRIVER:
+	case DEV_CLEAR_DRIVER:
+		error = priv_check(td, PRIV_DRIVER);
+		if (error == 0)
+			error = find_device(req, &dev);
+		break;
+	default:
+		error = ENOTTY;
+		break;
+	}
+	if (error) {
+		mtx_unlock(&Giant);
+		return (error);
+	}
+
+	/* Perform the requested operation. */
+	switch (cmd) {
+	case DEV_ATTACH:
+		if (device_is_attached(dev) && (dev->flags & DF_REBID) == 0)
+			error = EBUSY;
+		else if (!device_is_enabled(dev))
+			error = ENXIO;
+		else
+			error = device_probe_and_attach(dev);
+		break;
+	case DEV_DETACH:
+		if (!device_is_attached(dev)) {
+			error = ENXIO;
+			break;
+		}
+		if (!(req->dr_flags & DEVF_FORCE_DETACH)) {
+			error = device_quiesce(dev);
+			if (error)
+				break;
+		}
+		error = device_detach(dev);
+		break;
+	case DEV_ENABLE:
+		if (device_is_enabled(dev)) {
+			error = EBUSY;
+			break;
+		}
+
+		/*
+		 * If the device has been probed but not attached (e.g.
+		 * when it has been disabled by a loader hint), just
+		 * attach the device rather than doing a full probe.
+		 */
+		device_enable(dev);
+		if (device_is_alive(dev)) {
+			/*
+			 * If the device was disabled via a hint, clear
+			 * the hint.
+			 */
+			if (resource_disabled(dev->driver->name, dev->unit))
+				resource_unset_value(dev->driver->name,
+				    dev->unit, "disabled");
+			error = device_attach(dev);
+		} else
+			error = device_probe_and_attach(dev);
+		break;
+	case DEV_DISABLE:
+		if (!device_is_enabled(dev)) {
+			error = ENXIO;
+			break;
+		}
+
+		if (!(req->dr_flags & DEVF_FORCE_DETACH)) {
+			error = device_quiesce(dev);
+			if (error)
+				break;
+		}
+
+		/*
+		 * Force DF_FIXEDCLASS on around detach to preserve
+		 * the existing name.
+		 */
+		old = dev->flags;
+		dev->flags |= DF_FIXEDCLASS;
+		error = device_detach(dev);
+		if (!(old & DF_FIXEDCLASS))
+			dev->flags &= ~DF_FIXEDCLASS;
+		if (error == 0)
+			device_disable(dev);
+		break;
+	case DEV_SET_DRIVER: {
+		devclass_t dc;
+		char driver[128];
+
+		error = copyinstr(req->dr_data, driver, sizeof(driver), NULL);
+		if (error)
+			break;
+		if (driver[0] == '\0') {
+			error = EINVAL;
+			break;
+		}
+		if (dev->devclass != NULL &&
+		    strcmp(driver, dev->devclass->name) == 0)
+			/* XXX: Could possibly force DF_FIXEDCLASS on? */
+			break;
+
+		/*
+		 * Scan drivers for this device's bus looking for at
+		 * least one matching driver.
+		 */
+		if (dev->parent == NULL) {
+			error = EINVAL;
+			break;
+		}
+		if (!driver_exists(dev->parent, driver)) {
+			error = ENOENT;
+			break;
+		}
+		dc = devclass_create(driver);
+		if (dc == NULL) {
+			error = ENOMEM;
+			break;
+		}
+
+		/* Detach device if necessary. */
+		if (device_is_attached(dev)) {
+			if (req->dr_flags & DEVF_SET_DRIVER_DETACH)
+				error = device_detach(dev);
+			else
+				error = EBUSY;
+			if (error)
+				break;
+		}
+
+		/* Clear any previously-fixed device class and unit. */
+		if (dev->flags & DF_FIXEDCLASS)
+			devclass_delete_device(dev->devclass, dev);
+		dev->flags |= DF_WILDCARD;
+		dev->unit = -1;
+
+		/* Force the new device class. */
+		error = devclass_add_device(dc, dev);
+		if (error)
+			break;
+		dev->flags |= DF_FIXEDCLASS;
+		error = device_probe_and_attach(dev);
+		break;
+	}
+	case DEV_CLEAR_DRIVER:
+		if (!(dev->flags & DF_FIXEDCLASS)) {
+			error = 0;
+			break;
+		}
+		if (device_is_attached(dev)) {
+			if (req->dr_flags & DEVF_CLEAR_DRIVER_DETACH)
+				error = device_detach(dev);
+			else
+				error = EBUSY;
+			if (error)
+				break;
+		}
+
+		dev->flags &= ~DF_FIXEDCLASS;
+		dev->flags |= DF_WILDCARD;
+		devclass_delete_device(dev->devclass, dev);
+		error = device_probe_and_attach(dev);
+		break;
+	}
+	mtx_unlock(&Giant);
+	return (error);
+}
+
+static struct cdevsw devctl2_cdevsw = {
+	.d_version =	D_VERSION,
+	.d_ioctl =	devctl2_ioctl,
+	.d_name =	"devctl2",
+};
+
+static void
+devctl2_init(void)
+{
+
+	make_dev_credf(MAKEDEV_ETERNAL, &devctl2_cdevsw, 0, NULL,
+	    UID_ROOT, GID_WHEEL, 0600, "devctl2");
+}

Added: trunk/sys/kern/subr_bus_dma.c
===================================================================
--- trunk/sys/kern/subr_bus_dma.c	                        (rev 0)
+++ trunk/sys/kern/subr_bus_dma.c	2018-05-25 21:07:09 UTC (rev 9950)
@@ -0,0 +1,582 @@
+/* $MidnightBSD$ */
+/*-
+ * Copyright (c) 2012 EMC Corp.
+ * All rights reserved.
+ *
+ * Copyright (c) 1997, 1998 Justin T. Gibbs.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: stable/10/sys/kern/subr_bus_dma.c 292348 2015-12-16 19:01:14Z ken $");
+
+#include "opt_bus.h"
+
+#include <sys/param.h>
+#include <sys/conf.h>
+#include <sys/systm.h>
+#include <sys/bio.h>
+#include <sys/bus.h>
+#include <sys/callout.h>
+#include <sys/mbuf.h>
+#include <sys/memdesc.h>
+#include <sys/proc.h>
+#include <sys/uio.h>
+
+#include <vm/vm.h>
+#include <vm/vm_page.h>
+#include <vm/vm_map.h>
+#include <vm/pmap.h>
+
+#include <cam/cam.h>
+#include <cam/cam_ccb.h>
+
+#include <machine/bus.h>
+
+/*
+ * Load up data starting at offset within a region specified by a
+ * list of virtual address ranges until either length or the region
+ * are exhausted.
+ */
+static int
+_bus_dmamap_load_vlist(bus_dma_tag_t dmat, bus_dmamap_t map,
+    bus_dma_segment_t *list, int sglist_cnt, struct pmap *pmap, int *nsegs,
+    int flags, size_t offset, size_t length)
+{
+	int error;
+
+	error = 0;
+	for (; sglist_cnt > 0 && length != 0; sglist_cnt--, list++) {
+		char *addr;
+		size_t ds_len;
+
+		KASSERT((offset < list->ds_len),
+		    ("Invalid mid-segment offset"));
+		addr = (char *)(uintptr_t)list->ds_addr + offset;
+		ds_len = list->ds_len - offset;
+		offset = 0;
+		if (ds_len > length)
+			ds_len = length;
+		length -= ds_len;
+		KASSERT((ds_len != 0), ("Segment length is zero"));
+		error = _bus_dmamap_load_buffer(dmat, map, addr, ds_len, pmap,
+		    flags, NULL, nsegs);
+		if (error)
+			break;
+	}
+	return (error);
+}
+
+/*
+ * Load a list of physical addresses.
+ */
+static int
+_bus_dmamap_load_plist(bus_dma_tag_t dmat, bus_dmamap_t map,
+    bus_dma_segment_t *list, int sglist_cnt, int *nsegs, int flags)
+{
+	int error;
+
+	error = 0;
+	for (; sglist_cnt > 0; sglist_cnt--, list++) {
+		error = _bus_dmamap_load_phys(dmat, map,
+		    (vm_paddr_t)list->ds_addr, list->ds_len, flags, NULL,
+		    nsegs);
+		if (error)
+			break;
+	}
+	return (error);
+}
+
+/*
+ * Load an mbuf chain.
+ */
+static int
+_bus_dmamap_load_mbuf_sg(bus_dma_tag_t dmat, bus_dmamap_t map,
+    struct mbuf *m0, bus_dma_segment_t *segs, int *nsegs, int flags)
+{
+	struct mbuf *m;
+	int error;
+
+	error = 0;
+	for (m = m0; m != NULL && error == 0; m = m->m_next) {
+		if (m->m_len > 0) {
+			error = _bus_dmamap_load_buffer(dmat, map, m->m_data,
+			    m->m_len, kernel_pmap, flags | BUS_DMA_LOAD_MBUF,
+			    segs, nsegs);
+		}
+	}
+	CTR5(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d nsegs %d",
+	    __func__, dmat, flags, error, *nsegs);
+	return (error);
+}
+
+/*
+ * Load tlen data starting at offset within a region specified by a list of
+ * physical pages.
+ */
+static int
+_bus_dmamap_load_pages(bus_dma_tag_t dmat, bus_dmamap_t map,
+    vm_page_t *pages, bus_size_t tlen, int offset, int *nsegs, int flags)
+{
+	vm_paddr_t paddr;
+	bus_size_t len;
+	int error, i;
+ 
+	for (i = 0, error = 0; error == 0 && tlen > 0; i++, tlen -= len) {
+		len = min(PAGE_SIZE - offset, tlen);
+		paddr = VM_PAGE_TO_PHYS(pages[i]) + offset;
+		error = _bus_dmamap_load_phys(dmat, map, paddr, len,
+		    flags, NULL, nsegs);
+		offset = 0;
+	}
+	return (error);
+}
+ 
+/*
+ * Load from block io.
+ */
+static int
+_bus_dmamap_load_bio(bus_dma_tag_t dmat, bus_dmamap_t map, struct bio *bio,
+    int *nsegs, int flags)
+{
+
+	if ((bio->bio_flags & BIO_VLIST) != 0) {
+		bus_dma_segment_t *segs = (bus_dma_segment_t *)bio->bio_data;
+		return (_bus_dmamap_load_vlist(dmat, map, segs, bio->bio_ma_n,
+		    kernel_pmap, nsegs, flags, bio->bio_ma_offset,
+		    bio->bio_bcount));
+	}
+
+	if ((bio->bio_flags & BIO_UNMAPPED) != 0)
+		return (_bus_dmamap_load_pages(dmat, map, bio->bio_ma,
+		    bio->bio_bcount, bio->bio_ma_offset, nsegs, flags));
+
+	return (_bus_dmamap_load_buffer(dmat, map, bio->bio_data,
+	    bio->bio_bcount, kernel_pmap, flags, NULL, nsegs));
+}
+
+int
+bus_dmamap_load_ma_triv(bus_dma_tag_t dmat, bus_dmamap_t map,
+    struct vm_page **ma, bus_size_t tlen, int ma_offs, int flags,
+    bus_dma_segment_t *segs, int *segp)
+{
+	vm_paddr_t paddr;
+	bus_size_t len;
+	int error, i;
+
+	error = 0;
+	for (i = 0; tlen > 0; i++, tlen -= len) {
+		len = min(PAGE_SIZE - ma_offs, tlen);
+		paddr = VM_PAGE_TO_PHYS(ma[i]) + ma_offs;
+		error = _bus_dmamap_load_phys(dmat, map, paddr, len,
+		    flags, segs, segp);
+		if (error != 0)
+			break;
+		ma_offs = 0;
+	}
+	return (error);
+}
+
+/*
+ * Load a cam control block.
+ */
+static int
+_bus_dmamap_load_ccb(bus_dma_tag_t dmat, bus_dmamap_t map, union ccb *ccb,
+		    int *nsegs, int flags)
+{
+	struct ccb_hdr *ccb_h;
+	void *data_ptr;
+	int error;
+	uint32_t dxfer_len;
+	uint16_t sglist_cnt;
+
+	error = 0;
+	ccb_h = &ccb->ccb_h;
+	switch (ccb_h->func_code) {
+	case XPT_SCSI_IO: {
+		struct ccb_scsiio *csio;
+
+		csio = &ccb->csio;
+		data_ptr = csio->data_ptr;
+		dxfer_len = csio->dxfer_len;
+		sglist_cnt = csio->sglist_cnt;
+		break;
+	}
+	case XPT_CONT_TARGET_IO: {
+		struct ccb_scsiio *ctio;
+
+		ctio = &ccb->ctio;
+		data_ptr = ctio->data_ptr;
+		dxfer_len = ctio->dxfer_len;
+		sglist_cnt = ctio->sglist_cnt;
+		break;
+	}
+	case XPT_ATA_IO: {
+		struct ccb_ataio *ataio;
+
+		ataio = &ccb->ataio;
+		data_ptr = ataio->data_ptr;
+		dxfer_len = ataio->dxfer_len;
+		sglist_cnt = 0;
+		break;
+	}
+	default:
+		panic("_bus_dmamap_load_ccb: Unsupported func code %d",
+		    ccb_h->func_code);
+	}
+
+	switch ((ccb_h->flags & CAM_DATA_MASK)) {
+	case CAM_DATA_VADDR:
+		error = _bus_dmamap_load_buffer(dmat, map, data_ptr, dxfer_len,
+		    kernel_pmap, flags, NULL, nsegs);
+		break;
+	case CAM_DATA_PADDR:
+		error = _bus_dmamap_load_phys(dmat, map,
+		    (vm_paddr_t)(uintptr_t)data_ptr, dxfer_len, flags, NULL,
+		    nsegs);
+		break;
+	case CAM_DATA_SG:
+		error = _bus_dmamap_load_vlist(dmat, map,
+		    (bus_dma_segment_t *)data_ptr, sglist_cnt, kernel_pmap,
+		    nsegs, flags, 0, dxfer_len);
+		break;
+	case CAM_DATA_SG_PADDR:
+		error = _bus_dmamap_load_plist(dmat, map,
+		    (bus_dma_segment_t *)data_ptr, sglist_cnt, nsegs, flags);
+		break;
+	case CAM_DATA_BIO:
+		error = _bus_dmamap_load_bio(dmat, map, (struct bio *)data_ptr,
+		    nsegs, flags);
+		break;
+	default:
+		panic("_bus_dmamap_load_ccb: flags 0x%X unimplemented",
+		    ccb_h->flags);
+	}
+	return (error);
+}
+
+/*
+ * Load a uio.
+ */
+static int
+_bus_dmamap_load_uio(bus_dma_tag_t dmat, bus_dmamap_t map, struct uio *uio,
+    int *nsegs, int flags)
+{
+	bus_size_t resid;
+	bus_size_t minlen;
+	struct iovec *iov;
+	pmap_t pmap;
+	caddr_t addr;
+	int error, i;
+
+	if (uio->uio_segflg == UIO_USERSPACE) {
+		KASSERT(uio->uio_td != NULL,
+			("bus_dmamap_load_uio: USERSPACE but no proc"));
+		pmap = vmspace_pmap(uio->uio_td->td_proc->p_vmspace);
+	} else
+		pmap = kernel_pmap;
+	resid = uio->uio_resid;
+	iov = uio->uio_iov;
+	error = 0;
+
+	for (i = 0; i < uio->uio_iovcnt && resid != 0 && !error; i++) {
+		/*
+		 * Now at the first iovec to load.  Load each iovec
+		 * until we have exhausted the residual count.
+		 */
+
+		addr = (caddr_t) iov[i].iov_base;
+		minlen = resid < iov[i].iov_len ? resid : iov[i].iov_len;
+		if (minlen > 0) {
+			error = _bus_dmamap_load_buffer(dmat, map, addr,
+			    minlen, pmap, flags, NULL, nsegs);
+			resid -= minlen;
+		}
+	}
+
+	return (error);
+}
+
+/*
+ * Map the buffer buf into bus space using the dmamap map.
+ */
+int
+bus_dmamap_load(bus_dma_tag_t dmat, bus_dmamap_t map, void *buf,
+    bus_size_t buflen, bus_dmamap_callback_t *callback,
+    void *callback_arg, int flags)
+{
+	bus_dma_segment_t *segs;
+	struct memdesc mem;
+	int error;
+	int nsegs;
+
+	if ((flags & BUS_DMA_NOWAIT) == 0) {
+		mem = memdesc_vaddr(buf, buflen);
+		_bus_dmamap_waitok(dmat, map, &mem, callback, callback_arg);
+	}
+
+	nsegs = -1;
+	error = _bus_dmamap_load_buffer(dmat, map, buf, buflen, kernel_pmap,
+	    flags, NULL, &nsegs);
+	nsegs++;
+
+	CTR5(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d nsegs %d",
+	    __func__, dmat, flags, error, nsegs);
+
+	if (error == EINPROGRESS)
+		return (error);
+
+	segs = _bus_dmamap_complete(dmat, map, NULL, nsegs, error);
+	if (error)
+		(*callback)(callback_arg, segs, 0, error);
+	else
+		(*callback)(callback_arg, segs, nsegs, 0);
+
+	/*
+	 * Return ENOMEM to the caller so that it can pass it up the stack.
+	 * This error only happens when NOWAIT is set, so deferral is disabled.
+	 */
+	if (error == ENOMEM)
+		return (error);
+
+	return (0);
+}
+
+int
+bus_dmamap_load_mbuf(bus_dma_tag_t dmat, bus_dmamap_t map, struct mbuf *m0,
+    bus_dmamap_callback2_t *callback, void *callback_arg, int flags)
+{
+	bus_dma_segment_t *segs;
+	int nsegs, error;
+
+	M_ASSERTPKTHDR(m0);
+
+	flags |= BUS_DMA_NOWAIT;
+	nsegs = -1;
+	error = _bus_dmamap_load_mbuf_sg(dmat, map, m0, NULL, &nsegs, flags);
+	++nsegs;
+
+	segs = _bus_dmamap_complete(dmat, map, NULL, nsegs, error);
+	if (error)
+		(*callback)(callback_arg, segs, 0, 0, error);
+	else
+		(*callback)(callback_arg, segs, nsegs, m0->m_pkthdr.len, error);
+
+	CTR5(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d nsegs %d",
+	    __func__, dmat, flags, error, nsegs);
+	return (error);
+}
+
+int
+bus_dmamap_load_mbuf_sg(bus_dma_tag_t dmat, bus_dmamap_t map, struct mbuf *m0,
+    bus_dma_segment_t *segs, int *nsegs, int flags)
+{
+	int error;
+
+	flags |= BUS_DMA_NOWAIT;
+	*nsegs = -1;
+	error = _bus_dmamap_load_mbuf_sg(dmat, map, m0, segs, nsegs, flags);
+	++*nsegs;
+	_bus_dmamap_complete(dmat, map, segs, *nsegs, error);
+	return (error);
+}
+
+int
+bus_dmamap_load_uio(bus_dma_tag_t dmat, bus_dmamap_t map, struct uio *uio,
+    bus_dmamap_callback2_t *callback, void *callback_arg, int flags)
+{
+	bus_dma_segment_t *segs;
+	int nsegs, error;
+
+	flags |= BUS_DMA_NOWAIT;
+	nsegs = -1;
+	error = _bus_dmamap_load_uio(dmat, map, uio, &nsegs, flags);
+	nsegs++;
+
+	segs = _bus_dmamap_complete(dmat, map, NULL, nsegs, error);
+	if (error)
+		(*callback)(callback_arg, segs, 0, 0, error);
+	else
+		(*callback)(callback_arg, segs, nsegs, uio->uio_resid, error);
+
+	CTR5(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d nsegs %d",
+	    __func__, dmat, flags, error, nsegs);
+	return (error);
+}
+
+int
+bus_dmamap_load_ccb(bus_dma_tag_t dmat, bus_dmamap_t map, union ccb *ccb,
+		    bus_dmamap_callback_t *callback, void *callback_arg,
+		    int flags)
+{
+	bus_dma_segment_t *segs;
+	struct ccb_hdr *ccb_h;
+	struct memdesc mem;
+	int error;
+	int nsegs;
+
+	ccb_h = &ccb->ccb_h;
+	if ((ccb_h->flags & CAM_DIR_MASK) == CAM_DIR_NONE) {
+		callback(callback_arg, NULL, 0, 0);
+		return (0);
+	}
+	if ((flags & BUS_DMA_NOWAIT) == 0) {
+		mem = memdesc_ccb(ccb);
+		_bus_dmamap_waitok(dmat, map, &mem, callback, callback_arg);
+	}
+	nsegs = -1;
+	error = _bus_dmamap_load_ccb(dmat, map, ccb, &nsegs, flags);
+	nsegs++;
+
+	CTR5(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d nsegs %d",
+	    __func__, dmat, flags, error, nsegs);
+
+	if (error == EINPROGRESS)
+		return (error);
+
+	segs = _bus_dmamap_complete(dmat, map, NULL, nsegs, error);
+	if (error)
+		(*callback)(callback_arg, segs, 0, error);
+	else
+		(*callback)(callback_arg, segs, nsegs, error);
+	/*
+	 * Return ENOMEM to the caller so that it can pass it up the stack.
+	 * This error only happens when NOWAIT is set, so deferral is disabled.
+	 */
+	if (error == ENOMEM)
+		return (error);
+
+	return (0);
+}
+
+int
+bus_dmamap_load_bio(bus_dma_tag_t dmat, bus_dmamap_t map, struct bio *bio,
+		    bus_dmamap_callback_t *callback, void *callback_arg,
+		    int flags)
+{
+	bus_dma_segment_t *segs;
+	struct memdesc mem;
+	int error;
+	int nsegs;
+
+	if ((flags & BUS_DMA_NOWAIT) == 0) {
+		mem = memdesc_bio(bio);
+		_bus_dmamap_waitok(dmat, map, &mem, callback, callback_arg);
+	}
+	nsegs = -1;
+	error = _bus_dmamap_load_bio(dmat, map, bio, &nsegs, flags);
+	nsegs++;
+
+	CTR5(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d nsegs %d",
+	    __func__, dmat, flags, error, nsegs);
+
+	if (error == EINPROGRESS)
+		return (error);
+
+	segs = _bus_dmamap_complete(dmat, map, NULL, nsegs, error);
+	if (error)
+		(*callback)(callback_arg, segs, 0, error);
+	else
+		(*callback)(callback_arg, segs, nsegs, error);
+	/*
+	 * Return ENOMEM to the caller so that it can pass it up the stack.
+	 * This error only happens when NOWAIT is set, so deferral is disabled.
+	 */
+	if (error == ENOMEM)
+		return (error);
+
+	return (0);
+}
+
+int
+bus_dmamap_load_mem(bus_dma_tag_t dmat, bus_dmamap_t map,
+    struct memdesc *mem, bus_dmamap_callback_t *callback,
+    void *callback_arg, int flags)
+{
+	bus_dma_segment_t *segs;
+	int error;
+	int nsegs;
+
+	if ((flags & BUS_DMA_NOWAIT) == 0)
+		_bus_dmamap_waitok(dmat, map, mem, callback, callback_arg);
+
+	nsegs = -1;
+	error = 0;
+	switch (mem->md_type) {
+	case MEMDESC_VADDR:
+		error = _bus_dmamap_load_buffer(dmat, map, mem->u.md_vaddr,
+		    mem->md_opaque, kernel_pmap, flags, NULL, &nsegs);
+		break;
+	case MEMDESC_PADDR:
+		error = _bus_dmamap_load_phys(dmat, map, mem->u.md_paddr,
+		    mem->md_opaque, flags, NULL, &nsegs);
+		break;
+	case MEMDESC_VLIST:
+		error = _bus_dmamap_load_vlist(dmat, map, mem->u.md_list,
+		    mem->md_opaque, kernel_pmap, &nsegs, flags, 0, SIZE_T_MAX);
+		break;
+	case MEMDESC_PLIST:
+		error = _bus_dmamap_load_plist(dmat, map, mem->u.md_list,
+		    mem->md_opaque, &nsegs, flags);
+		break;
+	case MEMDESC_BIO:
+		error = _bus_dmamap_load_bio(dmat, map, mem->u.md_bio,
+		    &nsegs, flags);
+		break;
+	case MEMDESC_UIO:
+		error = _bus_dmamap_load_uio(dmat, map, mem->u.md_uio,
+		    &nsegs, flags);
+		break;
+	case MEMDESC_MBUF:
+		error = _bus_dmamap_load_mbuf_sg(dmat, map, mem->u.md_mbuf,
+		    NULL, &nsegs, flags);
+		break;
+	case MEMDESC_CCB:
+		error = _bus_dmamap_load_ccb(dmat, map, mem->u.md_ccb, &nsegs,
+		    flags);
+		break;
+	}
+	nsegs++;
+
+	CTR5(KTR_BUSDMA, "%s: tag %p tag flags 0x%x error %d nsegs %d",
+	    __func__, dmat, flags, error, nsegs);
+
+	if (error == EINPROGRESS)
+		return (error);
+
+	segs = _bus_dmamap_complete(dmat, map, NULL, nsegs, error);
+	if (error)
+		(*callback)(callback_arg, segs, 0, error);
+	else
+		(*callback)(callback_arg, segs, nsegs, 0);
+
+	/*
+	 * Return ENOMEM to the caller so that it can pass it up the stack.
+	 * This error only happens when NOWAIT is set, so deferral is disabled.
+	 */
+	if (error == ENOMEM)
+		return (error);
+
+	return (0);
+}


Property changes on: trunk/sys/kern/subr_bus_dma.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/kern/subr_busdma_bufalloc.c
===================================================================
--- trunk/sys/kern/subr_busdma_bufalloc.c	                        (rev 0)
+++ trunk/sys/kern/subr_busdma_bufalloc.c	2018-05-25 21:07:09 UTC (rev 9950)
@@ -0,0 +1,175 @@
+/* $MidnightBSD$ */
+/*-
+ * Copyright (c) 2012 Ian Lepore
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: stable/10/sys/kern/subr_busdma_bufalloc.c 294677 2016-01-24 19:21:53Z ian $");
+
+/*
+ * Buffer allocation support routines for bus_dmamem_alloc implementations.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/bus.h>
+#include <sys/busdma_bufalloc.h>
+#include <sys/malloc.h>
+
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_kern.h>
+#include <vm/uma.h>
+
+/*
+ * We manage buffer zones up to a page in size.  Buffers larger than a page can
+ * be managed by one of the kernel's page-oriented memory allocation routines as
+ * efficiently as what we can do here.  Also, a page is the largest size for
+ * which we can g'tee contiguity when using uma, and contiguity is one of the
+ * requirements we have to fulfill.
+ */
+#define	MIN_ZONE_BUFSIZE	32
+#define	MAX_ZONE_BUFSIZE	PAGE_SIZE
+
+/*
+ * The static array of 12 bufzones is big enough to handle all the zones for the
+ * smallest supported allocation size of 32 through the largest supported page
+ * size of 64K.  If you up the biggest page size number, up the array size too.
+ * Basically the size of the array needs to be log2(maxsize)-log2(minsize)+1,
+ * but I don't know of an easy way to express that as a compile-time constant.
+ */
+#if PAGE_SIZE > 65536
+#error Unsupported page size
+#endif
+
+struct busdma_bufalloc {
+	bus_size_t		min_size;
+	size_t			num_zones;
+	struct busdma_bufzone	buf_zones[12];
+};
+
+busdma_bufalloc_t 
+busdma_bufalloc_create(const char *name, bus_size_t minimum_alignment,
+    uma_alloc alloc_func, uma_free free_func, u_int32_t zcreate_flags)
+{
+	struct busdma_bufalloc *ba;
+	struct busdma_bufzone *bz;
+	int i;
+	bus_size_t cursize;
+
+	ba = malloc(sizeof(struct busdma_bufalloc), M_DEVBUF, 
+	    M_ZERO | M_WAITOK);
+
+	ba->min_size = MAX(MIN_ZONE_BUFSIZE, minimum_alignment);
+
+	/*
+	 * Each uma zone is created with an alignment of size-1, meaning that
+	 * the alignment is equal to the size (I.E., 64 byte buffers are aligned
+	 * to 64 byte boundaries, etc).  This allows for a fast efficient test
+	 * when deciding whether a pool buffer meets the constraints of a given
+	 * tag used for allocation: the buffer is usable if tag->alignment <=
+	 * bufzone->size.
+	 */
+	for (i = 0, bz = ba->buf_zones, cursize = ba->min_size;
+	    i < nitems(ba->buf_zones) && cursize <= MAX_ZONE_BUFSIZE;
+	    ++i, ++bz, cursize <<= 1) {
+		snprintf(bz->name, sizeof(bz->name), "dma %.10s %ju",
+		    name, (uintmax_t)cursize);
+		bz->size = cursize;
+		bz->umazone = uma_zcreate(bz->name, bz->size,
+		    NULL, NULL, NULL, NULL, bz->size - 1, zcreate_flags);
+		if (bz->umazone == NULL) {
+			busdma_bufalloc_destroy(ba);
+			return (NULL);
+		}
+		if (alloc_func != NULL)
+			uma_zone_set_allocf(bz->umazone, alloc_func);
+		if (free_func != NULL)
+			uma_zone_set_freef(bz->umazone, free_func);
+		++ba->num_zones;
+	}
+
+	return (ba);
+}
+
+void 
+busdma_bufalloc_destroy(busdma_bufalloc_t ba)
+{
+	struct busdma_bufzone *bz;
+	int i;
+
+	if (ba == NULL)
+		return;
+
+	for (i = 0, bz = ba->buf_zones; i < ba->num_zones; ++i, ++bz) {
+		uma_zdestroy(bz->umazone);
+	}
+
+	free(ba, M_DEVBUF);
+}
+
+struct busdma_bufzone * 
+busdma_bufalloc_findzone(busdma_bufalloc_t ba, bus_size_t size)
+{
+	struct busdma_bufzone *bz;
+	int i;
+
+	if (size > MAX_ZONE_BUFSIZE)
+		return (NULL);
+
+	for (i = 0, bz = ba->buf_zones; i < ba->num_zones; ++i, ++bz) {
+		if (bz->size >= size)
+			return (bz);
+	}
+
+	panic("Didn't find a buffer zone of the right size");
+}
+
+void *
+busdma_bufalloc_alloc_uncacheable(uma_zone_t zone, vm_size_t size,
+    uint8_t *pflag, int wait)
+{
+#ifdef VM_MEMATTR_UNCACHEABLE
+
+	/* Inform UMA that this allocator uses kernel_arena/object. */
+	*pflag = UMA_SLAB_KERNEL;
+
+	return ((void *)kmem_alloc_attr(kernel_arena, size, wait, 0,
+	    BUS_SPACE_MAXADDR, VM_MEMATTR_UNCACHEABLE));
+
+#else
+
+	panic("VM_MEMATTR_UNCACHEABLE unavailable");
+
+#endif	/* VM_MEMATTR_UNCACHEABLE */
+}
+
+void 
+busdma_bufalloc_free_uncacheable(void *item, vm_size_t size, uint8_t pflag)
+{
+
+	kmem_free(kernel_arena, (vm_offset_t)item, size);
+}
+


Property changes on: trunk/sys/kern/subr_busdma_bufalloc.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Modified: trunk/sys/kern/subr_clock.c
===================================================================
--- trunk/sys/kern/subr_clock.c	2018-05-25 20:59:46 UTC (rev 9949)
+++ trunk/sys/kern/subr_clock.c	2018-05-25 21:07:09 UTC (rev 9950)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1988 University of Utah.
  * Copyright (c) 1982, 1990, 1993
@@ -39,7 +40,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/subr_clock.c 275932 2014-12-19 09:34:14Z kib $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -46,6 +47,7 @@
 #include <sys/kernel.h>
 #include <sys/bus.h>
 #include <sys/clock.h>
+#include <sys/limits.h>
 #include <sys/sysctl.h>
 #include <sys/timetc.h>
 
@@ -132,7 +134,6 @@
 int
 clock_ct_to_ts(struct clocktime *ct, struct timespec *ts)
 {
-	time_t secs;
 	int i, year, days;
 
 	year = ct->year;
@@ -147,7 +148,7 @@
 	if (ct->mon < 1 || ct->mon > 12 || ct->day < 1 ||
 	    ct->day > days_in_month(year, ct->mon) ||
 	    ct->hour > 23 ||  ct->min > 59 || ct->sec > 59 ||
-	    ct->year > 2037) {		/* time_t overflow */
+	    (sizeof(time_t) == 4 && year > 2037)) {	/* time_t overflow */
 		if (ct_debug)
 			printf(" = EINVAL\n");
 		return (EINVAL);
@@ -166,11 +167,10 @@
 	  	days += days_in_month(year, i);
 	days += (ct->day - 1);
 
-	/* Add hours, minutes, seconds. */
-	secs = ((days * 24 + ct->hour) * 60 + ct->min) * 60 + ct->sec;
+	ts->tv_sec = (((time_t)days * 24 + ct->hour) * 60 + ct->min) * 60 +
+	    ct->sec;
+	ts->tv_nsec = ct->nsec;
 
-	ts->tv_sec = secs;
-	ts->tv_nsec = ct->nsec;
 	if (ct_debug)
 		printf(" = %ld.%09ld\n", (long)ts->tv_sec, (long)ts->tv_nsec);
 	return (0);

Added: trunk/sys/kern/subr_counter.c
===================================================================
--- trunk/sys/kern/subr_counter.c	                        (rev 0)
+++ trunk/sys/kern/subr_counter.c	2018-05-25 21:07:09 UTC (rev 9950)
@@ -0,0 +1,97 @@
+/* $MidnightBSD$ */
+/*-
+ * Copyright (c) 2012 Gleb Smirnoff <glebius at FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: stable/10/sys/kern/subr_counter.c 262739 2014-03-04 14:46:30Z glebius $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/sched.h>
+#include <sys/smp.h>
+#include <sys/sysctl.h>
+#include <vm/uma.h>
+
+#define IN_SUBR_COUNTER_C
+#include <sys/counter.h>
+ 
+void
+counter_u64_zero(counter_u64_t c)
+{
+
+	counter_u64_zero_inline(c);
+}
+
+uint64_t
+counter_u64_fetch(counter_u64_t c)
+{
+
+	return (counter_u64_fetch_inline(c));
+}
+
+counter_u64_t
+counter_u64_alloc(int flags)
+{
+	counter_u64_t r;
+
+	r = uma_zalloc(pcpu_zone_64, flags);
+	if (r != NULL)
+		counter_u64_zero(r);
+
+	return (r);
+}
+
+void
+counter_u64_free(counter_u64_t c)
+{
+
+	uma_zfree(pcpu_zone_64, c);
+}
+
+int
+sysctl_handle_counter_u64(SYSCTL_HANDLER_ARGS)
+{
+	uint64_t out;
+	int error;
+
+	out = counter_u64_fetch(*(counter_u64_t *)arg1);
+
+	error = SYSCTL_OUT(req, &out, sizeof(uint64_t));
+
+	if (error || !req->newptr)
+		return (error);
+
+	/*
+	 * Any write attempt to a counter zeroes it.
+	 */
+	counter_u64_zero(*(counter_u64_t *)arg1);
+
+	return (0);
+}


Property changes on: trunk/sys/kern/subr_counter.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Modified: trunk/sys/kern/subr_devstat.c
===================================================================
--- trunk/sys/kern/subr_devstat.c	2018-05-25 20:59:46 UTC (rev 9949)
+++ trunk/sys/kern/subr_devstat.c	2018-05-25 21:07:09 UTC (rev 9950)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1997, 1998, 1999 Kenneth D. Merry.
  * All rights reserved.
@@ -27,7 +28,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/subr_devstat.c 302234 2016-06-27 21:50:30Z bdrewery $");
 
 #include "opt_kdtrace.h"
 
@@ -36,6 +37,7 @@
 #include <sys/systm.h>
 #include <sys/bio.h>
 #include <sys/devicestat.h>
+#include <sys/sdt.h>
 #include <sys/sysctl.h>
 #include <sys/malloc.h>
 #include <sys/lock.h>
@@ -46,58 +48,22 @@
 
 #include <machine/atomic.h>
 
-#ifdef KDTRACE_HOOKS
-#include <sys/dtrace_bsd.h>
+SDT_PROVIDER_DEFINE(io);
 
-dtrace_io_start_probe_func_t dtrace_io_start_probe;
-dtrace_io_done_probe_func_t dtrace_io_done_probe;
-dtrace_io_wait_start_probe_func_t dtrace_io_wait_start_probe;
-dtrace_io_wait_done_probe_func_t dtrace_io_wait_done_probe;
+SDT_PROBE_DEFINE2(io, , , start, "struct bio *", "struct devstat *");
+SDT_PROBE_DEFINE2(io, , , done, "struct bio *", "struct devstat *");
+SDT_PROBE_DEFINE2(io, , , wait__start, "struct bio *",
+    "struct devstat *");
+SDT_PROBE_DEFINE2(io, , , wait__done, "struct bio *",
+    "struct devstat *");
 
-uint32_t	dtio_start_id;
-uint32_t	dtio_done_id;
-uint32_t	dtio_wait_start_id;
-uint32_t	dtio_wait_done_id;
+#define	DTRACE_DEVSTAT_START()		SDT_PROBE2(io, , , start, NULL, ds)
+#define	DTRACE_DEVSTAT_BIO_START()	SDT_PROBE2(io, , , start, bp, ds)
+#define	DTRACE_DEVSTAT_DONE()		SDT_PROBE2(io, , , done, NULL, ds)
+#define	DTRACE_DEVSTAT_BIO_DONE()	SDT_PROBE2(io, , , done, bp, ds)
+#define	DTRACE_DEVSTAT_WAIT_START()	SDT_PROBE2(io, , , wait__start, NULL, ds)
+#define	DTRACE_DEVSTAT_WAIT_DONE()	SDT_PROBE2(io, , , wait__done, NULL, ds)
 
-#define DTRACE_DEVSTAT_START() \
-	if (dtrace_io_start_probe != NULL) \
-		(*dtrace_io_start_probe)(dtio_start_id, NULL, ds);
-
-#define DTRACE_DEVSTAT_BIO_START() \
-	if (dtrace_io_start_probe != NULL) \
-		(*dtrace_io_start_probe)(dtio_start_id, bp, ds);
-
-#define DTRACE_DEVSTAT_DONE() \
-	if (dtrace_io_done_probe != NULL) \
-		(*dtrace_io_done_probe)(dtio_done_id, NULL, ds);
-
-#define DTRACE_DEVSTAT_BIO_DONE() \
-	if (dtrace_io_done_probe != NULL) \
-		(*dtrace_io_done_probe)(dtio_done_id, bp, ds);
-
-#define DTRACE_DEVSTAT_WAIT_START() \
-	if (dtrace_io_wait_start_probe != NULL) \
-		(*dtrace_io_wait_start_probe)(dtio_wait_start_id, NULL, ds);
-
-#define DTRACE_DEVSTAT_WAIT_DONE() \
-	if (dtrace_io_wait_done_probe != NULL) \
-		(*dtrace_io_wait_done_probe)(dtio_wait_done_id, NULL, ds);
-
-#else /* ! KDTRACE_HOOKS */
-
-#define DTRACE_DEVSTAT_START()
-
-#define DTRACE_DEVSTAT_BIO_START()
-
-#define DTRACE_DEVSTAT_DONE()
-
-#define DTRACE_DEVSTAT_BIO_DONE()
-
-#define DTRACE_DEVSTAT_WAIT_START()
-
-#define DTRACE_DEVSTAT_WAIT_DONE()
-#endif /* KDTRACE_HOOKS */
-
 static int devstat_num_devs;
 static long devstat_generation = 1;
 static int devstat_version = DEVSTAT_VERSION;
@@ -131,6 +97,7 @@
 	ds = devstat_alloc();
 	mtx_lock(&devstat_mutex);
 	if (unit_number == -1) {
+		ds->unit_number = unit_number;
 		ds->id = dev_name;
 		binuptime(&ds->creation_time);
 		devstat_generation++;
@@ -242,7 +209,7 @@
 
 	/* Remove this entry from the devstat queue */
 	atomic_add_acq_int(&ds->sequence1, 1);
-	if (ds->id == NULL) {
+	if (ds->unit_number != -1) {
 		devstat_num_devs--;
 		STAILQ_REMOVE(devstat_head, ds, devstat, dev_links);
 	}
@@ -374,6 +341,14 @@
 void
 devstat_end_transaction_bio(struct devstat *ds, struct bio *bp)
 {
+
+	devstat_end_transaction_bio_bt(ds, bp, NULL);
+}
+
+void
+devstat_end_transaction_bio_bt(struct devstat *ds, struct bio *bp,
+    struct bintime *now)
+{
 	devstat_trans_flags flg;
 
 	/* sanity check */
@@ -390,7 +365,7 @@
 		flg = DEVSTAT_NO_DATA;
 
 	devstat_end_transaction(ds, bp->bio_bcount - bp->bio_resid,
-				DEVSTAT_TAG_SIMPLE, flg, NULL, &bp->bio_t0);
+				DEVSTAT_TAG_SIMPLE, flg, now, &bp->bio_t0);
 	DTRACE_DEVSTAT_BIO_DONE();
 }
 
@@ -417,7 +392,7 @@
 	 * XXX devstat_generation should really be "volatile" but that
 	 * XXX freaks out the sysctl macro below.  The places where we
 	 * XXX change it and inspect it are bracketed in the mutex which
-	 * XXX guarantees us proper write barriers.  I don't belive the
+	 * XXX guarantees us proper write barriers.  I don't believe the
 	 * XXX compiler is allowed to optimize mygen away across calls
 	 * XXX to other functions, so the following is belived to be safe.
 	 */
@@ -533,7 +508,7 @@
 	mtx_assert(&devstat_mutex, MA_NOTOWNED);
 	if (!once) {
 		make_dev_credf(MAKEDEV_ETERNAL | MAKEDEV_CHECKNAME,
-		    &devstat_cdevsw, 0, NULL, UID_ROOT, GID_WHEEL, 0440,
+		    &devstat_cdevsw, 0, NULL, UID_ROOT, GID_WHEEL, 0444,
 		    DEVSTAT_DEVICE_NAME);
 		once = 1;
 	}
@@ -603,4 +578,4 @@
 }
 
 SYSCTL_INT(_debug_sizeof, OID_AUTO, devstat, CTLFLAG_RD,
-    NULL, sizeof(struct devstat), "sizeof(struct devstat)");
+    SYSCTL_NULL_INT_PTR, sizeof(struct devstat), "sizeof(struct devstat)");

Modified: trunk/sys/kern/subr_disk.c
===================================================================
--- trunk/sys/kern/subr_disk.c	2018-05-25 20:59:46 UTC (rev 9949)
+++ trunk/sys/kern/subr_disk.c	2018-05-25 21:07:09 UTC (rev 9950)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * ----------------------------------------------------------------------------
  * "THE BEER-WARE LICENSE" (Revision 42):
@@ -12,7 +13,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/subr_disk.c 212160 2010-09-02 19:40:28Z gibbs $");
 
 #include "opt_geom.h"
 

Added: trunk/sys/kern/subr_dnvlist.c
===================================================================
--- trunk/sys/kern/subr_dnvlist.c	                        (rev 0)
+++ trunk/sys/kern/subr_dnvlist.c	2018-05-25 21:07:09 UTC (rev 9950)
@@ -0,0 +1,129 @@
+/* $MidnightBSD$ */
+/*-
+ * Copyright (c) 2013 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: stable/10/sys/kern/subr_dnvlist.c 292973 2015-12-31 03:28:14Z ngie $");
+
+#ifdef _KERNEL
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+
+#include <machine/stdarg.h>
+
+#else
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdlib.h>
+#endif
+
+#include <sys/nv.h>
+#include <sys/nv_impl.h>
+
+#include <sys/dnv.h>
+
+#define	DNVLIST_GET(ftype, type)					\
+ftype									\
+dnvlist_get_##type(const nvlist_t *nvl, const char *name, ftype defval)	\
+{									\
+									\
+	if (nvlist_exists_##type(nvl, name))				\
+		return (nvlist_get_##type(nvl, name));			\
+	else								\
+		return (defval);					\
+}
+
+DNVLIST_GET(bool, bool)
+DNVLIST_GET(uint64_t, number)
+DNVLIST_GET(const char *, string)
+DNVLIST_GET(const nvlist_t *, nvlist)
+#ifndef _KERNEL
+DNVLIST_GET(int, descriptor)
+#endif
+
+#undef	DNVLIST_GET
+
+const void *
+dnvlist_get_binary(const nvlist_t *nvl, const char *name, size_t *sizep,
+    const void *defval, size_t defsize)
+{
+	const void *value;
+
+	if (nvlist_exists_binary(nvl, name))
+		value = nvlist_get_binary(nvl, name, sizep);
+	else {
+		if (sizep != NULL)
+			*sizep = defsize;
+		value = defval;
+	}
+	return (value);
+}
+
+#define	DNVLIST_TAKE(ftype, type)					\
+ftype									\
+dnvlist_take_##type(nvlist_t *nvl, const char *name, ftype defval)	\
+{									\
+									\
+	if (nvlist_exists_##type(nvl, name))				\
+		return (nvlist_take_##type(nvl, name));			\
+	else								\
+		return (defval);					\
+}
+
+DNVLIST_TAKE(bool, bool)
+DNVLIST_TAKE(uint64_t, number)
+DNVLIST_TAKE(char *, string)
+DNVLIST_TAKE(nvlist_t *, nvlist)
+#ifndef _KERNEL
+DNVLIST_TAKE(int, descriptor)
+#endif
+
+#undef	DNVLIST_TAKE
+
+void *
+dnvlist_take_binary(nvlist_t *nvl, const char *name, size_t *sizep,
+    void *defval, size_t defsize)
+{
+	void *value;
+
+	if (nvlist_exists_binary(nvl, name))
+		value = nvlist_take_binary(nvl, name, sizep);
+	else {
+		if (sizep != NULL)
+			*sizep = defsize;
+		value = defval;
+	}
+	return (value);
+}
+


Property changes on: trunk/sys/kern/subr_dnvlist.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Modified: trunk/sys/kern/subr_dummy_vdso_tc.c
===================================================================
--- trunk/sys/kern/subr_dummy_vdso_tc.c	2018-05-25 20:59:46 UTC (rev 9949)
+++ trunk/sys/kern/subr_dummy_vdso_tc.c	2018-05-25 21:07:09 UTC (rev 9950)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright 2012 Konstantin Belousov <kib at FreeBSD.ORG>.
  * All rights reserved.
@@ -25,7 +26,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/subr_dummy_vdso_tc.c 237433 2012-06-22 07:06:40Z kib $");
 
 #include "opt_compat.h"
 

Modified: trunk/sys/kern/subr_eventhandler.c
===================================================================
--- trunk/sys/kern/subr_eventhandler.c	2018-05-25 20:59:46 UTC (rev 9949)
+++ trunk/sys/kern/subr_eventhandler.c	2018-05-25 21:07:09 UTC (rev 9950)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1999 Michael Smith <msmith at freebsd.org>
  * All rights reserved.
@@ -25,7 +26,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/subr_eventhandler.c 205345 2010-03-19 19:51:03Z bz $");
 
 #include <sys/param.h>
 #include <sys/kernel.h>

Modified: trunk/sys/kern/subr_fattime.c
===================================================================
--- trunk/sys/kern/subr_fattime.c	2018-05-25 20:59:46 UTC (rev 9949)
+++ trunk/sys/kern/subr_fattime.c	2018-05-25 21:07:09 UTC (rev 9950)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 2006 Poul-Henning Kamp
  * All rights reserved.
@@ -23,7 +24,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $MidnightBSD$
+ * $FreeBSD: stable/10/sys/kern/subr_fattime.c 266368 2014-05-17 22:03:44Z ian $
  *
  * Convert MS-DOS FAT format timestamps to and from unix timespecs
  *
@@ -49,9 +50,9 @@
  * "New Technology".  Anyway...
  *
  * The 'utc' argument determines if the resulting FATTIME timestamp
- * should b on the UTC or local timezone calendar.
+ * should be on the UTC or local timezone calendar.
  *
- * The conversion functions below cut time into four-year leap-second
+ * The conversion functions below cut time into four-year leap-year
  * cycles rather than single years and uses table lookups inside those
  * cycles to get the months and years sorted out.
  *

Modified: trunk/sys/kern/subr_firmware.c
===================================================================
--- trunk/sys/kern/subr_firmware.c	2018-05-25 20:59:46 UTC (rev 9949)
+++ trunk/sys/kern/subr_firmware.c	2018-05-25 21:07:09 UTC (rev 9950)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 2005-2008, Sam Leffler <sam at errno.com>
  * All rights reserved.
@@ -25,7 +26,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/subr_firmware.c 237546 2012-06-25 05:41:16Z kevlo $");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
@@ -175,7 +176,10 @@
     unsigned int version, const struct firmware *parent)
 {
 	struct priv_fw *match, *frp;
+	char *str;
 
+	str = strdup(imagename, M_TEMP);
+
 	mtx_lock(&firmware_mtx);
 	/*
 	 * Do a lookup to make sure the name is unique or find a free slot.
@@ -185,6 +189,7 @@
 		mtx_unlock(&firmware_mtx);
 		printf("%s: image %s already registered!\n",
 			__func__, imagename);
+		free(str, M_TEMP);
 		return NULL;
 	}
 	if (frp == NULL) {
@@ -191,10 +196,11 @@
 		mtx_unlock(&firmware_mtx);
 		printf("%s: cannot register image %s, firmware table full!\n",
 		    __func__, imagename);
+		free(str, M_TEMP);
 		return NULL;
 	}
 	bzero(frp, sizeof(*frp));	/* start from a clean record */
-	frp->fw.name = imagename;
+	frp->fw.name = str;
 	frp->fw.data = data;
 	frp->fw.datasize = datasize;
 	frp->fw.version = version;
@@ -230,7 +236,7 @@
 		err = 0;
 	} else if (fp->refcnt != 0) {	/* cannot unregister */
 		err = EBUSY;
-	}  else {
+	} else {
 		linker_file_t x = fp->file;	/* save value */
 
 		/*
@@ -238,6 +244,7 @@
 		 * do not forget anything. Then restore 'file' which is
 		 * non-null for autoloaded images.
 		 */
+		free((void *) (uintptr_t) fp->fw.name, M_TEMP);
 		bzero(fp, sizeof(struct priv_fw));
 		fp->file = x;
 		err = 0;

Modified: trunk/sys/kern/subr_hash.c
===================================================================
--- trunk/sys/kern/subr_hash.c	2018-05-25 20:59:46 UTC (rev 9949)
+++ trunk/sys/kern/subr_hash.c	2018-05-25 21:07:09 UTC (rev 9950)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1982, 1986, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
@@ -35,7 +36,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/subr_hash.c 230486 2012-01-23 16:31:46Z glebius $");
 
 #include <sys/param.h>
 #include <sys/systm.h>

Modified: trunk/sys/kern/subr_hints.c
===================================================================
--- trunk/sys/kern/subr_hints.c	2018-05-25 20:59:46 UTC (rev 9949)
+++ trunk/sys/kern/subr_hints.c	2018-05-25 21:07:09 UTC (rev 9950)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 2000,2001 Peter Wemm <peter at FreeBSD.org>
  * All rights reserved.
@@ -25,11 +26,13 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/subr_hints.c 295131 2016-02-01 23:07:31Z jhb $");
 
 #include <sys/param.h>
 #include <sys/lock.h>
+#include <sys/malloc.h>
 #include <sys/mutex.h>
+#include <sys/sysctl.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 
@@ -42,6 +45,85 @@
 static char *hintp;
 
 /*
+ * Define kern.hintmode sysctl, which only accept value 2, that cause to
+ * switch from Static KENV mode to Dynamic KENV. So systems that have hints
+ * compiled into kernel will be able to see/modify KENV (and hints too).
+ */
+
+static int
+sysctl_hintmode(SYSCTL_HANDLER_ARGS)
+{
+	const char *cp;
+	char *line, *eq;
+	int eqidx, error, from_kenv, i, value;
+
+	from_kenv = 0;
+	cp = kern_envp;
+	value = hintmode;
+
+	/* Fetch candidate for new hintmode value */
+	error = sysctl_handle_int(oidp, &value, 0, req);
+	if (error || req->newptr == NULL)
+		return (error);
+
+	if (value != 2)
+		/* Only accept swithing to hintmode 2 */
+		return (EINVAL);
+
+	/* Migrate from static to dynamic hints */
+	switch (hintmode) {
+	case 0:
+		if (dynamic_kenv) {
+			/*
+			 * Already here. But assign hintmode to 2, to not
+			 * check it in the future.
+			 */
+			hintmode = 2;
+			return (0);
+		}
+		from_kenv = 1;
+		cp = kern_envp;
+		break;
+	case 1:
+		cp = static_hints;
+		break;
+	case 2:
+		/* Nothing to do, hintmode already 2 */
+		return (0);
+	}
+
+	while (cp) {
+		i = strlen(cp);
+		if (i == 0)
+			break;
+		if (from_kenv) {
+			if (strncmp(cp, "hint.", 5) != 0)
+				/* kenv can have not only hints */
+				continue;
+		}
+		eq = strchr(cp, '=');
+		if (eq == NULL)
+			/* Bad hint value */
+			continue;
+		eqidx = eq - cp;
+
+		line = malloc(i+1, M_TEMP, M_WAITOK);
+		strcpy(line, cp);
+		line[eqidx] = '\0';
+		setenv(line, line + eqidx + 1);
+		free(line, M_TEMP);
+		cp += i + 1;
+	}
+
+	hintmode = value;
+	use_kenv = 1;
+	return (0);
+}
+
+SYSCTL_PROC(_kern, OID_AUTO, hintmode, CTLTYPE_INT|CTLFLAG_RW,
+    &hintmode, 0, sysctl_hintmode, "I", "Get/set current hintmode");
+
+/*
  * Evil wildcarding resource string lookup.
  * This walks the supplied env string table and returns a match.
  * The start point can be remembered for incremental searches.
@@ -129,12 +211,11 @@
 		if (strncmp(cp, "hint.", 5) != 0)
 			hit = 0;
 		else
-			n = sscanf(cp, "hint.%32[^.].%d.%32[^=]=%128s",
+			n = sscanf(cp, "hint.%32[^.].%d.%32[^=]=%127s",
 			    r_name, &r_unit, r_resname, r_value);
 		if (hit && n != 4) {
 			printf("CONFIG: invalid hint '%s'\n", cp);
-			/* XXX: abuse bogus index() declaration */
-			p = index(cp, 'h');
+			p = strchr(cp, 'h');
 			*p = 'H';
 			hit = 0;
 		}
@@ -172,18 +253,18 @@
 	s = cp;
 	/* This is a bit of a hack, but at least is reentrant */
 	/* Note that it returns some !unterminated! strings. */
-	s = index(s, '.') + 1;		/* start of device */
+	s = strchr(s, '.') + 1;		/* start of device */
 	if (ret_name)
 		*ret_name = s;
-	s = index(s, '.') + 1;		/* start of unit */
+	s = strchr(s, '.') + 1;		/* start of unit */
 	if (ret_namelen && ret_name)
 		*ret_namelen = s - *ret_name - 1; /* device length */
 	if (ret_unit)
 		*ret_unit = r_unit;
-	s = index(s, '.') + 1;		/* start of resname */
+	s = strchr(s, '.') + 1;		/* start of resname */
 	if (ret_resname)
 		*ret_resname = s;
-	s = index(s, '=') + 1;		/* start of value */
+	s = strchr(s, '=') + 1;		/* start of value */
 	if (ret_resnamelen && ret_resname)
 		*ret_resnamelen = s - *ret_resname - 1; /* value len */
 	if (ret_value)
@@ -381,3 +462,31 @@
 	       return (0);
 	return (value);
 }
+
+/*
+ * Clear a value associated with a device by removing it from
+ * the kernel environment.  This only removes a hint for an
+ * exact unit.
+ */
+int
+resource_unset_value(const char *name, int unit, const char *resname)
+{
+	char varname[128];
+	const char *retname, *retvalue;
+	int error, line;
+	size_t len;
+
+	line = 0;
+	error = resource_find(&line, NULL, name, &unit, resname, NULL,
+	    &retname, NULL, NULL, NULL, NULL, &retvalue);
+	if (error)
+		return (error);
+
+	retname -= strlen("hint.");
+	len = retvalue - retname - 1;
+	if (len > sizeof(varname) - 1)
+		return (ENAMETOOLONG);
+	memcpy(varname, retname, len);
+	varname[len] = '\0';
+	return (unsetenv(varname));
+}

Modified: trunk/sys/kern/subr_kdb.c
===================================================================
--- trunk/sys/kern/subr_kdb.c	2018-05-25 20:59:46 UTC (rev 9949)
+++ trunk/sys/kern/subr_kdb.c	2018-05-25 21:07:09 UTC (rev 9950)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 2004 The FreeBSD Project
  * All rights reserved.
@@ -25,7 +26,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/subr_kdb.c 325460 2017-11-05 22:34:27Z ngie $");
 
 #include "opt_kdb.h"
 #include "opt_stack.h"
@@ -91,25 +92,30 @@
 SYSCTL_PROC(_debug_kdb, OID_AUTO, current, CTLTYPE_STRING | CTLFLAG_RW, NULL,
     0, kdb_sysctl_current, "A", "currently selected KDB backend");
 
-SYSCTL_PROC(_debug_kdb, OID_AUTO, enter, CTLTYPE_INT | CTLFLAG_RW, NULL, 0,
+SYSCTL_PROC(_debug_kdb, OID_AUTO, enter,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE, NULL, 0,
     kdb_sysctl_enter, "I", "set to enter the debugger");
 
-SYSCTL_PROC(_debug_kdb, OID_AUTO, panic, CTLTYPE_INT | CTLFLAG_RW, NULL, 0,
+SYSCTL_PROC(_debug_kdb, OID_AUTO, panic,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE, NULL, 0,
     kdb_sysctl_panic, "I", "set to panic the kernel");
 
-SYSCTL_PROC(_debug_kdb, OID_AUTO, trap, CTLTYPE_INT | CTLFLAG_RW, NULL, 0,
+SYSCTL_PROC(_debug_kdb, OID_AUTO, trap,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE, NULL, 0,
     kdb_sysctl_trap, "I", "set to cause a page fault via data access");
 
-SYSCTL_PROC(_debug_kdb, OID_AUTO, trap_code, CTLTYPE_INT | CTLFLAG_RW, NULL, 0,
+SYSCTL_PROC(_debug_kdb, OID_AUTO, trap_code,
+    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_SECURE, NULL, 0,
     kdb_sysctl_trap_code, "I", "set to cause a page fault via code access");
 
-SYSCTL_INT(_debug_kdb, OID_AUTO, break_to_debugger, CTLTYPE_INT | CTLFLAG_RW |
-    CTLFLAG_TUN, &kdb_break_to_debugger, 0, "Enable break to debugger");
+SYSCTL_INT(_debug_kdb, OID_AUTO, break_to_debugger,
+    CTLFLAG_RWTUN | CTLFLAG_SECURE,
+    &kdb_break_to_debugger, 0, "Enable break to debugger");
 TUNABLE_INT("debug.kdb.break_to_debugger", &kdb_break_to_debugger);
 
-SYSCTL_INT(_debug_kdb, OID_AUTO, alt_break_to_debugger, CTLTYPE_INT |
-    CTLFLAG_RW | CTLFLAG_TUN, &kdb_alt_break_to_debugger, 0,
-    "Enable alternative break to debugger");
+SYSCTL_INT(_debug_kdb, OID_AUTO, alt_break_to_debugger,
+    CTLFLAG_RWTUN | CTLFLAG_SECURE,
+    &kdb_alt_break_to_debugger, 0, "Enable alternative break to debugger");
 TUNABLE_INT("debug.kdb.alt_break_to_debugger", &kdb_alt_break_to_debugger);
 
 /*
@@ -498,6 +504,8 @@
 	if (!kdb_active || kdb_jmpbufp == NULL)
 		return;
 
+	printf("KDB: reentering\n");
+	kdb_backtrace();
 	longjmp(kdb_jmpbufp, 1);
 	/* NOTREACHED */
 }
@@ -508,12 +516,12 @@
 
 struct pcb *
 kdb_thr_ctx(struct thread *thr)
-{  
+{
 #if defined(SMP) && defined(KDB_STOPPEDPCB)
 	struct pcpu *pc;
 #endif
- 
-	if (thr == curthread) 
+
+	if (thr == curthread)
 		return (&kdb_pcb);
 
 #if defined(SMP) && defined(KDB_STOPPEDPCB)

Modified: trunk/sys/kern/subr_kobj.c
===================================================================
--- trunk/sys/kern/subr_kobj.c	2018-05-25 20:59:46 UTC (rev 9949)
+++ trunk/sys/kern/subr_kobj.c	2018-05-25 21:07:09 UTC (rev 9950)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 2000,2003 Doug Rabson
  * All rights reserved.
@@ -25,7 +26,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/subr_kobj.c 318275 2017-05-14 14:21:11Z marius $");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
@@ -83,7 +84,7 @@
  * desc pointer is NULL, it is guaranteed never to match any read
  * descriptors.
  */
-static struct kobj_method null_method = {
+static const struct kobj_method null_method = {
 	0, 0,
 };
 
@@ -213,19 +214,11 @@
 {
 	kobj_method_t *ce;
 
-#ifdef KOBJ_STATS
-	/*
-	 * Correct for the 'hit' assumption in KOBJOPLOOKUP and record
-	 * a 'miss'.
-	 */
-	kobj_lookup_hits--;
-	kobj_lookup_misses++;
-#endif
-
 	ce = kobj_lookup_method_mi(cls, desc);
 	if (!ce)
-		ce = desc->deflt;
-	*cep = ce;
+		ce = &desc->deflt;
+	if (cep)
+		*cep = ce;
 	return ce;
 }
 

Modified: trunk/sys/kern/subr_lock.c
===================================================================
--- trunk/sys/kern/subr_lock.c	2018-05-25 20:59:46 UTC (rev 9949)
+++ trunk/sys/kern/subr_lock.c	2018-05-25 21:07:09 UTC (rev 9950)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 2006 John Baldwin <jhb at FreeBSD.org>
  * All rights reserved.
@@ -10,9 +11,6 @@
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
- * 3. Neither the name of the author nor the names of any co-contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
@@ -33,7 +31,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/subr_lock.c 323870 2017-09-21 19:24:11Z marius $");
 
 #include "opt_ddb.h"
 #include "opt_mprof.h"
@@ -58,6 +56,7 @@
 #endif
 
 #include <machine/cpufunc.h>
+#include <machine/cpu.h>
 
 CTASSERT(LOCK_CLASS_MAX == 15);
 
@@ -66,6 +65,7 @@
 	&lock_class_mtx_sleep,
 	&lock_class_sx,
 	&lock_class_rm,
+	&lock_class_rm_sleepable,
 	&lock_class_rw,
 	&lock_class_lockmgr,
 };
@@ -77,8 +77,8 @@
 	int i;
 
 	/* Check for double-init and zero object. */
-	KASSERT(!lock_initalized(lock), ("lock \"%s\" %p already initialized",
-	    name, lock));
+	KASSERT(flags & LO_NEW || !lock_initalized(lock),
+	    ("lock \"%s\" %p already initialized", name, lock));
 
 	/* Look up lock class to find its index. */
 	for (i = 0; i < LOCK_CLASS_MAX; i++)
@@ -105,6 +105,34 @@
 	lock->lo_flags &= ~LO_INITIALIZED;
 }
 
+void
+lock_delay(struct lock_delay_arg *la)
+{
+	u_int i, delay, backoff, min, max;
+	struct lock_delay_config *lc = la->config;
+
+	delay = la->delay;
+
+	if (delay == 0)
+		delay = lc->initial;
+	else {
+		delay += lc->step;
+		max = lc->max;
+		if (delay > max)
+			delay = max;
+	}
+
+	backoff = cpu_ticks() % delay;
+	min = lc->min;
+	if (backoff < min)
+		backoff = min;
+	for (i = 0; i < backoff; i++)
+		cpu_spinwait();
+
+	la->delay = delay;
+	la->spin_cnt += backoff;
+}
+
 #ifdef DDB
 DB_SHOW_COMMAND(lock, db_show_lock)
 {
@@ -240,34 +268,13 @@
 }
 SYSINIT(lockprof, SI_SUB_SMP, SI_ORDER_ANY, lock_prof_init, NULL);
 
-/*
- * To be certain that lock profiling has idled on all cpus before we
- * reset, we schedule the resetting thread on all active cpus.  Since
- * all operations happen within critical sections we can be sure that
- * it is safe to zero the profiling structures.
- */
 static void
-lock_prof_idle(void)
-{
-	struct thread *td;
-	int cpu;
-
-	td = curthread;
-	thread_lock(td);
-	CPU_FOREACH(cpu) {
-		sched_bind(td, cpu);
-	}
-	sched_unbind(td);
-	thread_unlock(td);
-}
-
-static void
 lock_prof_reset_wait(void)
 {
 
 	/*
-	 * Spin relinquishing our cpu so that lock_prof_idle may
-	 * run on it.
+	 * Spin relinquishing our cpu so that quiesce_all_cpus may
+	 * complete.
 	 */
 	while (lock_prof_resetting)
 		sched_relinquish(curthread);
@@ -289,7 +296,7 @@
 	atomic_store_rel_int(&lock_prof_resetting, 1);
 	enabled = lock_prof_enable;
 	lock_prof_enable = 0;
-	lock_prof_idle();
+	quiesce_all_cpus("profreset", 0);
 	/*
 	 * Some objects may have migrated between CPUs.  Clear all links
 	 * before we zero the structures.  Some items may still be linked
@@ -401,7 +408,7 @@
 	    "max", "wait_max", "total", "wait_total", "count", "avg", "wait_avg", "cnt_hold", "cnt_lock", "name");
 	enabled = lock_prof_enable;
 	lock_prof_enable = 0;
-	lock_prof_idle();
+	quiesce_all_cpus("profstat", 0);
 	t = ticks;
 	for (cpu = 0; cpu <= mp_maxid; cpu++) {
 		if (lp_cpu[cpu] == NULL)

Modified: trunk/sys/kern/subr_log.c
===================================================================
--- trunk/sys/kern/subr_log.c	2018-05-25 20:59:46 UTC (rev 9949)
+++ trunk/sys/kern/subr_log.c	2018-05-25 21:07:09 UTC (rev 9950)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1982, 1986, 1993
  *	The Regents of the University of California.  All rights reserved.
@@ -34,7 +35,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/subr_log.c 247798 2013-03-04 16:07:55Z davide $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -117,8 +118,8 @@
 		return (EBUSY);
 	}
 	log_open = 1;
-	callout_reset(&logsoftc.sc_callout, hz / log_wakeups_per_second,
-	    logtimeout, NULL);
+	callout_reset_sbt(&logsoftc.sc_callout,
+	    SBT_1S / log_wakeups_per_second, 0, logtimeout, NULL, C_PREL(1));
 	mtx_unlock(&msgbuf_lock);
 
 	fsetown(td->td_proc->p_pid, &logsoftc.sc_sigio);	/* signal process only */
@@ -233,15 +234,8 @@
 
 	if (!log_open)
 		return;
-	if (log_wakeups_per_second < 1) {
-		printf("syslog wakeup is less than one.  Adjusting to 1.\n");
-		log_wakeups_per_second = 1;
-	}
-	if (msgbuftrigger == 0) {
-		callout_schedule(&logsoftc.sc_callout,
-		    hz / log_wakeups_per_second);
-		return;
-	}
+	if (msgbuftrigger == 0)
+		goto done;
 	msgbuftrigger = 0;
 	selwakeuppri(&logsoftc.sc_selp, LOG_RDPRI);
 	KNOTE_LOCKED(&logsoftc.sc_selp.si_note, 0);
@@ -248,7 +242,13 @@
 	if ((logsoftc.sc_state & LOG_ASYNC) && logsoftc.sc_sigio != NULL)
 		pgsigio(&logsoftc.sc_sigio, SIGIO, 0);
 	cv_broadcastpri(&log_wakeup, LOG_RDPRI);
-	callout_schedule(&logsoftc.sc_callout, hz / log_wakeups_per_second);
+done:
+	if (log_wakeups_per_second < 1) {
+		printf("syslog wakeup is less than one.  Adjusting to 1.\n");
+		log_wakeups_per_second = 1;
+	}
+	callout_reset_sbt(&logsoftc.sc_callout,
+	    SBT_1S / log_wakeups_per_second, 0, logtimeout, NULL, C_PREL(1));
 }
 
 /*ARGSUSED*/

Modified: trunk/sys/kern/subr_mbpool.c
===================================================================
--- trunk/sys/kern/subr_mbpool.c	2018-05-25 20:59:46 UTC (rev 9949)
+++ trunk/sys/kern/subr_mbpool.c	2018-05-25 21:07:09 UTC (rev 9950)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 2003
  *	Fraunhofer Institute for Open Communication Systems (FhG Fokus).
@@ -28,7 +29,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/subr_mbpool.c 302234 2016-06-27 21:50:30Z bdrewery $");
 
 #include <sys/param.h>
 #include <sys/lock.h>
@@ -40,6 +41,7 @@
 
 #include <machine/bus.h>
 
+#include <sys/mbuf.h>
 #include <sys/mbpool.h>
 
 MODULE_VERSION(libmbpool, 1);
@@ -209,16 +211,13 @@
 	pg = &p->pages[p->npages];
 
 	error = bus_dmamem_alloc(p->dmat, &pg->va, BUS_DMA_NOWAIT, &pg->map);
-	if (error != 0) {
-		free(pg, M_MBPOOL);
+	if (error != 0)
 		return;
-	}
 
 	error = bus_dmamap_load(p->dmat, pg->map, pg->va, p->page_size,
 	    mbp_callback, &pg->phy, 0);
 	if (error != 0) {
 		bus_dmamem_free(p->dmat, pg->va, pg->map);
-		free(pg, M_MBPOOL);
 		return;
 	}
 
@@ -282,14 +281,16 @@
 /*
  * Mbuf system external mbuf free routine
  */
-void
-mbp_ext_free(void *buf, void *arg)
+int
+mbp_ext_free(struct mbuf *m, void *buf, void *arg)
 {
 	mbp_free(arg, buf);
+
+	return (EXT_FREE_OK);
 }
 
 /*
- * Free all buffers that are marked as beeing on the card
+ * Free all buffers that are marked as being on the card
  */
 void
 mbp_card_free(struct mbpool *p)

Modified: trunk/sys/kern/subr_mchain.c
===================================================================
--- trunk/sys/kern/subr_mchain.c	2018-05-25 20:59:46 UTC (rev 9949)
+++ trunk/sys/kern/subr_mchain.c	2018-05-25 21:07:09 UTC (rev 9950)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 2000, 2001 Boris Popov
  * All rights reserved.
@@ -28,7 +29,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/subr_mchain.c 302234 2016-06-27 21:50:30Z bdrewery $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -59,10 +60,10 @@
 {
 	struct mbuf *m;
 
-	m = m_gethdr(M_WAIT, MT_DATA);
+	m = m_gethdr(M_WAITOK, MT_DATA);
 	m->m_len = 0;
 	mb_initm(mbp, m);
-	return 0;
+	return (0);
 }
 
 void
@@ -89,19 +90,19 @@
 
 	m = mbp->mb_top;
 	mbp->mb_top = NULL;
-	return m;
+	return (m);
 }
 
 int
 mb_fixhdr(struct mbchain *mbp)
 {
-	return mbp->mb_top->m_pkthdr.len = m_fixhdr(mbp->mb_top);
+	return (mbp->mb_top->m_pkthdr.len = m_fixhdr(mbp->mb_top));
 }
 
 /*
  * Check if object of size 'size' fit to the current position and
  * allocate new mbuf if not. Advance pointers and increase length of mbuf(s).
- * Return pointer to the object placeholder or NULL if any error occured.
+ * Return pointer to the object placeholder or NULL if any error occurred.
  * Note: size should be <= MLEN 
  */
 caddr_t
@@ -114,7 +115,7 @@
 		panic("mb_reserve: size = %d\n", size);
 	m = mbp->mb_cur;
 	if (mbp->mb_mleft < size) {
-		mn = m_get(M_WAIT, MT_DATA);
+		mn = m_get(M_WAITOK, MT_DATA);
 		mbp->mb_cur = m->m_next = mn;
 		m = mn;
 		m->m_len = 0;
@@ -124,7 +125,7 @@
 	mbp->mb_count += size;
 	bpos = mtod(m, caddr_t) + m->m_len;
 	m->m_len += size;
-	return bpos;
+	return (bpos);
 }
 
 int
@@ -131,21 +132,21 @@
 mb_put_padbyte(struct mbchain *mbp)
 {
 	caddr_t dst;
-	char x = 0;
+	uint8_t x = 0;
 
 	dst = mtod(mbp->mb_cur, caddr_t) + mbp->mb_cur->m_len;
 
-	/* only add padding if address is odd */
+	/* Only add padding if address is odd */
 	if ((unsigned long)dst & 1)
-		return mb_put_mem(mbp, (caddr_t)&x, 1, MB_MSYSTEM);
+		return (mb_put_mem(mbp, (caddr_t)&x, sizeof(x), MB_MSYSTEM));
 	else
-	return 0;
+		return (0);
 }
 
 int
 mb_put_uint8(struct mbchain *mbp, uint8_t x)
 {
-	return mb_put_mem(mbp, (caddr_t)&x, sizeof(x), MB_MSYSTEM);
+	return (mb_put_mem(mbp, (caddr_t)&x, sizeof(x), MB_MSYSTEM));
 }
 
 int
@@ -152,7 +153,7 @@
 mb_put_uint16be(struct mbchain *mbp, uint16_t x)
 {
 	x = htobe16(x);
-	return mb_put_mem(mbp, (caddr_t)&x, sizeof(x), MB_MSYSTEM);
+	return (mb_put_mem(mbp, (caddr_t)&x, sizeof(x), MB_MSYSTEM));
 }
 
 int
@@ -159,7 +160,7 @@
 mb_put_uint16le(struct mbchain *mbp, uint16_t x)
 {
 	x = htole16(x);
-	return mb_put_mem(mbp, (caddr_t)&x, sizeof(x), MB_MSYSTEM);
+	return (mb_put_mem(mbp, (caddr_t)&x, sizeof(x), MB_MSYSTEM));
 }
 
 int
@@ -166,7 +167,7 @@
 mb_put_uint32be(struct mbchain *mbp, uint32_t x)
 {
 	x = htobe32(x);
-	return mb_put_mem(mbp, (caddr_t)&x, sizeof(x), MB_MSYSTEM);
+	return (mb_put_mem(mbp, (caddr_t)&x, sizeof(x), MB_MSYSTEM));
 }
 
 int
@@ -173,7 +174,7 @@
 mb_put_uint32le(struct mbchain *mbp, uint32_t x)
 {
 	x = htole32(x);
-	return mb_put_mem(mbp, (caddr_t)&x, sizeof(x), MB_MSYSTEM);
+	return (mb_put_mem(mbp, (caddr_t)&x, sizeof(x), MB_MSYSTEM));
 }
 
 int
@@ -180,7 +181,7 @@
 mb_put_int64be(struct mbchain *mbp, int64_t x)
 {
 	x = htobe64(x);
-	return mb_put_mem(mbp, (caddr_t)&x, sizeof(x), MB_MSYSTEM);
+	return (mb_put_mem(mbp, (caddr_t)&x, sizeof(x), MB_MSYSTEM));
 }
 
 int
@@ -187,7 +188,7 @@
 mb_put_int64le(struct mbchain *mbp, int64_t x)
 {
 	x = htole64(x);
-	return mb_put_mem(mbp, (caddr_t)&x, sizeof(x), MB_MSYSTEM);
+	return (mb_put_mem(mbp, (caddr_t)&x, sizeof(x), MB_MSYSTEM));
 }
 
 int
@@ -205,7 +206,7 @@
 	while (size > 0) {
 		if (mleft == 0) {
 			if (m->m_next == NULL)
-				m = m_getm(m, size, M_WAIT, MT_DATA);
+				m = m_getm(m, size, M_WAITOK, MT_DATA);
 			else
 				m = m->m_next;
 			mleft = M_TRAILINGSPACE(m);
@@ -220,7 +221,7 @@
 			dstlen = mleft;
 			error = mbp->mb_copy(mbp, source, dst, &srclen, &dstlen);
 			if (error)
-				return error;
+				return (error);
 			break;
 		    case MB_MINLINE:
 			for (src = source, count = cplen; count; count--)
@@ -232,7 +233,7 @@
 		    case MB_MUSER:
 			error = copyin(source, dst, cplen);
 			if (error)
-				return error;
+				return (error);
 			break;
 		    case MB_MZERO:
 			bzero(dst, cplen);
@@ -246,7 +247,7 @@
 	}
 	mbp->mb_cur = m;
 	mbp->mb_mleft = mleft;
-	return 0;
+	return (0);
 }
 
 int
@@ -261,7 +262,7 @@
 	}
 	mbp->mb_mleft = M_TRAILINGSPACE(m);
 	mbp->mb_cur = m;
-	return 0;
+	return (0);
 }
 
 /*
@@ -277,7 +278,7 @@
 
 	while (size > 0 && uiop->uio_resid) {
 		if (uiop->uio_iovcnt <= 0 || uiop->uio_iov == NULL)
-			return EFBIG;
+			return (EFBIG);
 		left = uiop->uio_iov->iov_len;
 		if (left == 0) {
 			uiop->uio_iov++;
@@ -288,7 +289,7 @@
 			left = size;
 		error = mb_put_mem(mbp, uiop->uio_iov->iov_base, left, mtype);
 		if (error)
-			return error;
+			return (error);
 		uiop->uio_offset += left;
 		uiop->uio_resid -= left;
 		uiop->uio_iov->iov_base =
@@ -296,7 +297,7 @@
 		uiop->uio_iov->iov_len -= left;
 		size -= left;
 	}
-	return 0;
+	return (0);
 }
 
 /*
@@ -307,10 +308,10 @@
 {
 	struct mbuf *m;
 
-	m = m_gethdr(M_WAIT, MT_DATA);
+	m = m_gethdr(M_WAITOK, MT_DATA);
 	m->m_len = 0;
 	md_initm(mdp, m);
-	return 0;
+	return (0);
 }
 
 void
@@ -360,25 +361,25 @@
 	struct mbuf *m;
 
 	if (mdp->md_top == NULL)
-		return ENOENT;
+		return (ENOENT);
 	m = mdp->md_top->m_nextpkt;
 	md_done(mdp);
 	if (m == NULL)
-		return ENOENT;
+		return (ENOENT);
 	md_initm(mdp, m);
-	return 0;
+	return (0);
 }
 
 int
 md_get_uint8(struct mdchain *mdp, uint8_t *x)
 {
-	return md_get_mem(mdp, x, 1, MB_MINLINE);
+	return (md_get_mem(mdp, x, 1, MB_MINLINE));
 }
 
 int
 md_get_uint16(struct mdchain *mdp, uint16_t *x)
 {
-	return md_get_mem(mdp, (caddr_t)x, 2, MB_MINLINE);
+	return (md_get_mem(mdp, (caddr_t)x, 2, MB_MINLINE));
 }
 
 int
@@ -389,7 +390,7 @@
 
 	if (x != NULL)
 		*x = le16toh(v);
-	return error;
+	return (error);
 }
 
 int
@@ -400,13 +401,13 @@
 
 	if (x != NULL)
 		*x = be16toh(v);
-	return error;
+	return (error);
 }
 
 int
 md_get_uint32(struct mdchain *mdp, uint32_t *x)
 {
-	return md_get_mem(mdp, (caddr_t)x, 4, MB_MINLINE);
+	return (md_get_mem(mdp, (caddr_t)x, 4, MB_MINLINE));
 }
 
 int
@@ -418,7 +419,7 @@
 	error = md_get_uint32(mdp, &v);
 	if (x != NULL)
 		*x = be32toh(v);
-	return error;
+	return (error);
 }
 
 int
@@ -430,13 +431,13 @@
 	error = md_get_uint32(mdp, &v);
 	if (x != NULL)
 		*x = le32toh(v);
-	return error;
+	return (error);
 }
 
 int
 md_get_int64(struct mdchain *mdp, int64_t *x)
 {
-	return md_get_mem(mdp, (caddr_t)x, 8, MB_MINLINE);
+	return (md_get_mem(mdp, (caddr_t)x, 8, MB_MINLINE));
 }
 
 int
@@ -448,7 +449,7 @@
 	error = md_get_int64(mdp, &v);
 	if (x != NULL)
 		*x = be64toh(v);
-	return error;
+	return (error);
 }
 
 int
@@ -460,7 +461,7 @@
 	error = md_get_int64(mdp, &v);
 	if (x != NULL)
 		*x = le64toh(v);
-	return error;
+	return (error);
 }
 
 int
@@ -474,7 +475,7 @@
 	while (size > 0) {
 		if (m == NULL) {
 			MBERROR("incomplete copy\n");
-			return EBADRPC;
+			return (EBADRPC);
 		}
 		s = mdp->md_pos;
 		count = mtod(m, u_char*) + m->m_len - s;
@@ -506,7 +507,7 @@
 		}
 		target += count;
 	}
-	return 0;
+	return (0);
 }
 
 int
@@ -514,10 +515,10 @@
 {
 	struct mbuf *m = mdp->md_cur, *rm;
 
-	rm = m_copym(m, mdp->md_pos - mtod(m, u_char*), size, M_WAIT);
+	rm = m_copym(m, mdp->md_pos - mtod(m, u_char*), size, M_WAITOK);
 	md_get_mem(mdp, NULL, size, MB_MZERO);
 	*ret = rm;
-	return 0;
+	return (0);
 }
 
 int
@@ -530,7 +531,7 @@
 	mtype = (uiop->uio_segflg == UIO_SYSSPACE) ? MB_MSYSTEM : MB_MUSER;
 	while (size > 0 && uiop->uio_resid) {
 		if (uiop->uio_iovcnt <= 0 || uiop->uio_iov == NULL)
-			return EFBIG;
+			return (EFBIG);
 		left = uiop->uio_iov->iov_len;
 		if (left == 0) {
 			uiop->uio_iov++;
@@ -542,7 +543,7 @@
 			left = size;
 		error = md_get_mem(mdp, uiocp, left, mtype);
 		if (error)
-			return error;
+			return (error);
 		uiop->uio_offset += left;
 		uiop->uio_resid -= left;
 		uiop->uio_iov->iov_base =
@@ -550,5 +551,5 @@
 		uiop->uio_iov->iov_len -= left;
 		size -= left;
 	}
-	return 0;
+	return (0);
 }

Modified: trunk/sys/kern/subr_module.c
===================================================================
--- trunk/sys/kern/subr_module.c	2018-05-25 20:59:46 UTC (rev 9949)
+++ trunk/sys/kern/subr_module.c	2018-05-25 21:07:09 UTC (rev 9950)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 1998 Michael Smith
  * All rights reserved.
@@ -25,7 +26,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: stable/10/sys/kern/subr_module.c 218494 2011-02-09 19:08:21Z marcel $");
 
 #include <sys/param.h>
 #include <sys/systm.h>

Modified: trunk/sys/kern/subr_msgbuf.c
===================================================================
--- trunk/sys/kern/subr_msgbuf.c	2018-05-25 20:59:46 UTC (rev 9949)
+++ trunk/sys/kern/subr_msgbuf.c	2018-05-25 21:07:09 UTC (rev 9950)
@@ -1,3 +1,4 @@
+/* $MidnightBSD$ */
 /*-
  * Copyright (c) 2003 Ian Dowse.  All rights reserved.
  *
@@ -22,7 +23,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $MidnightBSD$
+ * $FreeBSD: stable/10/sys/kern/subr_msgbuf.c 302234 2016-06-27 21:50:30Z bdrewery $
  */
 
 /*
@@ -50,7 +51,7 @@
 
 /*
  * Timestamps in msgbuf are useful when trying to diagnose when core dumps
- * or other actions occured.
+ * or other actions occurred.
  */
 static int msgbuf_show_timestamp = 0;
 SYSCTL_INT(_kern, OID_AUTO, msgbuf_show_timestamp, CTLFLAG_RW | CTLFLAG_TUN,

Added: trunk/sys/kern/subr_nvlist.c
===================================================================
--- trunk/sys/kern/subr_nvlist.c	                        (rev 0)
+++ trunk/sys/kern/subr_nvlist.c	2018-05-25 21:07:09 UTC (rev 9950)
@@ -0,0 +1,1476 @@
+/* $MidnightBSD$ */
+/*-
+ * Copyright (c) 2009-2013 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: stable/10/sys/kern/subr_nvlist.c 292973 2015-12-31 03:28:14Z ngie $");
+
+#include <sys/param.h>
+#include <sys/endian.h>
+#include <sys/queue.h>
+
+#ifdef _KERNEL
+
+#include <sys/errno.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/systm.h>
+
+#include <machine/stdarg.h>
+
+#else
+#include <sys/socket.h>
+
+#include <errno.h>
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stdint.h>
+#define	_WITH_DPRINTF
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "msgio.h"
+#endif
+
+#ifdef HAVE_PJDLOG
+#include <pjdlog.h>
+#endif
+
+#include <sys/nv.h>
+#include <sys/nv_impl.h>
+#include <sys/nvlist_impl.h>
+#include <sys/nvpair_impl.h>
+
+#ifndef	HAVE_PJDLOG
+#ifdef _KERNEL
+#define	PJDLOG_ASSERT(...)		MPASS(__VA_ARGS__)
+#define	PJDLOG_RASSERT(expr, ...)	KASSERT(expr, (__VA_ARGS__))
+#define	PJDLOG_ABORT(...)		panic(__VA_ARGS__)
+#else
+#include <assert.h>
+#define	PJDLOG_ASSERT(...)		assert(__VA_ARGS__)
+#define	PJDLOG_RASSERT(expr, ...)	assert(expr)
+#define	PJDLOG_ABORT(...)		do {				\
+	fprintf(stderr, "%s:%u: ", __FILE__, __LINE__);			\
+	fprintf(stderr, __VA_ARGS__);					\
+	fprintf(stderr, "\n");						\
+	abort();							\
+} while (0)
+#endif
+#endif
+
+#define	NV_FLAG_PRIVATE_MASK	(NV_FLAG_BIG_ENDIAN)
+#define	NV_FLAG_PUBLIC_MASK	(NV_FLAG_IGNORE_CASE)
+#define	NV_FLAG_ALL_MASK	(NV_FLAG_PRIVATE_MASK | NV_FLAG_PUBLIC_MASK)
+
+#define	NVLIST_MAGIC	0x6e766c	/* "nvl" */
+struct nvlist {
+	int		 nvl_magic;
+	int		 nvl_error;
+	int		 nvl_flags;
+	nvpair_t	*nvl_parent;
+	struct nvl_head	 nvl_head;
+};
+
+#define	NVLIST_ASSERT(nvl)	do {					\
+	PJDLOG_ASSERT((nvl) != NULL);					\
+	PJDLOG_ASSERT((nvl)->nvl_magic == NVLIST_MAGIC);		\
+} while (0)
+
+#ifdef _KERNEL
+MALLOC_DEFINE(M_NVLIST, "nvlist", "kernel nvlist");
+#endif
+
+#define	NVPAIR_ASSERT(nvp)	nvpair_assert(nvp)
+
+#define	NVLIST_HEADER_MAGIC	0x6c
+#define	NVLIST_HEADER_VERSION	0x00
+struct nvlist_header {
+	uint8_t		nvlh_magic;
+	uint8_t		nvlh_version;
+	uint8_t		nvlh_flags;
+	uint64_t	nvlh_descriptors;
+	uint64_t	nvlh_size;
+} __packed;
+
+nvlist_t *
+nvlist_create(int flags)
+{
+	nvlist_t *nvl;
+
+	PJDLOG_ASSERT((flags & ~(NV_FLAG_PUBLIC_MASK)) == 0);
+
+	nvl = nv_malloc(sizeof(*nvl));
+	nvl->nvl_error = 0;
+	nvl->nvl_flags = flags;
+	nvl->nvl_parent = NULL;
+	TAILQ_INIT(&nvl->nvl_head);
+	nvl->nvl_magic = NVLIST_MAGIC;
+
+	return (nvl);
+}
+
+void
+nvlist_destroy(nvlist_t *nvl)
+{
+	nvpair_t *nvp;
+	int serrno;
+
+	if (nvl == NULL)
+		return;
+
+	SAVE_ERRNO(serrno);
+
+	NVLIST_ASSERT(nvl);
+
+	while ((nvp = nvlist_first_nvpair(nvl)) != NULL) {
+		nvlist_remove_nvpair(nvl, nvp);
+		nvpair_free(nvp);
+	}
+	nvl->nvl_magic = 0;
+	nv_free(nvl);
+
+	RESTORE_ERRNO(serrno);
+}
+
+void
+nvlist_set_error(nvlist_t *nvl, int error)
+{
+
+	PJDLOG_ASSERT(error != 0);
+
+	/*
+	 * Check for error != 0 so that we don't do the wrong thing if somebody
+	 * tries to abuse this API when asserts are disabled.
+	 */
+	if (nvl != NULL && error != 0 && nvl->nvl_error == 0)
+		nvl->nvl_error = error;
+}
+
+int
+nvlist_error(const nvlist_t *nvl)
+{
+
+	if (nvl == NULL)
+		return (ENOMEM);
+
+	NVLIST_ASSERT(nvl);
+
+	return (nvl->nvl_error);
+}
+
+nvpair_t *
+nvlist_get_nvpair_parent(const nvlist_t *nvl)
+{
+
+	NVLIST_ASSERT(nvl);
+
+	return (nvl->nvl_parent);
+}
+
+const nvlist_t *
+nvlist_get_parent(const nvlist_t *nvl, void **cookiep)
+{
+	nvpair_t *nvp;
+
+	NVLIST_ASSERT(nvl);
+
+	nvp = nvl->nvl_parent;
+	if (cookiep != NULL)
+		*cookiep = nvp;
+	if (nvp == NULL)
+		return (NULL);
+
+	return (nvpair_nvlist(nvp));
+}
+
+void
+nvlist_set_parent(nvlist_t *nvl, nvpair_t *parent)
+{
+
+	NVLIST_ASSERT(nvl);
+
+	nvl->nvl_parent = parent;
+}
+
+bool
+nvlist_empty(const nvlist_t *nvl)
+{
+
+	NVLIST_ASSERT(nvl);
+	PJDLOG_ASSERT(nvl->nvl_error == 0);
+
+	return (nvlist_first_nvpair(nvl) == NULL);
+}
+
+int
+nvlist_flags(const nvlist_t *nvl)
+{
+
+	NVLIST_ASSERT(nvl);
+	PJDLOG_ASSERT(nvl->nvl_error == 0);
+	PJDLOG_ASSERT((nvl->nvl_flags & ~(NV_FLAG_PUBLIC_MASK)) == 0);
+
+	return (nvl->nvl_flags);
+}
+
+static void
+nvlist_report_missing(int type, const char *name)
+{
+
+	PJDLOG_ABORT("Element '%s' of type %s doesn't exist.",
+	    name, nvpair_type_string(type));
+}
+
+static nvpair_t *
+nvlist_find(const nvlist_t *nvl, int type, const char *name)
+{
+	nvpair_t *nvp;
+
+	NVLIST_ASSERT(nvl);
+	PJDLOG_ASSERT(nvl->nvl_error == 0);
+	PJDLOG_ASSERT(type == NV_TYPE_NONE ||
+	    (type >= NV_TYPE_FIRST && type <= NV_TYPE_LAST));
+
+	for (nvp = nvlist_first_nvpair(nvl); nvp != NULL;
+	    nvp = nvlist_next_nvpair(nvl, nvp)) {
+		if (type != NV_TYPE_NONE && nvpair_type(nvp) != type)
+			continue;
+		if ((nvl->nvl_flags & NV_FLAG_IGNORE_CASE) != 0) {
+			if (strcasecmp(nvpair_name(nvp), name) != 0)
+				continue;
+		} else {
+			if (strcmp(nvpair_name(nvp), name) != 0)
+				continue;
+		}
+		break;
+	}
+
+	if (nvp == NULL)
+		RESTORE_ERRNO(ENOENT);
+
+	return (nvp);
+}
+
+bool
+nvlist_exists_type(const nvlist_t *nvl, const char *name, int type)
+{
+
+	NVLIST_ASSERT(nvl);
+	PJDLOG_ASSERT(nvl->nvl_error == 0);
+	PJDLOG_ASSERT(type == NV_TYPE_NONE ||
+	    (type >= NV_TYPE_FIRST && type <= NV_TYPE_LAST));
+
+	return (nvlist_find(nvl, type, name) != NULL);
+}
+
+void
+nvlist_free_type(nvlist_t *nvl, const char *name, int type)
+{
+	nvpair_t *nvp;
+
+	NVLIST_ASSERT(nvl);
+	PJDLOG_ASSERT(nvl->nvl_error == 0);
+	PJDLOG_ASSERT(type == NV_TYPE_NONE ||
+	    (type >= NV_TYPE_FIRST && type <= NV_TYPE_LAST));
+
+	nvp = nvlist_find(nvl, type, name);
+	if (nvp != NULL)
+		nvlist_free_nvpair(nvl, nvp);
+	else
+		nvlist_report_missing(type, name);
+}
+
+nvlist_t *
+nvlist_clone(const nvlist_t *nvl)
+{
+	nvlist_t *newnvl;
+	nvpair_t *nvp, *newnvp;
+
+	NVLIST_ASSERT(nvl);
+
+	if (nvl->nvl_error != 0) {
+		RESTORE_ERRNO(nvl->nvl_error);
+		return (NULL);
+	}
+
+	newnvl = nvlist_create(nvl->nvl_flags & NV_FLAG_PUBLIC_MASK);
+	for (nvp = nvlist_first_nvpair(nvl); nvp != NULL;
+	    nvp = nvlist_next_nvpair(nvl, nvp)) {
+		newnvp = nvpair_clone(nvp);
+		if (newnvp == NULL)
+			break;
+		nvlist_move_nvpair(newnvl, newnvp);
+	}
+	if (nvp != NULL) {
+		nvlist_destroy(newnvl);
+		return (NULL);
+	}
+	return (newnvl);
+}
+
+#ifndef _KERNEL
+static bool
+nvlist_dump_error_check(const nvlist_t *nvl, int fd, int level)
+{
+
+	if (nvlist_error(nvl) != 0) {
+		dprintf(fd, "%*serror: %d\n", level * 4, "",
+		    nvlist_error(nvl));
+		return (true);
+	}
+
+	return (false);
+}
+
+/*
+ * Dump content of nvlist.
+ */
+void
+nvlist_dump(const nvlist_t *nvl, int fd)
+{
+	const nvlist_t *tmpnvl;
+	nvpair_t *nvp, *tmpnvp;
+	void *cookie;
+	int level;
+
+	level = 0;
+	if (nvlist_dump_error_check(nvl, fd, level))
+		return;
+
+	nvp = nvlist_first_nvpair(nvl);
+	while (nvp != NULL) {
+		dprintf(fd, "%*s%s (%s):", level * 4, "", nvpair_name(nvp),
+		    nvpair_type_string(nvpair_type(nvp)));
+		switch (nvpair_type(nvp)) {
+		case NV_TYPE_NULL:
+			dprintf(fd, " null\n");
+			break;
+		case NV_TYPE_BOOL:
+			dprintf(fd, " %s\n", nvpair_get_bool(nvp) ?
+			    "TRUE" : "FALSE");
+			break;
+		case NV_TYPE_NUMBER:
+			dprintf(fd, " %ju (%jd) (0x%jx)\n",
+			    (uintmax_t)nvpair_get_number(nvp),
+			    (intmax_t)nvpair_get_number(nvp),
+			    (uintmax_t)nvpair_get_number(nvp));
+			break;
+		case NV_TYPE_STRING:
+			dprintf(fd, " [%s]\n", nvpair_get_string(nvp));
+			break;
+		case NV_TYPE_NVLIST:
+			dprintf(fd, "\n");
+			tmpnvl = nvpair_get_nvlist(nvp);
+			if (nvlist_dump_error_check(tmpnvl, fd, level + 1))
+				break;
+			tmpnvp = nvlist_first_nvpair(tmpnvl);
+			if (tmpnvp != NULL) {
+				nvl = tmpnvl;
+				nvp = tmpnvp;
+				level++;
+				continue;
+			}
+			break;
+		case NV_TYPE_DESCRIPTOR:
+			dprintf(fd, " %d\n", nvpair_get_descriptor(nvp));
+			break;
+		case NV_TYPE_BINARY:
+		    {
+			const unsigned char *binary;
+			unsigned int ii;
+			size_t size;
+
+			binary = nvpair_get_binary(nvp, &size);
+			dprintf(fd, " %zu ", size);
+			for (ii = 0; ii < size; ii++)
+				dprintf(fd, "%02hhx", binary[ii]);
+			dprintf(fd, "\n");
+			break;
+		    }
+		default:
+			PJDLOG_ABORT("Unknown type: %d.", nvpair_type(nvp));
+		}
+
+		while ((nvp = nvlist_next_nvpair(nvl, nvp)) == NULL) {
+			cookie = NULL;
+			nvl = nvlist_get_parent(nvl, &cookie);
+			if (nvl == NULL)
+				return;
+			nvp = cookie;
+			level--;
+		}
+	}
+}
+
+void
+nvlist_fdump(const nvlist_t *nvl, FILE *fp)
+{
+
+	fflush(fp);
+	nvlist_dump(nvl, fileno(fp));
+}
+#endif
+
+/*
+ * The function obtains size of the nvlist after nvlist_pack().
+ */
+size_t
+nvlist_size(const nvlist_t *nvl)
+{
+	const nvlist_t *tmpnvl;
+	const nvpair_t *nvp, *tmpnvp;
+	void *cookie;
+	size_t size;
+
+	NVLIST_ASSERT(nvl);
+	PJDLOG_ASSERT(nvl->nvl_error == 0);
+
+	size = sizeof(struct nvlist_header);
+	nvp = nvlist_first_nvpair(nvl);
+	while (nvp != NULL) {
+		size += nvpair_header_size();
+		size += strlen(nvpair_name(nvp)) + 1;
+		if (nvpair_type(nvp) == NV_TYPE_NVLIST) {
+			size += sizeof(struct nvlist_header);
+			size += nvpair_header_size() + 1;
+			tmpnvl = nvpair_get_nvlist(nvp);
+			PJDLOG_ASSERT(tmpnvl->nvl_error == 0);
+			tmpnvp = nvlist_first_nvpair(tmpnvl);
+			if (tmpnvp != NULL) {
+				nvl = tmpnvl;
+				nvp = tmpnvp;
+				continue;
+			}
+		} else {
+			size += nvpair_size(nvp);
+		}
+
+		while ((nvp = nvlist_next_nvpair(nvl, nvp)) == NULL) {
+			cookie = NULL;
+			nvl = nvlist_get_parent(nvl, &cookie);
+			if (nvl == NULL)
+				goto out;
+			nvp = cookie;
+		}
+	}
+
+out:
+	return (size);
+}
+
+#ifndef _KERNEL
+static int *
+nvlist_xdescriptors(const nvlist_t *nvl, int *descs, int level)
+{
+	const nvpair_t *nvp;
+
+	NVLIST_ASSERT(nvl);
+	PJDLOG_ASSERT(nvl->nvl_error == 0);
+	PJDLOG_ASSERT(level < 3);
+
+	for (nvp = nvlist_first_nvpair(nvl); nvp != NULL;
+	    nvp = nvlist_next_nvpair(nvl, nvp)) {
+		switch (nvpair_type(nvp)) {
+		case NV_TYPE_DESCRIPTOR:
+			*descs = nvpair_get_descriptor(nvp);
+			descs++;
+			break;
+		case NV_TYPE_NVLIST:
+			descs = nvlist_xdescriptors(nvpair_get_nvlist(nvp),
+			    descs, level + 1);
+			break;
+		}
+	}
+
+	return (descs);
+}
+#endif
+
+#ifndef _KERNEL
+int *
+nvlist_descriptors(const nvlist_t *nvl, size_t *nitemsp)
+{
+	size_t nitems;
+	int *fds;
+
+	nitems = nvlist_ndescriptors(nvl);
+	fds = nv_malloc(sizeof(fds[0]) * (nitems + 1));
+	if (fds == NULL)
+		return (NULL);
+	if (nitems > 0)
+		nvlist_xdescriptors(nvl, fds, 0);
+	fds[nitems] = -1;
+	if (nitemsp != NULL)
+		*nitemsp = nitems;
+	return (fds);
+}
+#endif
+
+static size_t
+nvlist_xndescriptors(const nvlist_t *nvl, int level)
+{
+#ifndef _KERNEL
+	const nvpair_t *nvp;
+	size_t ndescs;
+
+	NVLIST_ASSERT(nvl);
+	PJDLOG_ASSERT(nvl->nvl_error == 0);
+	PJDLOG_ASSERT(level < 3);
+
+	ndescs = 0;
+	for (nvp = nvlist_first_nvpair(nvl); nvp != NULL;
+	    nvp = nvlist_next_nvpair(nvl, nvp)) {
+		switch (nvpair_type(nvp)) {
+		case NV_TYPE_DESCRIPTOR:
+			ndescs++;
+			break;
+		case NV_TYPE_NVLIST:
+			ndescs += nvlist_xndescriptors(nvpair_get_nvlist(nvp),
+			    level + 1);
+			break;
+		}
+	}
+
+	return (ndescs);
+#else
+	return (0);
+#endif
+}
+
+size_t
+nvlist_ndescriptors(const nvlist_t *nvl)
+{
+
+	return (nvlist_xndescriptors(nvl, 0));
+}
+
+static unsigned char *
+nvlist_pack_header(const nvlist_t *nvl, unsigned char *ptr, size_t *leftp)
+{
+	struct nvlist_header nvlhdr;
+
+	NVLIST_ASSERT(nvl);
+
+	nvlhdr.nvlh_magic = NVLIST_HEADER_MAGIC;
+	nvlhdr.nvlh_version = NVLIST_HEADER_VERSION;
+	nvlhdr.nvlh_flags = nvl->nvl_flags;
+#if BYTE_ORDER == BIG_ENDIAN
+	nvlhdr.nvlh_flags |= NV_FLAG_BIG_ENDIAN;
+#endif
+	nvlhdr.nvlh_descriptors = nvlist_ndescriptors(nvl);
+	nvlhdr.nvlh_size = *leftp - sizeof(nvlhdr);
+	PJDLOG_ASSERT(*leftp >= sizeof(nvlhdr));
+	memcpy(ptr, &nvlhdr, sizeof(nvlhdr));
+	ptr += sizeof(nvlhdr);
+	*leftp -= sizeof(nvlhdr);
+
+	return (ptr);
+}
+
+void *
+nvlist_xpack(const nvlist_t *nvl, int64_t *fdidxp, size_t *sizep)
+{
+	unsigned char *buf, *ptr;
+	size_t left, size;
+	const nvlist_t *tmpnvl;
+	nvpair_t *nvp, *tmpnvp;
+	void *cookie;
+
+	NVLIST_ASSERT(nvl);
+
+	if (nvl->nvl_error != 0) {
+		RESTORE_ERRNO(nvl->nvl_error);
+		return (NULL);
+	}
+
+	size = nvlist_size(nvl);
+	buf = nv_malloc(size);
+	if (buf == NULL)
+		return (NULL);
+
+	ptr = buf;
+	left = size;
+
+	ptr = nvlist_pack_header(nvl, ptr, &left);
+
+	nvp = nvlist_first_nvpair(nvl);
+	while (nvp != NULL) {
+		NVPAIR_ASSERT(nvp);
+
+		nvpair_init_datasize(nvp);
+		ptr = nvpair_pack_header(nvp, ptr, &left);
+		if (ptr == NULL) {
+			nv_free(buf);
+			return (NULL);
+		}
+		switch (nvpair_type(nvp)) {
+		case NV_TYPE_NULL:
+			ptr = nvpair_pack_null(nvp, ptr, &left);
+			break;
+		case NV_TYPE_BOOL:
+			ptr = nvpair_pack_bool(nvp, ptr, &left);
+			break;
+		case NV_TYPE_NUMBER:
+			ptr = nvpair_pack_number(nvp, ptr, &left);
+			break;
+		case NV_TYPE_STRING:
+			ptr = nvpair_pack_string(nvp, ptr, &left);
+			break;
+		case NV_TYPE_NVLIST:
+			tmpnvl = nvpair_get_nvlist(nvp);
+			ptr = nvlist_pack_header(tmpnvl, ptr, &left);
+			if (ptr == NULL)
+				goto out;
+			tmpnvp = nvlist_first_nvpair(tmpnvl);
+			if (tmpnvp != NULL) {
+				nvl = tmpnvl;
+				nvp = tmpnvp;
+				continue;
+			}
+			ptr = nvpair_pack_nvlist_up(ptr, &left);
+			break;
+#ifndef _KERNEL
+		case NV_TYPE_DESCRIPTOR:
+			ptr = nvpair_pack_descriptor(nvp, ptr, fdidxp, &left);
+			break;
+#endif
+		case NV_TYPE_BINARY:
+			ptr = nvpair_pack_binary(nvp, ptr, &left);
+			break;
+		default:
+			PJDLOG_ABORT("Invalid type (%d).", nvpair_type(nvp));
+		}
+		if (ptr == NULL) {
+			nv_free(buf);
+			return (NULL);
+		}
+		while ((nvp = nvlist_next_nvpair(nvl, nvp)) == NULL) {
+			cookie = NULL;
+			nvl = nvlist_get_parent(nvl, &cookie);
+			if (nvl == NULL)
+				goto out;
+			nvp = cookie;
+			ptr = nvpair_pack_nvlist_up(ptr, &left);
+			if (ptr == NULL)
+				goto out;
+		}
+	}
+
+out:
+	if (sizep != NULL)
+		*sizep = size;
+	return (buf);
+}
+
+void *
+nvlist_pack(const nvlist_t *nvl, size_t *sizep)
+{
+
+	NVLIST_ASSERT(nvl);
+
+	if (nvl->nvl_error != 0) {
+		RESTORE_ERRNO(nvl->nvl_error);
+		return (NULL);
+	}
+
+	if (nvlist_ndescriptors(nvl) > 0) {
+		RESTORE_ERRNO(EOPNOTSUPP);
+		return (NULL);
+	}
+
+	return (nvlist_xpack(nvl, NULL, sizep));
+}
+
+static bool
+nvlist_check_header(struct nvlist_header *nvlhdrp)
+{
+
+	if (nvlhdrp->nvlh_magic != NVLIST_HEADER_MAGIC) {
+		RESTORE_ERRNO(EINVAL);
+		return (false);
+	}
+	if ((nvlhdrp->nvlh_flags & ~NV_FLAG_ALL_MASK) != 0) {
+		RESTORE_ERRNO(EINVAL);
+		return (false);
+	}
+#if BYTE_ORDER == BIG_ENDIAN
+	if ((nvlhdrp->nvlh_flags & NV_FLAG_BIG_ENDIAN) == 0) {
+		nvlhdrp->nvlh_size = le64toh(nvlhdrp->nvlh_size);
+		nvlhdrp->nvlh_descriptors = le64toh(nvlhdrp->nvlh_descriptors);
+	}
+#else
+	if ((nvlhdrp->nvlh_flags & NV_FLAG_BIG_ENDIAN) != 0) {
+		nvlhdrp->nvlh_size = be64toh(nvlhdrp->nvlh_size);
+		nvlhdrp->nvlh_descriptors = be64toh(nvlhdrp->nvlh_descriptors);
+	}
+#endif
+	return (true);
+}
+
+const unsigned char *
+nvlist_unpack_header(nvlist_t *nvl, const unsigned char *ptr, size_t nfds,
+    bool *isbep, size_t *leftp)
+{
+	struct nvlist_header nvlhdr;
+
+	if (*leftp < sizeof(nvlhdr))
+		goto failed;
+
+	memcpy(&nvlhdr, ptr, sizeof(nvlhdr));
+
+	if (!nvlist_check_header(&nvlhdr))
+		goto failed;
+
+	if (nvlhdr.nvlh_size != *leftp - sizeof(nvlhdr))
+		goto failed;
+
+	/*
+	 * nvlh_descriptors might be smaller than nfds in embedded nvlists.
+	 */
+	if (nvlhdr.nvlh_descriptors > nfds)
+		goto failed;
+
+	if ((nvlhdr.nvlh_flags & ~NV_FLAG_ALL_MASK) != 0)
+		goto failed;
+
+	nvl->nvl_flags = (nvlhdr.nvlh_flags & NV_FLAG_PUBLIC_MASK);
+
+	ptr += sizeof(nvlhdr);
+	if (isbep != NULL)
+		*isbep = (((int)nvlhdr.nvlh_flags & NV_FLAG_BIG_ENDIAN) != 0);
+	*leftp -= sizeof(nvlhdr);
+
+	return (ptr);
+failed:
+	RESTORE_ERRNO(EINVAL);
+	return (NULL);
+}
+
+nvlist_t *
+nvlist_xunpack(const void *buf, size_t size, const int *fds, size_t nfds)
+{
+	const unsigned char *ptr;
+	nvlist_t *nvl, *retnvl, *tmpnvl;
+	nvpair_t *nvp;
+	size_t left;
+	bool isbe;
+
+	left = size;
+	ptr = buf;
+
+	tmpnvl = NULL;
+	nvl = retnvl = nvlist_create(0);
+	if (nvl == NULL)
+		goto failed;
+
+	ptr = nvlist_unpack_header(nvl, ptr, nfds, &isbe, &left);
+	if (ptr == NULL)
+		goto failed;
+
+	while (left > 0) {
+		ptr = nvpair_unpack(isbe, ptr, &left, &nvp);
+		if (ptr == NULL)
+			goto failed;
+		switch (nvpair_type(nvp)) {
+		case NV_TYPE_NULL:
+			ptr = nvpair_unpack_null(isbe, nvp, ptr, &left);
+			break;
+		case NV_TYPE_BOOL:
+			ptr = nvpair_unpack_bool(isbe, nvp, ptr, &left);
+			break;
+		case NV_TYPE_NUMBER:
+			ptr = nvpair_unpack_number(isbe, nvp, ptr, &left);
+			break;
+		case NV_TYPE_STRING:
+			ptr = nvpair_unpack_string(isbe, nvp, ptr, &left);
+			break;
+		case NV_TYPE_NVLIST:
+			ptr = nvpair_unpack_nvlist(isbe, nvp, ptr, &left, nfds,
+			    &tmpnvl);
+			nvlist_set_parent(tmpnvl, nvp);
+			break;
+#ifndef _KERNEL
+		case NV_TYPE_DESCRIPTOR:
+			ptr = nvpair_unpack_descriptor(isbe, nvp, ptr, &left,
+			    fds, nfds);
+			break;
+#endif
+		case NV_TYPE_BINARY:
+			ptr = nvpair_unpack_binary(isbe, nvp, ptr, &left);
+			break;
+		case NV_TYPE_NVLIST_UP:
+			if (nvl->nvl_parent == NULL)
+				goto failed;
+			nvl = nvpair_nvlist(nvl->nvl_parent);
+			continue;
+		default:
+			PJDLOG_ABORT("Invalid type (%d).", nvpair_type(nvp));
+		}
+		if (ptr == NULL)
+			goto failed;
+		nvlist_move_nvpair(nvl, nvp);
+		if (tmpnvl != NULL) {
+			nvl = tmpnvl;
+			tmpnvl = NULL;
+		}
+	}
+
+	return (retnvl);
+failed:
+	nvlist_destroy(retnvl);
+	return (NULL);
+}
+
+nvlist_t *
+nvlist_unpack(const void *buf, size_t size)
+{
+
+	return (nvlist_xunpack(buf, size, NULL, 0));
+}
+
+#ifndef _KERNEL
+int
+nvlist_send(int sock, const nvlist_t *nvl)
+{
+	size_t datasize, nfds;
+	int *fds;
+	void *data;
+	int64_t fdidx;
+	int serrno, ret;
+
+	if (nvlist_error(nvl) != 0) {
+		errno = nvlist_error(nvl);
+		return (-1);
+	}
+
+	fds = nvlist_descriptors(nvl, &nfds);
+	if (fds == NULL)
+		return (-1);
+
+	ret = -1;
+	data = NULL;
+	fdidx = 0;
+
+	data = nvlist_xpack(nvl, &fdidx, &datasize);
+	if (data == NULL)
+		goto out;
+
+	if (buf_send(sock, data, datasize) == -1)
+		goto out;
+
+	if (nfds > 0) {
+		if (fd_send(sock, fds, nfds) == -1)
+			goto out;
+	}
+
+	ret = 0;
+out:
+	serrno = errno;
+	free(fds);
+	free(data);
+	errno = serrno;
+	return (ret);
+}
+
+nvlist_t *
+nvlist_recv(int sock)
+{
+	struct nvlist_header nvlhdr;
+	nvlist_t *nvl, *ret;
+	unsigned char *buf;
+	size_t nfds, size, i;
+	int serrno, *fds;
+
+	if (buf_recv(sock, &nvlhdr, sizeof(nvlhdr)) == -1)
+		return (NULL);
+
+	if (!nvlist_check_header(&nvlhdr))
+		return (NULL);
+
+	nfds = (size_t)nvlhdr.nvlh_descriptors;
+	size = sizeof(nvlhdr) + (size_t)nvlhdr.nvlh_size;
+
+	buf = malloc(size);
+	if (buf == NULL)
+		return (NULL);
+
+	memcpy(buf, &nvlhdr, sizeof(nvlhdr));
+
+	ret = NULL;
+	fds = NULL;
+
+	if (buf_recv(sock, buf + sizeof(nvlhdr), size - sizeof(nvlhdr)) == -1)
+		goto out;
+
+	if (nfds > 0) {
+		fds = malloc(nfds * sizeof(fds[0]));
+		if (fds == NULL)
+			goto out;
+		if (fd_recv(sock, fds, nfds) == -1)
+			goto out;
+	}
+
+	nvl = nvlist_xunpack(buf, size, fds, nfds);
+	if (nvl == NULL) {
+		for (i = 0; i < nfds; i++)
+			close(fds[i]);
+		goto out;
+	}
+
+	ret = nvl;
+out:
+	serrno = errno;
+	free(buf);
+	free(fds);
+	errno = serrno;
+
+	return (ret);
+}
+
+nvlist_t *
+nvlist_xfer(int sock, nvlist_t *nvl)
+{
+
+	if (nvlist_send(sock, nvl) < 0) {
+		nvlist_destroy(nvl);
+		return (NULL);
+	}
+	nvlist_destroy(nvl);
+	return (nvlist_recv(sock));
+}
+#endif
+
+nvpair_t *
+nvlist_first_nvpair(const nvlist_t *nvl)
+{
+
+	NVLIST_ASSERT(nvl);
+
+	return (TAILQ_FIRST(&nvl->nvl_head));
+}
+
+nvpair_t *
+nvlist_next_nvpair(const nvlist_t *nvl, const nvpair_t *nvp)
+{
+	nvpair_t *retnvp;
+
+	NVLIST_ASSERT(nvl);
+	NVPAIR_ASSERT(nvp);
+	PJDLOG_ASSERT(nvpair_nvlist(nvp) == nvl);
+
+	retnvp = nvpair_next(nvp);
+	PJDLOG_ASSERT(retnvp == NULL || nvpair_nvlist(retnvp) == nvl);
+
+	return (retnvp);
+
+}
+
+nvpair_t *
+nvlist_prev_nvpair(const nvlist_t *nvl, const nvpair_t *nvp)
+{
+	nvpair_t *retnvp;
+
+	NVLIST_ASSERT(nvl);
+	NVPAIR_ASSERT(nvp);
+	PJDLOG_ASSERT(nvpair_nvlist(nvp) == nvl);
+
+	retnvp = nvpair_prev(nvp);
+	PJDLOG_ASSERT(nvpair_nvlist(retnvp) == nvl);
+
+	return (retnvp);
+}
+
+const char *
+nvlist_next(const nvlist_t *nvl, int *typep, void **cookiep)
+{
+	nvpair_t *nvp;
+
+	NVLIST_ASSERT(nvl);
+	PJDLOG_ASSERT(cookiep != NULL);
+
+	if (*cookiep == NULL)
+		nvp = nvlist_first_nvpair(nvl);
+	else
+		nvp = nvlist_next_nvpair(nvl, *cookiep);
+	if (nvp == NULL)
+		return (NULL);
+	if (typep != NULL)
+		*typep = nvpair_type(nvp);
+	*cookiep = nvp;
+	return (nvpair_name(nvp));
+}
+
+bool
+nvlist_exists(const nvlist_t *nvl, const char *name)
+{
+
+	return (nvlist_find(nvl, NV_TYPE_NONE, name) != NULL);
+}
+
+#define	NVLIST_EXISTS(type, TYPE)					\
+bool									\
+nvlist_exists_##type(const nvlist_t *nvl, const char *name)		\
+{									\
+									\
+	return (nvlist_find(nvl, NV_TYPE_##TYPE, name) != NULL);	\
+}
+
+NVLIST_EXISTS(null, NULL)
+NVLIST_EXISTS(bool, BOOL)
+NVLIST_EXISTS(number, NUMBER)
+NVLIST_EXISTS(string, STRING)
+NVLIST_EXISTS(nvlist, NVLIST)
+#ifndef _KERNEL
+NVLIST_EXISTS(descriptor, DESCRIPTOR)
+#endif
+NVLIST_EXISTS(binary, BINARY)
+
+#undef	NVLIST_EXISTS
+
+void
+nvlist_add_nvpair(nvlist_t *nvl, const nvpair_t *nvp)
+{
+	nvpair_t *newnvp;
+
+	NVPAIR_ASSERT(nvp);
+
+	if (nvlist_error(nvl) != 0) {
+		RESTORE_ERRNO(nvlist_error(nvl));
+		return;
+	}
+	if (nvlist_exists(nvl, nvpair_name(nvp))) {
+		nvl->nvl_error = EEXIST;
+		RESTORE_ERRNO(nvlist_error(nvl));
+		return;
+	}
+
+	newnvp = nvpair_clone(nvp);
+	if (newnvp == NULL) {
+		nvl->nvl_error = ERRNO_OR_DEFAULT(ENOMEM);
+		RESTORE_ERRNO(nvlist_error(nvl));
+		return;
+	}
+
+	nvpair_insert(&nvl->nvl_head, newnvp, nvl);
+}
+
+void
+nvlist_add_stringf(nvlist_t *nvl, const char *name, const char *valuefmt, ...)
+{
+	va_list valueap;
+
+	va_start(valueap, valuefmt);
+	nvlist_add_stringv(nvl, name, valuefmt, valueap);
+	va_end(valueap);
+}
+
+void
+nvlist_add_stringv(nvlist_t *nvl, const char *name, const char *valuefmt,
+    va_list valueap)
+{
+	nvpair_t *nvp;
+
+	if (nvlist_error(nvl) != 0) {
+		RESTORE_ERRNO(nvlist_error(nvl));
+		return;
+	}
+
+	nvp = nvpair_create_stringv(name, valuefmt, valueap);
+	if (nvp == NULL) {
+		nvl->nvl_error = ERRNO_OR_DEFAULT(ENOMEM);
+		RESTORE_ERRNO(nvl->nvl_error);
+	} else
+		nvlist_move_nvpair(nvl, nvp);
+}
+
+void
+nvlist_add_null(nvlist_t *nvl, const char *name)
+{
+	nvpair_t *nvp;
+
+	if (nvlist_error(nvl) != 0) {
+		RESTORE_ERRNO(nvlist_error(nvl));
+		return;
+	}
+
+	nvp = nvpair_create_null(name);
+	if (nvp == NULL) {
+		nvl->nvl_error = ERRNO_OR_DEFAULT(ENOMEM);
+		RESTORE_ERRNO(nvl->nvl_error);
+	} else
+		nvlist_move_nvpair(nvl, nvp);
+}
+
+void
+nvlist_add_bool(nvlist_t *nvl, const char *name, bool value)
+{
+	nvpair_t *nvp;
+
+	if (nvlist_error(nvl) != 0) {
+		RESTORE_ERRNO(nvlist_error(nvl));
+		return;
+	}
+
+	nvp = nvpair_create_bool(name, value);
+	if (nvp == NULL) {
+		nvl->nvl_error = ERRNO_OR_DEFAULT(ENOMEM);
+		RESTORE_ERRNO(nvl->nvl_error);
+	} else
+		nvlist_move_nvpair(nvl, nvp);
+}
+
+void
+nvlist_add_number(nvlist_t *nvl, const char *name, uint64_t value)
+{
+	nvpair_t *nvp;
+
+	if (nvlist_error(nvl) != 0) {
+		RESTORE_ERRNO(nvlist_error(nvl));
+		return;
+	}
+
+	nvp = nvpair_create_number(name, value);
+	if (nvp == NULL) {
+		nvl->nvl_error = ERRNO_OR_DEFAULT(ENOMEM);
+		RESTORE_ERRNO(nvl->nvl_error);
+	} else
+		nvlist_move_nvpair(nvl, nvp);
+}
+
+void
+nvlist_add_string(nvlist_t *nvl, const char *name, const char *value)
+{
+	nvpair_t *nvp;
+
+	if (nvlist_error(nvl) != 0) {
+		RESTORE_ERRNO(nvlist_error(nvl));
+		return;
+	}
+
+	nvp = nvpair_create_string(name, value);
+	if (nvp == NULL) {
+		nvl->nvl_error = ERRNO_OR_DEFAULT(ENOMEM);
+		RESTORE_ERRNO(nvl->nvl_error);
+	} else
+		nvlist_move_nvpair(nvl, nvp);
+}
+
+void
+nvlist_add_nvlist(nvlist_t *nvl, const char *name, const nvlist_t *value)
+{
+	nvpair_t *nvp;
+
+	if (nvlist_error(nvl) != 0) {
+		RESTORE_ERRNO(nvlist_error(nvl));
+		return;
+	}
+
+	nvp = nvpair_create_nvlist(name, value);
+	if (nvp == NULL) {
+		nvl->nvl_error = ERRNO_OR_DEFAULT(ENOMEM);
+		RESTORE_ERRNO(nvl->nvl_error);
+	} else
+		nvlist_move_nvpair(nvl, nvp);
+}
+
+#ifndef _KERNEL
+void
+nvlist_add_descriptor(nvlist_t *nvl, const char *name, int value)
+{
+	nvpair_t *nvp;
+
+	if (nvlist_error(nvl) != 0) {
+		errno = nvlist_error(nvl);
+		return;
+	}
+
+	nvp = nvpair_create_descriptor(name, value);
+	if (nvp == NULL)
+		nvl->nvl_error = errno = (errno != 0 ? errno : ENOMEM);
+	else
+		nvlist_move_nvpair(nvl, nvp);
+}
+#endif
+
+void
+nvlist_add_binary(nvlist_t *nvl, const char *name, const void *value,
+    size_t size)
+{
+	nvpair_t *nvp;
+
+	if (nvlist_error(nvl) != 0) {
+		RESTORE_ERRNO(nvlist_error(nvl));
+		return;
+	}
+
+	nvp = nvpair_create_binary(name, value, size);
+	if (nvp == NULL) {
+		nvl->nvl_error = ERRNO_OR_DEFAULT(ENOMEM);
+		RESTORE_ERRNO(nvl->nvl_error);
+	} else
+		nvlist_move_nvpair(nvl, nvp);
+}
+
+void
+nvlist_move_nvpair(nvlist_t *nvl, nvpair_t *nvp)
+{
+
+	NVPAIR_ASSERT(nvp);
+	PJDLOG_ASSERT(nvpair_nvlist(nvp) == NULL);
+
+	if (nvlist_error(nvl) != 0) {
+		nvpair_free(nvp);
+		RESTORE_ERRNO(nvlist_error(nvl));
+		return;
+	}
+	if (nvlist_exists(nvl, nvpair_name(nvp))) {
+		nvpair_free(nvp);
+		nvl->nvl_error = EEXIST;
+		RESTORE_ERRNO(nvl->nvl_error);
+		return;
+	}
+
+	nvpair_insert(&nvl->nvl_head, nvp, nvl);
+}
+
+void
+nvlist_move_string(nvlist_t *nvl, const char *name, char *value)
+{
+	nvpair_t *nvp;
+
+	if (nvlist_error(nvl) != 0) {
+		nv_free(value);
+		RESTORE_ERRNO(nvlist_error(nvl));
+		return;
+	}
+
+	nvp = nvpair_move_string(name, value);
+	if (nvp == NULL) {
+		nvl->nvl_error = ERRNO_OR_DEFAULT(ENOMEM);
+		RESTORE_ERRNO(nvl->nvl_error);
+	} else
+		nvlist_move_nvpair(nvl, nvp);
+}
+
+void
+nvlist_move_nvlist(nvlist_t *nvl, const char *name, nvlist_t *value)
+{
+	nvpair_t *nvp;
+
+	if (nvlist_error(nvl) != 0) {
+		if (value != NULL && nvlist_get_nvpair_parent(value) != NULL)
+			nvlist_destroy(value);
+		RESTORE_ERRNO(nvlist_error(nvl));
+		return;
+	}
+
+	nvp = nvpair_move_nvlist(name, value);
+	if (nvp == NULL) {
+		nvl->nvl_error = ERRNO_OR_DEFAULT(ENOMEM);
+		RESTORE_ERRNO(nvl->nvl_error);
+	} else
+		nvlist_move_nvpair(nvl, nvp);
+}
+
+#ifndef _KERNEL
+void
+nvlist_move_descriptor(nvlist_t *nvl, const char *name, int value)
+{
+	nvpair_t *nvp;
+
+	if (nvlist_error(nvl) != 0) {
+		close(value);
+		errno = nvlist_error(nvl);
+		return;
+	}
+
+	nvp = nvpair_move_descriptor(name, value);
+	if (nvp == NULL)
+		nvl->nvl_error = errno = (errno != 0 ? errno : ENOMEM);
+	else
+		nvlist_move_nvpair(nvl, nvp);
+}
+#endif
+
+void
+nvlist_move_binary(nvlist_t *nvl, const char *name, void *value, size_t size)
+{
+	nvpair_t *nvp;
+
+	if (nvlist_error(nvl) != 0) {
+		nv_free(value);
+		RESTORE_ERRNO(nvlist_error(nvl));
+		return;
+	}
+
+	nvp = nvpair_move_binary(name, value, size);
+	if (nvp == NULL) {
+		nvl->nvl_error = ERRNO_OR_DEFAULT(ENOMEM);
+		RESTORE_ERRNO(nvl->nvl_error);
+	} else
+		nvlist_move_nvpair(nvl, nvp);
+}
+
+const nvpair_t *
+nvlist_get_nvpair(const nvlist_t *nvl, const char *name)
+{
+
+	return (nvlist_find(nvl, NV_TYPE_NONE, name));
+}
+
+#define	NVLIST_GET(ftype, type, TYPE)					\
+ftype									\
+nvlist_get_##type(const nvlist_t *nvl, const char *name)		\
+{									\
+	const nvpair_t *nvp;						\
+									\
+	nvp = nvlist_find(nvl, NV_TYPE_##TYPE, name);			\
+	if (nvp == NULL)						\
+		nvlist_report_missing(NV_TYPE_##TYPE, name);		\
+	return (nvpair_get_##type(nvp));				\
+}
+
+NVLIST_GET(bool, bool, BOOL)
+NVLIST_GET(uint64_t, number, NUMBER)
+NVLIST_GET(const char *, string, STRING)
+NVLIST_GET(const nvlist_t *, nvlist, NVLIST)
+#ifndef _KERNEL
+NVLIST_GET(int, descriptor, DESCRIPTOR)
+#endif
+
+#undef	NVLIST_GET
+
+const void *
+nvlist_get_binary(const nvlist_t *nvl, const char *name, size_t *sizep)
+{
+	nvpair_t *nvp;
+
+	nvp = nvlist_find(nvl, NV_TYPE_BINARY, name);
+	if (nvp == NULL)
+		nvlist_report_missing(NV_TYPE_BINARY, name);
+
+	return (nvpair_get_binary(nvp, sizep));
+}
+
+#define	NVLIST_TAKE(ftype, type, TYPE)					\
+ftype									\
+nvlist_take_##type(nvlist_t *nvl, const char *name)			\
+{									\
+	nvpair_t *nvp;							\
+	ftype value;							\
+									\
+	nvp = nvlist_find(nvl, NV_TYPE_##TYPE, name);			\
+	if (nvp == NULL)						\
+		nvlist_report_missing(NV_TYPE_##TYPE, name);		\
+	value = (ftype)(intptr_t)nvpair_get_##type(nvp);		\
+	nvlist_remove_nvpair(nvl, nvp);					\
+	nvpair_free_structure(nvp);					\
+	return (value);							\
+}
+
+NVLIST_TAKE(bool, bool, BOOL)
+NVLIST_TAKE(uint64_t, number, NUMBER)
+NVLIST_TAKE(char *, string, STRING)
+NVLIST_TAKE(nvlist_t *, nvlist, NVLIST)
+#ifndef _KERNEL
+NVLIST_TAKE(int, descriptor, DESCRIPTOR)
+#endif
+
+#undef	NVLIST_TAKE
+
+void *
+nvlist_take_binary(nvlist_t *nvl, const char *name, size_t *sizep)
+{
+	nvpair_t *nvp;
+	void *value;
+
+	nvp = nvlist_find(nvl, NV_TYPE_BINARY, name);
+	if (nvp == NULL)
+		nvlist_report_missing(NV_TYPE_BINARY, name);
+
+	value = (void *)(intptr_t)nvpair_get_binary(nvp, sizep);
+	nvlist_remove_nvpair(nvl, nvp);
+	nvpair_free_structure(nvp);
+	return (value);
+}
+
+void
+nvlist_remove_nvpair(nvlist_t *nvl, nvpair_t *nvp)
+{
+
+	NVLIST_ASSERT(nvl);
+	NVPAIR_ASSERT(nvp);
+	PJDLOG_ASSERT(nvpair_nvlist(nvp) == nvl);
+
+	nvpair_remove(&nvl->nvl_head, nvp, nvl);
+}
+
+void
+nvlist_free(nvlist_t *nvl, const char *name)
+{
+
+	nvlist_free_type(nvl, name, NV_TYPE_NONE);
+}
+
+#define	NVLIST_FREE(type, TYPE)						\
+void									\
+nvlist_free_##type(nvlist_t *nvl, const char *name)			\
+{									\
+									\
+	nvlist_free_type(nvl, name, NV_TYPE_##TYPE);			\
+}
+
+NVLIST_FREE(null, NULL)
+NVLIST_FREE(bool, BOOL)
+NVLIST_FREE(number, NUMBER)
+NVLIST_FREE(string, STRING)
+NVLIST_FREE(nvlist, NVLIST)
+#ifndef _KERNEL
+NVLIST_FREE(descriptor, DESCRIPTOR)
+#endif
+NVLIST_FREE(binary, BINARY)
+
+#undef	NVLIST_FREE
+
+void
+nvlist_free_nvpair(nvlist_t *nvl, nvpair_t *nvp)
+{
+
+	NVLIST_ASSERT(nvl);
+	NVPAIR_ASSERT(nvp);
+	PJDLOG_ASSERT(nvpair_nvlist(nvp) == nvl);
+
+	nvlist_remove_nvpair(nvl, nvp);
+	nvpair_free(nvp);
+}
+


Property changes on: trunk/sys/kern/subr_nvlist.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property
Added: trunk/sys/kern/subr_nvpair.c
===================================================================
--- trunk/sys/kern/subr_nvpair.c	                        (rev 0)
+++ trunk/sys/kern/subr_nvpair.c	2018-05-25 21:07:09 UTC (rev 9950)
@@ -0,0 +1,1112 @@
+/* $MidnightBSD$ */
+/*-
+ * Copyright (c) 2009-2013 The FreeBSD Foundation
+ * All rights reserved.
+ *
+ * This software was developed by Pawel Jakub Dawidek under sponsorship from
+ * the FreeBSD Foundation.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: stable/10/sys/kern/subr_nvpair.c 292973 2015-12-31 03:28:14Z ngie $");
+
+#include <sys/param.h>
+#include <sys/endian.h>
+#include <sys/queue.h>
+
+#ifdef _KERNEL
+
+#include <sys/errno.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/systm.h>
+
+#include <machine/stdarg.h>
+
+#else
+#include <errno.h>
+#include <fcntl.h>
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "common_impl.h"
+#endif
+
+#ifdef HAVE_PJDLOG
+#include <pjdlog.h>
+#endif
+
+#include <sys/nv.h>
+#include <sys/nv_impl.h>
+#include <sys/nvlist_impl.h>
+#include <sys/nvpair_impl.h>
+
+#ifndef	HAVE_PJDLOG
+#ifdef _KERNEL
+#define	PJDLOG_ASSERT(...)		MPASS(__VA_ARGS__)
+#define	PJDLOG_RASSERT(expr, ...)	KASSERT(expr, (__VA_ARGS__))
+#define	PJDLOG_ABORT(...)		panic(__VA_ARGS__)
+#else
+#include <assert.h>
+#define	PJDLOG_ASSERT(...)		assert(__VA_ARGS__)
+#define	PJDLOG_RASSERT(expr, ...)	assert(expr)
+#define	PJDLOG_ABORT(...)		abort()
+#endif
+#endif
+
+#define	NVPAIR_MAGIC	0x6e7670	/* "nvp" */
+struct nvpair {
+	int		 nvp_magic;
+	char		*nvp_name;
+	int		 nvp_type;
+	uint64_t	 nvp_data;
+	size_t		 nvp_datasize;
+	nvlist_t	*nvp_list;
+	TAILQ_ENTRY(nvpair) nvp_next;
+};
+
+#define	NVPAIR_ASSERT(nvp)	do {					\
+	PJDLOG_ASSERT((nvp) != NULL);					\
+	PJDLOG_ASSERT((nvp)->nvp_magic == NVPAIR_MAGIC);		\
+} while (0)
+
+struct nvpair_header {
+	uint8_t		nvph_type;
+	uint16_t	nvph_namesize;
+	uint64_t	nvph_datasize;
+} __packed;
+
+
+void
+nvpair_assert(const nvpair_t *nvp)
+{
+
+	NVPAIR_ASSERT(nvp);
+}
+
+nvlist_t *
+nvpair_nvlist(const nvpair_t *nvp)
+{
+
+	NVPAIR_ASSERT(nvp);
+
+	return (nvp->nvp_list);
+}
+
+nvpair_t *
+nvpair_next(const nvpair_t *nvp)
+{
+
+	NVPAIR_ASSERT(nvp);
+	PJDLOG_ASSERT(nvp->nvp_list != NULL);
+
+	return (TAILQ_NEXT(nvp, nvp_next));
+}
+
+nvpair_t *
+nvpair_prev(const nvpair_t *nvp)
+{
+
+	NVPAIR_ASSERT(nvp);
+	PJDLOG_ASSERT(nvp->nvp_list != NULL);
+
+	return (TAILQ_PREV(nvp, nvl_head, nvp_next));
+}
+
+void
+nvpair_insert(struct nvl_head *head, nvpair_t *nvp, nvlist_t *nvl)
+{
+
+	NVPAIR_ASSERT(nvp);
+	PJDLOG_ASSERT(nvp->nvp_list == NULL);
+	PJDLOG_ASSERT(!nvlist_exists(nvl, nvpair_name(nvp)));
+
+	TAILQ_INSERT_TAIL(head, nvp, nvp_next);
+	nvp->nvp_list = nvl;
+}
+
+static void
+nvpair_remove_nvlist(nvpair_t *nvp)
+{
+	nvlist_t *nvl;
+
+	/* XXX: DECONST is bad, mkay? */
+	nvl = __DECONST(nvlist_t *, nvpair_get_nvlist(nvp));
+	PJDLOG_ASSERT(nvl != NULL);
+	nvlist_set_parent(nvl, NULL);
+}
+
+void
+nvpair_remove(struct nvl_head *head, nvpair_t *nvp, const nvlist_t *nvl)
+{
+
+	NVPAIR_ASSERT(nvp);
+	PJDLOG_ASSERT(nvp->nvp_list == nvl);
+
+	if (nvpair_type(nvp) == NV_TYPE_NVLIST)
+		nvpair_remove_nvlist(nvp);
+
+	TAILQ_REMOVE(head, nvp, nvp_next);
+	nvp->nvp_list = NULL;
+}
+
+nvpair_t *
+nvpair_clone(const nvpair_t *nvp)
+{
+	nvpair_t *newnvp;
+	const char *name;
+	const void *data;
+	size_t datasize;
+
+	NVPAIR_ASSERT(nvp);
+
+	name = nvpair_name(nvp);
+
+	switch (nvpair_type(nvp)) {
+	case NV_TYPE_NULL:
+		newnvp = nvpair_create_null(name);
+		break;
+	case NV_TYPE_BOOL:
+		newnvp = nvpair_create_bool(name, nvpair_get_bool(nvp));
+		break;
+	case NV_TYPE_NUMBER:
+		newnvp = nvpair_create_number(name, nvpair_get_number(nvp));
+		break;
+	case NV_TYPE_STRING:
+		newnvp = nvpair_create_string(name, nvpair_get_string(nvp));
+		break;
+	case NV_TYPE_NVLIST:
+		newnvp = nvpair_create_nvlist(name, nvpair_get_nvlist(nvp));
+		break;
+#ifndef _KERNEL
+	case NV_TYPE_DESCRIPTOR:
+		newnvp = nvpair_create_descriptor(name,
+		    nvpair_get_descriptor(nvp));
+		break;
+#endif
+	case NV_TYPE_BINARY:
+		data = nvpair_get_binary(nvp, &datasize);
+		newnvp = nvpair_create_binary(name, data, datasize);
+		break;
+	default:
+		PJDLOG_ABORT("Unknown type: %d.", nvpair_type(nvp));
+	}
+
+	return (newnvp);
+}
+
+size_t
+nvpair_header_size(void)
+{
+
+	return (sizeof(struct nvpair_header));
+}
+
+size_t
+nvpair_size(const nvpair_t *nvp)
+{
+
+	NVPAIR_ASSERT(nvp);
+
+	return (nvp->nvp_datasize);
+}
+
+unsigned char *
+nvpair_pack_header(const nvpair_t *nvp, unsigned char *ptr, size_t *leftp)
+{
+	struct nvpair_header nvphdr;
+	size_t namesize;
+
+	NVPAIR_ASSERT(nvp);
+
+	nvphdr.nvph_type = nvp->nvp_type;
+	namesize = strlen(nvp->nvp_name) + 1;
+	PJDLOG_ASSERT(namesize > 0 && namesize <= UINT16_MAX);
+	nvphdr.nvph_namesize = namesize;
+	nvphdr.nvph_datasize = nvp->nvp_datasize;
+	PJDLOG_ASSERT(*leftp >= sizeof(nvphdr));
+	memcpy(ptr, &nvphdr, sizeof(nvphdr));
+	ptr += sizeof(nvphdr);
+	*leftp -= sizeof(nvphdr);
+
+	PJDLOG_ASSERT(*leftp >= namesize);
+	memcpy(ptr, nvp->nvp_name, namesize);
+	ptr += namesize;
+	*leftp -= namesize;
+
+	return (ptr);
+}
+
+unsigned char *
+nvpair_pack_null(const nvpair_t *nvp, unsigned char *ptr,
+    size_t *leftp __unused)
+{
+
+	NVPAIR_ASSERT(nvp);
+	PJDLOG_ASSERT(nvp->nvp_type == NV_TYPE_NULL);
+
+	return (ptr);
+}
+
+unsigned char *
+nvpair_pack_bool(const nvpair_t *nvp, unsigned char *ptr, size_t *leftp)
+{
+	uint8_t value;
+
+	NVPAIR_ASSERT(nvp);
+	PJDLOG_ASSERT(nvp->nvp_type == NV_TYPE_BOOL);
+
+	value = (uint8_t)nvp->nvp_data;
+
+	PJDLOG_ASSERT(*leftp >= sizeof(value));
+	memcpy(ptr, &value, sizeof(value));
+	ptr += sizeof(value);
+	*leftp -= sizeof(value);
+
+	return (ptr);
+}
+
+unsigned char *
+nvpair_pack_number(const nvpair_t *nvp, unsigned char *ptr, size_t *leftp)
+{
+	uint64_t value;
+
+	NVPAIR_ASSERT(nvp);
+	PJDLOG_ASSERT(nvp->nvp_type == NV_TYPE_NUMBER);
+
+	value = (uint64_t)nvp->nvp_data;
+
+	PJDLOG_ASSERT(*leftp >= sizeof(value));
+	memcpy(ptr, &value, sizeof(value));
+	ptr += sizeof(value);
+	*leftp -= sizeof(value);
+
+	return (ptr);
+}
+
+unsigned char *
+nvpair_pack_string(const nvpair_t *nvp, unsigned char *ptr, size_t *leftp)
+{
+
+	NVPAIR_ASSERT(nvp);
+	PJDLOG_ASSERT(nvp->nvp_type == NV_TYPE_STRING);
+
+	PJDLOG_ASSERT(*leftp >= nvp->nvp_datasize);
+	memcpy(ptr, (const void *)(intptr_t)nvp->nvp_data, nvp->nvp_datasize);
+	ptr += nvp->nvp_datasize;
+	*leftp -= nvp->nvp_datasize;
+
+	return (ptr);
+}
+
+unsigned char *
+nvpair_pack_nvlist_up(unsigned char *ptr, size_t *leftp)
+{
+	struct nvpair_header nvphdr;
+	size_t namesize;
+	const char *name = "";
+
+	namesize = 1;
+	nvphdr.nvph_type = NV_TYPE_NVLIST_UP;
+	nvphdr.nvph_namesize = namesize;
+	nvphdr.nvph_datasize = 0;
+	PJDLOG_ASSERT(*leftp >= sizeof(nvphdr));
+	memcpy(ptr, &nvphdr, sizeof(nvphdr));
+	ptr += sizeof(nvphdr);
+	*leftp -= sizeof(nvphdr);
+
+	PJDLOG_ASSERT(*leftp >= namesize);
+	memcpy(ptr, name, namesize);
+	ptr += namesize;
+	*leftp -= namesize;
+
+	return (ptr);
+}
+
+#ifndef _KERNEL
+unsigned char *
+nvpair_pack_descriptor(const nvpair_t *nvp, unsigned char *ptr, int64_t *fdidxp,
+    size_t *leftp)
+{
+	int64_t value;
+
+	NVPAIR_ASSERT(nvp);
+	PJDLOG_ASSERT(nvp->nvp_type == NV_TYPE_DESCRIPTOR);
+
+	value = (int64_t)nvp->nvp_data;
+	if (value != -1) {
+		/*
+		 * If there is a real descriptor here, we change its number
+		 * to position in the array of descriptors send via control
+		 * message.
+		 */
+		PJDLOG_ASSERT(fdidxp != NULL);
+
+		value = *fdidxp;
+		(*fdidxp)++;
+	}
+
+	PJDLOG_ASSERT(*leftp >= sizeof(value));
+	memcpy(ptr, &value, sizeof(value));
+	ptr += sizeof(value);
+	*leftp -= sizeof(value);
+
+	return (ptr);
+}
+#endif
+
+unsigned char *
+nvpair_pack_binary(const nvpair_t *nvp, unsigned char *ptr, size_t *leftp)
+{
+
+	NVPAIR_ASSERT(nvp);
+	PJDLOG_ASSERT(nvp->nvp_type == NV_TYPE_BINARY);
+
+	PJDLOG_ASSERT(*leftp >= nvp->nvp_datasize);
+	memcpy(ptr, (const void *)(intptr_t)nvp->nvp_data, nvp->nvp_datasize);
+	ptr += nvp->nvp_datasize;
+	*leftp -= nvp->nvp_datasize;
+
+	return (ptr);
+}
+
+void
+nvpair_init_datasize(nvpair_t *nvp)
+{
+
+	NVPAIR_ASSERT(nvp);
+
+	if (nvp->nvp_type == NV_TYPE_NVLIST) {
+		if (nvp->nvp_data == 0) {
+			nvp->nvp_datasize = 0;
+		} else {
+			nvp->nvp_datasize =
+			    nvlist_size((const nvlist_t *)(intptr_t)nvp->nvp_data);
+		}
+	}
+}
+
+const unsigned char *
+nvpair_unpack_header(bool isbe, nvpair_t *nvp, const unsigned char *ptr,
+    size_t *leftp)
+{
+	struct nvpair_header nvphdr;
+
+	if (*leftp < sizeof(nvphdr))
+		goto failed;
+
+	memcpy(&nvphdr, ptr, sizeof(nvphdr));
+	ptr += sizeof(nvphdr);
+	*leftp -= sizeof(nvphdr);
+
+#if NV_TYPE_FIRST > 0
+	if (nvphdr.nvph_type < NV_TYPE_FIRST)
+		goto failed;
+#endif
+	if (nvphdr.nvph_type > NV_TYPE_LAST &&
+	    nvphdr.nvph_type != NV_TYPE_NVLIST_UP) {
+		goto failed;
+	}
+
+#if BYTE_ORDER == BIG_ENDIAN
+	if (!isbe) {
+		nvphdr.nvph_namesize = le16toh(nvphdr.nvph_namesize);
+		nvphdr.nvph_datasize = le64toh(nvphdr.nvph_datasize);
+	}
+#else
+	if (isbe) {
+		nvphdr.nvph_namesize = be16toh(nvphdr.nvph_namesize);
+		nvphdr.nvph_datasize = be64toh(nvphdr.nvph_datasize);
+	}
+#endif
+
+	if (nvphdr.nvph_namesize > NV_NAME_MAX)
+		goto failed;
+	if (*leftp < nvphdr.nvph_namesize)
+		goto failed;
+	if (nvphdr.nvph_namesize < 1)
+		goto failed;
+	if (strnlen((const char *)ptr, nvphdr.nvph_namesize) !=
+	    (size_t)(nvphdr.nvph_namesize - 1)) {
+		goto failed;
+	}
+
+	memcpy(nvp->nvp_name, ptr, nvphdr.nvph_namesize);
+	ptr += nvphdr.nvph_namesize;
+	*leftp -= nvphdr.nvph_namesize;
+
+	if (*leftp < nvphdr.nvph_datasize)
+		goto failed;
+
+	nvp->nvp_type = nvphdr.nvph_type;
+	nvp->nvp_data = 0;
+	nvp->nvp_datasize = nvphdr.nvph_datasize;
+
+	return (ptr);
+failed:
+	RESTORE_ERRNO(EINVAL);
+	return (NULL);
+}
+
+const unsigned char *
+nvpair_unpack_null(bool isbe __unused, nvpair_t *nvp, const unsigned char *ptr,
+    size_t *leftp __unused)
+{
+
+	PJDLOG_ASSERT(nvp->nvp_type == NV_TYPE_NULL);
+
+	if (nvp->nvp_datasize != 0) {
+		RESTORE_ERRNO(EINVAL);
+		return (NULL);
+	}
+
+	return (ptr);
+}
+
+const unsigned char *
+nvpair_unpack_bool(bool isbe __unused, nvpair_t *nvp, const unsigned char *ptr,
+    size_t *leftp)
+{
+	uint8_t value;
+
+	PJDLOG_ASSERT(nvp->nvp_type == NV_TYPE_BOOL);
+
+	if (nvp->nvp_datasize != sizeof(value)) {
+		RESTORE_ERRNO(EINVAL);
+		return (NULL);
+	}
+	if (*leftp < sizeof(value)) {
+		RESTORE_ERRNO(EINVAL);
+		return (NULL);
+	}
+
+	memcpy(&value, ptr, sizeof(value));
+	ptr += sizeof(value);
+	*leftp -= sizeof(value);
+
+	if (value != 0 && value != 1) {
+		RESTORE_ERRNO(EINVAL);
+		return (NULL);
+	}
+
+	nvp->nvp_data = (uint64_t)value;
+
+	return (ptr);
+}
+
+const unsigned char *
+nvpair_unpack_number(bool isbe, nvpair_t *nvp, const unsigned char *ptr,
+     size_t *leftp)
+{
+
+	PJDLOG_ASSERT(nvp->nvp_type == NV_TYPE_NUMBER);
+
+	if (nvp->nvp_datasize != sizeof(uint64_t)) {
+		RESTORE_ERRNO(EINVAL);
+		return (NULL);
+	}
+	if (*leftp < sizeof(uint64_t)) {
+		RESTORE_ERRNO(EINVAL);
+		return (NULL);
+	}
+
+	if (isbe)
+		nvp->nvp_data = be64dec(ptr);
+	else
+		nvp->nvp_data = le64dec(ptr);
+	ptr += sizeof(uint64_t);
+	*leftp -= sizeof(uint64_t);
+
+	return (ptr);
+}
+
+const unsigned char *
+nvpair_unpack_string(bool isbe __unused, nvpair_t *nvp,
+    const unsigned char *ptr, size_t *leftp)
+{
+
+	PJDLOG_ASSERT(nvp->nvp_type == NV_TYPE_STRING);
+
+	if (*leftp < nvp->nvp_datasize || nvp->nvp_datasize == 0) {
+		RESTORE_ERRNO(EINVAL);
+		return (NULL);
+	}
+
+	if (strnlen((const char *)ptr, nvp->nvp_datasize) !=
+	    nvp->nvp_datasize - 1) {
+		RESTORE_ERRNO(EINVAL);
+		return (NULL);
+	}
+
+	nvp->nvp_data = (uint64_t)(uintptr_t)nv_strdup((const char *)ptr);
+	if (nvp->nvp_data == 0)
+		return (NULL);
+
+	ptr += nvp->nvp_datasize;
+	*leftp -= nvp->nvp_datasize;
+
+	return (ptr);
+}
+
+const unsigned char *
+nvpair_unpack_nvlist(bool isbe __unused, nvpair_t *nvp,
+    const unsigned char *ptr, size_t *leftp, size_t nfds, nvlist_t **child)
+{
+	nvlist_t *value;
+
+	PJDLOG_ASSERT(nvp->nvp_type == NV_TYPE_NVLIST);
+
+	if (*leftp < nvp->nvp_datasize || nvp->nvp_datasize == 0) {
+		RESTORE_ERRNO(EINVAL);
+		return (NULL);
+	}
+
+	value = nvlist_create(0);
+	if (value == NULL)
+		return (NULL);
+
+	ptr = nvlist_unpack_header(value, ptr, nfds, NULL, leftp);
+	if (ptr == NULL)
+		return (NULL);
+
+	nvp->nvp_data = (uint64_t)(uintptr_t)value;
+	*child = value;
+
+	return (ptr);
+}
+
+#ifndef _KERNEL
+const unsigned char *
+nvpair_unpack_descriptor(bool isbe, nvpair_t *nvp, const unsigned char *ptr,
+    size_t *leftp, const int *fds, size_t nfds)
+{
+	int64_t idx;
+
+	PJDLOG_ASSERT(nvp->nvp_type == NV_TYPE_DESCRIPTOR);
+
+	if (nvp->nvp_datasize != sizeof(idx)) {
+		errno = EINVAL;
+		return (NULL);
+	}
+	if (*leftp < sizeof(idx)) {
+		errno = EINVAL;
+		return (NULL);
+	}
+
+	if (isbe)
+		idx = be64dec(ptr);
+	else
+		idx = le64dec(ptr);
+
+	if (idx < 0) {
+		errno = EINVAL;
+		return (NULL);
+	}
+
+	if ((size_t)idx >= nfds) {
+		errno = EINVAL;
+		return (NULL);
+	}
+
+	nvp->nvp_data = (uint64_t)fds[idx];
+
+	ptr += sizeof(idx);
+	*leftp -= sizeof(idx);
+
+	return (ptr);
+}
+#endif
+
+const unsigned char *
+nvpair_unpack_binary(bool isbe __unused, nvpair_t *nvp,
+    const unsigned char *ptr, size_t *leftp)
+{
+	void *value;
+
+	PJDLOG_ASSERT(nvp->nvp_type == NV_TYPE_BINARY);
+
+	if (*leftp < nvp->nvp_datasize || nvp->nvp_datasize == 0) {
+		RESTORE_ERRNO(EINVAL);
+		return (NULL);
+	}
+
+	value = nv_malloc(nvp->nvp_datasize);
+	if (value == NULL)
+		return (NULL);
+
+	memcpy(value, ptr, nvp->nvp_datasize);
+	ptr += nvp->nvp_datasize;
+	*leftp -= nvp->nvp_datasize;
+
+	nvp->nvp_data = (uint64_t)(uintptr_t)value;
+
+	return (ptr);
+}
+
+const unsigned char *
+nvpair_unpack(bool isbe, const unsigned char *ptr, size_t *leftp,
+    nvpair_t **nvpp)
+{
+	nvpair_t *nvp, *tmp;
+
+	nvp = nv_calloc(1, sizeof(*nvp) + NV_NAME_MAX);
+	if (nvp == NULL)
+		return (NULL);
+	nvp->nvp_name = (char *)(nvp + 1);
+
+	ptr = nvpair_unpack_header(isbe, nvp, ptr, leftp);
+	if (ptr == NULL)
+		goto failed;
+	tmp = nv_realloc(nvp, sizeof(*nvp) + strlen(nvp->nvp_name) + 1);
+	if (tmp == NULL)
+		goto failed;
+	nvp = tmp;
+
+	/* Update nvp_name after realloc(). */
+	nvp->nvp_name = (char *)(nvp + 1);
+	nvp->nvp_data = 0x00;
+	nvp->nvp_magic = NVPAIR_MAGIC;
+	*nvpp = nvp;
+	return (ptr);
+failed:
+	nv_free(nvp);
+	return (NULL);
+}
+
+int
+nvpair_type(const nvpair_t *nvp)
+{
+
+	NVPAIR_ASSERT(nvp);
+
+	return (nvp->nvp_type);
+}
+
+const char *
+nvpair_name(const nvpair_t *nvp)
+{
+
+	NVPAIR_ASSERT(nvp);
+
+	return (nvp->nvp_name);
+}
+
+static nvpair_t *
+nvpair_allocv(const char *name, int type, uint64_t data, size_t datasize)
+{
+	nvpair_t *nvp;
+	size_t namelen;
+
+	PJDLOG_ASSERT(type >= NV_TYPE_FIRST && type <= NV_TYPE_LAST);
+
+	namelen = strlen(name);
+	if (namelen >= NV_NAME_MAX) {
+		RESTORE_ERRNO(ENAMETOOLONG);
+		return (NULL);
+	}
+
+	nvp = nv_calloc(1, sizeof(*nvp) + namelen + 1);
+	if (nvp != NULL) {
+		nvp->nvp_name = (char *)(nvp + 1);
+		memcpy(nvp->nvp_name, name, namelen);
+		nvp->nvp_name[namelen + 1] = '\0';
+		nvp->nvp_type = type;
+		nvp->nvp_data = data;
+		nvp->nvp_datasize = datasize;
+		nvp->nvp_magic = NVPAIR_MAGIC;
+	}
+
+	return (nvp);
+};
+
+nvpair_t *
+nvpair_create_stringf(const char *name, const char *valuefmt, ...)
+{
+	va_list valueap;
+	nvpair_t *nvp;
+
+	va_start(valueap, valuefmt);
+	nvp = nvpair_create_stringv(name, valuefmt, valueap);
+	va_end(valueap);
+
+	return (nvp);
+}
+
+nvpair_t *
+nvpair_create_stringv(const char *name, const char *valuefmt, va_list valueap)
+{
+	nvpair_t *nvp;
+	char *str;
+	int len;
+
+	len = nv_vasprintf(&str, valuefmt, valueap);
+	if (len < 0)
+		return (NULL);
+	nvp = nvpair_create_string(name, str);
+	if (nvp == NULL)
+		nv_free(str);
+	return (nvp);
+}
+
+nvpair_t *
+nvpair_create_null(const char *name)
+{
+
+	return (nvpair_allocv(name, NV_TYPE_NULL, 0, 0));
+}
+
+nvpair_t *
+nvpair_create_bool(const char *name, bool value)
+{
+
+	return (nvpair_allocv(name, NV_TYPE_BOOL, value ? 1 : 0,
+	    sizeof(uint8_t)));
+}
+
+nvpair_t *
+nvpair_create_number(const char *name, uint64_t value)
+{
+
+	return (nvpair_allocv(name, NV_TYPE_NUMBER, value, sizeof(value)));
+}
+
+nvpair_t *
+nvpair_create_string(const char *name, const char *value)
+{
+	nvpair_t *nvp;
+	size_t size;
+	char *data;
+
+	if (value == NULL) {
+		RESTORE_ERRNO(EINVAL);
+		return (NULL);
+	}
+
+	data = nv_strdup(value);
+	if (data == NULL)
+		return (NULL);
+	size = strlen(value) + 1;
+
+	nvp = nvpair_allocv(name, NV_TYPE_STRING, (uint64_t)(uintptr_t)data,
+	    size);
+	if (nvp == NULL)
+		nv_free(data);
+
+	return (nvp);
+}
+
+nvpair_t *
+nvpair_create_nvlist(const char *name, const nvlist_t *value)
+{
+	nvlist_t *nvl;
+	nvpair_t *nvp;
+
+	if (value == NULL) {
+		RESTORE_ERRNO(EINVAL);
+		return (NULL);
+	}
+
+	nvl = nvlist_clone(value);
+	if (nvl == NULL)
+		return (NULL);
+
+	nvp = nvpair_allocv(name, NV_TYPE_NVLIST, (uint64_t)(uintptr_t)nvl, 0);
+	if (nvp == NULL)
+		nvlist_destroy(nvl);
+	else
+		nvlist_set_parent(nvl, nvp);
+
+	return (nvp);
+}
+
+#ifndef _KERNEL
+nvpair_t *
+nvpair_create_descriptor(const char *name, int value)
+{
+	nvpair_t *nvp;
+
+	if (value < 0 || !fd_is_valid(value)) {
+		errno = EBADF;
+		return (NULL);
+	}
+
+	value = fcntl(value, F_DUPFD_CLOEXEC, 0);
+	if (value < 0)
+		return (NULL);
+
+	nvp = nvpair_allocv(name, NV_TYPE_DESCRIPTOR, (uint64_t)value,
+	    sizeof(int64_t));
+	if (nvp == NULL)
+		close(value);
+
+	return (nvp);
+}
+#endif
+
+nvpair_t *
+nvpair_create_binary(const char *name, const void *value, size_t size)
+{
+	nvpair_t *nvp;
+	void *data;
+
+	if (value == NULL || size == 0) {
+		RESTORE_ERRNO(EINVAL);
+		return (NULL);
+	}
+
+	data = nv_malloc(size);
+	if (data == NULL)
+		return (NULL);
+	memcpy(data, value, size);
+
+	nvp = nvpair_allocv(name, NV_TYPE_BINARY, (uint64_t)(uintptr_t)data,
+	    size);
+	if (nvp == NULL)
+		nv_free(data);
+
+	return (nvp);
+}
+
+nvpair_t *
+nvpair_move_string(const char *name, char *value)
+{
+	nvpair_t *nvp;
+	int serrno;
+
+	if (value == NULL) {
+		RESTORE_ERRNO(EINVAL);
+		return (NULL);
+	}
+
+	nvp = nvpair_allocv(name, NV_TYPE_STRING, (uint64_t)(uintptr_t)value,
+	    strlen(value) + 1);
+	if (nvp == NULL) {
+		SAVE_ERRNO(serrno);
+		nv_free(value);
+		RESTORE_ERRNO(serrno);
+	}
+
+	return (nvp);
+}
+
+nvpair_t *
+nvpair_move_nvlist(const char *name, nvlist_t *value)
+{
+	nvpair_t *nvp;
+
+	if (value == NULL || nvlist_get_nvpair_parent(value) != NULL) {
+		RESTORE_ERRNO(EINVAL);
+		return (NULL);
+	}
+
+	if (nvlist_error(value) != 0) {
+		RESTORE_ERRNO(nvlist_error(value));
+		nvlist_destroy(value);
+		return (NULL);
+	}
+
+	nvp = nvpair_allocv(name, NV_TYPE_NVLIST, (uint64_t)(uintptr_t)value,
+	    0);
+	if (nvp == NULL)
+		nvlist_destroy(value);
+	else
+		nvlist_set_parent(value, nvp);
+
+	return (nvp);
+}
+
+#ifndef _KERNEL
+nvpair_t *
+nvpair_move_descriptor(const char *name, int value)
+{
+	nvpair_t *nvp;
+	int serrno;
+
+	if (value < 0 || !fd_is_valid(value)) {
+		errno = EBADF;
+		return (NULL);
+	}
+
+	nvp = nvpair_allocv(name, NV_TYPE_DESCRIPTOR, (uint64_t)value,
+	    sizeof(int64_t));
+	if (nvp == NULL) {
+		serrno = errno;
+		close(value);
+		errno = serrno;
+	}
+
+	return (nvp);
+}
+#endif
+
+nvpair_t *
+nvpair_move_binary(const char *name, void *value, size_t size)
+{
+	nvpair_t *nvp;
+	int serrno;
+
+	if (value == NULL || size == 0) {
+		RESTORE_ERRNO(EINVAL);
+		return (NULL);
+	}
+
+	nvp = nvpair_allocv(name, NV_TYPE_BINARY, (uint64_t)(uintptr_t)value,
+	    size);
+	if (nvp == NULL) {
+		SAVE_ERRNO(serrno);
+		nv_free(value);
+		RESTORE_ERRNO(serrno);
+	}
+
+	return (nvp);
+}
+
+bool
+nvpair_get_bool(const nvpair_t *nvp)
+{
+
+	NVPAIR_ASSERT(nvp);
+
+	return (nvp->nvp_data == 1);
+}
+
+uint64_t
+nvpair_get_number(const nvpair_t *nvp)
+{
+
+	NVPAIR_ASSERT(nvp);
+
+	return (nvp->nvp_data);
+}
+
+const char *
+nvpair_get_string(const nvpair_t *nvp)
+{
+
+	NVPAIR_ASSERT(nvp);
+	PJDLOG_ASSERT(nvp->nvp_type == NV_TYPE_STRING);
+
+	return ((const char *)(intptr_t)nvp->nvp_data);
+}
+
+const nvlist_t *
+nvpair_get_nvlist(const nvpair_t *nvp)
+{
+
+	NVPAIR_ASSERT(nvp);
+	PJDLOG_ASSERT(nvp->nvp_type == NV_TYPE_NVLIST);
+
+	return ((const nvlist_t *)(intptr_t)nvp->nvp_data);
+}
+
+#ifndef _KERNEL
+int
+nvpair_get_descriptor(const nvpair_t *nvp)
+{
+
+	NVPAIR_ASSERT(nvp);
+	PJDLOG_ASSERT(nvp->nvp_type == NV_TYPE_DESCRIPTOR);
+
+	return ((int)nvp->nvp_data);
+}
+#endif
+
+const void *
+nvpair_get_binary(const nvpair_t *nvp, size_t *sizep)
+{
+
+	NVPAIR_ASSERT(nvp);
+	PJDLOG_ASSERT(nvp->nvp_type == NV_TYPE_BINARY);
+
+	if (sizep != NULL)
+		*sizep = nvp->nvp_datasize;
+	return ((const void *)(intptr_t)nvp->nvp_data);
+}
+
+void
+nvpair_free(nvpair_t *nvp)
+{
+
+	NVPAIR_ASSERT(nvp);
+	PJDLOG_ASSERT(nvp->nvp_list == NULL);
+
+	nvp->nvp_magic = 0;
+	switch (nvp->nvp_type) {
+#ifndef _KERNEL
+	case NV_TYPE_DESCRIPTOR:
+		close((int)nvp->nvp_data);
+		break;
+#endif
+	case NV_TYPE_NVLIST:
+		nvlist_destroy((nvlist_t *)(intptr_t)nvp->nvp_data);
+		break;
+	case NV_TYPE_STRING:
+		nv_free((char *)(intptr_t)nvp->nvp_data);
+		break;
+	case NV_TYPE_BINARY:
+		nv_free((void *)(intptr_t)nvp->nvp_data);
+		break;
+	}
+	nv_free(nvp);
+}
+
+void
+nvpair_free_structure(nvpair_t *nvp)
+{
+
+	NVPAIR_ASSERT(nvp);
+	PJDLOG_ASSERT(nvp->nvp_list == NULL);
+
+	nvp->nvp_magic = 0;
+	nv_free(nvp);
+}
+
+const char *
+nvpair_type_string(int type)
+{
+
+	switch (type) {
+	case NV_TYPE_NULL:
+		return ("NULL");
+	case NV_TYPE_BOOL:
+		return ("BOOL");
+	case NV_TYPE_NUMBER:
+		return ("NUMBER");
+	case NV_TYPE_STRING:
+		return ("STRING");
+	case NV_TYPE_NVLIST:
+		return ("NVLIST");
+	case NV_TYPE_DESCRIPTOR:
+		return ("DESCRIPTOR");
+	case NV_TYPE_BINARY:
+		return ("BINARY");
+	default:
+		return ("<UNKNOWN>");
+	}
+}
+


Property changes on: trunk/sys/kern/subr_nvpair.c
___________________________________________________________________
Added: svn:eol-style
## -0,0 +1 ##
+native
\ No newline at end of property
Added: svn:keywords
## -0,0 +1 ##
+MidnightBSD=%H
\ No newline at end of property
Added: svn:mime-type
## -0,0 +1 ##
+text/plain
\ No newline at end of property