[Midnightbsd-cvs] src: sys/kern: Sync with freebsd

Sat Sep 27 19:02:19 EDT 2008

Log Message:
-----------
Sync with freebsd

Modified Files:
--------------
    src/sys/kern:
        Makefile (r1.1.1.1 -> r1.2)
        bus_if.m (r1.1.1.1 -> r1.2)
        imgact_aout.c (r1.2 -> r1.3)
        imgact_elf.c (r1.2 -> r1.3)
        imgact_gzip.c (r1.2 -> r1.3)
        inflate.c (r1.1.1.1 -> r1.2)
        init_main.c (r1.3 -> r1.4)
        init_sysent.c (r1.2 -> r1.3)
        kern_acct.c (r1.1.1.2 -> r1.2)
        kern_alq.c (r1.1.1.1 -> r1.2)
        kern_clock.c (r1.2 -> r1.3)
        kern_condvar.c (r1.2 -> r1.3)
        kern_conf.c (r1.1.1.1 -> r1.2)
        kern_context.c (r1.1.1.1 -> r1.2)
        kern_cpu.c (r1.2 -> r1.3)
        kern_descrip.c (r1.5 -> r1.6)
        kern_environment.c (r1.1.1.1 -> r1.2)
        kern_event.c (r1.3 -> r1.4)
        kern_exec.c (r1.2 -> r1.3)
        kern_exit.c (r1.2 -> r1.3)
        kern_fork.c (r1.2 -> r1.3)
        kern_idle.c (r1.1.1.1 -> r1.2)
        kern_intr.c (r1.2 -> r1.3)
        kern_jail.c (r1.1.1.1 -> r1.2)
        kern_kse.c (r1.3 -> r1.4)
        kern_kthread.c (r1.2 -> r1.3)
        kern_ktr.c (r1.1.1.1 -> r1.2)
        kern_ktrace.c (r1.2 -> r1.3)
        kern_linker.c (r1.1.1.1 -> r1.2)
        kern_lock.c (r1.2 -> r1.3)
        kern_lockf.c (r1.1.1.1 -> r1.2)
        kern_malloc.c (r1.2 -> r1.3)
        kern_mbuf.c (r1.3 -> r1.4)
        kern_mib.c (r1.1.1.1 -> r1.2)
        kern_module.c (r1.1.1.1 -> r1.2)
        kern_mtxpool.c (r1.1.1.1 -> r1.2)
        kern_mutex.c (r1.1.1.2 -> r1.2)
        kern_ntptime.c (r1.1.1.1 -> r1.2)
        kern_pmc.c (r1.1.1.1 -> r1.2)
        kern_poll.c (r1.2 -> r1.3)
        kern_proc.c (r1.2 -> r1.3)
        kern_prot.c (r1.1.1.1 -> r1.2)
        kern_resource.c (r1.1.1.2 -> r1.2)
        kern_shutdown.c (r1.2 -> r1.3)
        kern_sig.c (r1.2 -> r1.3)
        kern_subr.c (r1.1.1.1 -> r1.2)
        kern_switch.c (r1.1.1.1 -> r1.2)
        kern_sx.c (r1.1.1.2 -> r1.2)
        kern_synch.c (r1.2 -> r1.3)
        kern_syscalls.c (r1.1.1.1 -> r1.2)
        kern_sysctl.c (r1.2 -> r1.3)
        kern_tc.c (r1.1.1.1 -> r1.2)
        kern_thr.c (r1.2 -> r1.3)
        kern_thread.c (r1.4 -> r1.5)
        kern_time.c (r1.1.1.2 -> r1.2)
        kern_timeout.c (r1.1.1.1 -> r1.2)
        kern_umtx.c (r1.2 -> r1.3)
        kern_uuid.c (r1.1.1.1 -> r1.2)
        kern_xxx.c (r1.1.1.1 -> r1.2)
        link_elf.c (r1.1.1.2 -> r1.2)
        link_elf_obj.c (r1.1.1.2 -> r1.2)
        makesyscalls.sh (r1.1.1.1 -> r1.2)
        md5c.c (r1.1.1.1 -> r1.2)
        sched_4bsd.c (r1.2 -> r1.3)
        sched_ule.c (r1.1.1.1 -> r1.2)
        subr_autoconf.c (r1.1.1.1 -> r1.2)
        subr_bus.c (r1.1.1.1 -> r1.2)
        subr_clock.c (r1.1.1.1 -> r1.2)
        subr_disk.c (r1.1.1.2 -> r1.2)
        subr_firmware.c (r1.1.1.1 -> r1.2)
        subr_hints.c (r1.1.1.1 -> r1.2)
        subr_kdb.c (r1.1.1.1 -> r1.2)
        subr_kobj.c (r1.1.1.1 -> r1.2)
        subr_mbpool.c (r1.1.1.1 -> r1.2)
        subr_mchain.c (r1.1.1.1 -> r1.2)
        subr_param.c (r1.1.1.1 -> r1.2)
        subr_pcpu.c (r1.1.1.1 -> r1.2)
        subr_power.c (r1.1.1.1 -> r1.2)
        subr_prf.c (r1.2 -> r1.3)
        subr_prof.c (r1.1.1.1 -> r1.2)
        subr_rman.c (r1.1.1.2 -> r1.2)
        subr_sbuf.c (r1.1.1.1 -> r1.2)
        subr_sleepqueue.c (r1.2 -> r1.3)
        subr_smp.c (r1.2 -> r1.3)
        subr_stack.c (r1.1 -> r1.2)
        subr_taskqueue.c (r1.2 -> r1.3)
        subr_trap.c (r1.1.1.1 -> r1.2)
        subr_turnstile.c (r1.1.1.2 -> r1.2)
        subr_unit.c (r1.1.1.1 -> r1.2)
        subr_witness.c (r1.3 -> r1.4)
        sys_generic.c (r1.2 -> r1.3)
        sys_pipe.c (r1.2 -> r1.3)
        sys_process.c (r1.2 -> r1.3)
        sys_socket.c (r1.1.1.1 -> r1.2)
        syscalls.c (r1.2 -> r1.3)
        syscalls.master (r1.2 -> r1.3)
        sysv_ipc.c (r1.1.1.1 -> r1.2)
        sysv_msg.c (r1.1.1.1 -> r1.2)
        sysv_sem.c (r1.1.1.1 -> r1.2)
        sysv_shm.c (r1.1.1.1 -> r1.2)
        tty.c (r1.3 -> r1.4)
        tty_compat.c (r1.1.1.1 -> r1.2)
        tty_cons.c (r1.1.1.1 -> r1.2)
        tty_pty.c (r1.2 -> r1.3)
        tty_tty.c (r1.1.1.1 -> r1.2)
        uipc_cow.c (r1.1.1.1 -> r1.2)
        uipc_domain.c (r1.2 -> r1.3)
        uipc_mbuf.c (r1.5 -> r1.6)
        uipc_mbuf2.c (r1.1.1.1 -> r1.2)
        uipc_sem.c (r1.1.1.2 -> r1.2)
        uipc_socket.c (r1.2 -> r1.3)
        uipc_syscalls.c (r1.3 -> r1.4)
        uipc_usrreq.c (r1.2 -> r1.3)
        vfs_aio.c (r1.3 -> r1.4)
        vfs_bio.c (r1.6 -> r1.7)
        vfs_cache.c (r1.2 -> r1.3)
        vfs_cluster.c (r1.2 -> r1.3)
        vfs_default.c (r1.2 -> r1.3)
        vfs_export.c (r1.1.1.1 -> r1.2)
        vfs_hash.c (r1.1.1.1 -> r1.2)
        vfs_init.c (r1.1.1.1 -> r1.2)
        vfs_lookup.c (r1.2 -> r1.3)
        vfs_mount.c (r1.4 -> r1.5)
        vfs_subr.c (r1.2 -> r1.3)
        vfs_syscalls.c (r1.2 -> r1.3)
        vfs_vnops.c (r1.2 -> r1.3)
        vnode_if.src (r1.1.1.1 -> r1.2)

Added Files:
-----------
    src/sys/kern:
        kern_priv.c (r1.1)
        kern_rwlock.c (r1.1)
        ksched.c (r1.1)
        p1003_1b.c (r1.1)
        posix4_mib.c (r1.1)
        serdev_if.m (r1.1)
        subr_acl_posix1e.c (r1.1)
        subr_fattime.c (r1.1)
        subr_lock.c (r1.1)
        subr_rtc.c (r1.1)
        systrace_args.c (r1.1)
        tty_pts.c (r1.1)
        uipc_debug.c (r1.1)
        uipc_mqueue.c (r1.1)
        uipc_sockbuf.c (r1.1)
        vfs_acl.c (r1.1)
        vfs_extattr.c (r1.1)

Removed Files:
-------------
    src/sys/kern:
        kern_acl.c
        kern_mac.c
        uipc_proto.c
        uipc_socket2.c

-------------- next part --------------

--- /dev/null
+++ sys/kern/uipc_debug.c
@@ -0,0 +1,522 @@
+/*-
+ * Copyright (c) 2007 Robert N. M. Watson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Debugger routines relating to sockets, protocols, etc, for use in DDB.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: src/sys/kern/uipc_debug.c,v 1.2 2007/05/03 14:42:41 rwatson Exp $");
+
+#include "opt_ddb.h"
+
+#include <sys/param.h>
+#include <sys/domain.h>
+#include <sys/kernel.h>
+#include <sys/protosw.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+
+#ifdef DDB
+#include <ddb/ddb.h>
+
+static void
+db_print_sotype(short so_type)
+{
+
+	switch (so_type) {
+	case SOCK_STREAM:
+		db_printf("SOCK_STREAM");
+		break;
+
+	case SOCK_DGRAM:
+		db_printf("SOCK_DGRAM");
+		break;
+
+	case SOCK_RAW:
+		db_printf("SOCK_RAW");
+		break;
+
+	case SOCK_RDM:
+		db_printf("SOCK_RDM");
+		break;
+
+	case SOCK_SEQPACKET:
+		db_printf("SOCK_SEQPACKET");
+		break;
+
+	default:
+		db_printf("unknown");
+		break;
+	}
+}
+
+static void
+db_print_sooptions(short so_options)
+{
+	int comma;
+
+	comma = 0;
+	if (so_options & SO_DEBUG) {
+		db_printf("%sSO_DEBUG", comma ? ", " : "");
+		comma = 1;
+	}
+	if (so_options & SO_ACCEPTCONN) {
+		db_printf("%sSO_ACCEPTCONN", comma ? ", " : "");
+		comma = 1;
+	}
+	if (so_options & SO_REUSEADDR) {
+		db_printf("%sSO_REUSEADDR", comma ? ", " : "");
+		comma = 1;
+	}
+	if (so_options & SO_KEEPALIVE) {
+		db_printf("%sSO_KEEPALIVE", comma ? ", " : "");
+		comma = 1;
+	}
+	if (so_options & SO_DONTROUTE) {
+		db_printf("%sSO_DONTROUTE", comma ? ", " : "");
+		comma = 1;
+	}
+	if (so_options & SO_BROADCAST) {
+		db_printf("%sSO_BROADCAST", comma ? ", " : "");
+		comma = 1;
+	}
+	if (so_options & SO_USELOOPBACK) {
+		db_printf("%sSO_USELOOPBACK", comma ? ", " : "");
+		comma = 1;
+	}
+	if (so_options & SO_LINGER) {
+		db_printf("%sSO_LINGER", comma ? ", " : "");
+		comma = 1;
+	}
+	if (so_options & SO_OOBINLINE) {
+		db_printf("%sSO_OOBINLINE", comma ? ", " : "");
+		comma = 1;
+	}
+	if (so_options & SO_REUSEPORT) {
+		db_printf("%sSO_REUSEPORT", comma ? ", " : "");
+		comma = 1;
+	}
+	if (so_options & SO_TIMESTAMP) {
+		db_printf("%sSO_TIMESTAMP", comma ? ", " : "");
+		comma = 1;
+	}
+	if (so_options & SO_NOSIGPIPE) {
+		db_printf("%sSO_NOSIGPIPE", comma ? ", " : "");
+		comma = 1;
+	}
+	if (so_options & SO_ACCEPTFILTER) {
+		db_printf("%sSO_ACCEPTFILTER", comma ? ", " : "");
+		comma = 1;
+	}
+	if (so_options & SO_BINTIME) {
+		db_printf("%sSO_BINTIME", comma ? ", " : "");
+		comma = 1;
+	}
+}
+
+static void
+db_print_sostate(short so_state)
+{
+	int comma;
+
+	comma = 0;
+	if (so_state & SS_NOFDREF) {
+		db_printf("%sSS_FDREF", comma ? ", " : "");
+		comma = 1;
+	}
+	if (so_state & SS_ISCONNECTED) {
+		db_printf("%sSS_ISCONNECTED", comma ? ", " : "");
+		comma = 1;
+	}
+	if (so_state & SS_ISCONNECTING) {
+		db_printf("%sSS_ISCONNECTING", comma ? ", " : "");
+		comma = 1;
+	}
+	if (so_state & SS_ISDISCONNECTING) {
+		db_printf("%sSS_ISDISCONNECTING", comma ? ", " : "");
+		comma = 1;
+	}
+	if (so_state & SS_NBIO) {
+		db_printf("%sSS_NBIO", comma ? ", " : "");
+		comma = 1;
+	}
+	if (so_state & SS_ASYNC) {
+		db_printf("%sSS_ASYNC", comma ? ", " : "");
+		comma = 1;
+	}
+	if (so_state & SS_ISCONFIRMING) {
+		db_printf("%sSS_ISCONFIRMING", comma ? ", " : "");
+		comma = 1;
+	}
+	comma = 0;
+	if (so_state & SS_PROTOREF) {
+		db_printf("%sSS_PROTOREF", comma ? ", " : "");
+		comma = 1;
+	}
+}
+
+static void
+db_print_soqstate(int so_qstate)
+{
+	int comma;
+
+	comma = 0;
+	if (so_qstate & SQ_INCOMP) {
+		db_printf("%sSQ_INCOMP", comma ? ", " : "");
+		comma = 1;
+	}
+	if (so_qstate & SQ_COMP) {
+		db_printf("%sSQ_COMP", comma ? ", " : "");
+		comma = 1;
+	}
+}
+
+static void
+db_print_sbstate(short sb_state)
+{
+	int comma;
+
+	comma = 0;
+	if (sb_state & SBS_CANTSENDMORE) {
+		db_printf("%sSS_CANTSENDMORE", comma ? ", " : "");
+		comma = 1;
+	}
+	if (sb_state & SBS_CANTRCVMORE) {
+		db_printf("%sSS_CANTRCVMORE", comma ? ", " : "");
+		comma = 1;
+	}
+	if (sb_state & SBS_RCVATMARK) {
+		db_printf("%sSS_RCVATMARK", comma ? ", " : "");
+		comma = 1;
+	}
+}
+
+static void
+db_print_indent(int indent)
+{
+	int i;
+
+	for (i = 0; i < indent; i++)
+		db_printf(" ");
+}
+
+static void
+db_print_domain(struct domain *d, const char *domainname, int indent)
+{
+
+	db_print_indent(indent);
+	db_printf("%s at %p\n", domainname, d);
+
+	indent += 2;
+
+	db_print_indent(indent);
+	db_printf("dom_family: %d   ", d->dom_family);
+	db_printf("dom_name: %s\n", d->dom_name);
+
+	db_print_indent(indent);
+	db_printf("dom_init: %p   ", d->dom_init);
+	db_printf("dom_externalize: %p   ", d->dom_externalize);
+	db_printf("dom_dispose: %p\n", d->dom_dispose);
+
+	db_print_indent(indent);
+	db_printf("dom_protosw: %p   ", d->dom_protosw);
+	db_printf("dom_next: %p\n", d->dom_next);
+
+	db_print_indent(indent);
+	db_printf("dom_rtattach: %p   ", d->dom_rtattach);
+	db_printf("dom_rtoffset: %d   ", d->dom_rtoffset);
+	db_printf("dom_maxrtkey: %d\n", d->dom_maxrtkey);
+
+	db_print_indent(indent);
+	db_printf("dom_ifattach: %p   ", d->dom_ifattach);
+	db_printf("dom_ifdetach: %p\n", d->dom_ifdetach);
+}
+
+static void
+db_print_prflags(short pr_flags)
+{
+	int comma;
+
+	comma = 0;
+	if (pr_flags & PR_ATOMIC) {
+		db_printf("%sPR_ATOMIC", comma ? ", " : "");
+		comma = 1;
+	}
+	if (pr_flags & PR_ADDR) {
+		db_printf("%sPR_ADDR", comma ? ", " : "");
+		comma = 1;
+	}
+	if (pr_flags & PR_CONNREQUIRED) {
+		db_printf("%sPR_CONNREQUIRED", comma ? ", " : "");
+		comma = 1;
+	}
+	if (pr_flags & PR_WANTRCVD) {
+		db_printf("%sPR_WANTRCVD", comma ? ", " : "");
+		comma = 1;
+	}
+	if (pr_flags & PR_RIGHTS) {
+		db_printf("%sPR_RIGHTS", comma ? ", " : "");
+		comma = 1;
+	}
+	if (pr_flags & PR_IMPLOPCL) {
+		db_printf("%sPR_IMPLOPCL", comma ? ", " : "");
+		comma = 1;
+	}
+	if (pr_flags & PR_LASTHDR) {
+		db_printf("%sPR_LASTHDR", comma ? ", " : "");
+		comma = 1;
+	}
+}
+
+static void
+db_print_protosw(struct protosw *pr, const char *prname, int indent)
+{
+
+	db_print_indent(indent);
+	db_printf("%s at %p\n", prname, pr);
+
+	indent += 2;
+
+	db_print_indent(indent);
+	db_printf("pr_type: %d   ", pr->pr_type);
+	db_printf("pr_domain: %p\n", pr->pr_domain);
+	if (pr->pr_domain != NULL)
+		db_print_domain(pr->pr_domain, "pr_domain", indent);
+
+	db_print_indent(indent);
+	db_printf("pr_protocol: %d\n", pr->pr_protocol);
+
+	db_print_indent(indent);
+	db_printf("pr_flags: %d (", pr->pr_flags);
+	db_print_prflags(pr->pr_flags);
+	db_printf(")\n");
+
+	db_print_indent(indent);
+	db_printf("pr_input: %p   ", pr->pr_input);
+	db_printf("pr_output: %p   ", pr->pr_output);
+	db_printf("pr_ctlinput: %p\n", pr->pr_ctlinput);
+
+	db_print_indent(indent);
+	db_printf("pr_ctloutput: %p   ", pr->pr_ctloutput);
+	db_printf("pr_ousrreq: %p   ", pr->pr_ousrreq);
+	db_printf("pr_init: %p\n", pr->pr_init);
+
+	db_print_indent(indent);
+	db_printf("pr_fasttimo: %p   ", pr->pr_fasttimo);
+	db_printf("pr_slowtimo: %p   ", pr->pr_slowtimo);
+	db_printf("pr_drain: %p\n", pr->pr_drain);
+
+	db_print_indent(indent);
+	db_printf("pr_ousrreq: %p\n", pr->pr_ousrreq);
+}
+
+static void
+db_print_sbflags(short sb_flags)
+{
+	int comma;
+
+	comma = 0;
+	if (sb_flags & SB_WAIT) {
+		db_printf("%sSB_WAIT", comma ? ", " : "");
+		comma = 1;
+	}
+	if (sb_flags & SB_SEL) {
+		db_printf("%sSB_SEL", comma ? ", " : "");
+		comma = 1;
+	}
+	if (sb_flags & SB_ASYNC) {
+		db_printf("%sSB_ASYNC", comma ? ", " : "");
+		comma = 1;
+	}
+	if (sb_flags & SB_UPCALL) {
+		db_printf("%sSB_UPCALL", comma ? ", " : "");
+		comma = 1;
+	}
+	if (sb_flags & SB_NOINTR) {
+		db_printf("%sSB_NOINTR", comma ? ", " : "");
+		comma = 1;
+	}
+	if (sb_flags & SB_AIO) {
+		db_printf("%sSB_AIO", comma ? ", " : "");
+		comma = 1;
+	}
+	if (sb_flags & SB_KNOTE) {
+		db_printf("%sSB_KNOTE", comma ? ", " : "");
+		comma = 1;
+	}
+	if (sb_flags & SB_AUTOSIZE) {
+		db_printf("%sSB_AUTOSIZE", comma ? ", " : "");
+		comma = 1;
+	}
+}
+
+static void
+db_print_sockbuf(struct sockbuf *sb, const char *sockbufname, int indent)
+{
+
+	db_print_indent(indent);
+	db_printf("%s at %p\n", sockbufname, sb);
+
+	indent += 2;
+
+	db_print_indent(indent);
+	db_printf("sb_state: 0x%x (", sb->sb_state);
+	db_print_sbstate(sb->sb_state);
+	db_printf(")\n");
+
+	db_print_indent(indent);
+	db_printf("sb_mb: %p   ", sb->sb_mb);
+	db_printf("sb_mbtail: %p   ", sb->sb_mbtail);
+	db_printf("sb_lastrecord: %p\n", sb->sb_lastrecord);
+
+	db_print_indent(indent);
+	db_printf("sb_cc: %d   ", sb->sb_cc);
+	db_printf("sb_hiwat: %d   ", sb->sb_hiwat);
+	db_printf("sb_mbcnt: %d   ", sb->sb_mbcnt);
+	db_printf("sb_mbmax: %d\n", sb->sb_mbmax);
+
+	db_print_indent(indent);
+	db_printf("sb_ctl: %d   ", sb->sb_ctl);
+	db_printf("sb_lowat: %d   ", sb->sb_lowat);
+	db_printf("sb_timeo: %d\n", sb->sb_timeo);
+
+	db_print_indent(indent);
+	db_printf("sb_flags: 0x%x (", sb->sb_flags);
+	db_print_sbflags(sb->sb_flags);
+	db_printf(")\n");
+}
+
+static void
+db_print_socket(struct socket *so, const char *socketname, int indent)
+{
+
+	db_print_indent(indent);
+	db_printf("%s at %p\n", socketname, so);
+
+	indent += 2;
+
+	db_print_indent(indent);
+	db_printf("so_count: %d   ", so->so_count);
+	db_printf("so_type: %d (", so->so_type);
+	db_print_sotype(so->so_type);
+	db_printf(")\n");
+
+	db_print_indent(indent);
+	db_printf("so_options: 0x%x (", so->so_options);
+	db_print_sooptions(so->so_options);
+	db_printf(")\n");
+
+	db_print_indent(indent);
+	db_printf("so_linger: %d   ", so->so_linger);
+	db_printf("so_state: 0x%x (", so->so_state);
+	db_print_sostate(so->so_state);
+	db_printf(")\n");
+
+	db_print_indent(indent);
+	db_printf("so_qstate: 0x%x (", so->so_qstate);
+	db_print_soqstate(so->so_qstate);
+	db_printf(")   ");
+	db_printf("so_pcb: %p   ", so->so_pcb);
+	db_printf("so_proto: %p\n", so->so_proto);
+
+	if (so->so_proto != NULL)
+		db_print_protosw(so->so_proto, "so_proto", indent);
+
+	db_print_indent(indent);
+	db_printf("so_head: %p   ", so->so_head);
+	db_printf("so_incomp first: %p   ", TAILQ_FIRST(&so->so_incomp));
+	db_printf("so_comp first: %p\n", TAILQ_FIRST(&so->so_comp));
+
+	db_print_indent(indent);
+	/* so_list skipped */
+	db_printf("so_qlen: %d   ", so->so_qlen);
+	db_printf("so_incqlen: %d   ", so->so_incqlen);
+	db_printf("so_qlimit: %d   ", so->so_qlimit);
+	db_printf("so_timeo: %d   ", so->so_timeo);
+	db_printf("so_error: %d\n", so->so_error);
+
+	db_print_indent(indent);
+	db_printf("so_sigio: %p   ", so->so_sigio);
+	db_printf("so_oobmark: %lu   ", so->so_oobmark);
+	db_printf("so_aiojobq first: %p\n", TAILQ_FIRST(&so->so_aiojobq));
+
+	db_print_sockbuf(&so->so_rcv, "so_rcv", indent);
+	db_print_sockbuf(&so->so_snd, "so_snd", indent);
+}
+
+DB_SHOW_COMMAND(socket, db_show_socket)
+{
+	struct socket *so;
+
+	if (!have_addr) {
+		db_printf("usage: show socket <addr>\n");
+		return;
+	}
+	so = (struct socket *)addr;
+
+	db_print_socket(so, "socket", 0);
+}
+
+DB_SHOW_COMMAND(sockbuf, db_show_sockbuf)
+{
+	struct sockbuf *sb;
+
+	if (!have_addr) {
+		db_printf("usage: show sockbuf <addr>\n");
+		return;
+	}
+	sb = (struct sockbuf *)addr;
+
+	db_print_sockbuf(sb, "sockbuf", 0);
+}
+
+DB_SHOW_COMMAND(protosw, db_show_protosw)
+{
+	struct protosw *pr;
+
+	if (!have_addr) {
+		db_printf("usage: show protosw <addr>\n");
+		return;
+	}
+	pr = (struct protosw *)addr;
+
+	db_print_protosw(pr, "protosw", 0);
+}
+
+DB_SHOW_COMMAND(domain, db_show_domain)
+{
+	struct domain *d;
+
+	if (!have_addr) {
+		db_printf("usage: show protosw <addr>\n");
+		return;
+	}
+	d = (struct domain *)addr;
+
+	db_print_domain(d, "domain", 0);
+}
+#endif
Index: subr_smp.c
===================================================================
RCS file: /home/cvs/src/sys/kern/subr_smp.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/subr_smp.c -L sys/kern/subr_smp.c -u -r1.2 -r1.3
--- sys/kern/subr_smp.c
+++ sys/kern/subr_smp.c
@@ -33,9 +33,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/subr_smp.c,v 1.196 2005/06/30 03:38:10 peter Exp $");
-
-#include "opt_kdb.h"
+__FBSDID("$FreeBSD: src/sys/kern/subr_smp.c,v 1.201 2007/09/11 22:54:09 attilio Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -49,6 +47,7 @@
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 
+#include <machine/cpu.h>
 #include <machine/smp.h>
 
 #include "opt_sched.h"
@@ -109,7 +108,7 @@
 static void (*smp_rv_action_func)(void *arg);
 static void (*smp_rv_teardown_func)(void *arg);
 static void *smp_rv_func_arg;
-static volatile int smp_rv_waiters[2];
+static volatile int smp_rv_waiters[3];
 
 /* 
  * Shared mutex to restrict busywaits between smp_rendezvous() and
@@ -145,11 +144,11 @@
 
 	mtx_init(&smp_ipi_mtx, "smp rendezvous", NULL, MTX_SPIN);
 	cpu_mp_start();
-	printf("MidnightBSD/SMP: Multiprocessor System Detected: %d CPUs\n",
+	printf("FreeBSD/SMP: Multiprocessor System Detected: %d CPUs\n",
 	    mp_ncpus);
 	cpu_mp_announce();
 }
-SYSINIT(cpu_mp, SI_SUB_CPU, SI_ORDER_SECOND, mp_start, NULL)
+SYSINIT(cpu_mp, SI_SUB_CPU, SI_ORDER_THIRD, mp_start, NULL)
 
 void
 forward_signal(struct thread *td)
@@ -161,7 +160,7 @@
 	 * this thread, so all we need to do is poke it if it is currently
 	 * executing so that it executes ast().
 	 */
-	mtx_assert(&sched_lock, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	KASSERT(TD_IS_RUNNING(td),
 	    ("forward_signal: thread is not TDS_RUNNING"));
 
@@ -189,8 +188,6 @@
 	struct thread *td;
 	cpumask_t id, map, me;
 
-	mtx_assert(&sched_lock, MA_OWNED);
-
 	CTR0(KTR_SMP, "forward_roundrobin()");
 
 	if (!smp_started || cold || panicstr)
@@ -203,7 +200,7 @@
 		td = pc->pc_curthread;
 		id = pc->pc_cpumask;
 		if (id != me && (id & stopped_cpus) == 0 &&
-		    td != pc->pc_idlethread) {
+		    !TD_IS_IDLETHREAD(td)) {
 			td->td_flags |= TDF_NEEDRESCHED;
 			map |= id;
 		}
@@ -242,37 +239,9 @@
 	ipi_selected(map, IPI_STOP);
 
 	i = 0;
-	while ((atomic_load_acq_int(&stopped_cpus) & map) != map) {
-		/* spin */
-		i++;
-#ifdef DIAGNOSTIC
-		if (i == 100000) {
-			printf("timeout stopping cpus\n");
-			break;
-		}
-#endif
-	}
-
-	return 1;
-}
-
-#ifdef KDB_STOP_NMI
-int
-stop_cpus_nmi(cpumask_t map)
-{
-	int i;
-
-	if (!smp_started)
-		return 0;
-
-	CTR1(KTR_SMP, "stop_cpus(%x)", map);
-
-	/* send the stop IPI to all CPUs in map */
-	ipi_nmi_selected(map);
-
-	i = 0;
-	while ((atomic_load_acq_int(&stopped_cpus) & map) != map) {
+	while ((stopped_cpus & map) != map) {
 		/* spin */
+		cpu_spinwait();
 		i++;
 #ifdef DIAGNOSTIC
 		if (i == 100000) {
@@ -284,7 +253,6 @@
 
 	return 1;
 }
-#endif /* KDB_STOP_NMI */
 
 /*
  * Called by a CPU to restart stopped CPUs. 
@@ -312,8 +280,8 @@
 	atomic_store_rel_int(&started_cpus, map);
 
 	/* wait for each to clear its bit */
-	while ((atomic_load_acq_int(&stopped_cpus) & map) != 0)
-		;	/* nothing */
+	while ((stopped_cpus & map) != 0)
+		cpu_spinwait();
 
 	return 1;
 }
@@ -331,20 +299,29 @@
 smp_rendezvous_action(void)
 {
 
+	/* Ensure we have up-to-date values. */
+	atomic_add_acq_int(&smp_rv_waiters[0], 1);
+	while (smp_rv_waiters[0] < mp_ncpus)
+		cpu_spinwait();
+
 	/* setup function */
 	if (smp_rv_setup_func != NULL)
 		smp_rv_setup_func(smp_rv_func_arg);
+
 	/* spin on entry rendezvous */
-	atomic_add_int(&smp_rv_waiters[0], 1);
-	while (atomic_load_acq_int(&smp_rv_waiters[0]) < mp_ncpus)
-		;	/* nothing */
+	atomic_add_int(&smp_rv_waiters[1], 1);
+	while (smp_rv_waiters[1] < mp_ncpus)
+		cpu_spinwait();
+
 	/* action function */
 	if (smp_rv_action_func != NULL)
 		smp_rv_action_func(smp_rv_func_arg);
+
 	/* spin on exit rendezvous */
-	atomic_add_int(&smp_rv_waiters[1], 1);
-	while (atomic_load_acq_int(&smp_rv_waiters[1]) < mp_ncpus)
-		;	/* nothing */
+	atomic_add_int(&smp_rv_waiters[2], 1);
+	while (smp_rv_waiters[2] < mp_ncpus)
+		cpu_spinwait();
+
 	/* teardown function */
 	if (smp_rv_teardown_func != NULL)
 		smp_rv_teardown_func(smp_rv_func_arg);
@@ -375,8 +352,9 @@
 	smp_rv_action_func = action_func;
 	smp_rv_teardown_func = teardown_func;
 	smp_rv_func_arg = arg;
-	smp_rv_waiters[0] = 0;
 	smp_rv_waiters[1] = 0;
+	smp_rv_waiters[2] = 0;
+	atomic_store_rel_int(&smp_rv_waiters[0], 0);
 
 	/* signal other processors, which will enter the IPI with interrupts off */
 	ipi_all_but_self(IPI_RENDEZVOUS);
Index: subr_mbpool.c
===================================================================
RCS file: /home/cvs/src/sys/kern/subr_mbpool.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/subr_mbpool.c -L sys/kern/subr_mbpool.c -u -r1.1.1.1 -r1.2
--- sys/kern/subr_mbpool.c
+++ sys/kern/subr_mbpool.c
@@ -28,7 +28,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/subr_mbpool.c,v 1.3 2005/01/06 23:35:39 imp Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/subr_mbpool.c,v 1.4 2007/05/27 17:38:36 rwatson Exp $");
 
 #include <sys/param.h>
 #include <sys/lock.h>
@@ -338,7 +338,7 @@
 	}
 	mtx_lock(&p->free_lock);
 	SLIST_FOREACH(cf, &p->free_list, link)
-		*free++;
+		(*free)++;
 	mtx_unlock(&p->free_lock);
 }
 
Index: kern_conf.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_conf.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/kern_conf.c -L sys/kern/kern_conf.c -u -r1.1.1.1 -r1.2
--- sys/kern/kern_conf.c
+++ sys/kern/kern_conf.c
@@ -25,7 +25,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_conf.c,v 1.186.2.5 2005/11/06 15:58:06 jhb Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_conf.c,v 1.208.2.1 2007/12/07 03:45:16 thompsa Exp $");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
@@ -39,9 +39,11 @@
 #include <sys/vnode.h>
 #include <sys/queue.h>
 #include <sys/poll.h>
+#include <sys/sx.h>
 #include <sys/ctype.h>
 #include <sys/tty.h>
 #include <sys/ucred.h>
+#include <sys/taskqueue.h>
 #include <machine/stdarg.h>
 
 #include <fs/devfs/devfs_int.h>
@@ -50,9 +52,15 @@
 
 struct mtx devmtx;
 static void destroy_devl(struct cdev *dev);
-static struct cdev *make_dev_credv(struct cdevsw *devsw, int minornr,
-	    struct ucred *cr, uid_t uid, gid_t gid, int mode, const char *fmt,
-	    va_list ap);
+static int destroy_dev_sched_cbl(struct cdev *dev,
+    void (*cb)(void *), void *arg);
+static struct cdev *make_dev_credv(int flags,
+    struct cdevsw *devsw, int minornr,
+    struct ucred *cr, uid_t uid, gid_t gid, int mode, const char *fmt,
+    va_list ap);
+
+static struct cdev_priv_list cdevp_free_list =
+    TAILQ_HEAD_INITIALIZER(cdevp_free_list);
 
 void
 dev_lock(void)
@@ -61,6 +69,31 @@
 	mtx_lock(&devmtx);
 }
 
+static void
+dev_unlock_and_free(void)
+{
+	struct cdev_priv *cdp;
+
+	mtx_assert(&devmtx, MA_OWNED);
+	while ((cdp = TAILQ_FIRST(&cdevp_free_list)) != NULL) {
+		TAILQ_REMOVE(&cdevp_free_list, cdp, cdp_list);
+		mtx_unlock(&devmtx);
+		devfs_free(&cdp->cdp_c);
+		mtx_lock(&devmtx);
+	}
+	mtx_unlock(&devmtx);
+}
+
+static void
+dev_free_devlocked(struct cdev *cdev)
+{
+	struct cdev_priv *cdp;
+
+	mtx_assert(&devmtx, MA_OWNED);
+	cdp = cdev->si_priv;
+	TAILQ_INSERT_HEAD(&cdevp_free_list, cdp, cdp_list);
+}
+
 void
 dev_unlock(void)
 {
@@ -102,7 +135,7 @@
 		;
 	else 
 #endif
-if (dev->si_devsw == NULL && dev->si_refcount == 0) {
+	if (dev->si_devsw == NULL && dev->si_refcount == 0) {
 		LIST_REMOVE(dev, si_list);
 		flag = 1;
 	}
@@ -115,12 +148,40 @@
 dev_refthread(struct cdev *dev)
 {
 	struct cdevsw *csw;
+	struct cdev_priv *cdp;
 
 	mtx_assert(&devmtx, MA_NOTOWNED);
 	dev_lock();
 	csw = dev->si_devsw;
-	if (csw != NULL)
-		dev->si_threadcount++;
+	if (csw != NULL) {
+		cdp = dev->si_priv;
+		if ((cdp->cdp_flags & CDP_SCHED_DTR) == 0)
+			dev->si_threadcount++;
+		else
+			csw = NULL;
+	}
+	dev_unlock();
+	return (csw);
+}
+
+struct cdevsw *
+devvn_refthread(struct vnode *vp, struct cdev **devp)
+{
+	struct cdevsw *csw;
+	struct cdev_priv *cdp;
+
+	mtx_assert(&devmtx, MA_NOTOWNED);
+	csw = NULL;
+	dev_lock();
+	*devp = vp->v_rdev;
+	if (*devp != NULL) {
+		cdp = (*devp)->si_priv;
+		if ((cdp->cdp_flags & CDP_SCHED_DTR) == 0) {
+			csw = (*devp)->si_devsw;
+			if (csw != NULL)
+				(*devp)->si_threadcount++;
+		}
+	}
 	dev_unlock();
 	return (csw);
 }
@@ -246,13 +307,13 @@
 }
 
 static int
-giant_fdopen(struct cdev *dev, int oflags, struct thread *td, int fdidx)
+giant_fdopen(struct cdev *dev, int oflags, struct thread *td, struct file *fp)
 {
 	int retval;
 
 	mtx_lock(&Giant);
 	retval = dev->si_devsw->d_gianttrick->
-	    d_fdopen(dev, oflags, td, fdidx);
+	    d_fdopen(dev, oflags, td, fp);
 	mtx_unlock(&Giant);
 	return (retval);
 }
@@ -399,7 +460,7 @@
 	udev = y;
 	LIST_FOREACH(si2, &csw->d_devs, si_list) {
 		if (si2->si_drv0 == udev) {
-			devfs_free(si);
+			dev_free_devlocked(si);
 			return (si2);
 		}
 	}
@@ -449,7 +510,8 @@
 	if (devsw->d_version != D_VERSION_01) {
 		printf(
 		    "WARNING: Device driver \"%s\" has wrong version %s\n",
-		    devsw->d_name, "and is disabled.  Recompile KLD module.");
+		    devsw->d_name == NULL ? "???" : devsw->d_name,
+		    "and is disabled.  Recompile KLD module.");
 		devsw->d_open = dead_open;
 		devsw->d_close = dead_close;
 		devsw->d_read = dead_read;
@@ -507,8 +569,9 @@
 	dev_unlock();
 }
 
-static struct cdev *
-make_dev_credv(struct cdevsw *devsw, int minornr, struct ucred *cr, uid_t uid,
+struct cdev *
+make_dev_credv(int flags, struct cdevsw *devsw, int minornr,
+    struct ucred *cr, uid_t uid,
     gid_t gid, int mode, const char *fmt, va_list ap)
 {
 	struct cdev *dev;
@@ -522,6 +585,8 @@
 	dev = devfs_alloc();
 	dev_lock();
 	dev = newdev(devsw, minornr, dev);
+	if (flags & MAKEDEV_REF)
+		dev_refl(dev);
 	if (dev->si_flags & SI_CHEAPCLONE &&
 	    dev->si_flags & SI_NAMED) {
 		/*
@@ -529,7 +594,7 @@
 		 * simplifies cloning devices.
 		 * XXX: still ??
 		 */
-		dev_unlock();
+		dev_unlock_and_free();
 		return (dev);
 	}
 	KASSERT(!(dev->si_flags & SI_NAMED),
@@ -543,15 +608,18 @@
 	}
 		
 	dev->si_flags |= SI_NAMED;
+#ifdef MAC
 	if (cr != NULL)
 		dev->si_cred = crhold(cr);
 	else
+#endif
 		dev->si_cred = NULL;
 	dev->si_uid = uid;
 	dev->si_gid = gid;
 	dev->si_mode = mode;
 
 	devfs_create(dev);
+	clean_unrhdrl(devfs_inos);
 	dev_unlock();
 	return (dev);
 }
@@ -564,7 +632,7 @@
 	va_list ap;
 
 	va_start(ap, fmt);
-	dev = make_dev_credv(devsw, minornr, NULL, uid, gid, mode, fmt, ap);
+	dev = make_dev_credv(0, devsw, minornr, NULL, uid, gid, mode, fmt, ap);
 	va_end(ap);
 	return (dev);
 }
@@ -577,7 +645,23 @@
 	va_list ap;
 
 	va_start(ap, fmt);
-	dev = make_dev_credv(devsw, minornr, cr, uid, gid, mode, fmt, ap);
+	dev = make_dev_credv(0, devsw, minornr, cr, uid, gid, mode, fmt, ap);
+	va_end(ap);
+
+	return (dev);
+}
+
+struct cdev *
+make_dev_credf(int flags, struct cdevsw *devsw, int minornr,
+    struct ucred *cr, uid_t uid,
+    gid_t gid, int mode, const char *fmt, ...)
+{
+	struct cdev *dev;
+	va_list ap;
+
+	va_start(ap, fmt);
+	dev = make_dev_credv(flags, devsw, minornr, cr, uid, gid, mode,
+	    fmt, ap);
 	va_end(ap);
 
 	return (dev);
@@ -622,6 +706,7 @@
 	va_end(ap);
 
 	devfs_create(dev);
+	clean_unrhdrl(devfs_inos);
 	dev_unlock();
 	dev_depends(pdev, dev);
 	return (dev);
@@ -635,7 +720,7 @@
 	mtx_assert(&devmtx, MA_OWNED);
 	KASSERT(dev->si_flags & SI_NAMED,
 	    ("WARNING: Driver mistake: destroy_dev on %d\n", minor(dev)));
-		
+
 	devfs_destroy(dev);
 
 	/* Remove name marking */
@@ -657,16 +742,20 @@
 		dev->si_flags &= ~SI_CLONELIST;
 	}
 
+	dev->si_refcount++;	/* Avoid race with dev_rel() */
 	csw = dev->si_devsw;
 	dev->si_devsw = NULL;	/* already NULL for SI_ALIAS */
 	while (csw != NULL && csw->d_purge != NULL && dev->si_threadcount) {
-		printf("Purging %lu threads from %s\n",
-		    dev->si_threadcount, devtoname(dev));
 		csw->d_purge(dev);
 		msleep(csw, &devmtx, PRIBIO, "devprg", hz/10);
+		if (dev->si_threadcount)
+			printf("Still %lu threads in %s\n",
+			    dev->si_threadcount, devtoname(dev));
+	}
+	while (dev->si_threadcount != 0) {
+		/* Use unique dummy wait ident */
+		msleep(&csw, &devmtx, PRIBIO, "devdrn", hz / 10);
 	}
-	if (csw != NULL && csw->d_purge != NULL)
-		printf("All threads purged from %s\n", devtoname(dev));
 
 	dev->si_drv1 = 0;
 	dev->si_drv2 = 0;
@@ -677,15 +766,18 @@
 		LIST_REMOVE(dev, si_list);
 
 		/* If cdevsw has no more struct cdev *'s, clean it */
-		if (LIST_EMPTY(&csw->d_devs))
+		if (LIST_EMPTY(&csw->d_devs)) {
 			fini_cdevsw(csw);
+			wakeup(&csw->d_devs);
+		}
 	}
 	dev->si_flags &= ~SI_ALIAS;
+	dev->si_refcount--;	/* Avoid race with dev_rel() */
 
 	if (dev->si_refcount > 0) {
 		LIST_INSERT_HEAD(&dead_cdevsw.d_devs, dev, si_list);
 	} else {
-		devfs_free(dev);
+		dev_free_devlocked(dev);
 	}
 }
 
@@ -695,7 +787,7 @@
 
 	dev_lock();
 	destroy_devl(dev);
-	dev_unlock();
+	dev_unlock_and_free();
 }
 
 const char *
@@ -779,7 +871,7 @@
 }
 
 int
-clone_create(struct clonedevs **cdp, struct cdevsw *csw, int *up, struct cdev **dp, u_int extra)
+clone_create(struct clonedevs **cdp, struct cdevsw *csw, int *up, struct cdev **dp, int extra)
 {
 	struct clonedevs *cd;
 	struct cdev *dev, *ndev, *dl, *de;
@@ -815,8 +907,8 @@
 		u = dev2unit(dev);
 		if (u == (unit | extra)) {
 			*dp = dev;
-			devfs_free(ndev);
 			dev_unlock();
+			devfs_free(ndev);
 			return (0);
 		}
 		if (unit == -1 && u == low) {
@@ -852,7 +944,7 @@
 		LIST_INSERT_HEAD(&cd->head, dev, si_clone);
 	dev->si_flags |= SI_CLONELIST;
 	*up = unit;
-	dev_unlock();
+	dev_unlock_and_free();
 	return (1);
 }
 
@@ -863,21 +955,126 @@
 void
 clone_cleanup(struct clonedevs **cdp)
 {
-	struct cdev *dev, *tdev;
+	struct cdev *dev;
+	struct cdev_priv *cp;
 	struct clonedevs *cd;
 	
 	cd = *cdp;
 	if (cd == NULL)
 		return;
 	dev_lock();
-	LIST_FOREACH_SAFE(dev, &cd->head, si_clone, tdev) {
+	while (!LIST_EMPTY(&cd->head)) {
+		dev = LIST_FIRST(&cd->head);
+		LIST_REMOVE(dev, si_clone);
 		KASSERT(dev->si_flags & SI_CLONELIST,
 		    ("Dev %p(%s) should be on clonelist", dev, dev->si_name));
-		KASSERT(dev->si_flags & SI_NAMED,
-		    ("Driver has goofed in cloning underways udev %x", dev->si_drv0));
-		destroy_devl(dev);
+		dev->si_flags &= ~SI_CLONELIST;
+		cp = dev->si_priv;
+		if (!(cp->cdp_flags & CDP_SCHED_DTR)) {
+			cp->cdp_flags |= CDP_SCHED_DTR;
+			KASSERT(dev->si_flags & SI_NAMED,
+				("Driver has goofed in cloning underways udev %x", dev->si_drv0));
+			destroy_devl(dev);
+		}
 	}
 	dev_unlock();
 	free(cd, M_DEVBUF);
 	*cdp = NULL;
 }
+
+static TAILQ_HEAD(, cdev_priv) dev_ddtr =
+	TAILQ_HEAD_INITIALIZER(dev_ddtr);
+static struct task dev_dtr_task;
+
+static void
+destroy_dev_tq(void *ctx, int pending)
+{
+	struct cdev_priv *cp;
+	struct cdev *dev;
+	void (*cb)(void *);
+	void *cb_arg;
+
+	dev_lock();
+	while (!TAILQ_EMPTY(&dev_ddtr)) {
+		cp = TAILQ_FIRST(&dev_ddtr);
+		dev = &cp->cdp_c;
+		KASSERT(cp->cdp_flags & CDP_SCHED_DTR,
+		    ("cdev %p in dev_destroy_tq without CDP_SCHED_DTR", cp));
+		TAILQ_REMOVE(&dev_ddtr, cp, cdp_dtr_list);
+		cb = cp->cdp_dtr_cb;
+		cb_arg = cp->cdp_dtr_cb_arg;
+		destroy_devl(dev);
+		dev_unlock();
+		dev_rel(dev);
+		if (cb != NULL)
+			cb(cb_arg);
+		dev_lock();
+	}
+	dev_unlock();
+}
+
+/*
+ * devmtx shall be locked on entry. devmtx will be unlocked after
+ * function return.
+ */
+static int
+destroy_dev_sched_cbl(struct cdev *dev, void (*cb)(void *), void *arg)
+{
+	struct cdev_priv *cp;
+
+	mtx_assert(&devmtx, MA_OWNED);
+	cp = dev->si_priv;
+	if (cp->cdp_flags & CDP_SCHED_DTR) {
+		dev_unlock();
+		return (0);
+	}
+	dev_refl(dev);
+	cp->cdp_flags |= CDP_SCHED_DTR;
+	cp->cdp_dtr_cb = cb;
+	cp->cdp_dtr_cb_arg = arg;
+	TAILQ_INSERT_TAIL(&dev_ddtr, cp, cdp_dtr_list);
+	dev_unlock();
+	taskqueue_enqueue(taskqueue_swi_giant, &dev_dtr_task);
+	return (1);
+}
+
+int
+destroy_dev_sched_cb(struct cdev *dev, void (*cb)(void *), void *arg)
+{
+	dev_lock();
+	return (destroy_dev_sched_cbl(dev, cb, arg));
+}
+
+int
+destroy_dev_sched(struct cdev *dev)
+{
+	return (destroy_dev_sched_cb(dev, NULL, NULL));
+}
+
+void
+destroy_dev_drain(struct cdevsw *csw)
+{
+
+	dev_lock();
+	while (!LIST_EMPTY(&csw->d_devs)) {
+		msleep(&csw->d_devs, &devmtx, PRIBIO, "devscd", hz/10);
+	}
+	dev_unlock();
+}
+
+void
+drain_dev_clone_events(void)
+{
+
+	sx_xlock(&clone_drain_lock);
+	sx_xunlock(&clone_drain_lock);
+}
+
+static void
+devdtr_init(void *dummy __unused)
+{
+
+	TASK_INIT(&dev_dtr_task, 0, destroy_dev_tq, NULL);
+}
+
+SYSINIT(devdtr, SI_SUB_DEVFS, SI_ORDER_SECOND, devdtr_init, NULL);
Index: kern_mbuf.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_mbuf.c,v
retrieving revision 1.3
retrieving revision 1.4
diff -L sys/kern/kern_mbuf.c -L sys/kern/kern_mbuf.c -u -r1.3 -r1.4
--- sys/kern/kern_mbuf.c
+++ sys/kern/kern_mbuf.c
@@ -26,13 +26,12 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_mbuf.c,v 1.9.2.8 2006/05/16 07:27:48 ps Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_mbuf.c,v 1.32.2.1 2007/12/15 23:16:04 rrs Exp $");
 
 #include "opt_mac.h"
 #include "opt_param.h"
 
 #include <sys/param.h>
-#include <sys/mac.h>
 #include <sys/malloc.h>
 #include <sys/systm.h>
 #include <sys/mbuf.h>
@@ -43,6 +42,8 @@
 #include <sys/smp.h>
 #include <sys/sysctl.h>
 
+#include <security/mac/mac_framework.h>
+
 #include <vm/vm.h>
 #include <vm/vm_page.h>
 #include <vm/uma.h>
@@ -106,19 +107,98 @@
 
 	/* This has to be done before VM init. */
 	nmbclusters = 1024 + maxusers * 64;
+	nmbjumbop = nmbclusters / 2;
+	nmbjumbo9 = nmbjumbop / 2;
+	nmbjumbo16 = nmbjumbo9 / 2;
 	TUNABLE_INT_FETCH("kern.ipc.nmbclusters", &nmbclusters);
 }
 SYSINIT(tunable_mbinit, SI_SUB_TUNABLES, SI_ORDER_ANY, tunable_mbinit, NULL);
 
-SYSCTL_DECL(_kern_ipc);
-SYSCTL_INT(_kern_ipc, OID_AUTO, nmbclusters, CTLFLAG_RW, &nmbclusters, 0,
-    "Maximum number of mbuf clusters allowed.");
-SYSCTL_INT(_kern_ipc, OID_AUTO, nmbjumbop, CTLFLAG_RW, &nmbjumbop, 0,
-    "Maximum number of mbuf page size jumbo clusters allowed");
-SYSCTL_INT(_kern_ipc, OID_AUTO, nmbjumbo9, CTLFLAG_RW, &nmbjumbo9, 0,
-    "Maximum number of mbuf 9k jumbo clusters allowed");
-SYSCTL_INT(_kern_ipc, OID_AUTO, nmbjumbo16, CTLFLAG_RW, &nmbjumbo16, 0,
+/* XXX: These should be tuneables. Can't change UMA limits on the fly. */
+static int
+sysctl_nmbclusters(SYSCTL_HANDLER_ARGS)
+{
+	int error, newnmbclusters;
+
+	newnmbclusters = nmbclusters;
+	error = sysctl_handle_int(oidp, &newnmbclusters, 0, req); 
+	if (error == 0 && req->newptr) {
+		if (newnmbclusters > nmbclusters) {
+			nmbclusters = newnmbclusters;
+			uma_zone_set_max(zone_clust, nmbclusters);
+			EVENTHANDLER_INVOKE(nmbclusters_change);
+		} else
+			error = EINVAL;
+	}
+	return (error);
+}
+SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbclusters, CTLTYPE_INT|CTLFLAG_RW,
+&nmbclusters, 0, sysctl_nmbclusters, "IU",
+    "Maximum number of mbuf clusters allowed");
+
+static int
+sysctl_nmbjumbop(SYSCTL_HANDLER_ARGS)
+{
+	int error, newnmbjumbop;
+
+	newnmbjumbop = nmbjumbop;
+	error = sysctl_handle_int(oidp, &newnmbjumbop, 0, req); 
+	if (error == 0 && req->newptr) {
+		if (newnmbjumbop> nmbjumbop) {
+			nmbjumbop = newnmbjumbop;
+			uma_zone_set_max(zone_jumbop, nmbjumbop);
+		} else
+			error = EINVAL;
+	}
+	return (error);
+}
+SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbop, CTLTYPE_INT|CTLFLAG_RW,
+&nmbjumbop, 0, sysctl_nmbjumbop, "IU",
+	 "Maximum number of mbuf page size jumbo clusters allowed");
+
+
+static int
+sysctl_nmbjumbo9(SYSCTL_HANDLER_ARGS)
+{
+	int error, newnmbjumbo9;
+
+	newnmbjumbo9 = nmbjumbo9;
+	error = sysctl_handle_int(oidp, &newnmbjumbo9, 0, req); 
+	if (error == 0 && req->newptr) {
+		if (newnmbjumbo9> nmbjumbo9) {
+			nmbjumbo9 = newnmbjumbo9;
+			uma_zone_set_max(zone_jumbo9, nmbjumbo9);
+		} else
+			error = EINVAL;
+	}
+	return (error);
+}
+SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbo9, CTLTYPE_INT|CTLFLAG_RW,
+&nmbjumbo9, 0, sysctl_nmbjumbo9, "IU",
+	"Maximum number of mbuf 9k jumbo clusters allowed"); 
+
+static int
+sysctl_nmbjumbo16(SYSCTL_HANDLER_ARGS)
+{
+	int error, newnmbjumbo16;
+
+	newnmbjumbo16 = nmbjumbo16;
+	error = sysctl_handle_int(oidp, &newnmbjumbo16, 0, req); 
+	if (error == 0 && req->newptr) {
+		if (newnmbjumbo16> nmbjumbo16) {
+			nmbjumbo16 = newnmbjumbo16;
+			uma_zone_set_max(zone_jumbo16, nmbjumbo16);
+		} else
+			error = EINVAL;
+	}
+	return (error);
+}
+SYSCTL_PROC(_kern_ipc, OID_AUTO, nmbjumbo16, CTLTYPE_INT|CTLFLAG_RW,
+&nmbjumbo16, 0, sysctl_nmbjumbo16, "IU",
     "Maximum number of mbuf 16k jumbo clusters allowed");
+
+
+
 SYSCTL_STRUCT(_kern_ipc, OID_AUTO, mbstat, CTLFLAG_RD, &mbstat, mbstat,
     "Mbuf general information and statistics");
 
@@ -131,6 +211,7 @@
 uma_zone_t	zone_jumbop;
 uma_zone_t	zone_jumbo9;
 uma_zone_t	zone_jumbo16;
+uma_zone_t	zone_ext_refcnt;
 
 /*
  * Local prototypes.
@@ -178,7 +259,6 @@
 	    NULL, NULL,
 #endif
 	    UMA_ALIGN_PTR, UMA_ZONE_REFCNT);
-
 	if (nmbclusters > 0)
 		uma_zone_set_max(zone_clust, nmbclusters);
 
@@ -219,6 +299,11 @@
 	if (nmbjumbo16 > 0)
 		uma_zone_set_max(zone_jumbo16, nmbjumbo16);
 
+	zone_ext_refcnt = uma_zcreate(MBUF_EXTREFCNT_MEM_NAME, sizeof(u_int),
+	    NULL, NULL,
+	    NULL, NULL,
+	    UMA_ALIGN_PTR, UMA_ZONE_ZINIT);
+
 	/* uma_prealloc() goes here... */
 
 	/*
@@ -294,6 +379,8 @@
 		m->m_pkthdr.header = NULL;
 		m->m_pkthdr.csum_flags = 0;
 		m->m_pkthdr.csum_data = 0;
+		m->m_pkthdr.tso_segsz = 0;
+		m->m_pkthdr.ether_vtag = 0;
 		SLIST_INIT(&m->m_pkthdr.tags);
 #ifdef MAC
 		/* If the label init fails, fail the alloc */
@@ -303,7 +390,6 @@
 #endif
 	} else
 		m->m_data = m->m_dat;
-	mbstat.m_mbufs += 1;	/* XXX */
 	return (0);
 }
 
@@ -314,14 +400,18 @@
 mb_dtor_mbuf(void *mem, int size, void *arg)
 {
 	struct mbuf *m;
+	unsigned long flags; 
 
 	m = (struct mbuf *)mem;
-	if ((m->m_flags & M_PKTHDR) != 0)
+	flags = (unsigned long)arg;
+	
+	if ((flags & MB_NOTAGS) == 0 && (m->m_flags & M_PKTHDR) != 0)
 		m_tag_delete_chain(m, NULL);
+	KASSERT((m->m_flags & M_EXT) == 0, ("%s: M_EXT set", __func__));
+	KASSERT((m->m_flags & M_NOFREE) == 0, ("%s: M_NOFREE set", __func__));	
 #ifdef INVARIANTS
 	trash_dtor(mem, size, arg);
 #endif
-	mbstat.m_mbufs -= 1;	/* XXX */
 }
 
 /*
@@ -343,11 +433,18 @@
 	KASSERT(m->m_ext.ext_args == NULL, ("%s: ext_args != NULL", __func__));
 	KASSERT(m->m_ext.ext_size == MCLBYTES, ("%s: ext_size != MCLBYTES", __func__));
 	KASSERT(m->m_ext.ext_type == EXT_PACKET, ("%s: ext_type != EXT_PACKET", __func__));
+	KASSERT(*m->m_ext.ref_cnt == 1, ("%s: ref_cnt != 1", __func__));
 #ifdef INVARIANTS
 	trash_dtor(m->m_ext.ext_buf, MCLBYTES, arg);
 #endif
-	mbstat.m_mbufs -= 1;	/* XXX */
-	mbstat.m_mclusts -= 1;	/* XXX */
+	/*
+	 * If there are processes blocked on zone_clust, waiting for pages to be freed up,
+	 * cause them to be woken up by draining the packet zone. We are exposed to a race here 
+	 * (in the check for the UMA_ZFLAG_FULL) where we might miss the flag set, but that is 
+	 * deliberate. We don't want to acquire the zone lock for every mbuf free.
+	 */
+ 	if (uma_zone_exhausted_nolock(zone_clust))
+ 		zone_drain(zone_pack);
 }
 
 /*
@@ -362,32 +459,41 @@
 mb_ctor_clust(void *mem, int size, void *arg, int how)
 {
 	struct mbuf *m;
-	int type = 0;
-
+	u_int *refcnt;
+	int type;
+	uma_zone_t zone;
+	
 #ifdef INVARIANTS
 	trash_ctor(mem, size, arg, how);
 #endif
- 	m = (struct mbuf *)arg;
-	if (m != NULL) {
-		switch (size) {
-		case MCLBYTES:
-			type = EXT_CLUSTER;
-			break;
+	switch (size) {
+	case MCLBYTES:
+		type = EXT_CLUSTER;
+		zone = zone_clust;
+		break;
 #if MJUMPAGESIZE != MCLBYTES
-		case MJUMPAGESIZE:
-			type = EXT_JUMBOP;
-			break;
-#endif
-		case MJUM9BYTES:
-			type = EXT_JUMBO9;
-			break;
-		case MJUM16BYTES:
-			type = EXT_JUMBO16;
-			break;
-		default:
-			panic("unknown cluster size");
-			break;
-		}
+	case MJUMPAGESIZE:
+		type = EXT_JUMBOP;
+		zone = zone_jumbop;
+		break;
+#endif
+	case MJUM9BYTES:
+		type = EXT_JUMBO9;
+		zone = zone_jumbo9;
+		break;
+	case MJUM16BYTES:
+		type = EXT_JUMBO16;
+		zone = zone_jumbo16;
+		break;
+	default:
+		panic("unknown cluster size");
+		break;
+	}
+
+	m = (struct mbuf *)arg;
+	refcnt = uma_find_refcnt(zone, mem);
+	*refcnt = 1;			
+	if (m != NULL) {
 		m->m_ext.ext_buf = (caddr_t)mem;
 		m->m_data = m->m_ext.ext_buf;
 		m->m_flags |= M_EXT;
@@ -395,9 +501,9 @@
 		m->m_ext.ext_args = NULL;
 		m->m_ext.ext_size = size;
 		m->m_ext.ext_type = type;
-		m->m_ext.ref_cnt = NULL;	/* Lazy counter assign. */
+		m->m_ext.ref_cnt = refcnt;
 	}
-	mbstat.m_mclusts += 1;	/* XXX */
+
 	return (0);
 }
 
@@ -408,9 +514,15 @@
 mb_dtor_clust(void *mem, int size, void *arg)
 {
 #ifdef INVARIANTS
+	uma_zone_t zone;
+
+	zone = m_getzone(size);
+	KASSERT(*(uma_find_refcnt(zone, mem)) <= 1,
+		("%s: refcnt incorrect %u", __func__,
+		 *(uma_find_refcnt(zone, mem))) );
+
 	trash_dtor(mem, size, arg);
 #endif
-	mbstat.m_mclusts -= 1;	/* XXX */
 }
 
 /*
@@ -422,7 +534,7 @@
 {
 	struct mbuf *m;
 
-	m = (struct mbuf *)mem;
+	m = (struct mbuf *)mem;		/* m is virgin. */
 	if (uma_zalloc_arg(zone_clust, m, how) == NULL ||
 	    m->m_ext.ext_buf == NULL)
 		return (ENOMEM);
@@ -430,7 +542,6 @@
 #ifdef INVARIANTS
 	trash_init(m->m_ext.ext_buf, MCLBYTES, how);
 #endif
-	mbstat.m_mclusts -= 1;	/* XXX */
 	return (0);
 }
 
@@ -448,8 +559,6 @@
 	trash_fini(m->m_ext.ext_buf, MCLBYTES);
 #endif
 	uma_zfree_arg(zone_clust, m->m_ext.ext_buf, NULL);
-	m->m_ext.ext_buf = NULL;
-	mbstat.m_mclusts += 1;	/* XXX */
 #ifdef INVARIANTS
 	trash_dtor(mem, size, NULL);
 #endif
@@ -483,14 +592,15 @@
 	m->m_len = 0;
 	m->m_flags = (flags | M_EXT);
 	m->m_type = type;
-	m->m_ext.ref_cnt = NULL;	/* Lazy counter assign. */
-
+	    
 	if (flags & M_PKTHDR) {
 		m->m_pkthdr.rcvif = NULL;
 		m->m_pkthdr.len = 0;
 		m->m_pkthdr.header = NULL;
 		m->m_pkthdr.csum_flags = 0;
 		m->m_pkthdr.csum_data = 0;
+		m->m_pkthdr.tso_segsz = 0;
+		m->m_pkthdr.ether_vtag = 0;
 		SLIST_INIT(&m->m_pkthdr.tags);
 #ifdef MAC
 		/* If the label init fails, fail the alloc */
@@ -501,8 +611,6 @@
 	}
 	/* m_ext is already initialized. */
 
-	mbstat.m_mbufs += 1;	/* XXX */
-	mbstat.m_mclusts += 1;	/* XXX */
 	return (0);
 }
 
@@ -522,7 +630,6 @@
 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK | WARN_PANIC, NULL,
 	    "mb_reclaim()");
 
-	mbstat.m_drain++;
 	for (dp = domains; dp != NULL; dp = dp->dom_next)
 		for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
 			if (pr->pr_drain != NULL)
Index: vfs_syscalls.c
===================================================================
RCS file: /home/cvs/src/sys/kern/vfs_syscalls.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/vfs_syscalls.c -L sys/kern/vfs_syscalls.c -u -r1.2 -r1.3
--- sys/kern/vfs_syscalls.c
+++ sys/kern/vfs_syscalls.c
@@ -35,7 +35,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/vfs_syscalls.c,v 1.392.2.7 2006/03/13 03:06:39 jeff Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/vfs_syscalls.c,v 1.443 2007/09/10 00:00:16 rwatson Exp $");
 
 #include "opt_compat.h"
 #include "opt_mac.h"
@@ -45,7 +45,6 @@
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/sysent.h>
-#include <sys/mac.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
@@ -55,21 +54,25 @@
 #include <sys/kernel.h>
 #include <sys/fcntl.h>
 #include <sys/file.h>
+#include <sys/filio.h>
 #include <sys/limits.h>
 #include <sys/linker.h>
 #include <sys/stat.h>
 #include <sys/sx.h>
 #include <sys/unistd.h>
 #include <sys/vnode.h>
+#include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/dirent.h>
-#include <sys/extattr.h>
 #include <sys/jail.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 
 #include <machine/stdarg.h>
 
+#include <security/audit/audit.h>
+#include <security/mac/mac_framework.h>
+
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
@@ -85,11 +88,6 @@
 static int vn_access(struct vnode *vp, int user_flags, struct ucred *cred,
     struct thread *td);
 
-static int extattr_list_vp(struct vnode *vp, int attrnamespace, void *data,
-    size_t nbytes, struct thread *td);
-
-int (*union_dircheckp)(struct thread *td, struct vnode **, struct file *);
-
 /*
  * The module initialization routine for POSIX asynchronous I/O will
  * set this to the version of AIO that it implements.  (Zero means
@@ -98,6 +96,11 @@
  */
 int async_io_version;
 
+#ifdef DEBUG
+static int syncprt = 0;
+SYSCTL_INT(_debug, OID_AUTO, syncprt, CTLFLAG_RW, &syncprt, 0, "");
+#endif
+
 /*
  * Sync each mounted filesystem.
  */
@@ -106,12 +109,6 @@
 	int     dummy;
 };
 #endif
-
-#ifdef DEBUG
-static int syncprt = 0;
-SYSCTL_INT(_debug, OID_AUTO, syncprt, CTLFLAG_RW, &syncprt, 0, "");
-#endif
-
 /* ARGSUSED */
 int
 sync(td, uap)
@@ -119,40 +116,37 @@
 	struct sync_args *uap;
 {
 	struct mount *mp, *nmp;
-	int asyncflag;
+	int vfslocked;
 
-	mtx_lock(&Giant);
 	mtx_lock(&mountlist_mtx);
 	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
 		if (vfs_busy(mp, LK_NOWAIT, &mountlist_mtx, td)) {
 			nmp = TAILQ_NEXT(mp, mnt_list);
 			continue;
 		}
+		vfslocked = VFS_LOCK_GIANT(mp);
 		if ((mp->mnt_flag & MNT_RDONLY) == 0 &&
 		    vn_start_write(NULL, &mp, V_NOWAIT) == 0) {
-			asyncflag = mp->mnt_flag & MNT_ASYNC;
-			mp->mnt_flag &= ~MNT_ASYNC;
+			MNT_ILOCK(mp);
+			mp->mnt_noasync++;
+			mp->mnt_kern_flag &= ~MNTK_ASYNC;
+			MNT_IUNLOCK(mp);
 			vfs_msync(mp, MNT_NOWAIT);
 			VFS_SYNC(mp, MNT_NOWAIT, td);
-			mp->mnt_flag |= asyncflag;
+			MNT_ILOCK(mp);
+			mp->mnt_noasync--;
+			if ((mp->mnt_flag & MNT_ASYNC) != 0 &&
+			    mp->mnt_noasync == 0)
+				mp->mnt_kern_flag |= MNTK_ASYNC;
+			MNT_IUNLOCK(mp);
 			vn_finished_write(mp);
 		}
+		VFS_UNLOCK_GIANT(vfslocked);
 		mtx_lock(&mountlist_mtx);
 		nmp = TAILQ_NEXT(mp, mnt_list);
 		vfs_unbusy(mp, td);
 	}
 	mtx_unlock(&mountlist_mtx);
-#if 0
-/*
- * XXX don't call vfs_bufstats() yet because that routine
- * was not imported in the Lite2 merge.
- */
-#ifdef DIAGNOSTIC
-	if (syncprt)
-		vfs_bufstats();
-#endif /* DIAGNOSTIC */
-#endif
-	mtx_unlock(&Giant);
 	return (0);
 }
 
@@ -164,8 +158,6 @@
 
 /*
  * Change filesystem quotas.
- *
- * MP SAFE
  */
 #ifndef _SYS_SYSPROTO_H_
 struct quotactl_args {
@@ -185,33 +177,76 @@
 		caddr_t arg;
 	} */ *uap;
 {
-	struct mount *mp, *vmp;
+	struct mount *mp;
+	int vfslocked;
 	int error;
 	struct nameidata nd;
 
+	AUDIT_ARG(cmd, uap->cmd);
+	AUDIT_ARG(uid, uap->uid);
 	if (jailed(td->td_ucred) && !prison_quotas)
 		return (EPERM);
-	mtx_lock(&Giant);
-	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->path, td);
-	if ((error = namei(&nd)) != 0) {
-		mtx_unlock(&Giant);
+	NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE | AUDITVNODE1,
+	   UIO_USERSPACE, uap->path, td);
+	if ((error = namei(&nd)) != 0)
 		return (error);
-	}
+	vfslocked = NDHASGIANT(&nd);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
-	error = vn_start_write(nd.ni_vp, &vmp, V_WAIT | PCATCH);
 	mp = nd.ni_vp->v_mount;
-	vrele(nd.ni_vp);
-	if (error) {
-		mtx_unlock(&Giant);
+	if ((error = vfs_busy(mp, 0, NULL, td))) {
+		vrele(nd.ni_vp);
+		VFS_UNLOCK_GIANT(vfslocked);
 		return (error);
 	}
+	vrele(nd.ni_vp);
 	error = VFS_QUOTACTL(mp, uap->cmd, uap->uid, uap->arg, td);
-	vn_finished_write(vmp);
-	mtx_unlock(&Giant);
+	vfs_unbusy(mp, td);
+	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
+ * Used by statfs conversion routines to scale the block size up if
+ * necessary so that all of the block counts are <= 'max_size'.  Note
+ * that 'max_size' should be a bitmask, i.e. 2^n - 1 for some non-zero
+ * value of 'n'.
+ */
+void
+statfs_scale_blocks(struct statfs *sf, long max_size)
+{
+	uint64_t count;
+	int shift;
+
+	KASSERT(powerof2(max_size + 1), ("%s: invalid max_size", __func__));
+
+	/*
+	 * Attempt to scale the block counts to give a more accurate
+	 * overview to userland of the ratio of free space to used
+	 * space.  To do this, find the largest block count and compute
+	 * a divisor that lets it fit into a signed integer <= max_size.
+	 */
+	if (sf->f_bavail < 0)
+		count = -sf->f_bavail;
+	else
+		count = sf->f_bavail;
+	count = MAX(sf->f_blocks, MAX(sf->f_bfree, count));
+	if (count <= max_size)
+		return;
+
+	count >>= flsl(max_size);
+	shift = 0;
+	while (count > 0) {
+		shift++;
+		count >>=1;
+	}
+
+	sf->f_bsize <<= shift;
+	sf->f_blocks >>= shift;
+	sf->f_bfree >>= shift;
+	sf->f_bavail >>= shift;
+}
+
+/*
  * Get filesystem statistics.
  */
 #ifndef _SYS_SYSPROTO_H_
@@ -243,27 +278,24 @@
 {
 	struct mount *mp;
 	struct statfs *sp, sb;
+	int vfslocked;
 	int error;
 	struct nameidata nd;
 
-	mtx_lock(&Giant);
-	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, pathseg, path, td);
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
+	    pathseg, path, td);
 	error = namei(&nd);
-	if (error) {
-		mtx_unlock(&Giant);
+	if (error)
 		return (error);
-	}
+	vfslocked = NDHASGIANT(&nd);
 	mp = nd.ni_vp->v_mount;
 	vfs_ref(mp);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vput(nd.ni_vp);
 #ifdef MAC
 	error = mac_check_mount_stat(td->td_ucred, mp);
-	if (error) {
-		vfs_rel(mp);
-		mtx_unlock(&Giant);
-		return (error);
-	}
+	if (error)
+		goto out;
 #endif
 	/*
 	 * Set these in case the underlying filesystem fails to do so.
@@ -273,20 +305,21 @@
 	sp->f_namemax = NAME_MAX;
 	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
 	error = VFS_STATFS(mp, sp, td);
-	vfs_rel(mp);
-	if (error) {
-		mtx_unlock(&Giant);
-		return (error);
-	}
-	if (suser(td)) {
+	if (error)
+		goto out;
+	if (priv_check(td, PRIV_VFS_GENERATION)) {
 		bcopy(sp, &sb, sizeof(sb));
 		sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
 		prison_enforce_statfs(td->td_ucred, mp, &sb);
 		sp = &sb;
 	}
-	mtx_unlock(&Giant);
 	*buf = *sp;
-	return (0);
+out:
+	vfs_rel(mp);
+	VFS_UNLOCK_GIANT(vfslocked);
+	if (mtx_owned(&Giant))
+		printf("statfs(%d): %s: %d\n", vfslocked, path, error);
+	return (error);
 }
 
 /*
@@ -321,14 +354,16 @@
 	struct file *fp;
 	struct mount *mp;
 	struct statfs *sp, sb;
+	int vfslocked;
 	struct vnode *vp;
 	int error;
 
+	AUDIT_ARG(fd, fd);
 	error = getvnode(td->td_proc->p_fd, fd, &fp);
 	if (error)
 		return (error);
-	mtx_lock(&Giant);
 	vp = fp->f_vnode;
+	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 #ifdef AUDIT
 	AUDIT_ARG(vnode, vp, ARG_VNODE1);
@@ -339,18 +374,13 @@
 	VOP_UNLOCK(vp, 0, td);
 	fdrop(fp, td);
 	if (vp->v_iflag & VI_DOOMED) {
-		if (mp)
-			vfs_rel(mp);
-		mtx_unlock(&Giant);
-		return (EBADF);
+		error = EBADF;
+		goto out;
 	}
 #ifdef MAC
 	error = mac_check_mount_stat(td->td_ucred, mp);
-	if (error) {
-		vfs_rel(mp);
-		mtx_unlock(&Giant);
-		return (error);
-	}
+	if (error)
+		goto out;
 #endif
 	/*
 	 * Set these in case the underlying filesystem fails to do so.
@@ -360,20 +390,20 @@
 	sp->f_namemax = NAME_MAX;
 	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
 	error = VFS_STATFS(mp, sp, td);
-	vfs_rel(mp);
-	if (error) {
-		mtx_unlock(&Giant);
-		return (error);
-	}
-	if (suser(td)) {
+	if (error)
+		goto out;
+	if (priv_check(td, PRIV_VFS_GENERATION)) {
 		bcopy(sp, &sb, sizeof(sb));
 		sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
 		prison_enforce_statfs(td->td_ucred, mp, &sb);
 		sp = &sb;
 	}
-	mtx_unlock(&Giant);
 	*buf = *sp;
-	return (0);
+out:
+	if (mp)
+		vfs_rel(mp);
+	VFS_UNLOCK_GIANT(vfslocked);
+	return (error);
 }
 
 /*
@@ -412,6 +442,7 @@
 	struct mount *mp, *nmp;
 	struct statfs *sfsp, *sp, sb;
 	size_t count, maxcount;
+	int vfslocked;
 	int error;
 
 	maxcount = bufsize / sizeof(struct statfs);
@@ -432,7 +463,6 @@
 		    M_WAITOK);
 	}
 	count = 0;
-	mtx_lock(&Giant);
 	mtx_lock(&mountlist_mtx);
 	for (mp = TAILQ_FIRST(&mountlist); mp != NULL; mp = nmp) {
 		if (prison_canseemount(td->td_ucred, mp) != 0) {
@@ -449,6 +479,7 @@
 			nmp = TAILQ_NEXT(mp, mnt_list);
 			continue;
 		}
+		vfslocked = VFS_LOCK_GIANT(mp);
 		if (sfsp && count < maxcount) {
 			sp = &mp->mnt_stat;
 			/*
@@ -466,12 +497,13 @@
 			if (((flags & (MNT_LAZY|MNT_NOWAIT)) == 0 ||
 			    (flags & MNT_WAIT)) &&
 			    (error = VFS_STATFS(mp, sp, td))) {
+				VFS_UNLOCK_GIANT(vfslocked);
 				mtx_lock(&mountlist_mtx);
 				nmp = TAILQ_NEXT(mp, mnt_list);
 				vfs_unbusy(mp, td);
 				continue;
 			}
-			if (suser(td)) {
+			if (priv_check(td, PRIV_VFS_GENERATION)) {
 				bcopy(sp, &sb, sizeof(sb));
 				sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
 				prison_enforce_statfs(td->td_ucred, mp, &sb);
@@ -483,19 +515,19 @@
 				error = copyout(sp, sfsp, sizeof(*sp));
 				if (error) {
 					vfs_unbusy(mp, td);
-					mtx_unlock(&Giant);
+					VFS_UNLOCK_GIANT(vfslocked);
 					return (error);
 				}
 			}
 			sfsp++;
 		}
+		VFS_UNLOCK_GIANT(vfslocked);
 		count++;
 		mtx_lock(&mountlist_mtx);
 		nmp = TAILQ_NEXT(mp, mnt_list);
 		vfs_unbusy(mp, td);
 	}
 	mtx_unlock(&mountlist_mtx);
-	mtx_unlock(&Giant);
 	if (sfsp && count > maxcount)
 		td->td_retval[0] = maxcount;
 	else
@@ -645,12 +677,13 @@
 	struct ostatfs *osp;
 {
 
+	statfs_scale_blocks(nsp, LONG_MAX);
 	bzero(osp, sizeof(*osp));
-	osp->f_bsize = MIN(nsp->f_bsize, LONG_MAX);
+	osp->f_bsize = nsp->f_bsize;
 	osp->f_iosize = MIN(nsp->f_iosize, LONG_MAX);
-	osp->f_blocks = MIN(nsp->f_blocks, LONG_MAX);
-	osp->f_bfree = MIN(nsp->f_bfree, LONG_MAX);
-	osp->f_bavail = MIN(nsp->f_bavail, LONG_MAX);
+	osp->f_blocks = nsp->f_blocks;
+	osp->f_bfree = nsp->f_bfree;
+	osp->f_bavail = nsp->f_bavail;
 	osp->f_files = MIN(nsp->f_files, LONG_MAX);
 	osp->f_ffree = MIN(nsp->f_ffree, LONG_MAX);
 	osp->f_owner = nsp->f_owner;
@@ -692,21 +725,16 @@
 	int vfslocked;
 	int error;
 
+	AUDIT_ARG(fd, uap->fd);
 	if ((error = getvnode(fdp, uap->fd, &fp)) != 0)
 		return (error);
 	vp = fp->f_vnode;
-	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	VREF(vp);
 	fdrop(fp, td);
+	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
-	if (vp->v_type != VDIR)
-		error = ENOTDIR;
-#ifdef MAC
-	else if ((error = mac_check_vnode_chdir(td->td_ucred, vp)) != 0) {
-	}
-#endif
-	else
-		error = VOP_ACCESS(vp, VEXEC, td->td_ucred, td);
+	AUDIT_ARG(vnode, vp, ARG_VNODE1);
+	error = change_dir(vp, td);
 	while (!error && (mp = vp->v_mountedhere) != NULL) {
 		int tvfslocked;
 		if (vfs_busy(mp, 0, 0, td))
@@ -730,10 +758,10 @@
 	}
 	VOP_UNLOCK(vp, 0, td);
 	VFS_UNLOCK_GIANT(vfslocked);
-	FILEDESC_LOCK_FAST(fdp);
+	FILEDESC_XLOCK(fdp);
 	vpold = fdp->fd_cdir;
 	fdp->fd_cdir = vp;
-	FILEDESC_UNLOCK_FAST(fdp);
+	FILEDESC_XUNLOCK(fdp);
 	vfslocked = VFS_LOCK_GIANT(vpold->v_mount);
 	vrele(vpold);
 	VFS_UNLOCK_GIANT(vfslocked);
@@ -768,7 +796,8 @@
 	struct vnode *vp;
 	int vfslocked;
 
-	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE, pathseg, path, td);
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1 | MPSAFE,
+	    pathseg, path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vfslocked = NDHASGIANT(&nd);
@@ -781,10 +810,10 @@
 	VOP_UNLOCK(nd.ni_vp, 0, td);
 	VFS_UNLOCK_GIANT(vfslocked);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
-	FILEDESC_LOCK_FAST(fdp);
+	FILEDESC_XLOCK(fdp);
 	vp = fdp->fd_cdir;
 	fdp->fd_cdir = nd.ni_vp;
-	FILEDESC_UNLOCK_FAST(fdp);
+	FILEDESC_XUNLOCK(fdp);
 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	vrele(vp);
 	VFS_UNLOCK_GIANT(vfslocked);
@@ -803,7 +832,8 @@
 	struct file *fp;
 	int fd;
 
-	FILEDESC_LOCK_ASSERT(fdp, MA_OWNED);
+	FILEDESC_LOCK_ASSERT(fdp);
+
 	for (fd = 0; fd < fdp->fd_nfiles ; fd++) {
 		fp = fget_locked(fdp, fd);
 		if (fp == NULL)
@@ -849,10 +879,10 @@
 	struct nameidata nd;
 	int vfslocked;
 
-	error = suser_cred(td->td_ucred, SUSER_ALLOWJAIL);
+	error = priv_check(td, PRIV_VFS_CHROOT);
 	if (error)
 		return (error);
-	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE,
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
 	    UIO_USERSPACE, uap->path, td);
 	error = namei(&nd);
 	if (error)
@@ -903,8 +933,8 @@
 
 /*
  * Common routine for kern_chroot() and jail_attach().  The caller is
- * responsible for invoking suser() and mac_check_chroot() to authorize this
- * operation.
+ * responsible for invoking priv_check() and mac_check_chroot() to authorize
+ * this operation.
  */
 int
 change_root(vp, td)
@@ -918,12 +948,12 @@
 
 	VFS_ASSERT_GIANT(vp->v_mount);
 	fdp = td->td_proc->p_fd;
-	FILEDESC_LOCK(fdp);
+	FILEDESC_XLOCK(fdp);
 	if (chroot_allow_open_directories == 0 ||
 	    (chroot_allow_open_directories == 1 && fdp->fd_rdir != rootvnode)) {
 		error = chroot_refuse_vdir_fds(fdp);
 		if (error) {
-			FILEDESC_UNLOCK(fdp);
+			FILEDESC_XUNLOCK(fdp);
 			return (error);
 		}
 	}
@@ -934,7 +964,7 @@
 		fdp->fd_jdir = vp;
 		VREF(fdp->fd_jdir);
 	}
-	FILEDESC_UNLOCK(fdp);
+	FILEDESC_XUNLOCK(fdp);
 	vfslocked = VFS_LOCK_GIANT(oldvp->v_mount);
 	vrele(oldvp);
 	VFS_UNLOCK_GIANT(vfslocked);
@@ -942,10 +972,8 @@
 }
 
 /*
- * Check permissions, allocate an open file structure,
- * and call the device open routine if any.
- *
- * MP SAFE
+ * Check permissions, allocate an open file structure, and call the device
+ * open routine if any.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct open_args {
@@ -963,12 +991,8 @@
 		int mode;
 	} */ *uap;
 {
-	int error;
 
-	error = kern_open(td, uap->path, UIO_USERSPACE, uap->flags, uap->mode);
-	if (mtx_owned(&Giant))
-		printf("open: %s: %d\n", uap->path, error);
-	return (error);
+	return kern_open(td, uap->path, UIO_USERSPACE, uap->flags, uap->mode);
 }
 
 int
@@ -988,6 +1012,8 @@
 	struct nameidata nd;
 	int vfslocked;
 
+	AUDIT_ARG(fflags, flags);
+	AUDIT_ARG(mode, mode);
 	if ((flags & O_ACCMODE) == O_ACCMODE)
 		return (EINVAL);
 	flags = FFLAGS(flags);
@@ -997,9 +1023,9 @@
 	/* An extra reference on `nfp' has been held for us by falloc(). */
 	fp = nfp;
 	cmode = ((mode &~ fdp->fd_cmask) & ALLPERMS) &~ S_ISTXT;
-	NDINIT(&nd, LOOKUP, FOLLOW, pathseg, path, td);
+	NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1 | MPSAFE, pathseg, path, td);
 	td->td_dupfd = -1;		/* XXX check for fdopen */
-	error = vn_open(&nd, &flags, cmode, indx);
+ 	error = vn_open(&nd, &flags, cmode, fp);
 	if (error) {
 		/*
 		 * If the vn_open replaced the method vector, something
@@ -1013,11 +1039,6 @@
 		}
 
 		/*
-		 * release our own reference
-		 */
-		fdrop(fp, td);
-
-		/*
 		 * handle special fdopen() case.  bleh.  dupfdopen() is
 		 * responsible for dropping the old contents of ofiles[indx]
 		 * if it succeeds.
@@ -1027,6 +1048,7 @@
 		    (error =
 			dupfdopen(td, fdp, indx, td->td_dupfd, flags, error)) == 0) {
 			td->td_retval[0] = indx;
+			fdrop(fp, td);
 			return (0);
 		}
 		/*
@@ -1034,6 +1056,7 @@
 		 * replaced or closed it.
 		 */
 		fdclose(fdp, fp, indx, td);
+		fdrop(fp, td);
 
 		if (error == ERESTART)
 			error = EINTR;
@@ -1044,41 +1067,16 @@
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vp = nd.ni_vp;
 
-	/*
-	 * There should be 2 references on the file, one from the descriptor
-	 * table, and one for us.
-	 *
-	 * Handle the case where someone closed the file (via its file
-	 * descriptor) while we were blocked.  The end result should look
-	 * like opening the file succeeded but it was immediately closed.
-	 * We call vn_close() manually because we haven't yet hooked up
-	 * the various 'struct file' fields.
-	 */
-	FILEDESC_LOCK(fdp);
 	FILE_LOCK(fp);
-	if (fp->f_count == 1) {
-		mp = vp->v_mount;
-		KASSERT(fdp->fd_ofiles[indx] != fp,
-		    ("Open file descriptor lost all refs"));
-		FILE_UNLOCK(fp);
-		FILEDESC_UNLOCK(fdp);
-		VOP_UNLOCK(vp, 0, td);
-		vn_close(vp, flags & FMASK, fp->f_cred, td);
-		VFS_UNLOCK_GIANT(vfslocked);
-		fdrop(fp, td);
-		td->td_retval[0] = indx;
-		return (0);
-	}
 	fp->f_vnode = vp;
 	if (fp->f_data == NULL)
 		fp->f_data = vp;
 	fp->f_flag = flags & FMASK;
-	if (fp->f_ops == &badfileops)
-		fp->f_ops = &vnops;
 	fp->f_seqcount = 1;
 	fp->f_type = (vp->v_type == VFIFO ? DTYPE_FIFO : DTYPE_VNODE);
+	if (fp->f_ops == &badfileops)
+		fp->f_ops = &vnops;
 	FILE_UNLOCK(fp);
-	FILEDESC_UNLOCK(fdp);
 
 	VOP_UNLOCK(vp, 0, td);
 	if (flags & (O_EXLOCK | O_SHLOCK)) {
@@ -1132,8 +1130,6 @@
 #ifdef COMPAT_43
 /*
  * Create a file.
- *
- * MP SAFE
  */
 #ifndef _SYS_SYSPROTO_H_
 struct ocreat_args {
@@ -1190,20 +1186,29 @@
 	struct nameidata nd;
 	int vfslocked;
 
+	AUDIT_ARG(mode, mode);
+	AUDIT_ARG(dev, dev);
 	switch (mode & S_IFMT) {
 	case S_IFCHR:
 	case S_IFBLK:
-		error = suser(td);
+		error = priv_check(td, PRIV_VFS_MKNOD_DEV);
+		break;
+	case S_IFMT:
+		error = priv_check(td, PRIV_VFS_MKNOD_BAD);
+		break;
+	case S_IFWHT:
+		error = priv_check(td, PRIV_VFS_MKNOD_WHT);
 		break;
 	default:
-		error = suser_cred(td->td_ucred, SUSER_ALLOWJAIL);
+		error = EINVAL;
 		break;
 	}
 	if (error)
 		return (error);
 restart:
 	bwillwrite();
-	NDINIT(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE, pathseg, path, td);
+	NDINIT(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE | AUDITVNODE1,
+	    pathseg, path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vfslocked = NDHASGIANT(&nd);
@@ -1219,10 +1224,10 @@
 		return (EEXIST);
 	} else {
 		VATTR_NULL(&vattr);
-		FILEDESC_LOCK_FAST(td->td_proc->p_fd);
+		FILEDESC_SLOCK(td->td_proc->p_fd);
 		vattr.va_mode = (mode & ALLPERMS) &
 		    ~td->td_proc->p_fd->fd_cmask;
-		FILEDESC_UNLOCK_FAST(td->td_proc->p_fd);
+		FILEDESC_SUNLOCK(td->td_proc->p_fd);
 		vattr.va_rdev = dev;
 		whiteout = 0;
 
@@ -1240,8 +1245,7 @@
 			whiteout = 1;
 			break;
 		default:
-			error = EINVAL;
-			break;
+			panic("kern_mknod: invalid mode");
 		}
 	}
 	if (vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
@@ -1305,9 +1309,11 @@
 	struct nameidata nd;
 	int vfslocked;
 
+	AUDIT_ARG(mode, mode);
 restart:
 	bwillwrite();
-	NDINIT(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE, pathseg, path, td);
+	NDINIT(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE | AUDITVNODE1,
+	    pathseg, path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vfslocked = NDHASGIANT(&nd);
@@ -1331,9 +1337,9 @@
 	}
 	VATTR_NULL(&vattr);
 	vattr.va_type = VFIFO;
-	FILEDESC_LOCK_FAST(td->td_proc->p_fd);
+	FILEDESC_SLOCK(td->td_proc->p_fd);
 	vattr.va_mode = (mode & ALLPERMS) & ~td->td_proc->p_fd->fd_cmask;
-	FILEDESC_UNLOCK_FAST(td->td_proc->p_fd);
+	FILEDESC_SUNLOCK(td->td_proc->p_fd);
 #ifdef MAC
 	error = mac_check_vnode_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
 	    &vattr);
@@ -1377,8 +1383,6 @@
 	return (error);
 }
 
-SYSCTL_DECL(_security_bsd);
-
 static int hardlink_check_uid = 0;
 SYSCTL_INT(_security_bsd, OID_AUTO, hardlink_check_uid, CTLFLAG_RW,
     &hardlink_check_uid, 0,
@@ -1396,9 +1400,6 @@
 	struct vattr va;
 	int error;
 
-	if (suser_cred(cred, SUSER_ALLOWJAIL) == 0)
-		return (0);
-
 	if (!hardlink_check_uid && !hardlink_check_gid)
 		return (0);
 
@@ -1406,14 +1407,16 @@
 	if (error != 0)
 		return (error);
 
-	if (hardlink_check_uid) {
-		if (cred->cr_uid != va.va_uid)
-			return (EPERM);
+	if (hardlink_check_uid && cred->cr_uid != va.va_uid) {
+		error = priv_check_cred(cred, PRIV_VFS_LINK, 0);
+		if (error)
+			return (error);
 	}
 
-	if (hardlink_check_gid) {
-		if (!groupmember(va.va_gid, cred))
-			return (EPERM);
+	if (hardlink_check_gid && !groupmember(va.va_gid, cred)) {
+		error = priv_check_cred(cred, PRIV_VFS_LINK, 0);
+		if (error)
+			return (error);
 	}
 
 	return (0);
@@ -1430,7 +1433,7 @@
 	int error;
 
 	bwillwrite();
-	NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE, segflg, path, td);
+	NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE | AUDITVNODE1, segflg, path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vfslocked = NDHASGIANT(&nd);
@@ -1446,7 +1449,8 @@
 		VFS_UNLOCK_GIANT(vfslocked);
 		return (error);
 	}
-	NDINIT(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE, segflg, link, td);
+	NDINIT(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE | AUDITVNODE2,
+	    segflg, link, td);
 	if ((error = namei(&nd)) == 0) {
 		lvfslocked = NDHASGIANT(&nd);
 		if (nd.ni_vp != NULL) {
@@ -1518,9 +1522,10 @@
 		if ((error = copyinstr(path, syspath, MAXPATHLEN, NULL)) != 0)
 			goto out;
 	}
+	AUDIT_ARG(text, syspath);
 restart:
 	bwillwrite();
-	NDINIT(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE,
+	NDINIT(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE | AUDITVNODE1,
 	    segflg, link, td);
 	if ((error = namei(&nd)) != 0)
 		goto out;
@@ -1545,9 +1550,9 @@
 		goto restart;
 	}
 	VATTR_NULL(&vattr);
-	FILEDESC_LOCK_FAST(td->td_proc->p_fd);
+	FILEDESC_SLOCK(td->td_proc->p_fd);
 	vattr.va_mode = ACCESSPERMS &~ td->td_proc->p_fd->fd_cmask;
-	FILEDESC_UNLOCK_FAST(td->td_proc->p_fd);
+	FILEDESC_SUNLOCK(td->td_proc->p_fd);
 #ifdef MAC
 	vattr.va_type = VLNK;
 	error = mac_check_vnode_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
@@ -1589,8 +1594,8 @@
 
 restart:
 	bwillwrite();
-	NDINIT(&nd, DELETE, LOCKPARENT | DOWHITEOUT | MPSAFE, UIO_USERSPACE,
-	    uap->path, td);
+	NDINIT(&nd, DELETE, LOCKPARENT | DOWHITEOUT | MPSAFE | AUDITVNODE1,
+	    UIO_USERSPACE, uap->path, td);
 	error = namei(&nd);
 	if (error)
 		return (error);
@@ -1656,7 +1661,8 @@
 
 restart:
 	bwillwrite();
-	NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF | MPSAFE, pathseg, path, td);
+	NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF | MPSAFE | AUDITVNODE1,
+	    pathseg, path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error == EINVAL ? EPERM : error);
 	vfslocked = NDHASGIANT(&nd);
@@ -1687,7 +1693,7 @@
 			goto restart;
 		}
 #ifdef MAC
-		error = mac_check_vnode_delete(td->td_ucred, nd.ni_dvp, vp,
+		error = mac_check_vnode_unlink(td->td_ucred, nd.ni_dvp, vp,
 		    &nd.ni_cnd);
 		if (error)
 			goto out;
@@ -1774,6 +1780,12 @@
 		break;
 	case L_SET:
 		break;
+	case SEEK_DATA:
+		error = fo_ioctl(fp, FIOSEEKDATA, &offset, cred, td);
+		break;
+	case SEEK_HOLE:
+		error = fo_ioctl(fp, FIOSEEKHOLE, &offset, cred, td);
+		break;
 	default:
 		error = EINVAL;
 	}
@@ -1815,16 +1827,28 @@
 		off_t offset;
 		int whence;
 	} */ nuap;
-	int error;
 
 	nuap.fd = uap->fd;
 	nuap.offset = uap->offset;
 	nuap.whence = uap->whence;
-	error = lseek(td, &nuap);
-	return (error);
+	return (lseek(td, &nuap));
 }
 #endif /* COMPAT_43 */
 
+/* Version with the 'pad' argument */
+int
+freebsd6_lseek(td, uap)
+	struct thread *td;
+	register struct freebsd6_lseek_args *uap;
+{
+	struct lseek_args ouap;
+
+	ouap.fd = uap->fd;
+	ouap.offset = uap->offset;
+	ouap.whence = uap->whence;
+	return (lseek(td, &ouap));
+}
+
 /*
  * Check access permissions using passed credentials.
  */
@@ -1898,7 +1922,8 @@
 	tmpcred->cr_uid = cred->cr_ruid;
 	tmpcred->cr_groups[0] = cred->cr_rgid;
 	td->td_ucred = tmpcred;
-	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE, pathseg, path, td);
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
+	    pathseg, path, td);
 	if ((error = namei(&nd)) != 0)
 		goto out1;
 	vfslocked = NDHASGIANT(&nd);
@@ -1931,18 +1956,25 @@
 		int flags;
 	} */ *uap;
 {
+
+	return (kern_eaccess(td, uap->path, UIO_USERSPACE, uap->flags));
+}
+
+int
+kern_eaccess(struct thread *td, char *path, enum uio_seg pathseg, int flags)
+{
 	struct nameidata nd;
 	struct vnode *vp;
 	int vfslocked;
 	int error;
 
-	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE, UIO_USERSPACE,
-	    uap->path, td);
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
+	    pathseg, path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vp = nd.ni_vp;
 	vfslocked = NDHASGIANT(&nd);
-	error = vn_access(vp, uap->flags, td->td_ucred, td);
+	error = vn_access(vp, flags, td->td_ucred, td);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vput(vp);
 	VFS_UNLOCK_GIANT(vfslocked);
@@ -2071,7 +2103,8 @@
 	struct stat sb;
 	int error, vfslocked;
 
-	NDINIT(&nd, LOOKUP, FOLLOW | LOCKSHARED | LOCKLEAF | MPSAFE,
+	NDINIT(&nd, LOOKUP,
+	    FOLLOW | LOCKSHARED | LOCKLEAF | MPSAFE | AUDITVNODE1,
 	    pathseg, path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
@@ -2080,6 +2113,8 @@
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vput(nd.ni_vp);
 	VFS_UNLOCK_GIANT(vfslocked);
+	if (mtx_owned(&Giant))
+		printf("stat(%d): %s\n", vfslocked, path);
 	if (error)
 		return (error);
 	*sbp = sb;
@@ -2120,7 +2155,8 @@
 	struct nameidata nd;
 	int error, vfslocked;
 
-	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED | MPSAFE,
+	NDINIT(&nd, LOOKUP,
+	    NOFOLLOW | LOCKLEAF | LOCKSHARED | MPSAFE | AUDITVNODE1,
 	    pathseg, path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
@@ -2245,7 +2281,8 @@
 	struct nameidata nd;
 	int error, vfslocked;
 
-	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE, pathseg, path, td);
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
+	    pathseg, path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vfslocked = NDHASGIANT(&nd);
@@ -2296,7 +2333,8 @@
 	struct nameidata nd;
 	int vfslocked;
 
-	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | MPSAFE, pathseg, path, td);
+	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
+	    pathseg, path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
@@ -2350,7 +2388,7 @@
 	 * chown can't fail when done as root.
 	 */
 	if (vp->v_type == VCHR || vp->v_type == VBLK) {
-		error = suser_cred(td->td_ucred, SUSER_ALLOWJAIL);
+		error = priv_check(td, PRIV_VFS_CHFLAGS_DEV);
 		if (error)
 			return (error);
 	}
@@ -2392,7 +2430,9 @@
 	struct nameidata nd;
 	int vfslocked;
 
-	NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE, UIO_USERSPACE, uap->path, td);
+	AUDIT_ARG(fflags, uap->flags);
+	NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE | AUDITVNODE1, UIO_USERSPACE,
+	    uap->path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
@@ -2418,7 +2458,9 @@
 	struct nameidata nd;
 	int vfslocked;
 
-	NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE, UIO_USERSPACE, uap->path, td);
+	AUDIT_ARG(fflags, uap->flags);
+	NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE | AUDITVNODE1, UIO_USERSPACE,
+	    uap->path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vfslocked = NDHASGIANT(&nd);
@@ -2450,12 +2492,19 @@
 	int vfslocked;
 	int error;
 
+	AUDIT_ARG(fd, uap->fd);
+	AUDIT_ARG(fflags, uap->flags);
 	if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0)
 		return (error);
 	vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
+#ifdef AUDIT
+	vn_lock(fp->f_vnode, LK_EXCLUSIVE | LK_RETRY, td);
+	AUDIT_ARG(vnode, fp->f_vnode, ARG_VNODE1);
+	VOP_UNLOCK(fp->f_vnode, 0, td);
+#endif
 	error = setfflags(td, fp->f_vnode, uap->flags);
-	fdrop(fp, td);
 	VFS_UNLOCK_GIANT(vfslocked);
+	fdrop(fp, td);
 	return (error);
 }
 
@@ -2516,7 +2565,8 @@
 	struct nameidata nd;
 	int vfslocked;
 
-	NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE, pathseg, path, td);
+	AUDIT_ARG(mode, mode);
+	NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE | AUDITVNODE1, pathseg, path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vfslocked = NDHASGIANT(&nd);
@@ -2548,7 +2598,9 @@
 	struct nameidata nd;
 	int vfslocked;
 
-	NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE, UIO_USERSPACE, uap->path, td);
+	AUDIT_ARG(mode, (mode_t)uap->mode);
+	NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE | AUDITVNODE1, UIO_USERSPACE,
+	    uap->path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vfslocked = NDHASGIANT(&nd);
@@ -2580,12 +2632,19 @@
 	int vfslocked;
 	int error;
 
+	AUDIT_ARG(fd, uap->fd);
+	AUDIT_ARG(mode, uap->mode);
 	if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0)
 		return (error);
 	vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
+#ifdef AUDIT
+	vn_lock(fp->f_vnode, LK_EXCLUSIVE | LK_RETRY, td);
+	AUDIT_ARG(vnode, fp->f_vnode, ARG_VNODE1);
+	VOP_UNLOCK(fp->f_vnode, 0, td);
+#endif
 	error = setfmode(td, fp->f_vnode, uap->mode);
-	fdrop(fp, td);
 	VFS_UNLOCK_GIANT(vfslocked);
+	fdrop(fp, td);
 	return (error);
 }
 
@@ -2652,7 +2711,8 @@
 	struct nameidata nd;
 	int vfslocked;
 
-	NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE, pathseg, path, td);
+	AUDIT_ARG(owner, uid, gid);
+	NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE | AUDITVNODE1, pathseg, path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vfslocked = NDHASGIANT(&nd);
@@ -2694,7 +2754,8 @@
 	struct nameidata nd;
 	int vfslocked;
 
-	NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE, pathseg, path, td);
+	AUDIT_ARG(owner, uid, gid);
+	NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE | AUDITVNODE1, pathseg, path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vfslocked = NDHASGIANT(&nd);
@@ -2728,12 +2789,19 @@
 	int vfslocked;
 	int error;
 
+	AUDIT_ARG(fd, uap->fd);
+	AUDIT_ARG(owner, uap->uid, uap->gid);
 	if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0)
 		return (error);
 	vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
+#ifdef AUDIT
+	vn_lock(fp->f_vnode, LK_EXCLUSIVE | LK_RETRY, td);
+	AUDIT_ARG(vnode, fp->f_vnode, ARG_VNODE1);
+	VOP_UNLOCK(fp->f_vnode, 0, td);
+#endif
 	error = setfown(td, fp->f_vnode, uap->uid, uap->gid);
-	fdrop(fp, td);
 	VFS_UNLOCK_GIANT(vfslocked);
+	fdrop(fp, td);
 	return (error);
 }
 
@@ -2848,7 +2916,7 @@
 
 	if ((error = getutimes(tptr, tptrseg, ts)) != 0)
 		return (error);
-	NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE, pathseg, path, td);
+	NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE | AUDITVNODE1, pathseg, path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vfslocked = NDHASGIANT(&nd);
@@ -2892,7 +2960,7 @@
 
 	if ((error = getutimes(tptr, tptrseg, ts)) != 0)
 		return (error);
-	NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE, pathseg, path, td);
+	NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE | AUDITVNODE1, pathseg, path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vfslocked = NDHASGIANT(&nd);
@@ -2933,14 +3001,20 @@
 	int vfslocked;
 	int error;
 
+	AUDIT_ARG(fd, fd);
 	if ((error = getutimes(tptr, tptrseg, ts)) != 0)
 		return (error);
 	if ((error = getvnode(td->td_proc->p_fd, fd, &fp)) != 0)
 		return (error);
 	vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
+#ifdef AUDIT
+	vn_lock(fp->f_vnode, LK_EXCLUSIVE | LK_RETRY, td);
+	AUDIT_ARG(vnode, fp->f_vnode, ARG_VNODE1);
+	VOP_UNLOCK(fp->f_vnode, 0, td);
+#endif
 	error = setutimes(td, fp->f_vnode, ts, 2, tptr == NULL);
-	fdrop(fp, td);
 	VFS_UNLOCK_GIANT(vfslocked);
+	fdrop(fp, td);
 	return (error);
 }
 
@@ -2979,7 +3053,7 @@
 
 	if (length < 0)
 		return(EINVAL);
-	NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE, pathseg, path, td);
+	NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE | AUDITVNODE1, pathseg, path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vfslocked = NDHASGIANT(&nd);
@@ -3036,6 +3110,7 @@
 	int vfslocked;
 	int error;
 
+	AUDIT_ARG(fd, uap->fd);
 	if (uap->length < 0)
 		return(EINVAL);
 	if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0)
@@ -3050,6 +3125,7 @@
 		goto drop;
 	VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	AUDIT_ARG(vnode, vp, ARG_VNODE1);
 	if (vp->v_type == VDIR)
 		error = EISDIR;
 #ifdef MAC
@@ -3128,6 +3204,27 @@
 }
 #endif /* COMPAT_43 */
 
+/* Versions with the pad argument */
+int
+freebsd6_truncate(struct thread *td, struct freebsd6_truncate_args *uap)
+{
+	struct truncate_args ouap;
+
+	ouap.path = uap->path;
+	ouap.length = uap->length;
+	return (truncate(td, &ouap));
+}
+
+int
+freebsd6_ftruncate(struct thread *td, struct freebsd6_ftruncate_args *uap)
+{
+	struct ftruncate_args ouap;
+
+	ouap.fd = uap->fd;
+	ouap.length = uap->length;
+	return (ftruncate(td, &ouap));
+}
+
 /*
  * Sync an open file.
  */
@@ -3149,6 +3246,7 @@
 	int vfslocked;
 	int error;
 
+	AUDIT_ARG(fd, uap->fd);
 	if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0)
 		return (error);
 	vp = fp->f_vnode;
@@ -3156,6 +3254,7 @@
 	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
 		goto drop;
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	AUDIT_ARG(vnode, vp, ARG_VNODE1);
 	if (vp->v_object != NULL) {
 		VM_OBJECT_LOCK(vp->v_object);
 		vm_object_page_clean(vp->v_object, 0, 0, 0);
@@ -3172,8 +3271,8 @@
 }
 
 /*
- * Rename files.  Source and destination must either both be directories,
- * or both not be directories.  If target is a directory, it must be empty.
+ * Rename files.  Source and destination must either both be directories, or
+ * both not be directories.  If target is a directory, it must be empty.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct rename_args {
@@ -3205,11 +3304,11 @@
 
 	bwillwrite();
 #ifdef MAC
-	NDINIT(&fromnd, DELETE, LOCKPARENT | LOCKLEAF | SAVESTART | MPSAFE,
-	    pathseg, from, td);
+	NDINIT(&fromnd, DELETE, LOCKPARENT | LOCKLEAF | SAVESTART | MPSAFE |
+	    AUDITVNODE1, pathseg, from, td);
 #else
-	NDINIT(&fromnd, DELETE, WANTPARENT | SAVESTART | MPSAFE,
-	    pathseg, from, td);
+	NDINIT(&fromnd, DELETE, WANTPARENT | SAVESTART | MPSAFE |
+	    AUDITVNODE1, pathseg, from, td);
 #endif
 	if ((error = namei(&fromnd)) != 0)
 		return (error);
@@ -3219,7 +3318,8 @@
 	error = mac_check_vnode_rename_from(td->td_ucred, fromnd.ni_dvp,
 	    fromnd.ni_vp, &fromnd.ni_cnd);
 	VOP_UNLOCK(fromnd.ni_dvp, 0, td);
-	VOP_UNLOCK(fromnd.ni_vp, 0, td);
+	if (fromnd.ni_dvp != fromnd.ni_vp)
+		VOP_UNLOCK(fromnd.ni_vp, 0, td);
 #endif
 	fvp = fromnd.ni_vp;
 	if (error == 0)
@@ -3231,7 +3331,7 @@
 		goto out1;
 	}
 	NDINIT(&tond, RENAME, LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART |
-	    MPSAFE, pathseg, to, td);
+	    MPSAFE | AUDITVNODE2, pathseg, to, td);
 	if (fromnd.ni_vp->v_type == VDIR)
 		tond.ni_cnd.cn_flags |= WILLBEDIR;
 	if ((error = namei(&tond)) != 0) {
@@ -3337,9 +3437,11 @@
 	struct nameidata nd;
 	int vfslocked;
 
+	AUDIT_ARG(mode, mode);
 restart:
 	bwillwrite();
-	NDINIT(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE, segflg, path, td);
+	NDINIT(&nd, CREATE, LOCKPARENT | SAVENAME | MPSAFE | AUDITVNODE1,
+	    segflg, path, td);
 	nd.ni_cnd.cn_flags |= WILLBEDIR;
 	if ((error = namei(&nd)) != 0)
 		return (error);
@@ -3370,9 +3472,9 @@
 	}
 	VATTR_NULL(&vattr);
 	vattr.va_type = VDIR;
-	FILEDESC_LOCK_FAST(td->td_proc->p_fd);
+	FILEDESC_SLOCK(td->td_proc->p_fd);
 	vattr.va_mode = (mode & ACCESSPERMS) &~ td->td_proc->p_fd->fd_cmask;
-	FILEDESC_UNLOCK_FAST(td->td_proc->p_fd);
+	FILEDESC_SUNLOCK(td->td_proc->p_fd);
 #ifdef MAC
 	error = mac_check_vnode_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
 	    &vattr);
@@ -3423,7 +3525,8 @@
 
 restart:
 	bwillwrite();
-	NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF | MPSAFE, pathseg, path, td);
+	NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF | MPSAFE | AUDITVNODE1,
+	    pathseg, path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vfslocked = NDHASGIANT(&nd);
@@ -3447,7 +3550,7 @@
 		goto out;
 	}
 #ifdef MAC
-	error = mac_check_vnode_delete(td->td_ucred, nd.ni_dvp, vp,
+	error = mac_check_vnode_unlink(td->td_ucred, nd.ni_dvp, vp,
 	    &nd.ni_cnd);
 	if (error)
 		goto out;
@@ -3507,7 +3610,7 @@
 	struct iovec aiov, kiov;
 	struct dirent *dp, *edp;
 	caddr_t dirbuf;
-	int error, eofflag, readcnt;
+	int error, eofflag, readcnt, vfslocked;
 	long loff;
 
 	/* XXX arbitrary sanity limit on `count'. */
@@ -3521,7 +3624,9 @@
 	}
 	vp = fp->f_vnode;
 unionread:
+	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	if (vp->v_type != VDIR) {
+		VFS_UNLOCK_GIANT(vfslocked);
 		fdrop(fp, td);
 		return (EINVAL);
 	}
@@ -3539,6 +3644,7 @@
 	error = mac_check_vnode_readdir(td->td_ucred, vp);
 	if (error) {
 		VOP_UNLOCK(vp, 0, td);
+		VFS_UNLOCK_GIANT(vfslocked);
 		fdrop(fp, td);
 		return (error);
 	}
@@ -3594,39 +3700,27 @@
 		}
 		FREE(dirbuf, M_TEMP);
 	}
-	VOP_UNLOCK(vp, 0, td);
 	if (error) {
+		VOP_UNLOCK(vp, 0, td);
+		VFS_UNLOCK_GIANT(vfslocked);
 		fdrop(fp, td);
 		return (error);
 	}
-	if (uap->count == auio.uio_resid) {
-		if (union_dircheckp) {
-			error = union_dircheckp(td, &vp, fp);
-			if (error == -1)
-				goto unionread;
-			if (error) {
-				fdrop(fp, td);
-				return (error);
-			}
-		}
-		/*
-		 * XXX We could delay dropping the lock above but
-		 * union_dircheckp complicates things.
-		 */
-		vn_lock(vp, LK_EXCLUSIVE|LK_RETRY, td);
-		if ((vp->v_vflag & VV_ROOT) &&
-		    (vp->v_mount->mnt_flag & MNT_UNION)) {
-			struct vnode *tvp = vp;
-			vp = vp->v_mount->mnt_vnodecovered;
-			VREF(vp);
-			fp->f_vnode = vp;
-			fp->f_data = vp;
-			fp->f_offset = 0;
-			vput(tvp);
-			goto unionread;
-		}
-		VOP_UNLOCK(vp, 0, td);
+	if (uap->count == auio.uio_resid &&
+	    (vp->v_vflag & VV_ROOT) &&
+	    (vp->v_mount->mnt_flag & MNT_UNION)) {
+		struct vnode *tvp = vp;
+		vp = vp->v_mount->mnt_vnodecovered;
+		VREF(vp);
+		fp->f_vnode = vp;
+		fp->f_data = vp;
+		fp->f_offset = 0;
+		vput(tvp);
+		VFS_UNLOCK_GIANT(vfslocked);
+		goto unionread;
 	}
+	VOP_UNLOCK(vp, 0, td);
+	VFS_UNLOCK_GIANT(vfslocked);
 	error = copyout(&loff, uap->basep, sizeof(long));
 	fdrop(fp, td);
 	td->td_retval[0] = uap->count - auio.uio_resid;
@@ -3663,6 +3757,7 @@
 	long loff;
 	int error, eofflag;
 
+	AUDIT_ARG(fd, uap->fd);
 	if ((error = getvnode(td->td_proc->p_fd, uap->fd, &fp)) != 0)
 		return (error);
 	if ((fp->f_flag & FREAD) == 0) {
@@ -3673,6 +3768,7 @@
 unionread:
 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	if (vp->v_type != VDIR) {
+		VFS_UNLOCK_GIANT(vfslocked);
 		error = EINVAL;
 		goto fail;
 	}
@@ -3686,6 +3782,7 @@
 	auio.uio_resid = uap->count;
 	/* vn_lock(vp, LK_SHARED | LK_RETRY, td); */
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	AUDIT_ARG(vnode, vp, ARG_VNODE1);
 	loff = auio.uio_offset = fp->f_offset;
 #ifdef MAC
 	error = mac_check_vnode_readdir(td->td_ucred, vp);
@@ -3694,47 +3791,35 @@
 		error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL,
 		    NULL);
 	fp->f_offset = auio.uio_offset;
-	VOP_UNLOCK(vp, 0, td);
-	if (error)
-		goto fail;
-	if (uap->count == auio.uio_resid) {
-		if (union_dircheckp) {
-			error = union_dircheckp(td, &vp, fp);
-			if (error == -1) {
-				VFS_UNLOCK_GIANT(vfslocked);
-				goto unionread;
-			}
-			if (error)
-				goto fail;
-		}
-		/*
-		 * XXX We could delay dropping the lock above but
-		 * union_dircheckp complicates things.
-		 */
-		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
-		if ((vp->v_vflag & VV_ROOT) &&
-		    (vp->v_mount->mnt_flag & MNT_UNION)) {
-			struct vnode *tvp = vp;
-			vp = vp->v_mount->mnt_vnodecovered;
-			VREF(vp);
-			fp->f_vnode = vp;
-			fp->f_data = vp;
-			fp->f_offset = 0;
-			vput(tvp);
-			VFS_UNLOCK_GIANT(vfslocked);
-			goto unionread;
-		}
+	if (error) {
 		VOP_UNLOCK(vp, 0, td);
+		VFS_UNLOCK_GIANT(vfslocked);
+		goto fail;
 	}
+	if (uap->count == auio.uio_resid &&
+	    (vp->v_vflag & VV_ROOT) &&
+	    (vp->v_mount->mnt_flag & MNT_UNION)) {
+		struct vnode *tvp = vp;
+		vp = vp->v_mount->mnt_vnodecovered;
+		VREF(vp);
+		fp->f_vnode = vp;
+		fp->f_data = vp;
+		fp->f_offset = 0;
+		vput(tvp);
+		VFS_UNLOCK_GIANT(vfslocked);
+		goto unionread;
+	}
+	VOP_UNLOCK(vp, 0, td);
+	VFS_UNLOCK_GIANT(vfslocked);
 	if (uap->basep != NULL) {
 		error = copyout(&loff, uap->basep, sizeof(long));
 	}
 	td->td_retval[0] = uap->count - auio.uio_resid;
 fail:
-	VFS_UNLOCK_GIANT(vfslocked);
 	fdrop(fp, td);
 	return (error);
 }
+
 #ifndef _SYS_SYSPROTO_H_
 struct getdents_args {
 	int fd;
@@ -3761,8 +3846,6 @@
 
 /*
  * Set the mode mask for creation of filesystem nodes.
- *
- * MP SAFE
  */
 #ifndef _SYS_SYSPROTO_H_
 struct umask_args {
@@ -3778,17 +3861,17 @@
 {
 	register struct filedesc *fdp;
 
-	FILEDESC_LOCK_FAST(td->td_proc->p_fd);
+	FILEDESC_XLOCK(td->td_proc->p_fd);
 	fdp = td->td_proc->p_fd;
 	td->td_retval[0] = fdp->fd_cmask;
 	fdp->fd_cmask = uap->newmask & ALLPERMS;
-	FILEDESC_UNLOCK_FAST(td->td_proc->p_fd);
+	FILEDESC_XUNLOCK(td->td_proc->p_fd);
 	return (0);
 }
 
 /*
- * Void all references to file by ripping underlying filesystem
- * away from vnode.
+ * Void all references to file by ripping underlying filesystem away from
+ * vnode.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct revoke_args {
@@ -3808,8 +3891,8 @@
 	struct nameidata nd;
 	int vfslocked;
 
-	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE, UIO_USERSPACE,
-	    uap->path, td);
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
+	    UIO_USERSPACE, uap->path, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	vfslocked = NDHASGIANT(&nd);
@@ -3828,7 +3911,7 @@
 	if (error)
 		goto out;
 	if (td->td_ucred->cr_uid != vattr.va_uid) {
-		error = suser_cred(td->td_ucred, SUSER_ALLOWJAIL);
+		error = priv_check(td, PRIV_VFS_ADMIN);
 		if (error)
 			goto out;
 	}
@@ -3857,7 +3940,7 @@
 	if (fdp == NULL)
 		error = EBADF;
 	else {
-		FILEDESC_LOCK(fdp);
+		FILEDESC_SLOCK(fdp);
 		if ((u_int)fd >= fdp->fd_nfiles ||
 		    (fp = fdp->fd_ofiles[fd]) == NULL)
 			error = EBADF;
@@ -3868,14 +3951,14 @@
 			fhold(fp);
 			error = 0;
 		}
-		FILEDESC_UNLOCK(fdp);
+		FILEDESC_SUNLOCK(fdp);
 	}
 	*fpp = fp;
 	return (error);
 }
 
 /*
- * Get (NFS) file handle
+ * Get an (NFS) file handle.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct lgetfh_args {
@@ -3894,10 +3977,10 @@
 	int vfslocked;
 	int error;
 
-	error = suser(td);
+	error = priv_check(td, PRIV_VFS_GETFH);
 	if (error)
 		return (error);
-	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | MPSAFE,
+	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
 	    UIO_USERSPACE, uap->fname, td);
 	error = namei(&nd);
 	if (error)
@@ -3907,7 +3990,7 @@
 	vp = nd.ni_vp;
 	bzero(&fh, sizeof(fh));
 	fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid;
-	error = VFS_VPTOFH(vp, &fh.fh_fid);
+	error = VOP_VPTOFH(vp, &fh.fh_fid);
 	vput(vp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	if (error)
@@ -3933,10 +4016,10 @@
 	int vfslocked;
 	int error;
 
-	error = suser(td);
+	error = priv_check(td, PRIV_VFS_GETFH);
 	if (error)
 		return (error);
-	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE,
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | MPSAFE | AUDITVNODE1,
 	    UIO_USERSPACE, uap->fname, td);
 	error = namei(&nd);
 	if (error)
@@ -3946,7 +4029,7 @@
 	vp = nd.ni_vp;
 	bzero(&fh, sizeof(fh));
 	fh.fh_fsid = vp->v_mount->mnt_stat.f_fsid;
-	error = VFS_VPTOFH(vp, &fh.fh_fid);
+	error = VOP_VPTOFH(vp, &fh.fh_fid);
 	vput(vp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	if (error)
@@ -3956,13 +4039,11 @@
 }
 
 /*
- * syscall for the rpc.lockd to use to translate a NFS file handle into
- * an open descriptor.
+ * syscall for the rpc.lockd to use to translate a NFS file handle into an
+ * open descriptor.
  *
- * warning: do not remove the suser() call or this becomes one giant
+ * warning: do not remove the priv_check() call or this becomes one giant
  * security hole.
- *
- * MP SAFE
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fhopen_args {
@@ -3989,9 +4070,10 @@
 	register struct filedesc *fdp = p->p_fd;
 	int fmode, mode, error, type;
 	struct file *nfp;
+	int vfslocked;
 	int indx;
 
-	error = suser(td);
+	error = priv_check(td, PRIV_VFS_FHOPEN);
 	if (error)
 		return (error);
 	fmode = FFLAGS(uap->flags);
@@ -4002,12 +4084,10 @@
 	if (error)
 		return(error);
 	/* find the mount point */
-	mtx_lock(&Giant);
 	mp = vfs_getvfs(&fhp.fh_fsid);
-	if (mp == NULL) {
-		error = ESTALE;
-		goto out;
-	}
+	if (mp == NULL)
+		return (ESTALE);
+	vfslocked = VFS_LOCK_GIANT(mp);
 	/* now give me my vnode, it gets returned to me locked */
 	error = VFS_FHTOVP(mp, &fhp.fh_fid, &vp);
 	if (error)
@@ -4081,7 +4161,7 @@
 		if (error)
 			goto bad;
 	}
-	error = VOP_OPEN(vp, fmode, td->td_ucred, td, -1);
+	error = VOP_OPEN(vp, fmode, td->td_ucred, td, NULL);
 	if (error)
 		goto bad;
 
@@ -4100,11 +4180,13 @@
 	/* An extra reference on `nfp' has been held for us by falloc(). */
 	fp = nfp;
 
+	FILE_LOCK(nfp);
 	nfp->f_vnode = vp;
 	nfp->f_data = vp;
 	nfp->f_flag = fmode & FMASK;
-	nfp->f_ops = &vnops;
 	nfp->f_type = DTYPE_VNODE;
+	nfp->f_ops = &vnops;
+	FILE_UNLOCK(nfp);
 	if (fmode & (O_EXLOCK | O_SHLOCK)) {
 		lf.l_whence = SEEK_SET;
 		lf.l_start = 0;
@@ -4138,21 +4220,21 @@
 
 	VOP_UNLOCK(vp, 0, td);
 	fdrop(fp, td);
-	mtx_unlock(&Giant);
+	vfs_rel(mp);
+	VFS_UNLOCK_GIANT(vfslocked);
 	td->td_retval[0] = indx;
 	return (0);
 
 bad:
 	vput(vp);
 out:
-	mtx_unlock(&Giant);
+	vfs_rel(mp);
+	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
  * Stat an (NFS) file handle.
- *
- * MP SAFE
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fhstat_args {
@@ -4172,26 +4254,27 @@
 	fhandle_t fh;
 	struct mount *mp;
 	struct vnode *vp;
+	int vfslocked;
 	int error;
 
-	error = suser(td);
+	error = priv_check(td, PRIV_VFS_FHSTAT);
 	if (error)
 		return (error);
 	error = copyin(uap->u_fhp, &fh, sizeof(fhandle_t));
 	if (error)
 		return (error);
-	mtx_lock(&Giant);
-	if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL) {
-		mtx_unlock(&Giant);
+	if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL)
 		return (ESTALE);
-	}
+	vfslocked = VFS_LOCK_GIANT(mp);
 	if ((error = VFS_FHTOVP(mp, &fh.fh_fid, &vp))) {
-		mtx_unlock(&Giant);
+		vfs_rel(mp);
+		VFS_UNLOCK_GIANT(vfslocked);
 		return (error);
 	}
 	error = vn_stat(vp, &sb, td->td_ucred, NOCRED, td);
 	vput(vp);
-	mtx_unlock(&Giant);
+	vfs_rel(mp);
+	VFS_UNLOCK_GIANT(vfslocked);
 	if (error)
 		return (error);
 	error = copyout(&sb, uap->sb, sizeof(sb));
@@ -4200,8 +4283,6 @@
 
 /*
  * Implement fstatfs() for (NFS) file handles.
- *
- * MP SAFE
  */
 #ifndef _SYS_SYSPROTO_H_
 struct fhstatfs_args {
@@ -4236,39 +4317,29 @@
 	struct statfs *sp;
 	struct mount *mp;
 	struct vnode *vp;
+	int vfslocked;
 	int error;
 
-	error = suser(td);
+	error = priv_check(td, PRIV_VFS_FHSTATFS);
 	if (error)
 		return (error);
-	mtx_lock(&Giant);
-	if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL) {
-		mtx_unlock(&Giant);
+	if ((mp = vfs_getvfs(&fh.fh_fsid)) == NULL)
 		return (ESTALE);
-	}
+	vfslocked = VFS_LOCK_GIANT(mp);
 	error = VFS_FHTOVP(mp, &fh.fh_fid, &vp);
 	if (error) {
-		mtx_unlock(&Giant);
+		VFS_UNLOCK_GIANT(vfslocked);
+		vfs_rel(mp);
 		return (error);
 	}
-	mp = vp->v_mount;
-	if (mp)
-		vfs_ref(mp);
 	vput(vp);
-	if (mp == NULL)
-		return (EBADF);
 	error = prison_canseemount(td->td_ucred, mp);
-	if (error) {
-		vfs_rel(mp);
-		return (error);
-	}
+	if (error)
+		goto out;
 #ifdef MAC
 	error = mac_check_mount_stat(td->td_ucred, mp);
-	if (error) {
-		vfs_rel(mp);
-		mtx_unlock(&Giant);
-		return (error);
-	}
+	if (error)
+		goto out;
 #endif
 	/*
 	 * Set these in case the underlying filesystem fails to do so.
@@ -4278,714 +4349,10 @@
 	sp->f_namemax = NAME_MAX;
 	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
 	error = VFS_STATFS(mp, sp, td);
-	vfs_rel(mp);
-	mtx_unlock(&Giant);
-	if (error)
-		return (error);
-	*buf = *sp;
-	return (0);
-}
-
-/*
- * Syscall to push extended attribute configuration information into the
- * VFS.  Accepts a path, which it converts to a mountpoint, as well as
- * a command (int cmd), and attribute name and misc data.  For now, the
- * attribute name is left in userspace for consumption by the VFS_op.
- * It will probably be changed to be copied into sysspace by the
- * syscall in the future, once issues with various consumers of the
- * attribute code have raised their hands.
- *
- * Currently this is used only by UFS Extended Attributes.
- */
-int
-extattrctl(td, uap)
-	struct thread *td;
-	struct extattrctl_args /* {
-		const char *path;
-		int cmd;
-		const char *filename;
-		int attrnamespace;
-		const char *attrname;
-	} */ *uap;
-{
-	struct vnode *filename_vp;
-	struct nameidata nd;
-	struct mount *mp, *mp_writable;
-	char attrname[EXTATTR_MAXNAMELEN];
-	int vfslocked, fnvfslocked, error;
-
-	/*
-	 * uap->attrname is not always defined.  We check again later when we
-	 * invoke the VFS call so as to pass in NULL there if needed.
-	 */
-	if (uap->attrname != NULL) {
-		error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN,
-		    NULL);
-		if (error)
-			return (error);
-	}
-
-	vfslocked = fnvfslocked = 0;
-	/*
-	 * uap->filename is not always defined.  If it is, grab a vnode lock,
-	 * which VFS_EXTATTRCTL() will later release.
-	 */
-	filename_vp = NULL;
-	if (uap->filename != NULL) {
-		NDINIT(&nd, LOOKUP, MPSAFE | FOLLOW | LOCKLEAF,
-		    UIO_USERSPACE, uap->filename, td);
-		error = namei(&nd);
-		if (error)
-			return (error);
-		fnvfslocked = NDHASGIANT(&nd);
-		filename_vp = nd.ni_vp;
-		NDFREE(&nd, NDF_NO_VP_RELE | NDF_NO_VP_UNLOCK);
-	}
-
-	/* uap->path is always defined. */
-	NDINIT(&nd, LOOKUP, MPSAFE | FOLLOW, UIO_USERSPACE, uap->path, td);
-	error = namei(&nd);
-	if (error) {
-		if (filename_vp != NULL)
-			vput(filename_vp);
-		goto out;
-	}
-	vfslocked = NDHASGIANT(&nd);
-	mp = nd.ni_vp->v_mount;
-	error = vn_start_write(nd.ni_vp, &mp_writable, V_WAIT | PCATCH);
-	NDFREE(&nd, 0);
-	if (error) {
-		if (filename_vp != NULL)
-			vput(filename_vp);
-		goto out;
-	}
-
-	error = VFS_EXTATTRCTL(mp, uap->cmd, filename_vp, uap->attrnamespace,
-	    uap->attrname != NULL ? attrname : NULL, td);
-
-	vn_finished_write(mp_writable);
-	/*
-	 * VFS_EXTATTRCTL will have unlocked, but not de-ref'd,
-	 * filename_vp, so vrele it if it is defined.
-	 */
-	if (filename_vp != NULL)
-		vrele(filename_vp);
+	if (error == 0)
+		*buf = *sp;
 out:
-	VFS_UNLOCK_GIANT(fnvfslocked);
-	VFS_UNLOCK_GIANT(vfslocked);
-	return (error);
-}
-
-/*-
- * Set a named extended attribute on a file or directory
- *
- * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace",
- *            kernelspace string pointer "attrname", userspace buffer
- *            pointer "data", buffer length "nbytes", thread "td".
- * Returns: 0 on success, an error number otherwise
- * Locks: none
- * References: vp must be a valid reference for the duration of the call
- */
-static int
-extattr_set_vp(struct vnode *vp, int attrnamespace, const char *attrname,
-    void *data, size_t nbytes, struct thread *td)
-{
-	struct mount *mp;
-	struct uio auio;
-	struct iovec aiov;
-	ssize_t cnt;
-	int error;
-
-	VFS_ASSERT_GIANT(vp->v_mount);
-	error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
-	if (error)
-		return (error);
-	VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
-
-	aiov.iov_base = data;
-	aiov.iov_len = nbytes;
-	auio.uio_iov = &aiov;
-	auio.uio_iovcnt = 1;
-	auio.uio_offset = 0;
-	if (nbytes > INT_MAX) {
-		error = EINVAL;
-		goto done;
-	}
-	auio.uio_resid = nbytes;
-	auio.uio_rw = UIO_WRITE;
-	auio.uio_segflg = UIO_USERSPACE;
-	auio.uio_td = td;
-	cnt = nbytes;
-
-#ifdef MAC
-	error = mac_check_vnode_setextattr(td->td_ucred, vp, attrnamespace,
-	    attrname, &auio);
-	if (error)
-		goto done;
-#endif
-
-	error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio,
-	    td->td_ucred, td);
-	cnt -= auio.uio_resid;
-	td->td_retval[0] = cnt;
-
-done:
-	VOP_UNLOCK(vp, 0, td);
-	vn_finished_write(mp);
-	return (error);
-}
-
-int
-extattr_set_fd(td, uap)
-	struct thread *td;
-	struct extattr_set_fd_args /* {
-		int fd;
-		int attrnamespace;
-		const char *attrname;
-		void *data;
-		size_t nbytes;
-	} */ *uap;
-{
-	struct file *fp;
-	char attrname[EXTATTR_MAXNAMELEN];
-	int vfslocked, error;
-
-	error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
-	if (error)
-		return (error);
-
-	error = getvnode(td->td_proc->p_fd, uap->fd, &fp);
-	if (error)
-		return (error);
-
-	vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
-	error = extattr_set_vp(fp->f_vnode, uap->attrnamespace,
-	    attrname, uap->data, uap->nbytes, td);
-	fdrop(fp, td);
-	VFS_UNLOCK_GIANT(vfslocked);
-
-	return (error);
-}
-
-int
-extattr_set_file(td, uap)
-	struct thread *td;
-	struct extattr_set_file_args /* {
-		const char *path;
-		int attrnamespace;
-		const char *attrname;
-		void *data;
-		size_t nbytes;
-	} */ *uap;
-{
-	struct nameidata nd;
-	char attrname[EXTATTR_MAXNAMELEN];
-	int vfslocked, error;
-
-	error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
-	if (error)
-		return (error);
-
-	NDINIT(&nd, LOOKUP, MPSAFE | FOLLOW, UIO_USERSPACE, uap->path, td);
-	error = namei(&nd);
-	if (error)
-		return (error);
-	NDFREE(&nd, NDF_ONLY_PNBUF);
-
-	vfslocked = NDHASGIANT(&nd);
-	error = extattr_set_vp(nd.ni_vp, uap->attrnamespace, attrname,
-	    uap->data, uap->nbytes, td);
-
-	vrele(nd.ni_vp);
-	VFS_UNLOCK_GIANT(vfslocked);
-	return (error);
-}
-
-int
-extattr_set_link(td, uap)
-	struct thread *td;
-	struct extattr_set_link_args /* {
-		const char *path;
-		int attrnamespace;
-		const char *attrname;
-		void *data;
-		size_t nbytes;
-	} */ *uap;
-{
-	struct nameidata nd;
-	char attrname[EXTATTR_MAXNAMELEN];
-	int vfslocked, error;
-
-	error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
-	if (error)
-		return (error);
-
-	NDINIT(&nd, LOOKUP, MPSAFE | NOFOLLOW, UIO_USERSPACE, uap->path, td);
-	error = namei(&nd);
-	if (error)
-		return (error);
-	NDFREE(&nd, NDF_ONLY_PNBUF);
-
-	vfslocked = NDHASGIANT(&nd);
-	error = extattr_set_vp(nd.ni_vp, uap->attrnamespace, attrname,
-	    uap->data, uap->nbytes, td);
-
-	vrele(nd.ni_vp);
-	VFS_UNLOCK_GIANT(vfslocked);
-	return (error);
-}
-
-/*-
- * Get a named extended attribute on a file or directory
- *
- * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace",
- *            kernelspace string pointer "attrname", userspace buffer
- *            pointer "data", buffer length "nbytes", thread "td".
- * Returns: 0 on success, an error number otherwise
- * Locks: none
- * References: vp must be a valid reference for the duration of the call
- */
-static int
-extattr_get_vp(struct vnode *vp, int attrnamespace, const char *attrname,
-    void *data, size_t nbytes, struct thread *td)
-{
-	struct uio auio, *auiop;
-	struct iovec aiov;
-	ssize_t cnt;
-	size_t size, *sizep;
-	int error;
-
-	VFS_ASSERT_GIANT(vp->v_mount);
-	VOP_LEASE(vp, td, td->td_ucred, LEASE_READ);
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
-
-	/*
-	 * Slightly unusual semantics: if the user provides a NULL data
-	 * pointer, they don't want to receive the data, just the
-	 * maximum read length.
-	 */
-	auiop = NULL;
-	sizep = NULL;
-	cnt = 0;
-	if (data != NULL) {
-		aiov.iov_base = data;
-		aiov.iov_len = nbytes;
-		auio.uio_iov = &aiov;
-		auio.uio_iovcnt = 1;
-		auio.uio_offset = 0;
-		if (nbytes > INT_MAX) {
-			error = EINVAL;
-			goto done;
-		}
-		auio.uio_resid = nbytes;
-		auio.uio_rw = UIO_READ;
-		auio.uio_segflg = UIO_USERSPACE;
-		auio.uio_td = td;
-		auiop = &auio;
-		cnt = nbytes;
-	} else
-		sizep = &size;
-
-#ifdef MAC
-	error = mac_check_vnode_getextattr(td->td_ucred, vp, attrnamespace,
-	    attrname, &auio);
-	if (error)
-		goto done;
-#endif
-
-	error = VOP_GETEXTATTR(vp, attrnamespace, attrname, auiop, sizep,
-	    td->td_ucred, td);
-
-	if (auiop != NULL) {
-		cnt -= auio.uio_resid;
-		td->td_retval[0] = cnt;
-	} else
-		td->td_retval[0] = size;
-
-done:
-	VOP_UNLOCK(vp, 0, td);
-	return (error);
-}
-
-int
-extattr_get_fd(td, uap)
-	struct thread *td;
-	struct extattr_get_fd_args /* {
-		int fd;
-		int attrnamespace;
-		const char *attrname;
-		void *data;
-		size_t nbytes;
-	} */ *uap;
-{
-	struct file *fp;
-	char attrname[EXTATTR_MAXNAMELEN];
-	int vfslocked, error;
-
-	error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
-	if (error)
-		return (error);
-
-	error = getvnode(td->td_proc->p_fd, uap->fd, &fp);
-	if (error)
-		return (error);
-
-	vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
-	error = extattr_get_vp(fp->f_vnode, uap->attrnamespace,
-	    attrname, uap->data, uap->nbytes, td);
-
-	fdrop(fp, td);
-	VFS_UNLOCK_GIANT(vfslocked);
-	return (error);
-}
-
-int
-extattr_get_file(td, uap)
-	struct thread *td;
-	struct extattr_get_file_args /* {
-		const char *path;
-		int attrnamespace;
-		const char *attrname;
-		void *data;
-		size_t nbytes;
-	} */ *uap;
-{
-	struct nameidata nd;
-	char attrname[EXTATTR_MAXNAMELEN];
-	int vfslocked, error;
-
-	error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
-	if (error)
-		return (error);
-
-	NDINIT(&nd, LOOKUP, MPSAFE | FOLLOW, UIO_USERSPACE, uap->path, td);
-	error = namei(&nd);
-	if (error)
-		return (error);
-	NDFREE(&nd, NDF_ONLY_PNBUF);
-
-	vfslocked = NDHASGIANT(&nd);
-	error = extattr_get_vp(nd.ni_vp, uap->attrnamespace, attrname,
-	    uap->data, uap->nbytes, td);
-
-	vrele(nd.ni_vp);
-	VFS_UNLOCK_GIANT(vfslocked);
-	return (error);
-}
-
-int
-extattr_get_link(td, uap)
-	struct thread *td;
-	struct extattr_get_link_args /* {
-		const char *path;
-		int attrnamespace;
-		const char *attrname;
-		void *data;
-		size_t nbytes;
-	} */ *uap;
-{
-	struct nameidata nd;
-	char attrname[EXTATTR_MAXNAMELEN];
-	int vfslocked, error;
-
-	error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
-	if (error)
-		return (error);
-
-	NDINIT(&nd, LOOKUP, MPSAFE | NOFOLLOW, UIO_USERSPACE, uap->path, td);
-	error = namei(&nd);
-	if (error)
-		return (error);
-	NDFREE(&nd, NDF_ONLY_PNBUF);
-
-	vfslocked = NDHASGIANT(&nd);
-	error = extattr_get_vp(nd.ni_vp, uap->attrnamespace, attrname,
-	    uap->data, uap->nbytes, td);
-
-	vrele(nd.ni_vp);
-	VFS_UNLOCK_GIANT(vfslocked);
-	return (error);
-}
-
-/*
- * extattr_delete_vp(): Delete a named extended attribute on a file or
- *                      directory
- *
- * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace",
- *            kernelspace string pointer "attrname", proc "p"
- * Returns: 0 on success, an error number otherwise
- * Locks: none
- * References: vp must be a valid reference for the duration of the call
- */
-static int
-extattr_delete_vp(struct vnode *vp, int attrnamespace, const char *attrname,
-    struct thread *td)
-{
-	struct mount *mp;
-	int error;
-
-	VFS_ASSERT_GIANT(vp->v_mount);
-	error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
-	if (error)
-		return (error);
-	VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
-
-#ifdef MAC
-	error = mac_check_vnode_deleteextattr(td->td_ucred, vp, attrnamespace,
-	    attrname);
-	if (error)
-		goto done;
-#endif
-
-	error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, td->td_ucred,
-	    td);
-	if (error == EOPNOTSUPP)
-		error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL,
-		    td->td_ucred, td);
-#ifdef MAC
-done:
-#endif
-	VOP_UNLOCK(vp, 0, td);
-	vn_finished_write(mp);
-	return (error);
-}
-
-int
-extattr_delete_fd(td, uap)
-	struct thread *td;
-	struct extattr_delete_fd_args /* {
-		int fd;
-		int attrnamespace;
-		const char *attrname;
-	} */ *uap;
-{
-	struct file *fp;
-	char attrname[EXTATTR_MAXNAMELEN];
-	int vfslocked, error;
-
-	error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
-	if (error)
-		return (error);
-
-	error = getvnode(td->td_proc->p_fd, uap->fd, &fp);
-	if (error)
-		return (error);
-
-	vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
-	error = extattr_delete_vp(fp->f_vnode, uap->attrnamespace,
-	    attrname, td);
-	fdrop(fp, td);
-	VFS_UNLOCK_GIANT(vfslocked);
-	return (error);
-}
-
-int
-extattr_delete_file(td, uap)
-	struct thread *td;
-	struct extattr_delete_file_args /* {
-		const char *path;
-		int attrnamespace;
-		const char *attrname;
-	} */ *uap;
-{
-	struct nameidata nd;
-	char attrname[EXTATTR_MAXNAMELEN];
-	int vfslocked, error;
-
-	error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
-	if (error)
-		return(error);
-
-	NDINIT(&nd, LOOKUP, MPSAFE | FOLLOW, UIO_USERSPACE, uap->path, td);
-	error = namei(&nd);
-	if (error)
-		return(error);
-	NDFREE(&nd, NDF_ONLY_PNBUF);
-
-	vfslocked = NDHASGIANT(&nd);
-	error = extattr_delete_vp(nd.ni_vp, uap->attrnamespace, attrname, td);
-	vrele(nd.ni_vp);
-	VFS_UNLOCK_GIANT(vfslocked);
-	return(error);
-}
-
-int
-extattr_delete_link(td, uap)
-	struct thread *td;
-	struct extattr_delete_link_args /* {
-		const char *path;
-		int attrnamespace;
-		const char *attrname;
-	} */ *uap;
-{
-	struct nameidata nd;
-	char attrname[EXTATTR_MAXNAMELEN];
-	int vfslocked, error;
-
-	error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
-	if (error)
-		return(error);
-
-	NDINIT(&nd, LOOKUP, MPSAFE | NOFOLLOW, UIO_USERSPACE, uap->path, td);
-	error = namei(&nd);
-	if (error)
-		return(error);
-	NDFREE(&nd, NDF_ONLY_PNBUF);
-
-	vfslocked = NDHASGIANT(&nd);
-	error = extattr_delete_vp(nd.ni_vp, uap->attrnamespace, attrname, td);
-	vrele(nd.ni_vp);
-	VFS_UNLOCK_GIANT(vfslocked);
-	return(error);
-}
-
-/*-
- * Retrieve a list of extended attributes on a file or directory.
- *
- * Arguments: unlocked vnode "vp", attribute namespace 'attrnamespace",
- *            userspace buffer pointer "data", buffer length "nbytes",
- *            thread "td".
- * Returns: 0 on success, an error number otherwise
- * Locks: none
- * References: vp must be a valid reference for the duration of the call
- */
-static int
-extattr_list_vp(struct vnode *vp, int attrnamespace, void *data,
-    size_t nbytes, struct thread *td)
-{
-	struct uio auio, *auiop;
-	size_t size, *sizep;
-	struct iovec aiov;
-	ssize_t cnt;
-	int error;
-
-	VFS_ASSERT_GIANT(vp->v_mount);
-	VOP_LEASE(vp, td, td->td_ucred, LEASE_READ);
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
-
-	auiop = NULL;
-	sizep = NULL;
-	cnt = 0;
-	if (data != NULL) {
-		aiov.iov_base = data;
-		aiov.iov_len = nbytes;
-		auio.uio_iov = &aiov;
-		auio.uio_iovcnt = 1;
-		auio.uio_offset = 0;
-		if (nbytes > INT_MAX) {
-			error = EINVAL;
-			goto done;
-		}
-		auio.uio_resid = nbytes;
-		auio.uio_rw = UIO_READ;
-		auio.uio_segflg = UIO_USERSPACE;
-		auio.uio_td = td;
-		auiop = &auio;
-		cnt = nbytes;
-	} else
-		sizep = &size;
-
-#ifdef MAC
-	error = mac_check_vnode_listextattr(td->td_ucred, vp, attrnamespace);
-	if (error)
-		goto done;
-#endif
-
-	error = VOP_LISTEXTATTR(vp, attrnamespace, auiop, sizep,
-	    td->td_ucred, td);
-
-	if (auiop != NULL) {
-		cnt -= auio.uio_resid;
-		td->td_retval[0] = cnt;
-	} else
-		td->td_retval[0] = size;
-
-done:
-	VOP_UNLOCK(vp, 0, td);
-	return (error);
-}
-
-
-int
-extattr_list_fd(td, uap)
-	struct thread *td;
-	struct extattr_list_fd_args /* {
-		int fd;
-		int attrnamespace;
-		void *data;
-		size_t nbytes;
-	} */ *uap;
-{
-	struct file *fp;
-	int vfslocked, error;
-
-	error = getvnode(td->td_proc->p_fd, uap->fd, &fp);
-	if (error)
-		return (error);
-
-	vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
-	error = extattr_list_vp(fp->f_vnode, uap->attrnamespace, uap->data,
-	    uap->nbytes, td);
-
-	fdrop(fp, td);
-	VFS_UNLOCK_GIANT(vfslocked);
-	return (error);
-}
-
-int
-extattr_list_file(td, uap)
-	struct thread*td;
-	struct extattr_list_file_args /* {
-		const char *path;
-		int attrnamespace;
-		void *data;
-		size_t nbytes;
-	} */ *uap;
-{
-	struct nameidata nd;
-	int vfslocked, error;
-
-	NDINIT(&nd, LOOKUP, MPSAFE | FOLLOW, UIO_USERSPACE, uap->path, td);
-	error = namei(&nd);
-	if (error)
-		return (error);
-	NDFREE(&nd, NDF_ONLY_PNBUF);
-
-	vfslocked = NDHASGIANT(&nd);
-	error = extattr_list_vp(nd.ni_vp, uap->attrnamespace, uap->data,
-	    uap->nbytes, td);
-
-	vrele(nd.ni_vp);
-	VFS_UNLOCK_GIANT(vfslocked);
-	return (error);
-}
-
-int
-extattr_list_link(td, uap)
-	struct thread*td;
-	struct extattr_list_link_args /* {
-		const char *path;
-		int attrnamespace;
-		void *data;
-		size_t nbytes;
-	} */ *uap;
-{
-	struct nameidata nd;
-	int vfslocked, error;
-
-	NDINIT(&nd, LOOKUP, MPSAFE | NOFOLLOW, UIO_USERSPACE, uap->path, td);
-	error = namei(&nd);
-	if (error)
-		return (error);
-	NDFREE(&nd, NDF_ONLY_PNBUF);
-
-	vfslocked = NDHASGIANT(&nd);
-	error = extattr_list_vp(nd.ni_vp, uap->attrnamespace, uap->data,
-	    uap->nbytes, td);
-
-	vrele(nd.ni_vp);
+	vfs_rel(mp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
Index: sys_pipe.c
===================================================================
RCS file: /home/cvs/src/sys/kern/sys_pipe.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/sys_pipe.c -L sys/kern/sys_pipe.c -u -r1.2 -r1.3
--- sys/kern/sys_pipe.c
+++ sys/kern/sys_pipe.c
@@ -89,7 +89,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/sys_pipe.c,v 1.184.2.2 2006/01/31 15:44:51 glebius Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/sys_pipe.c,v 1.191.2.1 2007/11/25 11:11:28 dumbbell Exp $");
 
 #include "opt_mac.h"
 
@@ -101,7 +101,6 @@
 #include <sys/filio.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
-#include <sys/mac.h>
 #include <sys/mutex.h>
 #include <sys/ttycom.h>
 #include <sys/stat.h>
@@ -117,6 +116,8 @@
 #include <sys/uio.h>
 #include <sys/event.h>
 
+#include <security/mac/mac_framework.h>
+
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_object.h>
@@ -174,19 +175,14 @@
 #define MINPIPESIZE (PIPE_SIZE/3)
 #define MAXPIPESIZE (2*PIPE_SIZE/3)
 
-static int amountpipes;
 static int amountpipekva;
 static int pipefragretry;
 static int pipeallocfail;
 static int piperesizefail;
 static int piperesizeallowed = 1;
 
-SYSCTL_DECL(_kern_ipc);
-
 SYSCTL_INT(_kern_ipc, OID_AUTO, maxpipekva, CTLFLAG_RDTUN,
 	   &maxpipekva, 0, "Pipe KVA limit");
-SYSCTL_INT(_kern_ipc, OID_AUTO, pipes, CTLFLAG_RD,
-	   &amountpipes, 0, "Current # of pipes");
 SYSCTL_INT(_kern_ipc, OID_AUTO, pipekva, CTLFLAG_RD,
 	   &amountpipekva, 0, "Pipe KVA usage");
 SYSCTL_INT(_kern_ipc, OID_AUTO, pipefragretry, CTLFLAG_RD,
@@ -215,7 +211,6 @@
 static int pipespace_new(struct pipe *cpipe, int size);
 
 static int	pipe_zone_ctor(void *mem, int size, void *arg, int flags);
-static void	pipe_zone_dtor(void *mem, int size, void *arg);
 static int	pipe_zone_init(void *mem, int size, int flags);
 static void	pipe_zone_fini(void *mem, int size);
 
@@ -227,8 +222,8 @@
 pipeinit(void *dummy __unused)
 {
 
-	pipe_zone = uma_zcreate("PIPE", sizeof(struct pipepair),
-	    pipe_zone_ctor, pipe_zone_dtor, pipe_zone_init, pipe_zone_fini,
+	pipe_zone = uma_zcreate("pipe", sizeof(struct pipepair),
+	    pipe_zone_ctor, NULL, pipe_zone_init, pipe_zone_fini,
 	    UMA_ALIGN_PTR, 0);
 	KASSERT(pipe_zone != NULL, ("pipe_zone not initialized"));
 }
@@ -278,22 +273,9 @@
 	 */
 	pp->pp_label = NULL;
 
-	atomic_add_int(&amountpipes, 2);
 	return (0);
 }
 
-static void
-pipe_zone_dtor(void *mem, int size, void *arg)
-{
-	struct pipepair *pp;
-
-	KASSERT(size == sizeof(*pp), ("pipe_zone_dtor: wrong size"));
-
-	pp = (struct pipepair *)mem;
-
-	atomic_subtract_int(&amountpipes, 2);
-}
-
 static int
 pipe_zone_init(void *mem, int size, int flags)
 {
@@ -320,10 +302,9 @@
 }
 
 /*
- * The pipe system call for the DTYPE_PIPE type of pipes.  If we fail,
- * let the zone pick up the pieces via pipeclose().
+ * The pipe system call for the DTYPE_PIPE type of pipes.  If we fail, let
+ * the zone pick up the pieces via pipeclose().
  */
-
 /* ARGSUSED */
 int
 pipe(td, uap)
@@ -897,9 +878,9 @@
 	while (wpipe->pipe_state & PIPE_DIRECTW) {
 		if (wpipe->pipe_state & PIPE_WANTR) {
 			wpipe->pipe_state &= ~PIPE_WANTR;
-			pipeselwakeup(wpipe);
 			wakeup(wpipe);
 		}
+		pipeselwakeup(wpipe);
 		wpipe->pipe_state |= PIPE_WANTW;
 		pipeunlock(wpipe);
 		error = msleep(wpipe, PIPE_MTX(wpipe),
@@ -913,9 +894,9 @@
 	if (wpipe->pipe_buffer.cnt > 0) {
 		if (wpipe->pipe_state & PIPE_WANTR) {
 			wpipe->pipe_state &= ~PIPE_WANTR;
-			pipeselwakeup(wpipe);
 			wakeup(wpipe);
 		}
+		pipeselwakeup(wpipe);
 		wpipe->pipe_state |= PIPE_WANTW;
 		pipeunlock(wpipe);
 		error = msleep(wpipe, PIPE_MTX(wpipe),
@@ -1077,8 +1058,9 @@
 		 * The direct write mechanism will detect the reader going
 		 * away on us.
 		 */
-		if ((uio->uio_iov->iov_len >= PIPE_MINDIRECT) &&
-		    (wpipe->pipe_buffer.size >= PIPE_MINDIRECT) &&
+		if (uio->uio_segflg == UIO_USERSPACE &&
+		    uio->uio_iov->iov_len >= PIPE_MINDIRECT &&
+		    wpipe->pipe_buffer.size >= PIPE_MINDIRECT &&
 		    (fp->f_flag & FNONBLOCK) == 0) {
 			pipeunlock(wpipe);
 			error = pipe_direct_write(wpipe, uio);
@@ -1098,9 +1080,10 @@
 		if (wpipe->pipe_state & PIPE_DIRECTW) {
 			if (wpipe->pipe_state & PIPE_WANTR) {
 				wpipe->pipe_state &= ~PIPE_WANTR;
-				pipeselwakeup(wpipe);
 				wakeup(wpipe);
 			}
+			pipeselwakeup(wpipe);
+			wpipe->pipe_state |= PIPE_WANTW;
 			pipeunlock(wpipe);
 			error = msleep(wpipe, PIPE_MTX(rpipe), PRIBIO | PCATCH,
 			    "pipbww", 0);
Index: sysv_shm.c
===================================================================
RCS file: /home/cvs/src/sys/kern/sysv_shm.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/sysv_shm.c -L sys/kern/sysv_shm.c -u -r1.1.1.1 -r1.2
--- sys/kern/sysv_shm.c
+++ sys/kern/sysv_shm.c
@@ -60,7 +60,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/sysv_shm.c,v 1.102 2005/05/12 20:04:48 jhb Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/sysv_shm.c,v 1.111 2007/03/05 13:10:57 rwatson Exp $");
 
 #include "opt_compat.h"
 #include "opt_sysvipc.h"
@@ -84,7 +84,8 @@
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/jail.h>
-#include <sys/mac.h>
+
+#include <security/mac/mac_framework.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
@@ -94,28 +95,26 @@
 #include <vm/vm_page.h>
 #include <vm/vm_pager.h>
 
-#ifdef MAC_DEBUG
-#define MPRINTF(a)      printf a
-#else
-#define MPRINTF(a)	
-#endif
-
 static MALLOC_DEFINE(M_SHM, "shm", "SVID compatible shared memory segments");
 
+#if defined(__i386__) && (defined(COMPAT_FREEBSD4) || defined(COMPAT_43))
 struct oshmctl_args;
 static int oshmctl(struct thread *td, struct oshmctl_args *uap);
+#endif
 
 static int shmget_allocate_segment(struct thread *td,
     struct shmget_args *uap, int mode);
 static int shmget_existing(struct thread *td, struct shmget_args *uap,
     int mode, int segnum);
 
+#if defined(__i386__) && (defined(COMPAT_FREEBSD4) || defined(COMPAT_43))
 /* XXX casting to (sy_call_t *) is bogus, as usual. */
 static sy_call_t *shmcalls[] = {
 	(sy_call_t *)shmat, (sy_call_t *)oshmctl,
 	(sy_call_t *)shmdt, (sy_call_t *)shmget,
 	(sy_call_t *)shmctl
 };
+#endif
 
 #define	SHMSEG_FREE     	0x0200
 #define	SHMSEG_REMOVED  	0x0400
@@ -176,16 +175,15 @@
 static int shm_use_phys;
 static int shm_allow_removed;
 
-SYSCTL_DECL(_kern_ipc);
-SYSCTL_INT(_kern_ipc, OID_AUTO, shmmax, CTLFLAG_RW, &shminfo.shmmax, 0,
+SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmmax, CTLFLAG_RW, &shminfo.shmmax, 0,
     "Maximum shared memory segment size");
-SYSCTL_INT(_kern_ipc, OID_AUTO, shmmin, CTLFLAG_RW, &shminfo.shmmin, 0,
+SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmmin, CTLFLAG_RW, &shminfo.shmmin, 0,
     "Minimum shared memory segment size");
-SYSCTL_INT(_kern_ipc, OID_AUTO, shmmni, CTLFLAG_RDTUN, &shminfo.shmmni, 0,
+SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmmni, CTLFLAG_RDTUN, &shminfo.shmmni, 0,
     "Number of shared memory identifiers");
-SYSCTL_INT(_kern_ipc, OID_AUTO, shmseg, CTLFLAG_RDTUN, &shminfo.shmseg, 0,
+SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmseg, CTLFLAG_RDTUN, &shminfo.shmseg, 0,
     "Number of segments per process");
-SYSCTL_INT(_kern_ipc, OID_AUTO, shmall, CTLFLAG_RW, &shminfo.shmall, 0,
+SYSCTL_ULONG(_kern_ipc, OID_AUTO, shmall, CTLFLAG_RW, &shminfo.shmall, 0,
     "Maximum number of pages available for shared memory");
 SYSCTL_INT(_kern_ipc, OID_AUTO, shm_use_phys, CTLFLAG_RW,
     &shm_use_phys, 0, "Enable/Disable locking of shared memory pages in core");
@@ -291,10 +289,6 @@
 	const void *shmaddr;
 };
 #endif
-
-/*
- * MPSAFE
- */
 int
 shmdt(td, uap)
 	struct thread *td;
@@ -329,10 +323,8 @@
 #ifdef MAC
 	shmsegptr = &shmsegs[IPCID_TO_IX(shmmap_s->shmid)];
 	error = mac_check_sysv_shmdt(td->td_ucred, shmsegptr);
-	if (error != 0) {
-		MPRINTF(("mac_check_sysv_shmdt returned %d\n", error));
+	if (error != 0)
 		goto done2;
-	}
 #endif
 	error = shm_delete_mapping(p->p_vmspace, shmmap_s);
 done2:
@@ -347,10 +339,6 @@
 	int shmflg;
 };
 #endif
-
-/*
- * MPSAFE
- */
 int
 kern_shmat(td, shmid, shmaddr, shmflg)
 	struct thread *td;
@@ -390,10 +378,8 @@
 		goto done2;
 #ifdef MAC
 	error = mac_check_sysv_shmat(td->td_ucred, shmseg, shmflg);
-	if (error != 0) {
-	 	MPRINTF(("mac_check_sysv_shmat returned %d\n", error));
+	if (error != 0)
 		goto done2;
-	}
 #endif
 	for (i = 0; i < shminfo.shmseg; i++) {
 		if (shmmap_s->shmid == -1)
@@ -464,6 +450,7 @@
 	return kern_shmat(td, uap->shmid, uap->shmaddr, uap->shmflg);
 }
 
+#if defined(__i386__) && (defined(COMPAT_FREEBSD4) || defined(COMPAT_43))
 struct oshmid_ds {
 	struct	ipc_perm shm_perm;	/* operation perms */
 	int	shm_segsz;		/* size of segment (bytes) */
@@ -481,10 +468,6 @@
 	int cmd;
 	struct oshmid_ds *ubuf;
 };
-
-/*
- * MPSAFE
- */
 static int
 oshmctl(td, uap)
 	struct thread *td;
@@ -510,11 +493,8 @@
 			goto done2;
 #ifdef MAC
 		error = mac_check_sysv_shmctl(td->td_ucred, shmseg, uap->cmd);
-		if (error != 0) {
-			MPRINTF(("mac_check_sysv_shmctl returned %d\n",
-			    error));
+		if (error != 0)
 			goto done2;
-		}
 #endif
 		outbuf.shm_perm = shmseg->u.shm_perm;
 		outbuf.shm_segsz = shmseg->u.shm_segsz;
@@ -540,6 +520,7 @@
 	return (EINVAL);
 #endif
 }
+#endif
 
 #ifndef _SYS_SYSPROTO_H_
 struct shmctl_args {
@@ -548,10 +529,6 @@
 	struct shmid_ds *buf;
 };
 #endif
-
-/*
- * MPSAFE
- */
 int
 kern_shmctl(td, shmid, cmd, buf, bufsz)
 	struct thread *td;
@@ -599,10 +576,8 @@
 	}
 #ifdef MAC
 	error = mac_check_sysv_shmctl(td->td_ucred, shmseg, cmd);
-	if (error != 0) {
-		MPRINTF(("mac_check_sysv_shmctl returned %d\n", error));
+	if (error != 0)
 		goto done2;
-	}
 #endif
 	switch (cmd) {
 	case SHM_STAT:
@@ -700,7 +675,6 @@
 	int shmflg;
 };
 #endif
-
 static int
 shmget_existing(td, uap, mode, segnum)
 	struct thread *td;
@@ -726,14 +700,11 @@
 	}
 	if ((uap->shmflg & (IPC_CREAT | IPC_EXCL)) == (IPC_CREAT | IPC_EXCL))
 		return (EEXIST);
-	error = ipcperm(td, &shmseg->u.shm_perm, mode);
 #ifdef MAC
 	error = mac_check_sysv_shmget(td->td_ucred, shmseg, uap->shmflg);
 	if (error != 0)
-		MPRINTF(("mac_check_sysv_shmget returned %d\n", error));
-#endif
-	if (error)
 		return (error);
+#endif
 	if (uap->size && uap->size > shmseg->u.shm_segsz)
 		return (EINVAL);
 	td->td_retval[0] = IXSEQ_TO_IPCID(segnum, shmseg->u.shm_perm);
@@ -825,9 +796,6 @@
 	return (0);
 }
 
-/*
- * MPSAFE
- */
 int
 shmget(td, uap)
 	struct thread *td;
@@ -860,9 +828,6 @@
 	return (error);
 }
 
-/*
- * MPSAFE
- */
 int
 shmsys(td, uap)
 	struct thread *td;
@@ -874,6 +839,7 @@
 		int	a4;
 	} */ *uap;
 {
+#if defined(__i386__) && (defined(COMPAT_FREEBSD4) || defined(COMPAT_43))
 	int error;
 
 	if (!jail_sysvipc_allowed && jailed(td->td_ucred))
@@ -885,6 +851,9 @@
 	error = (*shmcalls[uap->which])(td, &uap->a2);
 	mtx_unlock(&Giant);
 	return (error);
+#else
+	return (nosys(td, NULL));
+#endif
 }
 
 static void
@@ -955,15 +924,15 @@
 {
 	int i;
 
-	TUNABLE_INT_FETCH("kern.ipc.shmmaxpgs", &shminfo.shmall);
+	TUNABLE_ULONG_FETCH("kern.ipc.shmmaxpgs", &shminfo.shmall);
 	for (i = PAGE_SIZE; i > 0; i--) {
 		shminfo.shmmax = shminfo.shmall * i;
 		if (shminfo.shmmax >= shminfo.shmall)
 			break;
 	}
-	TUNABLE_INT_FETCH("kern.ipc.shmmin", &shminfo.shmmin);
-	TUNABLE_INT_FETCH("kern.ipc.shmmni", &shminfo.shmmni);
-	TUNABLE_INT_FETCH("kern.ipc.shmseg", &shminfo.shmseg);
+	TUNABLE_ULONG_FETCH("kern.ipc.shmmin", &shminfo.shmmin);
+	TUNABLE_ULONG_FETCH("kern.ipc.shmmni", &shminfo.shmmni);
+	TUNABLE_ULONG_FETCH("kern.ipc.shmseg", &shminfo.shmseg);
 	TUNABLE_INT_FETCH("kern.ipc.shm_use_phys", &shm_use_phys);
 
 	shmalloced = shminfo.shmmni;
Index: kern_lock.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_lock.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/kern_lock.c -L sys/kern/kern_lock.c -u -r1.2 -r1.3
--- sys/kern/kern_lock.c
+++ sys/kern/kern_lock.c
@@ -41,7 +41,10 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_lock.c,v 1.89.2.3 2006/03/13 03:05:50 jeff Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_lock.c,v 1.110 2007/05/18 15:04:59 jhb Exp $");
+
+#include "opt_ddb.h"
+#include "opt_global.h"
 
 #include <sys/param.h>
 #include <sys/kdb.h>
@@ -52,20 +55,52 @@
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/systm.h>
+#include <sys/lock_profile.h>
 #ifdef DEBUG_LOCKS
 #include <sys/stack.h>
 #endif
 
+#ifdef DDB
+#include <ddb/ddb.h>
+static void	db_show_lockmgr(struct lock_object *lock);
+#endif
+static void	lock_lockmgr(struct lock_object *lock, int how);
+static int	unlock_lockmgr(struct lock_object *lock);
+
+struct lock_class lock_class_lockmgr = {
+	.lc_name = "lockmgr",
+	.lc_flags = LC_SLEEPLOCK | LC_SLEEPABLE | LC_RECURSABLE | LC_UPGRADABLE,
+#ifdef DDB
+	.lc_ddb_show = db_show_lockmgr,
+#endif
+	.lc_lock = lock_lockmgr,
+	.lc_unlock = unlock_lockmgr,
+};
+
 /*
  * Locking primitives implementation.
  * Locks provide shared/exclusive sychronization.
  */
 
+void
+lock_lockmgr(struct lock_object *lock, int how)
+{
+
+	panic("lockmgr locks do not support sleep interlocking");
+}
+
+int
+unlock_lockmgr(struct lock_object *lock)
+{
+
+	panic("lockmgr locks do not support sleep interlocking");
+}
+
 #define	COUNT(td, x)	if ((td)) (td)->td_locks += (x)
 #define LK_ALL (LK_HAVE_EXCL | LK_WANT_EXCL | LK_WANT_UPGRADE | \
 	LK_SHARE_NONZERO | LK_WAIT_NONZERO)
 
-static int acquire(struct lock **lkpp, int extflags, int wanted);
+static int acquire(struct lock **lkpp, int extflags, int wanted, int *contested, uint64_t *waittime);
 static int acquiredrain(struct lock *lkp, int extflags) ;
 
 static __inline void
@@ -93,7 +128,7 @@
 }
 
 static int
-acquire(struct lock **lkpp, int extflags, int wanted)
+acquire(struct lock **lkpp, int extflags, int wanted, int *contested, uint64_t *waittime)
 {
 	struct lock *lkp = *lkpp;
 	int error;
@@ -104,6 +139,9 @@
 	if ((extflags & LK_NOWAIT) && (lkp->lk_flags & wanted))
 		return EBUSY;
 	error = 0;
+	if ((lkp->lk_flags & wanted) != 0)
+		lock_profile_obtain_lock_failed(&lkp->lk_object, contested, waittime);
+	
 	while ((lkp->lk_flags & wanted) != 0) {
 		CTR2(KTR_LOCK,
 		    "acquire(): lkp == %p, lk_flags == 0x%x sleeping",
@@ -142,16 +180,16 @@
  * accepted shared locks and shared-to-exclusive upgrades to go away.
  */
 int
-lockmgr(lkp, flags, interlkp, td)
-	struct lock *lkp;
-	u_int flags;
-	struct mtx *interlkp;
-	struct thread *td;
+_lockmgr(struct lock *lkp, u_int flags, struct mtx *interlkp, 
+	 struct thread *td, char *file, int line)
+
 {
 	int error;
 	struct thread *thr;
 	int extflags, lockflags;
-
+	int contested = 0;
+	uint64_t waitstart = 0;
+	
 	error = 0;
 	if (td == NULL)
 		thr = LK_KERNPROC;
@@ -179,7 +217,7 @@
 
 	if ((flags & (LK_NOWAIT|LK_RELEASE)) == 0)
 		WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK,
-		    &lkp->lk_interlock->mtx_object,
+		    &lkp->lk_interlock->lock_object,
 		    "Acquiring lockmgr lock \"%s\"", lkp->lk_wmesg);
 
 	if (panicstr != NULL) {
@@ -209,10 +247,13 @@
 			lockflags = LK_HAVE_EXCL;
 			if (td != NULL && !(td->td_pflags & TDP_DEADLKTREAT))
 				lockflags |= LK_WANT_EXCL | LK_WANT_UPGRADE;
-			error = acquire(&lkp, extflags, lockflags);
+			error = acquire(&lkp, extflags, lockflags, &contested, &waitstart);
 			if (error)
 				break;
 			sharelock(td, lkp, 1);
+			if (lkp->lk_sharecount == 1)
+				lock_profile_obtain_lock_success(&lkp->lk_object, contested, waitstart, file, line);
+
 #if defined(DEBUG_LOCKS)
 			stack_save(&lkp->lk_stack);
 #endif
@@ -223,6 +264,8 @@
 		 * An alternative would be to fail with EDEADLK.
 		 */
 		sharelock(td, lkp, 1);
+		if (lkp->lk_sharecount == 1)
+			lock_profile_obtain_lock_success(&lkp->lk_object, contested, waitstart, file, line);
 		/* FALLTHROUGH downgrade */
 
 	case LK_DOWNGRADE:
@@ -266,6 +309,8 @@
 		if (lkp->lk_sharecount <= 0)
 			panic("lockmgr: upgrade without shared");
 		shareunlock(td, lkp, 1);
+		if (lkp->lk_sharecount == 0)
+			lock_profile_release_lock(&lkp->lk_object);
 		/*
 		 * If we are just polling, check to see if we will block.
 		 */
@@ -282,7 +327,7 @@
 			 * drop to zero, then take exclusive lock.
 			 */
 			lkp->lk_flags |= LK_WANT_UPGRADE;
-			error = acquire(&lkp, extflags, LK_SHARE_NONZERO);
+			error = acquire(&lkp, extflags, LK_SHARE_NONZERO, &contested, &waitstart);
 			lkp->lk_flags &= ~LK_WANT_UPGRADE;
 
 			if (error) {
@@ -296,6 +341,7 @@
 			lkp->lk_lockholder = thr;
 			lkp->lk_exclusivecount = 1;
 			COUNT(td, 1);
+			lock_profile_obtain_lock_success(&lkp->lk_object, contested, waitstart, file, line);
 #if defined(DEBUG_LOCKS)
 			stack_save(&lkp->lk_stack);
 #endif
@@ -335,14 +381,14 @@
 		/*
 		 * Try to acquire the want_exclusive flag.
 		 */
-		error = acquire(&lkp, extflags, (LK_HAVE_EXCL | LK_WANT_EXCL));
+		error = acquire(&lkp, extflags, (LK_HAVE_EXCL | LK_WANT_EXCL), &contested, &waitstart);
 		if (error)
 			break;
 		lkp->lk_flags |= LK_WANT_EXCL;
 		/*
 		 * Wait for shared locks and upgrades to finish.
 		 */
-		error = acquire(&lkp, extflags, LK_HAVE_EXCL | LK_WANT_UPGRADE | LK_SHARE_NONZERO);
+		error = acquire(&lkp, extflags, LK_HAVE_EXCL | LK_WANT_UPGRADE | LK_SHARE_NONZERO, &contested, &waitstart);
 		lkp->lk_flags &= ~LK_WANT_EXCL;
 		if (error) {
 			if (lkp->lk_flags & LK_WAIT_NONZERO)		
@@ -355,6 +401,7 @@
 			panic("lockmgr: non-zero exclusive count");
 		lkp->lk_exclusivecount = 1;
 		COUNT(td, 1);
+		lock_profile_obtain_lock_success(&lkp->lk_object, contested, waitstart, file, line);
 #if defined(DEBUG_LOCKS)
 		stack_save(&lkp->lk_stack);
 #endif
@@ -374,11 +421,18 @@
 				lkp->lk_flags &= ~LK_HAVE_EXCL;
 				lkp->lk_lockholder = LK_NOPROC;
 				lkp->lk_exclusivecount = 0;
+				lock_profile_release_lock(&lkp->lk_object);
 			} else {
 				lkp->lk_exclusivecount--;
 			}
 		} else if (lkp->lk_flags & LK_SHARE_NONZERO)
 			shareunlock(td, lkp, 1);
+		else  {
+			printf("lockmgr: thread %p unlocking unheld lock\n",
+			    thr);
+			kdb_backtrace();
+		}
+
 		if (lkp->lk_flags & LK_WAIT_NONZERO)
 			wakeup((void *)lkp);
 		break;
@@ -490,13 +544,14 @@
 	lkp->lk_waitcount = 0;
 	lkp->lk_exclusivecount = 0;
 	lkp->lk_prio = prio;
-	lkp->lk_wmesg = wmesg;
 	lkp->lk_timo = timo;
 	lkp->lk_lockholder = LK_NOPROC;
 	lkp->lk_newlock = NULL;
 #ifdef DEBUG_LOCKS
 	stack_zero(&lkp->lk_stack);
 #endif
+	lock_init(&lkp->lk_object, &lock_class_lockmgr, wmesg, NULL,
+	    LO_RECURSABLE | LO_SLEEPABLE | LO_UPGRADABLE);
 }
 
 /*
@@ -506,8 +561,10 @@
 lockdestroy(lkp)
 	struct lock *lkp;
 {
+
 	CTR2(KTR_LOCK, "lockdestroy(): lkp == %p (lk_wmesg == \"%s\")",
 	    lkp, lkp->lk_wmesg);
+	lock_destroy(&lkp->lk_object);
 }
 
 /*
@@ -554,6 +611,21 @@
 }
 
 /*
+ * Determine the number of waiters on a lock.
+ */
+int
+lockwaiters(lkp)
+	struct lock *lkp;
+{
+	int count;
+
+	mtx_lock(lkp->lk_interlock);
+	count = lkp->lk_waitcount;
+	mtx_unlock(lkp->lk_interlock);
+	return (count);
+}
+
+/*
  * Print out information about state of a lock. Used by VOP_PRINT
  * routines to display status about contained locks.
  */
@@ -575,3 +647,71 @@
 	stack_print(&lkp->lk_stack);
 #endif
 }
+
+#ifdef DDB
+/*
+ * Check to see if a thread that is blocked on a sleep queue is actually
+ * blocked on a 'struct lock'.  If so, output some details and return true.
+ * If the lock has an exclusive owner, return that in *ownerp.
+ */
+int
+lockmgr_chain(struct thread *td, struct thread **ownerp)
+{
+	struct lock *lkp;
+
+	lkp = td->td_wchan;
+
+	/* Simple test to see if wchan points to a lockmgr lock. */
+	if (LOCK_CLASS(&lkp->lk_object) == &lock_class_lockmgr &&
+	    lkp->lk_wmesg == td->td_wmesg)
+		goto ok;
+
+	/*
+	 * If this thread is doing a DRAIN, then it would be asleep on
+	 * &lkp->lk_flags rather than lkp.
+	 */
+	lkp = (struct lock *)((char *)td->td_wchan -
+	    offsetof(struct lock, lk_flags));
+	if (LOCK_CLASS(&lkp->lk_object) == &lock_class_lockmgr &&
+	    lkp->lk_wmesg == td->td_wmesg && (lkp->lk_flags & LK_WAITDRAIN))
+		goto ok;
+
+	/* Doen't seem to be a lockmgr lock. */
+	return (0);
+
+ok:
+	/* Ok, we think we have a lockmgr lock, so output some details. */
+	db_printf("blocked on lk \"%s\" ", lkp->lk_wmesg);
+	if (lkp->lk_sharecount) {
+		db_printf("SHARED (count %d)\n", lkp->lk_sharecount);
+		*ownerp = NULL;
+	} else {
+		db_printf("EXCL (count %d)\n", lkp->lk_exclusivecount);
+		*ownerp = lkp->lk_lockholder;
+	}
+	return (1);
+}
+
+void
+db_show_lockmgr(struct lock_object *lock)
+{
+	struct thread *td;
+	struct lock *lkp;
+
+	lkp = (struct lock *)lock;
+
+	db_printf(" lock type: %s\n", lkp->lk_wmesg);
+	db_printf(" state: ");
+	if (lkp->lk_sharecount)
+		db_printf("SHARED (count %d)\n", lkp->lk_sharecount);
+	else if (lkp->lk_flags & LK_HAVE_EXCL) {
+		td = lkp->lk_lockholder;
+		db_printf("EXCL (count %d) %p ", lkp->lk_exclusivecount, td);
+		db_printf("(tid %d, pid %d, \"%s\")\n", td->td_tid,
+		    td->td_proc->p_pid, td->td_proc->p_comm);
+	} else
+		db_printf("UNLOCKED\n");
+	if (lkp->lk_waitcount > 0)
+		db_printf(" waiters: %d\n", lkp->lk_waitcount);
+}
+#endif
Index: kern_shutdown.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_shutdown.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/kern_shutdown.c -L sys/kern/kern_shutdown.c -u -r1.2 -r1.3
--- sys/kern/kern_shutdown.c
+++ sys/kern/kern_shutdown.c
@@ -35,7 +35,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_shutdown.c,v 1.174.2.3 2006/03/13 03:05:54 jeff Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_shutdown.c,v 1.182.4.1 2008/01/30 21:21:50 ru Exp $");
 
 #include "opt_kdb.h"
 #include "opt_mac.h"
@@ -53,9 +53,9 @@
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
-#include <sys/mac.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
+#include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/reboot.h>
 #include <sys/resourcevar.h>
@@ -68,6 +68,14 @@
 #include <machine/pcb.h>
 #include <machine/smp.h>
 
+#include <security/mac/mac_framework.h>
+
+#include <vm/vm.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/vm_pager.h>
+#include <vm/swap_pager.h>
+
 #include <sys/signalvar.h>
 
 #ifndef PANIC_REBOOT_WAIT_TIME
@@ -142,9 +150,7 @@
 SYSINIT(shutdown_conf, SI_SUB_INTRINSIC, SI_ORDER_ANY, shutdown_conf, NULL)
 
 /*
- * The system call that results in a reboot
- *
- * MPSAFE
+ * The system call that results in a reboot.
  */
 /* ARGSUSED */
 int
@@ -157,7 +163,7 @@
 	error = mac_check_system_reboot(td->td_ucred, uap->opt);
 #endif
 	if (error == 0)
-		error = suser(td);
+		error = priv_check(td, PRIV_REBOOT);
 	if (error == 0) {
 		mtx_lock(&Giant);
 		boot(uap->opt);
@@ -261,9 +267,9 @@
 	 * systems don't shutdown properly (i.e., ACPI power off) if we
 	 * run on another processor.
 	 */
-	mtx_lock_spin(&sched_lock);
+	thread_lock(curthread);
 	sched_bind(curthread, 0);
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(curthread);
 	KASSERT(PCPU_GET(cpuid) == 0, ("boot: not running on cpu 0"));
 #endif
 	/* We're in the process of rebooting. */
@@ -334,9 +340,9 @@
 			 */
 			DROP_GIANT();
 			for (subiter = 0; subiter < 50 * iter; subiter++) {
-				mtx_lock_spin(&sched_lock);
+				thread_lock(curthread);
 				mi_switch(SW_VOL, NULL);
-				mtx_unlock_spin(&sched_lock);
+				thread_unlock(curthread);
 				DELAY(1000);
 			}
 			PICKUP_GIANT();
@@ -384,6 +390,7 @@
 			if (panicstr == 0)
 				vfs_unmountall();
 		}
+		swapoff_all();
 		DELAY(100000);		/* wait for console output to finish */
 	}
 
@@ -486,8 +493,6 @@
  * Panic is called on unresolvable fatal errors.  It prints "panic: mesg",
  * and then reboots.  If we are called twice, then we avoid trying to sync
  * the disks as this often leads to recursive panics.
- *
- * MPSAFE
  */
 void
 panic(const char *fmt, ...)
@@ -550,9 +555,9 @@
 	}
 #endif
 #endif
-	mtx_lock_spin(&sched_lock);
+	/*thread_lock(td); */
 	td->td_flags |= TDF_INPANIC;
-	mtx_unlock_spin(&sched_lock);
+	/* thread_unlock(td); */
 	if (!sync_on_panic)
 		bootopt |= RB_NOSYNC;
 	boot(bootopt);
@@ -626,6 +631,20 @@
 	return (0);
 }
 
+/* Call dumper with bounds checking. */
+int
+dump_write(struct dumperinfo *di, void *virtual, vm_offset_t physical,
+    off_t offset, size_t length)
+{
+
+	if (length != 0 && (offset < di->mediaoffset ||
+	    offset - di->mediaoffset + length > di->mediasize)) {
+		printf("Attempt to write outside dump device boundaries.\n");
+		return (ENXIO);
+	}
+	return (di->dumper(di->priv, virtual, physical, offset, length));
+}
+
 #if defined(__powerpc__)
 void
 dumpsys(struct dumperinfo *di __unused)
Index: subr_autoconf.c
===================================================================
RCS file: /home/cvs/src/sys/kern/subr_autoconf.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/subr_autoconf.c -L sys/kern/subr_autoconf.c -u -r1.1.1.1 -r1.2
--- sys/kern/subr_autoconf.c
+++ sys/kern/subr_autoconf.c
@@ -35,10 +35,12 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/subr_autoconf.c,v 1.22 2005/01/06 23:35:39 imp Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/subr_autoconf.c,v 1.23 2006/07/19 18:53:56 jhb Exp $");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
 #include <sys/systm.h>
 
 /*
@@ -50,26 +52,32 @@
  */
 static TAILQ_HEAD(, intr_config_hook) intr_config_hook_list =
 	TAILQ_HEAD_INITIALIZER(intr_config_hook_list);
-
+static struct mtx intr_config_hook_lock;
+MTX_SYSINIT(intr_config_hook, &intr_config_hook_lock, "intr config", MTX_DEF);
 
 /* ARGSUSED */
 static void run_interrupt_driven_config_hooks(void *dummy);
+
 static void
 run_interrupt_driven_config_hooks(dummy)
 	void *dummy;
 {
 	struct intr_config_hook *hook_entry, *next_entry;
 
-	for (hook_entry = TAILQ_FIRST(&intr_config_hook_list);
-	     hook_entry != NULL;
-	     hook_entry = next_entry) {
+	mtx_lock(&intr_config_hook_lock);
+	TAILQ_FOREACH_SAFE(hook_entry, &intr_config_hook_list, ich_links,
+	    next_entry) {
 		next_entry = TAILQ_NEXT(hook_entry, ich_links);
+		mtx_unlock(&intr_config_hook_lock);
 		(*hook_entry->ich_func)(hook_entry->ich_arg);
+		mtx_lock(&intr_config_hook_lock);
 	}
 
 	while (!TAILQ_EMPTY(&intr_config_hook_list)) {
-		tsleep(&intr_config_hook_list, PCONFIG, "conifhk", 0);
+		msleep(&intr_config_hook_list, &intr_config_hook_lock, PCONFIG,
+		    "conifhk", 0);
 	}
+	mtx_unlock(&intr_config_hook_lock);
 }
 SYSINIT(intr_config_hooks, SI_SUB_INT_CONFIG_HOOKS, SI_ORDER_FIRST,
 	run_interrupt_driven_config_hooks, NULL)
@@ -85,17 +93,18 @@
 {
 	struct intr_config_hook *hook_entry;
 
-	for (hook_entry = TAILQ_FIRST(&intr_config_hook_list);
-	     hook_entry != NULL;
-	     hook_entry = TAILQ_NEXT(hook_entry, ich_links))
+	mtx_lock(&intr_config_hook_lock);
+	TAILQ_FOREACH(hook_entry, &intr_config_hook_list, ich_links)
 		if (hook_entry == hook)
 			break;
 	if (hook_entry != NULL) {
+		mtx_unlock(&intr_config_hook_lock);
 		printf("config_intrhook_establish: establishing an "
 		       "already established hook.\n");
 		return (1);
 	}
 	TAILQ_INSERT_TAIL(&intr_config_hook_list, hook, ich_links);
+	mtx_unlock(&intr_config_hook_lock);
 	if (cold == 0)
 		/* XXX Sufficient for modules loaded after initial config??? */
 		run_interrupt_driven_config_hooks(NULL);	
@@ -108,9 +117,8 @@
 {
 	struct intr_config_hook *hook_entry;
 
-	for (hook_entry = TAILQ_FIRST(&intr_config_hook_list);
-	     hook_entry != NULL;
-	     hook_entry = TAILQ_NEXT(hook_entry, ich_links))
+	mtx_lock(&intr_config_hook_lock);
+	TAILQ_FOREACH(hook_entry, &intr_config_hook_list, ich_links)
 		if (hook_entry == hook)
 			break;
 	if (hook_entry == NULL)
@@ -118,6 +126,8 @@
 		      "unestablished hook");
 
 	TAILQ_REMOVE(&intr_config_hook_list, hook, ich_links);
+
 	/* Wakeup anyone watching the list */
 	wakeup(&intr_config_hook_list);
+	mtx_unlock(&intr_config_hook_lock);
 }
Index: kern_umtx.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_umtx.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/kern_umtx.c -L sys/kern/kern_umtx.c -u -r1.2 -r1.3
--- sys/kern/kern_umtx.c
+++ sys/kern/kern_umtx.c
@@ -26,20 +26,24 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_umtx.c,v 1.33.2.1 2006/01/16 05:48:40 davidxu Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_umtx.c,v 1.61.2.1 2007/12/20 07:15:40 davidxu Exp $");
 
+#include "opt_compat.h"
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
+#include <sys/priv.h>
 #include <sys/proc.h>
+#include <sys/sched.h>
+#include <sys/smp.h>
+#include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/eventhandler.h>
-#include <sys/thr.h>
 #include <sys/umtx.h>
 
 #include <vm/vm.h>
@@ -48,81 +52,204 @@
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 
-#define UMTX_PRIVATE	0
-#define UMTX_SHARED	1
+#include <machine/cpu.h>
 
-#define UMTX_STATIC_SHARED
+#ifdef COMPAT_IA32
+#include <compat/freebsd32/freebsd32_proto.h>
+#endif
+
+#define TYPE_SIMPLE_LOCK	0
+#define TYPE_SIMPLE_WAIT	1
+#define TYPE_NORMAL_UMUTEX	2
+#define TYPE_PI_UMUTEX		3
+#define TYPE_PP_UMUTEX		4
+#define TYPE_CV			5	
 
+/* Key to represent a unique userland synchronous object */
 struct umtx_key {
+	int	hash;
 	int	type;
+	int	shared;
 	union {
 		struct {
 			vm_object_t	object;
-			long		offset;
+			uintptr_t	offset;
 		} shared;
 		struct {
-			struct umtx	*umtx;
-			long		pid;
+			struct vmspace	*vs;
+			uintptr_t	addr;
 		} private;
 		struct {
-			void		*ptr;
-			long		word;
+			void		*a;
+			uintptr_t	b;
 		} both;
 	} info;
 };
 
+/* Priority inheritance mutex info. */
+struct umtx_pi {
+	/* Owner thread */
+	struct thread		*pi_owner;
+
+	/* Reference count */
+	int			pi_refcount;
+
+ 	/* List entry to link umtx holding by thread */
+	TAILQ_ENTRY(umtx_pi)	pi_link;
+
+	/* List entry in hash */
+	TAILQ_ENTRY(umtx_pi)	pi_hashlink;
+
+	/* List for waiters */
+	TAILQ_HEAD(,umtx_q)	pi_blocked;
+
+	/* Identify a userland lock object */
+	struct umtx_key		pi_key;
+};
+
+/* A userland synchronous object user. */
 struct umtx_q {
-	LIST_ENTRY(umtx_q)	uq_next;	/* Linked list for the hash. */
-	struct umtx_key		uq_key;		/* Umtx key. */
-	struct thread		*uq_thread;	/* The thread waits on. */
-	LIST_ENTRY(umtx_q)	uq_rqnext;	/* Linked list for requeuing. */
-	vm_offset_t		uq_addr;	/* Umtx's virtual address. */
+	/* Linked list for the hash. */
+	TAILQ_ENTRY(umtx_q)	uq_link;
+
+	/* Umtx key. */
+	struct umtx_key		uq_key;
+
+	/* Umtx flags. */
+	int			uq_flags;
+#define UQF_UMTXQ	0x0001
+
+	/* The thread waits on. */
+	struct thread		*uq_thread;
+
+	/*
+	 * Blocked on PI mutex. read can use chain lock
+	 * or umtx_lock, write must have both chain lock and
+	 * umtx_lock being hold.
+	 */
+	struct umtx_pi		*uq_pi_blocked;
+
+	/* On blocked list */
+	TAILQ_ENTRY(umtx_q)	uq_lockq;
+
+	/* Thread contending with us */
+	TAILQ_HEAD(,umtx_pi)	uq_pi_contested;
+
+	/* Inherited priority from PP mutex */
+	u_char			uq_inherited_pri;
 };
 
-LIST_HEAD(umtx_head, umtx_q);
+TAILQ_HEAD(umtxq_head, umtx_q);
+
+/* Userland lock object's wait-queue chain */
 struct umtxq_chain {
-	struct mtx		uc_lock;	/* Lock for this chain. */
-	struct umtx_head	uc_queue;	/* List of sleep queues. */
-#define	UCF_BUSY		0x01
-#define	UCF_WANT		0x02
-	int			uc_flags;
+	/* Lock for this chain. */
+	struct mtx		uc_lock;
+
+	/* List of sleep queues. */
+	struct umtxq_head	uc_queue;
+
+	/* Busy flag */
+	char			uc_busy;
+
+	/* Chain lock waiters */
+	int			uc_waiters;
+
+	/* All PI in the list */
+	TAILQ_HEAD(,umtx_pi)	uc_pi_list;
 };
 
+#define	UMTXQ_LOCKED_ASSERT(uc)		mtx_assert(&(uc)->uc_lock, MA_OWNED)
+
+/*
+ * Don't propagate time-sharing priority, there is a security reason,
+ * a user can simply introduce PI-mutex, let thread A lock the mutex,
+ * and let another thread B block on the mutex, because B is
+ * sleeping, its priority will be boosted, this causes A's priority to
+ * be boosted via priority propagating too and will never be lowered even
+ * if it is using 100%CPU, this is unfair to other processes.
+ */
+
+#define UPRI(td)	(((td)->td_user_pri >= PRI_MIN_TIMESHARE &&\
+			  (td)->td_user_pri <= PRI_MAX_TIMESHARE) ?\
+			 PRI_MAX_TIMESHARE : (td)->td_user_pri)
+
 #define	GOLDEN_RATIO_PRIME	2654404609U
 #define	UMTX_CHAINS		128
 #define	UMTX_SHIFTS		(__WORD_BIT - 7)
 
-static struct umtxq_chain umtxq_chains[UMTX_CHAINS];
+#define THREAD_SHARE		0
+#define PROCESS_SHARE		1
+#define AUTO_SHARE		2
+
+#define	GET_SHARE(flags)	\
+    (((flags) & USYNC_PROCESS_SHARED) == 0 ? THREAD_SHARE : PROCESS_SHARE)
+
+static uma_zone_t		umtx_pi_zone;
+static struct umtxq_chain	umtxq_chains[UMTX_CHAINS];
 static MALLOC_DEFINE(M_UMTX, "umtx", "UMTX queue memory");
+static int			umtx_pi_allocated;
 
-static void umtxq_init_chains(void *);
-static int umtxq_hash(struct umtx_key *key);
-static struct mtx *umtxq_mtx(int chain);
+SYSCTL_NODE(_debug, OID_AUTO, umtx, CTLFLAG_RW, 0, "umtx debug");
+SYSCTL_INT(_debug_umtx, OID_AUTO, umtx_pi_allocated, CTLFLAG_RD,
+    &umtx_pi_allocated, 0, "Allocated umtx_pi");
+
+static void umtxq_sysinit(void *);
+static void umtxq_hash(struct umtx_key *key);
+static struct umtxq_chain *umtxq_getchain(struct umtx_key *key);
 static void umtxq_lock(struct umtx_key *key);
 static void umtxq_unlock(struct umtx_key *key);
 static void umtxq_busy(struct umtx_key *key);
 static void umtxq_unbusy(struct umtx_key *key);
 static void umtxq_insert(struct umtx_q *uq);
 static void umtxq_remove(struct umtx_q *uq);
-static int umtxq_sleep(struct thread *td, struct umtx_key *key,
-	int prio, const char *wmesg, int timo);
+static int umtxq_sleep(struct umtx_q *uq, const char *wmesg, int timo);
 static int umtxq_count(struct umtx_key *key);
 static int umtxq_signal(struct umtx_key *key, int nr_wakeup);
-#ifdef UMTX_DYNAMIC_SHARED
-static void fork_handler(void *arg, struct proc *p1, struct proc *p2,
-	int flags);
-#endif
 static int umtx_key_match(const struct umtx_key *k1, const struct umtx_key *k2);
-static int umtx_key_get(struct thread *td, struct umtx *umtx,
+static int umtx_key_get(void *addr, int type, int share,
 	struct umtx_key *key);
 static void umtx_key_release(struct umtx_key *key);
+static struct umtx_pi *umtx_pi_alloc(int);
+static void umtx_pi_free(struct umtx_pi *pi);
+static void umtx_pi_adjust_locked(struct thread *td, u_char oldpri);
+static int do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags);
+static void umtx_thread_cleanup(struct thread *td);
+static void umtx_exec_hook(void *arg __unused, struct proc *p __unused,
+	struct image_params *imgp __unused);
+SYSINIT(umtx, SI_SUB_EVENTHANDLER+1, SI_ORDER_MIDDLE, umtxq_sysinit, NULL);
+
+static struct mtx umtx_lock;
 
-SYSINIT(umtx, SI_SUB_EVENTHANDLER+1, SI_ORDER_MIDDLE, umtxq_init_chains, NULL);
+static void
+umtxq_sysinit(void *arg __unused)
+{
+	int i;
+
+	umtx_pi_zone = uma_zcreate("umtx pi", sizeof(struct umtx_pi),
+		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+	for (i = 0; i < UMTX_CHAINS; ++i) {
+		mtx_init(&umtxq_chains[i].uc_lock, "umtxql", NULL,
+			 MTX_DEF | MTX_DUPOK);
+		TAILQ_INIT(&umtxq_chains[i].uc_queue);
+		TAILQ_INIT(&umtxq_chains[i].uc_pi_list);
+		umtxq_chains[i].uc_busy = 0;
+		umtxq_chains[i].uc_waiters = 0;
+	}
+	mtx_init(&umtx_lock, "umtx lock", NULL, MTX_SPIN);
+	EVENTHANDLER_REGISTER(process_exec, umtx_exec_hook, NULL,
+	    EVENTHANDLER_PRI_ANY);
+}
 
 struct umtx_q *
 umtxq_alloc(void)
 {
-	return (malloc(sizeof(struct umtx_q), M_UMTX, M_WAITOK));
+	struct umtx_q *uq;
+
+	uq = malloc(sizeof(struct umtx_q), M_UMTX, M_WAITOK | M_ZERO);
+	TAILQ_INIT(&uq->uq_pi_contested);
+	uq->uq_inherited_pri = PRI_MAX;
+	return (uq);
 }
 
 void
@@ -131,83 +258,84 @@
 	free(uq, M_UMTX);
 }
 
-static void
-umtxq_init_chains(void *arg __unused)
-{
-	int i;
-
-	for (i = 0; i < UMTX_CHAINS; ++i) {
-		mtx_init(&umtxq_chains[i].uc_lock, "umtxq_lock", NULL,
-			 MTX_DEF | MTX_DUPOK);
-		LIST_INIT(&umtxq_chains[i].uc_queue);
-		umtxq_chains[i].uc_flags = 0;
-	}
-#ifdef UMTX_DYNAMIC_SHARED
-	EVENTHANDLER_REGISTER(process_fork, fork_handler, 0, 10000);
-#endif
-}
-
-static inline int
+static inline void
 umtxq_hash(struct umtx_key *key)
 {
-	unsigned n = (uintptr_t)key->info.both.ptr + key->info.both.word;
-	return (((n * GOLDEN_RATIO_PRIME) >> UMTX_SHIFTS) % UMTX_CHAINS);
+	unsigned n = (uintptr_t)key->info.both.a + key->info.both.b;
+	key->hash = ((n * GOLDEN_RATIO_PRIME) >> UMTX_SHIFTS) % UMTX_CHAINS;
 }
 
 static inline int
 umtx_key_match(const struct umtx_key *k1, const struct umtx_key *k2)
 {
 	return (k1->type == k2->type &&
-		k1->info.both.ptr == k2->info.both.ptr &&
-	        k1->info.both.word == k2->info.both.word);
+		k1->info.both.a == k2->info.both.a &&
+	        k1->info.both.b == k2->info.both.b);
 }
 
-static inline struct mtx *
-umtxq_mtx(int chain)
+static inline struct umtxq_chain *
+umtxq_getchain(struct umtx_key *key)
 {
-	return (&umtxq_chains[chain].uc_lock);
+	return (&umtxq_chains[key->hash]);
 }
 
+/*
+ * Set chain to busy state when following operation
+ * may be blocked (kernel mutex can not be used).
+ */
 static inline void
 umtxq_busy(struct umtx_key *key)
 {
-	int chain = umtxq_hash(key);
+	struct umtxq_chain *uc;
 
-	mtx_assert(umtxq_mtx(chain), MA_OWNED);
-	while (umtxq_chains[chain].uc_flags & UCF_BUSY) {
-		umtxq_chains[chain].uc_flags |= UCF_WANT;
-		msleep(&umtxq_chains[chain], umtxq_mtx(chain),
-		       0, "umtxq_busy", 0);
+	uc = umtxq_getchain(key);
+	mtx_assert(&uc->uc_lock, MA_OWNED);
+	while (uc->uc_busy != 0) {
+		uc->uc_waiters++;
+		msleep(uc, &uc->uc_lock, 0, "umtxqb", 0);
+		uc->uc_waiters--;
 	}
-	umtxq_chains[chain].uc_flags |= UCF_BUSY;
+	uc->uc_busy = 1;
 }
 
+/*
+ * Unbusy a chain.
+ */
 static inline void
 umtxq_unbusy(struct umtx_key *key)
 {
-	int chain = umtxq_hash(key);
+	struct umtxq_chain *uc;
 
-	mtx_assert(umtxq_mtx(chain), MA_OWNED);
-	KASSERT(umtxq_chains[chain].uc_flags & UCF_BUSY, ("not busy"));
-	umtxq_chains[chain].uc_flags &= ~UCF_BUSY;
-	if (umtxq_chains[chain].uc_flags & UCF_WANT) {
-		umtxq_chains[chain].uc_flags &= ~UCF_WANT;
-		wakeup(&umtxq_chains[chain]);
-	}
+	uc = umtxq_getchain(key);
+	mtx_assert(&uc->uc_lock, MA_OWNED);
+	KASSERT(uc->uc_busy != 0, ("not busy"));
+	uc->uc_busy = 0;
+	if (uc->uc_waiters)
+		wakeup_one(uc);
 }
 
+/*
+ * Lock a chain.
+ */
 static inline void
 umtxq_lock(struct umtx_key *key)
 {
-	int chain = umtxq_hash(key);
-	mtx_lock(umtxq_mtx(chain));
+	struct umtxq_chain *uc;
+
+	uc = umtxq_getchain(key);
+	mtx_lock(&uc->uc_lock);
 }
 
+/*
+ * Unlock a chain.
+ */
 static inline void
 umtxq_unlock(struct umtx_key *key)
 {
-	int chain = umtxq_hash(key);
-	mtx_unlock(umtxq_mtx(chain));
+	struct umtxq_chain *uc;
+
+	uc = umtxq_getchain(key);
+	mtx_unlock(&uc->uc_lock);
 }
 
 /*
@@ -216,15 +344,12 @@
 static inline void
 umtxq_insert(struct umtx_q *uq)
 {
-	struct umtx_head *head;
-	int chain = umtxq_hash(&uq->uq_key);
+	struct umtxq_chain *uc;
 
-	mtx_assert(umtxq_mtx(chain), MA_OWNED);
-	head = &umtxq_chains[chain].uc_queue;
-	LIST_INSERT_HEAD(head, uq, uq_next);
-	mtx_lock_spin(&sched_lock);
-	uq->uq_thread->td_flags |= TDF_UMTXQ;
-	mtx_unlock_spin(&sched_lock);
+	uc = umtxq_getchain(&uq->uq_key);
+	UMTXQ_LOCKED_ASSERT(uc);
+	TAILQ_INSERT_TAIL(&uc->uc_queue, uq, uq_link);
+	uq->uq_flags |= UQF_UMTXQ;
 }
 
 /*
@@ -233,53 +358,78 @@
 static inline void
 umtxq_remove(struct umtx_q *uq)
 {
-	mtx_assert(umtxq_mtx(umtxq_hash(&uq->uq_key)), MA_OWNED);
-	if (uq->uq_thread->td_flags & TDF_UMTXQ) {
-		LIST_REMOVE(uq, uq_next);
-		/* turning off TDF_UMTXQ should be the last thing. */
-		mtx_lock_spin(&sched_lock);
-		uq->uq_thread->td_flags &= ~TDF_UMTXQ;
-		mtx_unlock_spin(&sched_lock);
+	struct umtxq_chain *uc;
+
+	uc = umtxq_getchain(&uq->uq_key);
+	UMTXQ_LOCKED_ASSERT(uc);
+	if (uq->uq_flags & UQF_UMTXQ) {
+		TAILQ_REMOVE(&uc->uc_queue, uq, uq_link);
+		uq->uq_flags &= ~UQF_UMTXQ;
 	}
 }
 
+/*
+ * Check if there are multiple waiters
+ */
 static int
 umtxq_count(struct umtx_key *key)
 {
+	struct umtxq_chain *uc;
+	struct umtx_q *uq;
+	int count = 0;
+
+	uc = umtxq_getchain(key);
+	UMTXQ_LOCKED_ASSERT(uc);
+	TAILQ_FOREACH(uq, &uc->uc_queue, uq_link) {
+		if (umtx_key_match(&uq->uq_key, key)) {
+			if (++count > 1)
+				break;
+		}
+	}
+	return (count);
+}
+
+/*
+ * Check if there are multiple PI waiters and returns first
+ * waiter.
+ */
+static int
+umtxq_count_pi(struct umtx_key *key, struct umtx_q **first)
+{
+	struct umtxq_chain *uc;
 	struct umtx_q *uq;
-	struct umtx_head *head;
-	int chain, count = 0;
+	int count = 0;
 
-	chain = umtxq_hash(key);
-	mtx_assert(umtxq_mtx(chain), MA_OWNED);
-	head = &umtxq_chains[chain].uc_queue;
-	LIST_FOREACH(uq, head, uq_next) {
+	*first = NULL;
+	uc = umtxq_getchain(key);
+	UMTXQ_LOCKED_ASSERT(uc);
+	TAILQ_FOREACH(uq, &uc->uc_queue, uq_link) {
 		if (umtx_key_match(&uq->uq_key, key)) {
 			if (++count > 1)
 				break;
+			*first = uq;
 		}
 	}
 	return (count);
 }
 
+/*
+ * Wake up threads waiting on an userland object.
+ */
 static int
 umtxq_signal(struct umtx_key *key, int n_wake)
 {
+	struct umtxq_chain *uc;
 	struct umtx_q *uq, *next;
-	struct umtx_head *head;
-	struct thread *blocked = NULL;
-	int chain, ret;
+	int ret;
 
 	ret = 0;
-	chain = umtxq_hash(key);
-	mtx_assert(umtxq_mtx(chain), MA_OWNED);
-	head = &umtxq_chains[chain].uc_queue;
-	for (uq = LIST_FIRST(head); uq; uq = next) {
-		next = LIST_NEXT(uq, uq_next);
+	uc = umtxq_getchain(key);
+	UMTXQ_LOCKED_ASSERT(uc);
+	TAILQ_FOREACH_SAFE(uq, &uc->uc_queue, uq_link, next) {
 		if (umtx_key_match(&uq->uq_key, key)) {
-			blocked = uq->uq_thread;
 			umtxq_remove(uq);
-			wakeup(blocked);
+			wakeup(uq);
 			if (++ret >= n_wake)
 				break;
 		}
@@ -287,180 +437,118 @@
 	return (ret);
 }
 
+/*
+ * Wake up specified thread.
+ */
+static inline void
+umtxq_signal_thread(struct umtx_q *uq)
+{
+	struct umtxq_chain *uc;
+
+	uc = umtxq_getchain(&uq->uq_key);
+	UMTXQ_LOCKED_ASSERT(uc);
+	umtxq_remove(uq);
+	wakeup(uq);
+}
+
+/*
+ * Put thread into sleep state, before sleeping, check if
+ * thread was removed from umtx queue.
+ */
 static inline int
-umtxq_sleep(struct thread *td, struct umtx_key *key, int priority,
-	    const char *wmesg, int timo)
+umtxq_sleep(struct umtx_q *uq, const char *wmesg, int timo)
 {
-	int chain = umtxq_hash(key);
-	int error = msleep(td, umtxq_mtx(chain), priority, wmesg, timo);
+	struct umtxq_chain *uc;
+	int error;
+
+	uc = umtxq_getchain(&uq->uq_key);
+	UMTXQ_LOCKED_ASSERT(uc);
+	if (!(uq->uq_flags & UQF_UMTXQ))
+		return (0);
+	error = msleep(uq, &uc->uc_lock, PCATCH, wmesg, timo);
 	if (error == EWOULDBLOCK)
 		error = ETIMEDOUT;
 	return (error);
 }
 
+/*
+ * Convert userspace address into unique logical address.
+ */
 static int
-umtx_key_get(struct thread *td, struct umtx *umtx, struct umtx_key *key)
+umtx_key_get(void *addr, int type, int share, struct umtx_key *key)
 {
-#if defined(UMTX_DYNAMIC_SHARED) || defined(UMTX_STATIC_SHARED)
+	struct thread *td = curthread;
 	vm_map_t map;
 	vm_map_entry_t entry;
 	vm_pindex_t pindex;
 	vm_prot_t prot;
 	boolean_t wired;
 
-	map = &td->td_proc->p_vmspace->vm_map;
-	if (vm_map_lookup(&map, (vm_offset_t)umtx, VM_PROT_WRITE,
-	    &entry, &key->info.shared.object, &pindex, &prot,
-	    &wired) != KERN_SUCCESS) {
-		return EFAULT;
+	key->type = type;
+	if (share == THREAD_SHARE) {
+		key->shared = 0;
+		key->info.private.vs = td->td_proc->p_vmspace;
+		key->info.private.addr = (uintptr_t)addr;
+	} else {
+		MPASS(share == PROCESS_SHARE || share == AUTO_SHARE);
+		map = &td->td_proc->p_vmspace->vm_map;
+		if (vm_map_lookup(&map, (vm_offset_t)addr, VM_PROT_WRITE,
+		    &entry, &key->info.shared.object, &pindex, &prot,
+		    &wired) != KERN_SUCCESS) {
+			return EFAULT;
+		}
+
+		if ((share == PROCESS_SHARE) ||
+		    (share == AUTO_SHARE &&
+		     VM_INHERIT_SHARE == entry->inheritance)) {
+			key->shared = 1;
+			key->info.shared.offset = entry->offset + entry->start -
+				(vm_offset_t)addr;
+			vm_object_reference(key->info.shared.object);
+		} else {
+			key->shared = 0;
+			key->info.private.vs = td->td_proc->p_vmspace;
+			key->info.private.addr = (uintptr_t)addr;
+		}
+		vm_map_lookup_done(map, entry);
 	}
-#endif
 
-#if defined(UMTX_DYNAMIC_SHARED)
-	key->type = UMTX_SHARED;
-	key->info.shared.offset = entry->offset + entry->start - 
-		(vm_offset_t)umtx;
-	/*
-	 * Add object reference, if we don't do this, a buggy application
-	 * deallocates the object, the object will be reused by other
-	 * applications, then unlock will wake wrong thread.
-	 */
-	vm_object_reference(key->info.shared.object);
-	vm_map_lookup_done(map, entry);
-#elif defined(UMTX_STATIC_SHARED)
-	if (VM_INHERIT_SHARE == entry->inheritance) {
-		key->type = UMTX_SHARED;
-		key->info.shared.offset = entry->offset + entry->start -
-			(vm_offset_t)umtx;
-		vm_object_reference(key->info.shared.object);
-	} else {
-		key->type = UMTX_PRIVATE;
-		key->info.private.umtx = umtx;
-		key->info.private.pid  = td->td_proc->p_pid;
-	}
-	vm_map_lookup_done(map, entry);
-#else
-	key->type = UMTX_PRIVATE;
-	key->info.private.umtx = umtx;
-	key->info.private.pid  = td->td_proc->p_pid;
-#endif
+	umtxq_hash(key);
 	return (0);
 }
 
+/*
+ * Release key.
+ */
 static inline void
 umtx_key_release(struct umtx_key *key)
 {
-	if (key->type == UMTX_SHARED)
+	if (key->shared)
 		vm_object_deallocate(key->info.shared.object);
 }
 
-static inline int
-umtxq_queue_me(struct thread *td, struct umtx *umtx, struct umtx_q *uq)
-{
-	int error;
-
-	if ((error = umtx_key_get(td, umtx, &uq->uq_key)) != 0)
-		return (error);
-
-	uq->uq_addr = (vm_offset_t)umtx;
-	uq->uq_thread = td;
-	umtxq_lock(&uq->uq_key);
-	/* hmm, for condition variable, we don't need busy flag. */
-	umtxq_busy(&uq->uq_key);
-	umtxq_insert(uq);
-	umtxq_unbusy(&uq->uq_key);
-	umtxq_unlock(&uq->uq_key);
-	return (0);
-}
-
-#if defined(UMTX_DYNAMIC_SHARED)
-static void
-fork_handler(void *arg, struct proc *p1, struct proc *p2, int flags)
-{
-	vm_map_t map;
-	vm_map_entry_t entry;
-	vm_object_t object;
-	vm_pindex_t pindex;
-	vm_prot_t prot;
-	boolean_t wired;
-	struct umtx_key key;
-	LIST_HEAD(, umtx_q) workq;
-	struct umtx_q *uq;
-	struct thread *td;
-	int onq;
-
-	LIST_INIT(&workq);
-
-	/* Collect threads waiting on umtxq */
-	PROC_LOCK(p1);
-	FOREACH_THREAD_IN_PROC(p1, td) {
-		if (td->td_flags & TDF_UMTXQ) {
-			uq = td->td_umtxq;
-			if (uq)
-				LIST_INSERT_HEAD(&workq, uq, uq_rqnext);
-		}
-	}
-	PROC_UNLOCK(p1);
-
-	LIST_FOREACH(uq, &workq, uq_rqnext) {
-		map = &p1->p_vmspace->vm_map;
-		if (vm_map_lookup(&map, uq->uq_addr, VM_PROT_WRITE,
-		    &entry, &object, &pindex, &prot, &wired) != KERN_SUCCESS) {
-			continue;
-		}
-		key.type = UMTX_SHARED;
-		key.info.shared.object = object;
-		key.info.shared.offset = entry->offset + entry->start -
-			uq->uq_addr;
-		if (umtx_key_match(&key, &uq->uq_key)) {
-			vm_map_lookup_done(map, entry);
-			continue;
-		}
-		
-		umtxq_lock(&uq->uq_key);
-		umtxq_busy(&uq->uq_key);
-		if (uq->uq_thread->td_flags & TDF_UMTXQ) {
-			umtxq_remove(uq);
-			onq = 1;
-		} else
-			onq = 0;
-		umtxq_unbusy(&uq->uq_key);
-		umtxq_unlock(&uq->uq_key);
-		if (onq) {
-			vm_object_deallocate(uq->uq_key.info.shared.object);
-			uq->uq_key = key;
-			umtxq_lock(&uq->uq_key);
-			umtxq_busy(&uq->uq_key);
-			umtxq_insert(uq);
-			umtxq_unbusy(&uq->uq_key);
-			umtxq_unlock(&uq->uq_key);
-			vm_object_reference(uq->uq_key.info.shared.object);
-		}
-		vm_map_lookup_done(map, entry);
-	}
-}
-#endif
-
+/*
+ * Lock a umtx object.
+ */
 static int
-_do_lock(struct thread *td, struct umtx *umtx, long id, int timo)
+_do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id, int timo)
 {
 	struct umtx_q *uq;
-	intptr_t owner;
-	intptr_t old;
+	u_long owner;
+	u_long old;
 	int error = 0;
 
 	uq = td->td_umtxq;
+
 	/*
-	 * Care must be exercised when dealing with umtx structure.  It
+	 * Care must be exercised when dealing with umtx structure. It
 	 * can fault on any access.
 	 */
-
 	for (;;) {
 		/*
 		 * Try the uncontested case.  This should be done in userland.
 		 */
-		owner = casuptr((intptr_t *)&umtx->u_owner,
-		    UMTX_UNOWNED, id);
+		owner = casuword(&umtx->u_owner, UMTX_UNOWNED, id);
 
 		/* The acquire succeeded. */
 		if (owner == UMTX_UNOWNED)
@@ -472,7 +560,7 @@
 
 		/* If no one owns it but it is contested try to acquire it. */
 		if (owner == UMTX_CONTESTED) {
-			owner = casuptr((intptr_t *)&umtx->u_owner,
+			owner = casuword(&umtx->u_owner,
 			    UMTX_CONTESTED, id | UMTX_CONTESTED);
 
 			if (owner == UMTX_CONTESTED)
@@ -490,24 +578,31 @@
 		 * If we caught a signal, we have retried and now
 		 * exit immediately.
 		 */
-		if (error || (error = umtxq_queue_me(td, umtx, uq)) != 0)
+		if (error != 0)
 			return (error);
 
+		if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK,
+			AUTO_SHARE, &uq->uq_key)) != 0)
+			return (error);
+
+		umtxq_lock(&uq->uq_key);
+		umtxq_busy(&uq->uq_key);
+		umtxq_insert(uq);
+		umtxq_unbusy(&uq->uq_key);
+		umtxq_unlock(&uq->uq_key);
+
 		/*
 		 * Set the contested bit so that a release in user space
 		 * knows to use the system call for unlock.  If this fails
 		 * either some one else has acquired the lock or it has been
 		 * released.
 		 */
-		old = casuptr((intptr_t *)&umtx->u_owner, owner,
-		    owner | UMTX_CONTESTED);
+		old = casuword(&umtx->u_owner, owner, owner | UMTX_CONTESTED);
 
 		/* The address was invalid. */
 		if (old == -1) {
 			umtxq_lock(&uq->uq_key);
-			umtxq_busy(&uq->uq_key);
 			umtxq_remove(uq);
-			umtxq_unbusy(&uq->uq_key);
 			umtxq_unlock(&uq->uq_key);
 			umtx_key_release(&uq->uq_key);
 			return (EFAULT);
@@ -519,14 +614,9 @@
 		 * unlocking the umtx.
 		 */
 		umtxq_lock(&uq->uq_key);
-		if (old == owner && (td->td_flags & TDF_UMTXQ)) {
-			error = umtxq_sleep(td, &uq->uq_key,
-				       PCATCH,
-				       "umtx", timo);
-		}
-		umtxq_busy(&uq->uq_key);
+		if (old == owner)
+			error = umtxq_sleep(uq, "umtx", timo);
 		umtxq_remove(uq);
-		umtxq_unbusy(&uq->uq_key);
 		umtxq_unlock(&uq->uq_key);
 		umtx_key_release(&uq->uq_key);
 	}
@@ -534,8 +624,11 @@
 	return (0);
 }
 
+/*
+ * Lock a umtx object.
+ */
 static int
-do_lock(struct thread *td, struct umtx *umtx, long id,
+do_lock_umtx(struct thread *td, struct umtx *umtx, u_long id,
 	struct timespec *timeout)
 {
 	struct timespec ts, ts2, ts3;
@@ -543,13 +636,16 @@
 	int error;
 
 	if (timeout == NULL) {
-		error = _do_lock(td, umtx, id, 0);
+		error = _do_lock_umtx(td, umtx, id, 0);
+		/* Mutex locking is restarted if it is interrupted. */
+		if (error == EINTR)
+			error = ERESTART;
 	} else {
 		getnanouptime(&ts);
 		timespecadd(&ts, timeout);
 		TIMESPEC_TO_TIMEVAL(&tv, timeout);
 		for (;;) {
-			error = _do_lock(td, umtx, id, tvtohz(&tv));
+			error = _do_lock_umtx(td, umtx, id, tvtohz(&tv));
 			if (error != ETIMEDOUT)
 				break;
 			getnanouptime(&ts2);
@@ -561,41 +657,48 @@
 			timespecsub(&ts3, &ts2);
 			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
 		}
+		/* Timed-locking is not restarted. */
+		if (error == ERESTART)
+			error = EINTR;
 	}
-	/*
-	 * This lets userland back off critical region if needed.
-	 */
-	if (error == ERESTART)
-		error = EINTR;
 	return (error);
 }
 
+/*
+ * Unlock a umtx object.
+ */
 static int
-do_unlock(struct thread *td, struct umtx *umtx, long id)
+do_unlock_umtx(struct thread *td, struct umtx *umtx, u_long id)
 {
 	struct umtx_key key;
-	intptr_t owner;
-	intptr_t old;
+	u_long owner;
+	u_long old;
 	int error;
 	int count;
 
 	/*
 	 * Make sure we own this mtx.
-	 *
-	 * XXX Need a {fu,su}ptr this is not correct on arch where
-	 * sizeof(intptr_t) != sizeof(long).
 	 */
-	if ((owner = fuword(&umtx->u_owner)) == -1)
+	owner = fuword(__DEVOLATILE(u_long *, &umtx->u_owner));
+	if (owner == -1)
 		return (EFAULT);
 
 	if ((owner & ~UMTX_CONTESTED) != id)
 		return (EPERM);
 
-	/* We should only ever be in here for contested locks */
-	if ((owner & UMTX_CONTESTED) == 0)
-		return (EINVAL);
+	/* This should be done in userland */
+	if ((owner & UMTX_CONTESTED) == 0) {
+		old = casuword(&umtx->u_owner, owner, UMTX_UNOWNED);
+		if (old == -1)
+			return (EFAULT);
+		if (old == owner)
+			return (0);
+		owner = old;
+	}
 
-	if ((error = umtx_key_get(td, umtx, &key)) != 0)
+	/* We should only ever be in here for contested locks */
+	if ((error = umtx_key_get(umtx, TYPE_SIMPLE_LOCK, AUTO_SHARE,
+		&key)) != 0)
 		return (error);
 
 	umtxq_lock(&key);
@@ -608,10 +711,10 @@
 	 * there is zero or one thread only waiting for it.
 	 * Otherwise, it must be marked as contested.
 	 */
-	old = casuptr((intptr_t *)&umtx->u_owner, owner,
-			count <= 1 ? UMTX_UNOWNED : UMTX_CONTESTED);
+	old = casuword(&umtx->u_owner, owner,
+		count <= 1 ? UMTX_UNOWNED : UMTX_CONTESTED);
 	umtxq_lock(&key);
-	umtxq_signal(&key, 0);
+	umtxq_signal(&key,1);
 	umtxq_unbusy(&key);
 	umtxq_unlock(&key);
 	umtx_key_release(&key);
@@ -622,49 +725,126 @@
 	return (0);
 }
 
+#ifdef COMPAT_IA32
+
+/*
+ * Lock a umtx object.
+ */
 static int
-do_wait(struct thread *td, struct umtx *umtx, long id, struct timespec *timeout)
+_do_lock_umtx32(struct thread *td, uint32_t *m, uint32_t id, int timo)
 {
 	struct umtx_q *uq;
-	struct timespec ts, ts2, ts3;
-	struct timeval tv;
-	long tmp;
+	uint32_t owner;
+	uint32_t old;
 	int error = 0;
 
 	uq = td->td_umtxq;
-	if ((error = umtxq_queue_me(td, umtx, uq)) != 0)
-		return (error);
-	tmp = fuword(&umtx->u_owner);
-	if (tmp != id) {
-		umtxq_lock(&uq->uq_key);
-		umtxq_remove(uq);
-		umtxq_unlock(&uq->uq_key);
-	} else if (timeout == NULL) {
+
+	/*
+	 * Care must be exercised when dealing with umtx structure. It
+	 * can fault on any access.
+	 */
+	for (;;) {
+		/*
+		 * Try the uncontested case.  This should be done in userland.
+		 */
+		owner = casuword32(m, UMUTEX_UNOWNED, id);
+
+		/* The acquire succeeded. */
+		if (owner == UMUTEX_UNOWNED)
+			return (0);
+
+		/* The address was invalid. */
+		if (owner == -1)
+			return (EFAULT);
+
+		/* If no one owns it but it is contested try to acquire it. */
+		if (owner == UMUTEX_CONTESTED) {
+			owner = casuword32(m,
+			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
+			if (owner == UMUTEX_CONTESTED)
+				return (0);
+
+			/* The address was invalid. */
+			if (owner == -1)
+				return (EFAULT);
+
+			/* If this failed the lock has changed, restart. */
+			continue;
+		}
+
+		/*
+		 * If we caught a signal, we have retried and now
+		 * exit immediately.
+		 */
+		if (error != 0)
+			return (error);
+
+		if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK,
+			AUTO_SHARE, &uq->uq_key)) != 0)
+			return (error);
+
 		umtxq_lock(&uq->uq_key);
-		if (td->td_flags & TDF_UMTXQ)
-			error = umtxq_sleep(td, &uq->uq_key,
-			       PCATCH, "ucond", 0);
-		if (!(td->td_flags & TDF_UMTXQ))
-			error = 0;
-		else
-			umtxq_remove(uq);
+		umtxq_busy(&uq->uq_key);
+		umtxq_insert(uq);
+		umtxq_unbusy(&uq->uq_key);
 		umtxq_unlock(&uq->uq_key);
-	} else {
-		getnanouptime(&ts);
-		timespecadd(&ts, timeout);
-		TIMESPEC_TO_TIMEVAL(&tv, timeout);
-		for (;;) {
+
+		/*
+		 * Set the contested bit so that a release in user space
+		 * knows to use the system call for unlock.  If this fails
+		 * either some one else has acquired the lock or it has been
+		 * released.
+		 */
+		old = casuword32(m, owner, owner | UMUTEX_CONTESTED);
+
+		/* The address was invalid. */
+		if (old == -1) {
 			umtxq_lock(&uq->uq_key);
-			if (td->td_flags & TDF_UMTXQ) {
-				error = umtxq_sleep(td, &uq->uq_key,
-					    PCATCH,
-					    "ucond", tvtohz(&tv));
-			}
-			if (!(td->td_flags & TDF_UMTXQ)) {
-				umtxq_unlock(&uq->uq_key);
-				goto out;
-			}
+			umtxq_remove(uq);
 			umtxq_unlock(&uq->uq_key);
+			umtx_key_release(&uq->uq_key);
+			return (EFAULT);
+		}
+
+		/*
+		 * We set the contested bit, sleep. Otherwise the lock changed
+		 * and we need to retry or we lost a race to the thread
+		 * unlocking the umtx.
+		 */
+		umtxq_lock(&uq->uq_key);
+		if (old == owner)
+			error = umtxq_sleep(uq, "umtx", timo);
+		umtxq_remove(uq);
+		umtxq_unlock(&uq->uq_key);
+		umtx_key_release(&uq->uq_key);
+	}
+
+	return (0);
+}
+
+/*
+ * Lock a umtx object.
+ */
+static int
+do_lock_umtx32(struct thread *td, void *m, uint32_t id,
+	struct timespec *timeout)
+{
+	struct timespec ts, ts2, ts3;
+	struct timeval tv;
+	int error;
+
+	if (timeout == NULL) {
+		error = _do_lock_umtx32(td, m, id, 0);
+		/* Mutex locking is restarted if it is interrupted. */
+		if (error == EINTR)
+			error = ERESTART;
+	} else {
+		getnanouptime(&ts);
+		timespecadd(&ts, timeout);
+		TIMESPEC_TO_TIMEVAL(&tv, timeout);
+		for (;;) {
+			error = _do_lock_umtx32(td, m, id, tvtohz(&tv));
 			if (error != ETIMEDOUT)
 				break;
 			getnanouptime(&ts2);
@@ -676,24 +856,152 @@
 			timespecsub(&ts3, &ts2);
 			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
 		}
+		/* Timed-locking is not restarted. */
+		if (error == ERESTART)
+			error = EINTR;
+	}
+	return (error);
+}
+
+/*
+ * Unlock a umtx object.
+ */
+static int
+do_unlock_umtx32(struct thread *td, uint32_t *m, uint32_t id)
+{
+	struct umtx_key key;
+	uint32_t owner;
+	uint32_t old;
+	int error;
+	int count;
+
+	/*
+	 * Make sure we own this mtx.
+	 */
+	owner = fuword32(m);
+	if (owner == -1)
+		return (EFAULT);
+
+	if ((owner & ~UMUTEX_CONTESTED) != id)
+		return (EPERM);
+
+	/* This should be done in userland */
+	if ((owner & UMUTEX_CONTESTED) == 0) {
+		old = casuword32(m, owner, UMUTEX_UNOWNED);
+		if (old == -1)
+			return (EFAULT);
+		if (old == owner)
+			return (0);
+		owner = old;
+	}
+
+	/* We should only ever be in here for contested locks */
+	if ((error = umtx_key_get(m, TYPE_SIMPLE_LOCK, AUTO_SHARE,
+		&key)) != 0)
+		return (error);
+
+	umtxq_lock(&key);
+	umtxq_busy(&key);
+	count = umtxq_count(&key);
+	umtxq_unlock(&key);
+
+	/*
+	 * When unlocking the umtx, it must be marked as unowned if
+	 * there is zero or one thread only waiting for it.
+	 * Otherwise, it must be marked as contested.
+	 */
+	old = casuword32(m, owner,
+		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
+	umtxq_lock(&key);
+	umtxq_signal(&key,1);
+	umtxq_unbusy(&key);
+	umtxq_unlock(&key);
+	umtx_key_release(&key);
+	if (old == -1)
+		return (EFAULT);
+	if (old != owner)
+		return (EINVAL);
+	return (0);
+}
+#endif
+
+/*
+ * Fetch and compare value, sleep on the address if value is not changed.
+ */
+static int
+do_wait(struct thread *td, void *addr, u_long id,
+	struct timespec *timeout, int compat32)
+{
+	struct umtx_q *uq;
+	struct timespec ts, ts2, ts3;
+	struct timeval tv;
+	u_long tmp;
+	int error = 0;
+
+	uq = td->td_umtxq;
+	if ((error = umtx_key_get(addr, TYPE_SIMPLE_WAIT, AUTO_SHARE,
+	    &uq->uq_key)) != 0)
+		return (error);
+
+	umtxq_lock(&uq->uq_key);
+	umtxq_insert(uq);
+	umtxq_unlock(&uq->uq_key);
+	if (compat32 == 0)
+		tmp = fuword(addr);
+        else
+		tmp = fuword32(addr);
+	if (tmp != id) {
+		umtxq_lock(&uq->uq_key);
+		umtxq_remove(uq);
+		umtxq_unlock(&uq->uq_key);
+	} else if (timeout == NULL) {
+		umtxq_lock(&uq->uq_key);
+		error = umtxq_sleep(uq, "uwait", 0);
+		umtxq_remove(uq);
+		umtxq_unlock(&uq->uq_key);
+	} else {
+		getnanouptime(&ts);
+		timespecadd(&ts, timeout);
+		TIMESPEC_TO_TIMEVAL(&tv, timeout);
 		umtxq_lock(&uq->uq_key);
+		for (;;) {
+			error = umtxq_sleep(uq, "uwait", tvtohz(&tv));
+			if (!(uq->uq_flags & UQF_UMTXQ))
+				break;
+			if (error != ETIMEDOUT)
+				break;
+			umtxq_unlock(&uq->uq_key);
+			getnanouptime(&ts2);
+			if (timespeccmp(&ts2, &ts, >=)) {
+				error = ETIMEDOUT;
+				umtxq_lock(&uq->uq_key);
+				break;
+			}
+			ts3 = ts;
+			timespecsub(&ts3, &ts2);
+			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
+			umtxq_lock(&uq->uq_key);
+		}
 		umtxq_remove(uq);
 		umtxq_unlock(&uq->uq_key);
 	}
-out:
 	umtx_key_release(&uq->uq_key);
 	if (error == ERESTART)
 		error = EINTR;
 	return (error);
 }
 
+/*
+ * Wake up threads sleeping on the specified address.
+ */
 int
 kern_umtx_wake(struct thread *td, void *uaddr, int n_wake)
 {
 	struct umtx_key key;
 	int ret;
 	
-	if ((ret = umtx_key_get(td, uaddr, &key)) != 0)
+	if ((ret = umtx_key_get(uaddr, TYPE_SIMPLE_WAIT, AUTO_SHARE,
+	   &key)) != 0)
 		return (ret);
 	umtxq_lock(&key);
 	ret = umtxq_signal(&key, n_wake);
@@ -702,71 +1010,1726 @@
 	return (0);
 }
 
-int
-_umtx_lock(struct thread *td, struct _umtx_lock_args *uap)
-    /* struct umtx *umtx */
+/*
+ * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
+ */
+static int
+_do_lock_normal(struct thread *td, struct umutex *m, uint32_t flags, int timo,
+	int try)
 {
-	return _do_lock(td, uap->umtx, td->td_tid, 0);
-}
+	struct umtx_q *uq;
+	uint32_t owner, old, id;
+	int error = 0;
 
-int
-_umtx_unlock(struct thread *td, struct _umtx_unlock_args *uap)
-    /* struct umtx *umtx */
-{
-	return do_unlock(td, uap->umtx, td->td_tid);
+	id = td->td_tid;
+	uq = td->td_umtxq;
+
+	/*
+	 * Care must be exercised when dealing with umtx structure. It
+	 * can fault on any access.
+	 */
+	for (;;) {
+		/*
+		 * Try the uncontested case.  This should be done in userland.
+		 */
+		owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
+
+		/* The acquire succeeded. */
+		if (owner == UMUTEX_UNOWNED)
+			return (0);
+
+		/* The address was invalid. */
+		if (owner == -1)
+			return (EFAULT);
+
+		/* If no one owns it but it is contested try to acquire it. */
+		if (owner == UMUTEX_CONTESTED) {
+			owner = casuword32(&m->m_owner,
+			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
+
+			if (owner == UMUTEX_CONTESTED)
+				return (0);
+
+			/* The address was invalid. */
+			if (owner == -1)
+				return (EFAULT);
+
+			/* If this failed the lock has changed, restart. */
+			continue;
+		}
+
+		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
+		    (owner & ~UMUTEX_CONTESTED) == id)
+			return (EDEADLK);
+
+		if (try != 0)
+			return (EBUSY);
+
+		/*
+		 * If we caught a signal, we have retried and now
+		 * exit immediately.
+		 */
+		if (error != 0)
+			return (error);
+
+		if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX,
+		    GET_SHARE(flags), &uq->uq_key)) != 0)
+			return (error);
+
+		umtxq_lock(&uq->uq_key);
+		umtxq_busy(&uq->uq_key);
+		umtxq_insert(uq);
+		umtxq_unbusy(&uq->uq_key);
+		umtxq_unlock(&uq->uq_key);
+
+		/*
+		 * Set the contested bit so that a release in user space
+		 * knows to use the system call for unlock.  If this fails
+		 * either some one else has acquired the lock or it has been
+		 * released.
+		 */
+		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
+
+		/* The address was invalid. */
+		if (old == -1) {
+			umtxq_lock(&uq->uq_key);
+			umtxq_remove(uq);
+			umtxq_unlock(&uq->uq_key);
+			umtx_key_release(&uq->uq_key);
+			return (EFAULT);
+		}
+
+		/*
+		 * We set the contested bit, sleep. Otherwise the lock changed
+		 * and we need to retry or we lost a race to the thread
+		 * unlocking the umtx.
+		 */
+		umtxq_lock(&uq->uq_key);
+		if (old == owner)
+			error = umtxq_sleep(uq, "umtxn", timo);
+		umtxq_remove(uq);
+		umtxq_unlock(&uq->uq_key);
+		umtx_key_release(&uq->uq_key);
+	}
+
+	return (0);
 }
 
-int
-_umtx_op(struct thread *td, struct _umtx_op_args *uap)
+/*
+ * Lock PTHREAD_PRIO_NONE protocol POSIX mutex.
+ */
+/*
+ * Unlock PTHREAD_PRIO_NONE protocol POSIX mutex.
+ */
+static int
+do_unlock_normal(struct thread *td, struct umutex *m, uint32_t flags)
 {
-	struct timespec timeout;
-	struct timespec *ts;
+	struct umtx_key key;
+	uint32_t owner, old, id;
 	int error;
+	int count;
+
+	id = td->td_tid;
+	/*
+	 * Make sure we own this mtx.
+	 */
+	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
+	if (owner == -1)
+		return (EFAULT);
+
+	if ((owner & ~UMUTEX_CONTESTED) != id)
+		return (EPERM);
 
-	switch(uap->op) {
-	case UMTX_OP_LOCK:
-		/* Allow a null timespec (wait forever). */
-		if (uap->uaddr2 == NULL)
-			ts = NULL;
-		else {
-			error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
-			if (error != 0)
-				break;
-			if (timeout.tv_nsec >= 1000000000 ||
-			    timeout.tv_nsec < 0) {
-				error = EINVAL;
-				break;
-			}
-			ts = &timeout;
-		}
-		error = do_lock(td, uap->umtx, uap->id, ts);
-		break;
-	case UMTX_OP_UNLOCK:
-		error = do_unlock(td, uap->umtx, uap->id);
-		break;
-	case UMTX_OP_WAIT:
-		/* Allow a null timespec (wait forever). */
-		if (uap->uaddr2 == NULL)
-			ts = NULL;
-		else {
-			error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
-			if (error != 0)
-				break;
-			if (timeout.tv_nsec >= 1000000000 ||
-			    timeout.tv_nsec < 0) {
-				error = EINVAL;
-				break;
-			}
-			ts = &timeout;
-		}
-		error = do_wait(td, uap->umtx, uap->id, ts);
-		break;
-	case UMTX_OP_WAKE:
-		error = kern_umtx_wake(td, uap->umtx, uap->id);
-		break;
-	default:
-		error = EINVAL;
-		break;
+	/* This should be done in userland */
+	if ((owner & UMUTEX_CONTESTED) == 0) {
+		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
+		if (old == -1)
+			return (EFAULT);
+		if (old == owner)
+			return (0);
+		owner = old;
 	}
-	return (error);
+
+	/* We should only ever be in here for contested locks */
+	if ((error = umtx_key_get(m, TYPE_NORMAL_UMUTEX, GET_SHARE(flags),
+	    &key)) != 0)
+		return (error);
+
+	umtxq_lock(&key);
+	umtxq_busy(&key);
+	count = umtxq_count(&key);
+	umtxq_unlock(&key);
+
+	/*
+	 * When unlocking the umtx, it must be marked as unowned if
+	 * there is zero or one thread only waiting for it.
+	 * Otherwise, it must be marked as contested.
+	 */
+	old = casuword32(&m->m_owner, owner,
+		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
+	umtxq_lock(&key);
+	umtxq_signal(&key,1);
+	umtxq_unbusy(&key);
+	umtxq_unlock(&key);
+	umtx_key_release(&key);
+	if (old == -1)
+		return (EFAULT);
+	if (old != owner)
+		return (EINVAL);
+	return (0);
+}
+
+static inline struct umtx_pi *
+umtx_pi_alloc(int flags)
+{
+	struct umtx_pi *pi;
+
+	pi = uma_zalloc(umtx_pi_zone, M_ZERO | flags);
+	TAILQ_INIT(&pi->pi_blocked);
+	atomic_add_int(&umtx_pi_allocated, 1);
+	return (pi);
+}
+
+static inline void
+umtx_pi_free(struct umtx_pi *pi)
+{
+	uma_zfree(umtx_pi_zone, pi);
+	atomic_add_int(&umtx_pi_allocated, -1);
+}
+
+/*
+ * Adjust the thread's position on a pi_state after its priority has been
+ * changed.
+ */
+static int
+umtx_pi_adjust_thread(struct umtx_pi *pi, struct thread *td)
+{
+	struct umtx_q *uq, *uq1, *uq2;
+	struct thread *td1;
+
+	mtx_assert(&umtx_lock, MA_OWNED);
+	if (pi == NULL)
+		return (0);
+
+	uq = td->td_umtxq;
+
+	/*
+	 * Check if the thread needs to be moved on the blocked chain.
+	 * It needs to be moved if either its priority is lower than
+	 * the previous thread or higher than the next thread.
+	 */
+	uq1 = TAILQ_PREV(uq, umtxq_head, uq_lockq);
+	uq2 = TAILQ_NEXT(uq, uq_lockq);
+	if ((uq1 != NULL && UPRI(td) < UPRI(uq1->uq_thread)) ||
+	    (uq2 != NULL && UPRI(td) > UPRI(uq2->uq_thread))) {
+		/*
+		 * Remove thread from blocked chain and determine where
+		 * it should be moved to.
+		 */
+		TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
+		TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
+			td1 = uq1->uq_thread;
+			MPASS(td1->td_proc->p_magic == P_MAGIC);
+			if (UPRI(td1) > UPRI(td))
+				break;
+		}
+
+		if (uq1 == NULL)
+			TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
+		else
+			TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
+	}
+	return (1);
+}
+
+/*
+ * Propagate priority when a thread is blocked on POSIX
+ * PI mutex.
+ */ 
+static void
+umtx_propagate_priority(struct thread *td)
+{
+	struct umtx_q *uq;
+	struct umtx_pi *pi;
+	int pri;
+
+	mtx_assert(&umtx_lock, MA_OWNED);
+	pri = UPRI(td);
+	uq = td->td_umtxq;
+	pi = uq->uq_pi_blocked;
+	if (pi == NULL)
+		return;
+
+	for (;;) {
+		td = pi->pi_owner;
+		if (td == NULL)
+			return;
+
+		MPASS(td->td_proc != NULL);
+		MPASS(td->td_proc->p_magic == P_MAGIC);
+
+		if (UPRI(td) <= pri)
+			return;
+
+		thread_lock(td);
+		sched_lend_user_prio(td, pri);
+		thread_unlock(td);
+
+		/*
+		 * Pick up the lock that td is blocked on.
+		 */
+		uq = td->td_umtxq;
+		pi = uq->uq_pi_blocked;
+		/* Resort td on the list if needed. */
+		if (!umtx_pi_adjust_thread(pi, td))
+			break;
+	}
+}
+
+/*
+ * Unpropagate priority for a PI mutex when a thread blocked on
+ * it is interrupted by signal or resumed by others.
+ */
+static void
+umtx_unpropagate_priority(struct umtx_pi *pi)
+{
+	struct umtx_q *uq, *uq_owner;
+	struct umtx_pi *pi2;
+	int pri, oldpri;
+
+	mtx_assert(&umtx_lock, MA_OWNED);
+
+	while (pi != NULL && pi->pi_owner != NULL) {
+		pri = PRI_MAX;
+		uq_owner = pi->pi_owner->td_umtxq;
+
+		TAILQ_FOREACH(pi2, &uq_owner->uq_pi_contested, pi_link) {
+			uq = TAILQ_FIRST(&pi2->pi_blocked);
+			if (uq != NULL) {
+				if (pri > UPRI(uq->uq_thread))
+					pri = UPRI(uq->uq_thread);
+			}
+		}
+
+		if (pri > uq_owner->uq_inherited_pri)
+			pri = uq_owner->uq_inherited_pri;
+		thread_lock(pi->pi_owner);
+		oldpri = pi->pi_owner->td_user_pri;
+		sched_unlend_user_prio(pi->pi_owner, pri);
+		thread_unlock(pi->pi_owner);
+		umtx_pi_adjust_locked(pi->pi_owner, oldpri);
+		pi = uq_owner->uq_pi_blocked;
+	}
+}
+
+/*
+ * Insert a PI mutex into owned list.
+ */
+static void
+umtx_pi_setowner(struct umtx_pi *pi, struct thread *owner)
+{
+	struct umtx_q *uq_owner;
+
+	uq_owner = owner->td_umtxq;
+	mtx_assert(&umtx_lock, MA_OWNED);
+	if (pi->pi_owner != NULL)
+		panic("pi_ower != NULL");
+	pi->pi_owner = owner;
+	TAILQ_INSERT_TAIL(&uq_owner->uq_pi_contested, pi, pi_link);
+}
+
+/*
+ * Claim ownership of a PI mutex.
+ */
+static int
+umtx_pi_claim(struct umtx_pi *pi, struct thread *owner)
+{
+	struct umtx_q *uq, *uq_owner;
+
+	uq_owner = owner->td_umtxq;
+	mtx_lock_spin(&umtx_lock);
+	if (pi->pi_owner == owner) {
+		mtx_unlock_spin(&umtx_lock);
+		return (0);
+	}
+
+	if (pi->pi_owner != NULL) {
+		/*
+		 * userland may have already messed the mutex, sigh.
+		 */
+		mtx_unlock_spin(&umtx_lock);
+		return (EPERM);
+	}
+	umtx_pi_setowner(pi, owner);
+	uq = TAILQ_FIRST(&pi->pi_blocked);
+	if (uq != NULL) {
+		int pri;
+
+		pri = UPRI(uq->uq_thread);
+		thread_lock(owner);
+		if (pri < UPRI(owner))
+			sched_lend_user_prio(owner, pri);
+		thread_unlock(owner);
+	}
+	mtx_unlock_spin(&umtx_lock);
+	return (0);
+}
+
+static void
+umtx_pi_adjust_locked(struct thread *td, u_char oldpri)
+{
+	struct umtx_q *uq;
+	struct umtx_pi *pi;
+
+	uq = td->td_umtxq;
+
+	mtx_assert(&umtx_lock, MA_OWNED);
+	MPASS(TD_ON_UPILOCK(td));
+
+	/*
+	 * Pick up the lock that td is blocked on.
+	 */
+	pi = uq->uq_pi_blocked;
+	MPASS(pi != NULL);
+
+	/* Resort the turnstile on the list. */
+	if (!umtx_pi_adjust_thread(pi, td))
+		return;
+
+	/*
+	 * If our priority was lowered and we are at the head of the
+	 * turnstile, then propagate our new priority up the chain.
+	 */
+	if (uq == TAILQ_FIRST(&pi->pi_blocked) && UPRI(td) < oldpri)
+		umtx_propagate_priority(td);
+}
+
+/*
+ * Adjust a thread's order position in its blocked PI mutex,
+ * this may result new priority propagating process.
+ */
+void
+umtx_pi_adjust(struct thread *td, u_char oldpri)
+{
+	struct umtx_q *uq;
+	struct umtx_pi *pi;
+
+	uq = td->td_umtxq;
+	mtx_lock_spin(&umtx_lock);
+	/*
+	 * Pick up the lock that td is blocked on.
+	 */
+	pi = uq->uq_pi_blocked;
+	if (pi != NULL)
+		umtx_pi_adjust_locked(td, oldpri);
+	mtx_unlock_spin(&umtx_lock);
+}
+
+/*
+ * Sleep on a PI mutex.
+ */
+static int
+umtxq_sleep_pi(struct umtx_q *uq, struct umtx_pi *pi,
+	uint32_t owner, const char *wmesg, int timo)
+{
+	struct umtxq_chain *uc;
+	struct thread *td, *td1;
+	struct umtx_q *uq1;
+	int pri;
+	int error = 0;
+
+	td = uq->uq_thread;
+	KASSERT(td == curthread, ("inconsistent uq_thread"));
+	uc = umtxq_getchain(&uq->uq_key);
+	UMTXQ_LOCKED_ASSERT(uc);
+	umtxq_insert(uq);
+	if (pi->pi_owner == NULL) {
+		/* XXX
+		 * Current, We only support process private PI-mutex,
+		 * non-contended PI-mutexes are locked in userland.
+		 * Process shared PI-mutex should always be initialized
+		 * by kernel and be registered in kernel, locking should
+		 * always be done by kernel to avoid security problems.
+		 * For process private PI-mutex, we can find owner
+		 * thread and boost its priority safely.
+		 */
+		PROC_LOCK(curproc);
+		td1 = thread_find(curproc, owner);
+		mtx_lock_spin(&umtx_lock);
+		if (td1 != NULL && pi->pi_owner == NULL) {
+			uq1 = td1->td_umtxq;
+			umtx_pi_setowner(pi, td1);
+		}
+		PROC_UNLOCK(curproc);
+	} else {
+		mtx_lock_spin(&umtx_lock);
+	}
+
+	TAILQ_FOREACH(uq1, &pi->pi_blocked, uq_lockq) {
+		pri = UPRI(uq1->uq_thread);
+		if (pri > UPRI(td))
+			break;
+	}
+
+	if (uq1 != NULL)
+		TAILQ_INSERT_BEFORE(uq1, uq, uq_lockq);
+	else
+		TAILQ_INSERT_TAIL(&pi->pi_blocked, uq, uq_lockq);
+
+	uq->uq_pi_blocked = pi;
+	td->td_flags |= TDF_UPIBLOCKED;
+	mtx_unlock_spin(&umtx_lock);
+	umtxq_unlock(&uq->uq_key);
+
+	mtx_lock_spin(&umtx_lock);
+	umtx_propagate_priority(td);
+	mtx_unlock_spin(&umtx_lock);
+
+	umtxq_lock(&uq->uq_key);
+	if (uq->uq_flags & UQF_UMTXQ) {
+		error = msleep(uq, &uc->uc_lock, PCATCH, wmesg, timo);
+		if (error == EWOULDBLOCK)
+			error = ETIMEDOUT;
+		if (uq->uq_flags & UQF_UMTXQ) {
+			umtxq_busy(&uq->uq_key);
+			umtxq_remove(uq);
+			umtxq_unbusy(&uq->uq_key);
+		}
+	}
+	umtxq_unlock(&uq->uq_key);
+
+	mtx_lock_spin(&umtx_lock);
+	uq->uq_pi_blocked = NULL;
+	thread_lock(td);
+	td->td_flags &= ~TDF_UPIBLOCKED;
+	thread_unlock(td);
+	TAILQ_REMOVE(&pi->pi_blocked, uq, uq_lockq);
+	umtx_unpropagate_priority(pi);
+	mtx_unlock_spin(&umtx_lock);
+
+	umtxq_lock(&uq->uq_key);
+
+	return (error);
+}
+
+/*
+ * Add reference count for a PI mutex.
+ */
+static void
+umtx_pi_ref(struct umtx_pi *pi)
+{
+	struct umtxq_chain *uc;
+
+	uc = umtxq_getchain(&pi->pi_key);
+	UMTXQ_LOCKED_ASSERT(uc);
+	pi->pi_refcount++;
+}
+
+/*
+ * Decrease reference count for a PI mutex, if the counter
+ * is decreased to zero, its memory space is freed.
+ */ 
+static void
+umtx_pi_unref(struct umtx_pi *pi)
+{
+	struct umtxq_chain *uc;
+	int free = 0;
+
+	uc = umtxq_getchain(&pi->pi_key);
+	UMTXQ_LOCKED_ASSERT(uc);
+	KASSERT(pi->pi_refcount > 0, ("invalid reference count"));
+	if (--pi->pi_refcount == 0) {
+		mtx_lock_spin(&umtx_lock);
+		if (pi->pi_owner != NULL) {
+			TAILQ_REMOVE(&pi->pi_owner->td_umtxq->uq_pi_contested,
+				pi, pi_link);
+			pi->pi_owner = NULL;
+		}
+		KASSERT(TAILQ_EMPTY(&pi->pi_blocked),
+			("blocked queue not empty"));
+		mtx_unlock_spin(&umtx_lock);
+		TAILQ_REMOVE(&uc->uc_pi_list, pi, pi_hashlink);
+		free = 1;
+	}
+	if (free)
+		umtx_pi_free(pi);
+}
+
+/*
+ * Find a PI mutex in hash table.
+ */
+static struct umtx_pi *
+umtx_pi_lookup(struct umtx_key *key)
+{
+	struct umtxq_chain *uc;
+	struct umtx_pi *pi;
+
+	uc = umtxq_getchain(key);
+	UMTXQ_LOCKED_ASSERT(uc);
+
+	TAILQ_FOREACH(pi, &uc->uc_pi_list, pi_hashlink) {
+		if (umtx_key_match(&pi->pi_key, key)) {
+			return (pi);
+		}
+	}
+	return (NULL);
+}
+
+/*
+ * Insert a PI mutex into hash table.
+ */
+static inline void
+umtx_pi_insert(struct umtx_pi *pi)
+{
+	struct umtxq_chain *uc;
+
+	uc = umtxq_getchain(&pi->pi_key);
+	UMTXQ_LOCKED_ASSERT(uc);
+	TAILQ_INSERT_TAIL(&uc->uc_pi_list, pi, pi_hashlink);
+}
+
+/*
+ * Lock a PI mutex.
+ */
+static int
+_do_lock_pi(struct thread *td, struct umutex *m, uint32_t flags, int timo,
+	int try)
+{
+	struct umtx_q *uq;
+	struct umtx_pi *pi, *new_pi;
+	uint32_t id, owner, old;
+	int error;
+
+	id = td->td_tid;
+	uq = td->td_umtxq;
+
+	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
+	    &uq->uq_key)) != 0)
+		return (error);
+	umtxq_lock(&uq->uq_key);
+	pi = umtx_pi_lookup(&uq->uq_key);
+	if (pi == NULL) {
+		new_pi = umtx_pi_alloc(M_NOWAIT);
+		if (new_pi == NULL) {
+			umtxq_unlock(&uq->uq_key);
+			new_pi = umtx_pi_alloc(M_WAITOK);
+			new_pi->pi_key = uq->uq_key;
+			umtxq_lock(&uq->uq_key);
+			pi = umtx_pi_lookup(&uq->uq_key);
+			if (pi != NULL) {
+				umtx_pi_free(new_pi);
+				new_pi = NULL;
+			}
+		}
+		if (new_pi != NULL) {
+			new_pi->pi_key = uq->uq_key;
+			umtx_pi_insert(new_pi);
+			pi = new_pi;
+		}
+	}
+	umtx_pi_ref(pi);
+	umtxq_unlock(&uq->uq_key);
+
+	/*
+	 * Care must be exercised when dealing with umtx structure.  It
+	 * can fault on any access.
+	 */
+	for (;;) {
+		/*
+		 * Try the uncontested case.  This should be done in userland.
+		 */
+		owner = casuword32(&m->m_owner, UMUTEX_UNOWNED, id);
+
+		/* The acquire succeeded. */
+		if (owner == UMUTEX_UNOWNED) {
+			error = 0;
+			break;
+		}
+
+		/* The address was invalid. */
+		if (owner == -1) {
+			error = EFAULT;
+			break;
+		}
+
+		/* If no one owns it but it is contested try to acquire it. */
+		if (owner == UMUTEX_CONTESTED) {
+			owner = casuword32(&m->m_owner,
+			    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
+
+			if (owner == UMUTEX_CONTESTED) {
+				umtxq_lock(&uq->uq_key);
+				error = umtx_pi_claim(pi, td);
+				umtxq_unlock(&uq->uq_key);
+				break;
+			}
+
+			/* The address was invalid. */
+			if (owner == -1) {
+				error = EFAULT;
+				break;
+			}
+
+			/* If this failed the lock has changed, restart. */
+			continue;
+		}
+
+		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
+		    (owner & ~UMUTEX_CONTESTED) == id) {
+			error = EDEADLK;
+			break;
+		}
+
+		if (try != 0) {
+			error = EBUSY;
+			break;
+		}
+
+		/*
+		 * If we caught a signal, we have retried and now
+		 * exit immediately.
+		 */
+		if (error != 0)
+			break;
+			
+		umtxq_lock(&uq->uq_key);
+		umtxq_busy(&uq->uq_key);
+		umtxq_unlock(&uq->uq_key);
+
+		/*
+		 * Set the contested bit so that a release in user space
+		 * knows to use the system call for unlock.  If this fails
+		 * either some one else has acquired the lock or it has been
+		 * released.
+		 */
+		old = casuword32(&m->m_owner, owner, owner | UMUTEX_CONTESTED);
+
+		/* The address was invalid. */
+		if (old == -1) {
+			umtxq_lock(&uq->uq_key);
+			umtxq_unbusy(&uq->uq_key);
+			umtxq_unlock(&uq->uq_key);
+			error = EFAULT;
+			break;
+		}
+
+		umtxq_lock(&uq->uq_key);
+		umtxq_unbusy(&uq->uq_key);
+		/*
+		 * We set the contested bit, sleep. Otherwise the lock changed
+		 * and we need to retry or we lost a race to the thread
+		 * unlocking the umtx.
+		 */
+		if (old == owner)
+			error = umtxq_sleep_pi(uq, pi, owner & ~UMUTEX_CONTESTED,
+				 "umtxpi", timo);
+		umtxq_unlock(&uq->uq_key);
+	}
+
+	umtxq_lock(&uq->uq_key);
+	umtx_pi_unref(pi);
+	umtxq_unlock(&uq->uq_key);
+
+	umtx_key_release(&uq->uq_key);
+	return (error);
+}
+
+/*
+ * Unlock a PI mutex.
+ */
+static int
+do_unlock_pi(struct thread *td, struct umutex *m, uint32_t flags)
+{
+	struct umtx_key key;
+	struct umtx_q *uq_first, *uq_first2, *uq_me;
+	struct umtx_pi *pi, *pi2;
+	uint32_t owner, old, id;
+	int error;
+	int count;
+	int pri;
+
+	id = td->td_tid;
+	/*
+	 * Make sure we own this mtx.
+	 */
+	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
+	if (owner == -1)
+		return (EFAULT);
+
+	if ((owner & ~UMUTEX_CONTESTED) != id)
+		return (EPERM);
+
+	/* This should be done in userland */
+	if ((owner & UMUTEX_CONTESTED) == 0) {
+		old = casuword32(&m->m_owner, owner, UMUTEX_UNOWNED);
+		if (old == -1)
+			return (EFAULT);
+		if (old == owner)
+			return (0);
+		owner = old;
+	}
+
+	/* We should only ever be in here for contested locks */
+	if ((error = umtx_key_get(m, TYPE_PI_UMUTEX, GET_SHARE(flags),
+	    &key)) != 0)
+		return (error);
+
+	umtxq_lock(&key);
+	umtxq_busy(&key);
+	count = umtxq_count_pi(&key, &uq_first);
+	if (uq_first != NULL) {
+		pi = uq_first->uq_pi_blocked;
+		if (pi->pi_owner != curthread) {
+			umtxq_unbusy(&key);
+			umtxq_unlock(&key);
+			/* userland messed the mutex */
+			return (EPERM);
+		}
+		uq_me = curthread->td_umtxq;
+		mtx_lock_spin(&umtx_lock);
+		pi->pi_owner = NULL;
+		TAILQ_REMOVE(&uq_me->uq_pi_contested, pi, pi_link);
+		uq_first = TAILQ_FIRST(&pi->pi_blocked);
+		pri = PRI_MAX;
+		TAILQ_FOREACH(pi2, &uq_me->uq_pi_contested, pi_link) {
+			uq_first2 = TAILQ_FIRST(&pi2->pi_blocked);
+			if (uq_first2 != NULL) {
+				if (pri > UPRI(uq_first2->uq_thread))
+					pri = UPRI(uq_first2->uq_thread);
+			}
+		}
+		thread_lock(curthread);
+		sched_unlend_user_prio(curthread, pri);
+		thread_unlock(curthread);
+		mtx_unlock_spin(&umtx_lock);
+	}
+	umtxq_unlock(&key);
+
+	/*
+	 * When unlocking the umtx, it must be marked as unowned if
+	 * there is zero or one thread only waiting for it.
+	 * Otherwise, it must be marked as contested.
+	 */
+	old = casuword32(&m->m_owner, owner,
+		count <= 1 ? UMUTEX_UNOWNED : UMUTEX_CONTESTED);
+
+	umtxq_lock(&key);
+	if (uq_first != NULL)
+		umtxq_signal_thread(uq_first);
+	umtxq_unbusy(&key);
+	umtxq_unlock(&key);
+	umtx_key_release(&key);
+	if (old == -1)
+		return (EFAULT);
+	if (old != owner)
+		return (EINVAL);
+	return (0);
+}
+
+/*
+ * Lock a PP mutex.
+ */
+static int
+_do_lock_pp(struct thread *td, struct umutex *m, uint32_t flags, int timo,
+	int try)
+{
+	struct umtx_q *uq, *uq2;
+	struct umtx_pi *pi;
+	uint32_t ceiling;
+	uint32_t owner, id;
+	int error, pri, old_inherited_pri, su;
+
+	id = td->td_tid;
+	uq = td->td_umtxq;
+	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
+	    &uq->uq_key)) != 0)
+		return (error);
+	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
+	for (;;) {
+		old_inherited_pri = uq->uq_inherited_pri;
+		umtxq_lock(&uq->uq_key);
+		umtxq_busy(&uq->uq_key);
+		umtxq_unlock(&uq->uq_key);
+
+		ceiling = RTP_PRIO_MAX - fuword32(&m->m_ceilings[0]);
+		if (ceiling > RTP_PRIO_MAX) {
+			error = EINVAL;
+			goto out;
+		}
+
+		mtx_lock_spin(&umtx_lock);
+		if (UPRI(td) < PRI_MIN_REALTIME + ceiling) {
+			mtx_unlock_spin(&umtx_lock);
+			error = EINVAL;
+			goto out;
+		}
+		if (su && PRI_MIN_REALTIME + ceiling < uq->uq_inherited_pri) {
+			uq->uq_inherited_pri = PRI_MIN_REALTIME + ceiling;
+			thread_lock(td);
+			if (uq->uq_inherited_pri < UPRI(td))
+				sched_lend_user_prio(td, uq->uq_inherited_pri);
+			thread_unlock(td);
+		}
+		mtx_unlock_spin(&umtx_lock);
+
+		owner = casuword32(&m->m_owner,
+		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
+
+		if (owner == UMUTEX_CONTESTED) {
+			error = 0;
+			break;
+		}
+
+		/* The address was invalid. */
+		if (owner == -1) {
+			error = EFAULT;
+			break;
+		}
+
+		if ((flags & UMUTEX_ERROR_CHECK) != 0 &&
+		    (owner & ~UMUTEX_CONTESTED) == id) {
+			error = EDEADLK;
+			break;
+		}
+
+		if (try != 0) {
+			error = EBUSY;
+			break;
+		}
+
+		/*
+		 * If we caught a signal, we have retried and now
+		 * exit immediately.
+		 */
+		if (error != 0)
+			break;
+
+		umtxq_lock(&uq->uq_key);
+		umtxq_insert(uq);
+		umtxq_unbusy(&uq->uq_key);
+		error = umtxq_sleep(uq, "umtxpp", timo);
+		umtxq_remove(uq);
+		umtxq_unlock(&uq->uq_key);
+
+		mtx_lock_spin(&umtx_lock);
+		uq->uq_inherited_pri = old_inherited_pri;
+		pri = PRI_MAX;
+		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
+			uq2 = TAILQ_FIRST(&pi->pi_blocked);
+			if (uq2 != NULL) {
+				if (pri > UPRI(uq2->uq_thread))
+					pri = UPRI(uq2->uq_thread);
+			}
+		}
+		if (pri > uq->uq_inherited_pri)
+			pri = uq->uq_inherited_pri;
+		thread_lock(td);
+		sched_unlend_user_prio(td, pri);
+		thread_unlock(td);
+		mtx_unlock_spin(&umtx_lock);
+	}
+
+	if (error != 0) {
+		mtx_lock_spin(&umtx_lock);
+		uq->uq_inherited_pri = old_inherited_pri;
+		pri = PRI_MAX;
+		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
+			uq2 = TAILQ_FIRST(&pi->pi_blocked);
+			if (uq2 != NULL) {
+				if (pri > UPRI(uq2->uq_thread))
+					pri = UPRI(uq2->uq_thread);
+			}
+		}
+		if (pri > uq->uq_inherited_pri)
+			pri = uq->uq_inherited_pri;
+		thread_lock(td);
+		sched_unlend_user_prio(td, pri);
+		thread_unlock(td);
+		mtx_unlock_spin(&umtx_lock);
+	}
+
+out:
+	umtxq_lock(&uq->uq_key);
+	umtxq_unbusy(&uq->uq_key);
+	umtxq_unlock(&uq->uq_key);
+	umtx_key_release(&uq->uq_key);
+	return (error);
+}
+
+/*
+ * Unlock a PP mutex.
+ */
+static int
+do_unlock_pp(struct thread *td, struct umutex *m, uint32_t flags)
+{
+	struct umtx_key key;
+	struct umtx_q *uq, *uq2;
+	struct umtx_pi *pi;
+	uint32_t owner, id;
+	uint32_t rceiling;
+	int error, pri, new_inherited_pri, su;
+
+	id = td->td_tid;
+	uq = td->td_umtxq;
+	su = (priv_check(td, PRIV_SCHED_RTPRIO) == 0);
+
+	/*
+	 * Make sure we own this mtx.
+	 */
+	owner = fuword32(__DEVOLATILE(uint32_t *, &m->m_owner));
+	if (owner == -1)
+		return (EFAULT);
+
+	if ((owner & ~UMUTEX_CONTESTED) != id)
+		return (EPERM);
+
+	error = copyin(&m->m_ceilings[1], &rceiling, sizeof(uint32_t));
+	if (error != 0)
+		return (error);
+
+	if (rceiling == -1)
+		new_inherited_pri = PRI_MAX;
+	else {
+		rceiling = RTP_PRIO_MAX - rceiling;
+		if (rceiling > RTP_PRIO_MAX)
+			return (EINVAL);
+		new_inherited_pri = PRI_MIN_REALTIME + rceiling;
+	}
+
+	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
+	    &key)) != 0)
+		return (error);
+	umtxq_lock(&key);
+	umtxq_busy(&key);
+	umtxq_unlock(&key);
+	/*
+	 * For priority protected mutex, always set unlocked state
+	 * to UMUTEX_CONTESTED, so that userland always enters kernel
+	 * to lock the mutex, it is necessary because thread priority
+	 * has to be adjusted for such mutex.
+	 */
+	error = suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
+		UMUTEX_CONTESTED);
+
+	umtxq_lock(&key);
+	if (error == 0)
+		umtxq_signal(&key, 1);
+	umtxq_unbusy(&key);
+	umtxq_unlock(&key);
+
+	if (error == -1)
+		error = EFAULT;
+	else {
+		mtx_lock_spin(&umtx_lock);
+		if (su != 0)
+			uq->uq_inherited_pri = new_inherited_pri;
+		pri = PRI_MAX;
+		TAILQ_FOREACH(pi, &uq->uq_pi_contested, pi_link) {
+			uq2 = TAILQ_FIRST(&pi->pi_blocked);
+			if (uq2 != NULL) {
+				if (pri > UPRI(uq2->uq_thread))
+					pri = UPRI(uq2->uq_thread);
+			}
+		}
+		if (pri > uq->uq_inherited_pri)
+			pri = uq->uq_inherited_pri;
+		thread_lock(td);
+		sched_unlend_user_prio(td, pri);
+		thread_unlock(td);
+		mtx_unlock_spin(&umtx_lock);
+	}
+	umtx_key_release(&key);
+	return (error);
+}
+
+static int
+do_set_ceiling(struct thread *td, struct umutex *m, uint32_t ceiling,
+	uint32_t *old_ceiling)
+{
+	struct umtx_q *uq;
+	uint32_t save_ceiling;
+	uint32_t owner, id;
+	uint32_t flags;
+	int error;
+
+	flags = fuword32(&m->m_flags);
+	if ((flags & UMUTEX_PRIO_PROTECT) == 0)
+		return (EINVAL);
+	if (ceiling > RTP_PRIO_MAX)
+		return (EINVAL);
+	id = td->td_tid;
+	uq = td->td_umtxq;
+	if ((error = umtx_key_get(m, TYPE_PP_UMUTEX, GET_SHARE(flags),
+	   &uq->uq_key)) != 0)
+		return (error);
+	for (;;) {
+		umtxq_lock(&uq->uq_key);
+		umtxq_busy(&uq->uq_key);
+		umtxq_unlock(&uq->uq_key);
+
+		save_ceiling = fuword32(&m->m_ceilings[0]);
+
+		owner = casuword32(&m->m_owner,
+		    UMUTEX_CONTESTED, id | UMUTEX_CONTESTED);
+
+		if (owner == UMUTEX_CONTESTED) {
+			suword32(&m->m_ceilings[0], ceiling);
+			suword32(__DEVOLATILE(uint32_t *, &m->m_owner),
+				UMUTEX_CONTESTED);
+			error = 0;
+			break;
+		}
+
+		/* The address was invalid. */
+		if (owner == -1) {
+			error = EFAULT;
+			break;
+		}
+
+		if ((owner & ~UMUTEX_CONTESTED) == id) {
+			suword32(&m->m_ceilings[0], ceiling);
+			error = 0;
+			break;
+		}
+
+		/*
+		 * If we caught a signal, we have retried and now
+		 * exit immediately.
+		 */
+		if (error != 0)
+			break;
+
+		/*
+		 * We set the contested bit, sleep. Otherwise the lock changed
+		 * and we need to retry or we lost a race to the thread
+		 * unlocking the umtx.
+		 */
+		umtxq_lock(&uq->uq_key);
+		umtxq_insert(uq);
+		umtxq_unbusy(&uq->uq_key);
+		error = umtxq_sleep(uq, "umtxpp", 0);
+		umtxq_remove(uq);
+		umtxq_unlock(&uq->uq_key);
+	}
+	umtxq_lock(&uq->uq_key);
+	if (error == 0)
+		umtxq_signal(&uq->uq_key, INT_MAX);
+	umtxq_unbusy(&uq->uq_key);
+	umtxq_unlock(&uq->uq_key);
+	umtx_key_release(&uq->uq_key);
+	if (error == 0 && old_ceiling != NULL)
+		suword32(old_ceiling, save_ceiling);
+	return (error);
+}
+
+static int
+_do_lock_umutex(struct thread *td, struct umutex *m, int flags, int timo,
+	int try)
+{
+	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
+	case 0:
+		return (_do_lock_normal(td, m, flags, timo, try));
+	case UMUTEX_PRIO_INHERIT:
+		return (_do_lock_pi(td, m, flags, timo, try));
+	case UMUTEX_PRIO_PROTECT:
+		return (_do_lock_pp(td, m, flags, timo, try));
+	}
+	return (EINVAL);
+}
+
+/*
+ * Lock a userland POSIX mutex.
+ */
+static int
+do_lock_umutex(struct thread *td, struct umutex *m,
+	struct timespec *timeout, int try)
+{
+	struct timespec ts, ts2, ts3;
+	struct timeval tv;
+	uint32_t flags;
+	int error;
+
+	flags = fuword32(&m->m_flags);
+	if (flags == -1)
+		return (EFAULT);
+
+	if (timeout == NULL) {
+		error = _do_lock_umutex(td, m, flags, 0, try);
+		/* Mutex locking is restarted if it is interrupted. */
+		if (error == EINTR)
+			error = ERESTART;
+	} else {
+		getnanouptime(&ts);
+		timespecadd(&ts, timeout);
+		TIMESPEC_TO_TIMEVAL(&tv, timeout);
+		for (;;) {
+			error = _do_lock_umutex(td, m, flags, tvtohz(&tv), try);
+			if (error != ETIMEDOUT)
+				break;
+			getnanouptime(&ts2);
+			if (timespeccmp(&ts2, &ts, >=)) {
+				error = ETIMEDOUT;
+				break;
+			}
+			ts3 = ts;
+			timespecsub(&ts3, &ts2);
+			TIMESPEC_TO_TIMEVAL(&tv, &ts3);
+		}
+		/* Timed-locking is not restarted. */
+		if (error == ERESTART)
+			error = EINTR;
+	}
+	return (error);
+}
+
+/*
+ * Unlock a userland POSIX mutex.
+ */
+static int
+do_unlock_umutex(struct thread *td, struct umutex *m)
+{
+	uint32_t flags;
+
+	flags = fuword32(&m->m_flags);
+	if (flags == -1)
+		return (EFAULT);
+
+	switch(flags & (UMUTEX_PRIO_INHERIT | UMUTEX_PRIO_PROTECT)) {
+	case 0:
+		return (do_unlock_normal(td, m, flags));
+	case UMUTEX_PRIO_INHERIT:
+		return (do_unlock_pi(td, m, flags));
+	case UMUTEX_PRIO_PROTECT:
+		return (do_unlock_pp(td, m, flags));
+	}
+
+	return (EINVAL);
+}
+
+static int
+do_cv_wait(struct thread *td, struct ucond *cv, struct umutex *m,
+	struct timespec *timeout, u_long wflags)
+{
+	struct umtx_q *uq;
+	struct timeval tv;
+	struct timespec cts, ets, tts;
+	uint32_t flags;
+	int error;
+
+	uq = td->td_umtxq;
+	flags = fuword32(&cv->c_flags);
+	error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &uq->uq_key);
+	if (error != 0)
+		return (error);
+	umtxq_lock(&uq->uq_key);
+	umtxq_busy(&uq->uq_key);
+	umtxq_insert(uq);
+	umtxq_unlock(&uq->uq_key);
+
+	/*
+	 * The magic thing is we should set c_has_waiters to 1 before
+	 * releasing user mutex.
+	 */
+	suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 1);
+
+	umtxq_lock(&uq->uq_key);
+	umtxq_unbusy(&uq->uq_key);
+	umtxq_unlock(&uq->uq_key);
+
+	error = do_unlock_umutex(td, m);
+	
+	umtxq_lock(&uq->uq_key);
+	if (error == 0) {
+		if ((wflags & UMTX_CHECK_UNPARKING) &&
+		    (td->td_pflags & TDP_WAKEUP)) {
+			td->td_pflags &= ~TDP_WAKEUP;
+			error = EINTR;
+		} else if (timeout == NULL) {
+			error = umtxq_sleep(uq, "ucond", 0);
+		} else {
+			getnanouptime(&ets);
+			timespecadd(&ets, timeout);
+			TIMESPEC_TO_TIMEVAL(&tv, timeout);
+			for (;;) {
+				error = umtxq_sleep(uq, "ucond", tvtohz(&tv));
+				if (error != ETIMEDOUT)
+					break;
+				getnanouptime(&cts);
+				if (timespeccmp(&cts, &ets, >=)) {
+					error = ETIMEDOUT;
+					break;
+				}
+				tts = ets;
+				timespecsub(&tts, &cts);
+				TIMESPEC_TO_TIMEVAL(&tv, &tts);
+			}
+		}
+	}
+
+	if (error != 0) {
+		if ((uq->uq_flags & UQF_UMTXQ) == 0) {
+			/*
+			 * If we concurrently got do_cv_signal()d
+			 * and we got an error or UNIX signals or a timeout,
+			 * then, perform another umtxq_signal to avoid
+			 * consuming the wakeup. This may cause supurious
+			 * wakeup for another thread which was just queued,
+			 * but SUSV3 explicitly allows supurious wakeup to
+			 * occur, and indeed a kernel based implementation
+			 * can not avoid it.
+			 */ 
+			if (!umtxq_signal(&uq->uq_key, 1))
+				error = 0;
+		}
+		if (error == ERESTART)
+			error = EINTR;
+	}
+	umtxq_remove(uq);
+	umtxq_unlock(&uq->uq_key);
+	umtx_key_release(&uq->uq_key);
+	return (error);
+}
+
+/*
+ * Signal a userland condition variable.
+ */
+static int
+do_cv_signal(struct thread *td, struct ucond *cv)
+{
+	struct umtx_key key;
+	int error, cnt, nwake;
+	uint32_t flags;
+
+	flags = fuword32(&cv->c_flags);
+	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
+		return (error);	
+	umtxq_lock(&key);
+	umtxq_busy(&key);
+	cnt = umtxq_count(&key);
+	nwake = umtxq_signal(&key, 1);
+	if (cnt <= nwake) {
+		umtxq_unlock(&key);
+		error = suword32(
+		    __DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);
+		umtxq_lock(&key);
+	}
+	umtxq_unbusy(&key);
+	umtxq_unlock(&key);
+	umtx_key_release(&key);
+	return (error);
+}
+
+static int
+do_cv_broadcast(struct thread *td, struct ucond *cv)
+{
+	struct umtx_key key;
+	int error;
+	uint32_t flags;
+
+	flags = fuword32(&cv->c_flags);
+	if ((error = umtx_key_get(cv, TYPE_CV, GET_SHARE(flags), &key)) != 0)
+		return (error);	
+
+	umtxq_lock(&key);
+	umtxq_busy(&key);
+	umtxq_signal(&key, INT_MAX);
+	umtxq_unlock(&key);
+
+	error = suword32(__DEVOLATILE(uint32_t *, &cv->c_has_waiters), 0);
+
+	umtxq_lock(&key);
+	umtxq_unbusy(&key);
+	umtxq_unlock(&key);
+
+	umtx_key_release(&key);
+	return (error);
+}
+
+int
+_umtx_lock(struct thread *td, struct _umtx_lock_args *uap)
+    /* struct umtx *umtx */
+{
+	return _do_lock_umtx(td, uap->umtx, td->td_tid, 0);
+}
+
+int
+_umtx_unlock(struct thread *td, struct _umtx_unlock_args *uap)
+    /* struct umtx *umtx */
+{
+	return do_unlock_umtx(td, uap->umtx, td->td_tid);
+}
+
+static int
+__umtx_op_lock_umtx(struct thread *td, struct _umtx_op_args *uap)
+{
+	struct timespec *ts, timeout;
+	int error;
+
+	/* Allow a null timespec (wait forever). */
+	if (uap->uaddr2 == NULL)
+		ts = NULL;
+	else {
+		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
+		if (error != 0)
+			return (error);
+		if (timeout.tv_nsec >= 1000000000 ||
+		    timeout.tv_nsec < 0) {
+			return (EINVAL);
+		}
+		ts = &timeout;
+	}
+	return (do_lock_umtx(td, uap->obj, uap->val, ts));
+}
+
+static int
+__umtx_op_unlock_umtx(struct thread *td, struct _umtx_op_args *uap)
+{
+	return (do_unlock_umtx(td, uap->obj, uap->val));
+}
+
+static int
+__umtx_op_wait(struct thread *td, struct _umtx_op_args *uap)
+{
+	struct timespec *ts, timeout;
+	int error;
+
+	if (uap->uaddr2 == NULL)
+		ts = NULL;
+	else {
+		error = copyin(uap->uaddr2, &timeout, sizeof(timeout));
+		if (error != 0)
+			return (error);
+		if (timeout.tv_nsec >= 1000000000 ||
+		    timeout.tv_nsec < 0)
+			return (EINVAL);
+		ts = &timeout;
+	}
+	return do_wait(td, uap->obj, uap->val, ts, 0);
+}
+
+static int
+__umtx_op_wake(struct thread *td, struct _umtx_op_args *uap)
+{
+	return (kern_umtx_wake(td, uap->obj, uap->val));
+}
+
+static int
+__umtx_op_lock_umutex(struct thread *td, struct _umtx_op_args *uap)
+{
+	struct timespec *ts, timeout;
+	int error;
+
+	/* Allow a null timespec (wait forever). */
+	if (uap->uaddr2 == NULL)
+		ts = NULL;
+	else {
+		error = copyin(uap->uaddr2, &timeout,
+		    sizeof(timeout));
+		if (error != 0)
+			return (error);
+		if (timeout.tv_nsec >= 1000000000 ||
+		    timeout.tv_nsec < 0) {
+			return (EINVAL);
+		}
+		ts = &timeout;
+	}
+	return do_lock_umutex(td, uap->obj, ts, 0);
+}
+
+static int
+__umtx_op_trylock_umutex(struct thread *td, struct _umtx_op_args *uap)
+{
+	return do_lock_umutex(td, uap->obj, NULL, 1);
+}
+
+static int
+__umtx_op_unlock_umutex(struct thread *td, struct _umtx_op_args *uap)
+{
+	return do_unlock_umutex(td, uap->obj);
+}
+
+static int
+__umtx_op_set_ceiling(struct thread *td, struct _umtx_op_args *uap)
+{
+	return do_set_ceiling(td, uap->obj, uap->val, uap->uaddr1);
+}
+
+static int
+__umtx_op_cv_wait(struct thread *td, struct _umtx_op_args *uap)
+{
+	struct timespec *ts, timeout;
+	int error;
+
+	/* Allow a null timespec (wait forever). */
+	if (uap->uaddr2 == NULL)
+		ts = NULL;
+	else {
+		error = copyin(uap->uaddr2, &timeout,
+		    sizeof(timeout));
+		if (error != 0)
+			return (error);
+		if (timeout.tv_nsec >= 1000000000 ||
+		    timeout.tv_nsec < 0) {
+			return (EINVAL);
+		}
+		ts = &timeout;
+	}
+	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
+}
+
+static int
+__umtx_op_cv_signal(struct thread *td, struct _umtx_op_args *uap)
+{
+	return do_cv_signal(td, uap->obj);
+}
+
+static int
+__umtx_op_cv_broadcast(struct thread *td, struct _umtx_op_args *uap)
+{
+	return do_cv_broadcast(td, uap->obj);
+}
+
+typedef int (*_umtx_op_func)(struct thread *td, struct _umtx_op_args *uap);
+
+static _umtx_op_func op_table[] = {
+	__umtx_op_lock_umtx,		/* UMTX_OP_LOCK */
+	__umtx_op_unlock_umtx,		/* UMTX_OP_UNLOCK */
+	__umtx_op_wait,			/* UMTX_OP_WAIT */
+	__umtx_op_wake,			/* UMTX_OP_WAKE */
+	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_TRYLOCK */
+	__umtx_op_lock_umutex,		/* UMTX_OP_MUTEX_LOCK */
+	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK */
+	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
+	__umtx_op_cv_wait,		/* UMTX_OP_CV_WAIT*/
+	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
+	__umtx_op_cv_broadcast		/* UMTX_OP_CV_BROADCAST */
+};
+
+int
+_umtx_op(struct thread *td, struct _umtx_op_args *uap)
+{
+	if ((unsigned)uap->op < UMTX_OP_MAX)
+		return (*op_table[uap->op])(td, uap);
+	return (EINVAL);
+}
+
+#ifdef COMPAT_IA32
+int
+freebsd32_umtx_lock(struct thread *td, struct freebsd32_umtx_lock_args *uap)
+    /* struct umtx *umtx */
+{
+	return (do_lock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid, NULL));
+}
+
+int
+freebsd32_umtx_unlock(struct thread *td, struct freebsd32_umtx_unlock_args *uap)
+    /* struct umtx *umtx */
+{
+	return (do_unlock_umtx32(td, (uint32_t *)uap->umtx, td->td_tid));
+}
+
+struct timespec32 {
+	u_int32_t tv_sec;
+	u_int32_t tv_nsec;
+};
+
+static inline int
+copyin_timeout32(void *addr, struct timespec *tsp)
+{
+	struct timespec32 ts32;
+	int error;
+
+	error = copyin(addr, &ts32, sizeof(struct timespec32));
+	if (error == 0) {
+		tsp->tv_sec = ts32.tv_sec;
+		tsp->tv_nsec = ts32.tv_nsec;
+	}
+	return (error);
+}
+
+static int
+__umtx_op_lock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
+{
+	struct timespec *ts, timeout;
+	int error;
+
+	/* Allow a null timespec (wait forever). */
+	if (uap->uaddr2 == NULL)
+		ts = NULL;
+	else {
+		error = copyin_timeout32(uap->uaddr2, &timeout);
+		if (error != 0)
+			return (error);
+		if (timeout.tv_nsec >= 1000000000 ||
+		    timeout.tv_nsec < 0) {
+			return (EINVAL);
+		}
+		ts = &timeout;
+	}
+	return (do_lock_umtx32(td, uap->obj, uap->val, ts));
+}
+
+static int
+__umtx_op_unlock_umtx_compat32(struct thread *td, struct _umtx_op_args *uap)
+{
+	return (do_unlock_umtx32(td, uap->obj, (uint32_t)uap->val));
+}
+
+static int
+__umtx_op_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
+{
+	struct timespec *ts, timeout;
+	int error;
+
+	if (uap->uaddr2 == NULL)
+		ts = NULL;
+	else {
+		error = copyin_timeout32(uap->uaddr2, &timeout);
+		if (error != 0)
+			return (error);
+		if (timeout.tv_nsec >= 1000000000 ||
+		    timeout.tv_nsec < 0)
+			return (EINVAL);
+		ts = &timeout;
+	}
+	return do_wait(td, uap->obj, uap->val, ts, 1);
+}
+
+static int
+__umtx_op_lock_umutex_compat32(struct thread *td, struct _umtx_op_args *uap)
+{
+	struct timespec *ts, timeout;
+	int error;
+
+	/* Allow a null timespec (wait forever). */
+	if (uap->uaddr2 == NULL)
+		ts = NULL;
+	else {
+		error = copyin_timeout32(uap->uaddr2, &timeout);
+		if (error != 0)
+			return (error);
+		if (timeout.tv_nsec >= 1000000000 ||
+		    timeout.tv_nsec < 0)
+			return (EINVAL);
+		ts = &timeout;
+	}
+	return do_lock_umutex(td, uap->obj, ts, 0);
+}
+
+static int
+__umtx_op_cv_wait_compat32(struct thread *td, struct _umtx_op_args *uap)
+{
+	struct timespec *ts, timeout;
+	int error;
+
+	/* Allow a null timespec (wait forever). */
+	if (uap->uaddr2 == NULL)
+		ts = NULL;
+	else {
+		error = copyin_timeout32(uap->uaddr2, &timeout);
+		if (error != 0)
+			return (error);
+		if (timeout.tv_nsec >= 1000000000 ||
+		    timeout.tv_nsec < 0)
+			return (EINVAL);
+		ts = &timeout;
+	}
+	return (do_cv_wait(td, uap->obj, uap->uaddr1, ts, uap->val));
+}
+
+static _umtx_op_func op_table_compat32[] = {
+	__umtx_op_lock_umtx_compat32,	/* UMTX_OP_LOCK */
+	__umtx_op_unlock_umtx_compat32,	/* UMTX_OP_UNLOCK */
+	__umtx_op_wait_compat32,	/* UMTX_OP_WAIT */
+	__umtx_op_wake,			/* UMTX_OP_WAKE */
+	__umtx_op_trylock_umutex,	/* UMTX_OP_MUTEX_LOCK */
+	__umtx_op_lock_umutex_compat32,	/* UMTX_OP_MUTEX_TRYLOCK */
+	__umtx_op_unlock_umutex,	/* UMTX_OP_MUTEX_UNLOCK	*/
+	__umtx_op_set_ceiling,		/* UMTX_OP_SET_CEILING */
+	__umtx_op_cv_wait_compat32,	/* UMTX_OP_CV_WAIT*/
+	__umtx_op_cv_signal,		/* UMTX_OP_CV_SIGNAL */
+	__umtx_op_cv_broadcast		/* UMTX_OP_CV_BROADCAST */
+};
+
+int
+freebsd32_umtx_op(struct thread *td, struct freebsd32_umtx_op_args *uap)
+{
+	if ((unsigned)uap->op < UMTX_OP_MAX)
+		return (*op_table_compat32[uap->op])(td,
+			(struct _umtx_op_args *)uap);
+	return (EINVAL);
+}
+#endif
+
+void
+umtx_thread_init(struct thread *td)
+{
+	td->td_umtxq = umtxq_alloc();
+	td->td_umtxq->uq_thread = td;
+}
+
+void
+umtx_thread_fini(struct thread *td)
+{
+	umtxq_free(td->td_umtxq);
+}
+
+/*
+ * It will be called when new thread is created, e.g fork().
+ */
+void
+umtx_thread_alloc(struct thread *td)
+{
+	struct umtx_q *uq;
+
+	uq = td->td_umtxq;
+	uq->uq_inherited_pri = PRI_MAX;
+
+	KASSERT(uq->uq_flags == 0, ("uq_flags != 0"));
+	KASSERT(uq->uq_thread == td, ("uq_thread != td"));
+	KASSERT(uq->uq_pi_blocked == NULL, ("uq_pi_blocked != NULL"));
+	KASSERT(TAILQ_EMPTY(&uq->uq_pi_contested), ("uq_pi_contested is not empty"));
+}
+
+/*
+ * exec() hook.
+ */
+static void
+umtx_exec_hook(void *arg __unused, struct proc *p __unused,
+	struct image_params *imgp __unused)
+{
+	umtx_thread_cleanup(curthread);
+}
+
+/*
+ * thread_exit() hook.
+ */
+void
+umtx_thread_exit(struct thread *td)
+{
+	umtx_thread_cleanup(td);
+}
+
+/*
+ * clean up umtx data.
+ */
+static void
+umtx_thread_cleanup(struct thread *td)
+{
+	struct umtx_q *uq;
+	struct umtx_pi *pi;
+
+	if ((uq = td->td_umtxq) == NULL)
+		return;
+
+	mtx_lock_spin(&umtx_lock);
+	uq->uq_inherited_pri = PRI_MAX;
+	while ((pi = TAILQ_FIRST(&uq->uq_pi_contested)) != NULL) {
+		pi->pi_owner = NULL;
+		TAILQ_REMOVE(&uq->uq_pi_contested, pi, pi_link);
+	}
+	thread_lock(td);
+	td->td_flags &= ~TDF_UBORROWING;
+	thread_unlock(td);
+	mtx_unlock_spin(&umtx_lock);
 }
Index: kern_sig.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_sig.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/kern_sig.c -L sys/kern/kern_sig.c -u -r1.2 -r1.3
--- sys/kern/kern_sig.c
+++ sys/kern/kern_sig.c
@@ -35,7 +35,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: /repoman/r/ncvs/src/sys/kern/kern_sig.c,v 1.306.2.2 2006/02/27 00:19:40 davidxu Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_sig.c,v 1.349.2.1.2.1 2008/01/19 18:15:05 kib Exp $");
 
 #include "opt_compat.h"
 #include "opt_ktrace.h"
@@ -57,9 +57,9 @@
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/proc.h>
+#include <sys/posix4.h>
 #include <sys/pioctl.h>
 #include <sys/resourcevar.h>
-#include <sys/sched.h>
 #include <sys/sleepqueue.h>
 #include <sys/smp.h>
 #include <sys/stat.h>
@@ -69,14 +69,16 @@
 #include <sys/sysent.h>
 #include <sys/syslog.h>
 #include <sys/sysproto.h>
+#include <sys/timers.h>
 #include <sys/unistd.h>
 #include <sys/wait.h>
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#include <vm/uma.h>
 
 #include <machine/cpu.h>
 
-#if defined (__alpha__) && !defined(COMPAT_43)
-#error "You *really* need COMPAT_43 on the alpha for longjmp(3)"
-#endif
+#include <security/audit/audit.h>
 
 #define	ONSIG	32		/* NSIG for osig* syscalls.  XXX. */
 
@@ -91,18 +93,45 @@
 static void	filt_sigdetach(struct knote *kn);
 static int	filt_signal(struct knote *kn, long hint);
 static struct thread *sigtd(struct proc *p, int sig, int prop);
-static int	kern_sigtimedwait(struct thread *td, sigset_t set,
-				siginfo_t *info, struct timespec *timeout);
-static void	do_tdsignal(struct thread *td, int sig, sigtarget_t target);
+#ifdef KSE
+static int	do_tdsignal(struct proc *, struct thread *, int, ksiginfo_t *);
+#endif
+static void	sigqueue_start(void);
 
+static uma_zone_t	ksiginfo_zone = NULL;
 struct filterops sig_filtops =
 	{ 0, filt_sigattach, filt_sigdetach, filt_signal };
 
-static int	kern_logsigexit = 1;
+int	kern_logsigexit = 1;
 SYSCTL_INT(_kern, KERN_LOGSIGEXIT, logsigexit, CTLFLAG_RW, 
     &kern_logsigexit, 0, 
     "Log processes quitting on abnormal signals to syslog(3)");
 
+static int	kern_forcesigexit = 1;
+SYSCTL_INT(_kern, OID_AUTO, forcesigexit, CTLFLAG_RW,
+    &kern_forcesigexit, 0, "Force trap signal to be handled");
+
+SYSCTL_NODE(_kern, OID_AUTO, sigqueue, CTLFLAG_RW, 0, "POSIX real time signal");
+
+static int	max_pending_per_proc = 128;
+SYSCTL_INT(_kern_sigqueue, OID_AUTO, max_pending_per_proc, CTLFLAG_RW,
+    &max_pending_per_proc, 0, "Max pending signals per proc");
+
+static int	preallocate_siginfo = 1024;
+TUNABLE_INT("kern.sigqueue.preallocate", &preallocate_siginfo);
+SYSCTL_INT(_kern_sigqueue, OID_AUTO, preallocate, CTLFLAG_RD,
+    &preallocate_siginfo, 0, "Preallocated signal memory size");
+
+static int	signal_overflow = 0;
+SYSCTL_INT(_kern_sigqueue, OID_AUTO, overflow, CTLFLAG_RD,
+    &signal_overflow, 0, "Number of signals overflew");
+
+static int	signal_alloc_fail = 0;
+SYSCTL_INT(_kern_sigqueue, OID_AUTO, alloc_fail, CTLFLAG_RD,
+    &signal_alloc_fail, 0, "signals failed to be allocated");
+
+SYSINIT(signal, SI_SUB_P1003_1B, SI_ORDER_FIRST+3, sigqueue_start, NULL);
+
 /*
  * Policy -- Can ucred cr1 send SIGIO to process cr2?
  * Should use cr_cansignal() once cr_cansignal() allows SIGIO and SIGURG
@@ -175,33 +204,372 @@
         SA_KILL|SA_PROC,		/* SIGUSR2 */
 };
 
+static void
+sigqueue_start(void)
+{
+	ksiginfo_zone = uma_zcreate("ksiginfo", sizeof(ksiginfo_t),
+		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+	uma_prealloc(ksiginfo_zone, preallocate_siginfo);
+	p31b_setcfg(CTL_P1003_1B_REALTIME_SIGNALS, _POSIX_REALTIME_SIGNALS);
+	p31b_setcfg(CTL_P1003_1B_RTSIG_MAX, SIGRTMAX - SIGRTMIN + 1);
+	p31b_setcfg(CTL_P1003_1B_SIGQUEUE_MAX, max_pending_per_proc);
+}
+
+ksiginfo_t *
+ksiginfo_alloc(int wait)
+{
+	int flags;
+
+	flags = M_ZERO;
+	if (! wait)
+		flags |= M_NOWAIT;
+	if (ksiginfo_zone != NULL)
+		return ((ksiginfo_t *)uma_zalloc(ksiginfo_zone, flags));
+	return (NULL);
+}
+
+void
+ksiginfo_free(ksiginfo_t *ksi)
+{
+	uma_zfree(ksiginfo_zone, ksi);
+}
+
+static __inline int
+ksiginfo_tryfree(ksiginfo_t *ksi)
+{
+	if (!(ksi->ksi_flags & KSI_EXT)) {
+		uma_zfree(ksiginfo_zone, ksi);
+		return (1);
+	}
+	return (0);
+}
+
+void
+sigqueue_init(sigqueue_t *list, struct proc *p)
+{
+	SIGEMPTYSET(list->sq_signals);
+	SIGEMPTYSET(list->sq_kill);
+	TAILQ_INIT(&list->sq_list);
+	list->sq_proc = p;
+	list->sq_flags = SQ_INIT;
+}
+
+/*
+ * Get a signal's ksiginfo.
+ * Return:
+ * 	0	-	signal not found
+ *	others	-	signal number
+ */ 
+int
+sigqueue_get(sigqueue_t *sq, int signo, ksiginfo_t *si)
+{
+	struct proc *p = sq->sq_proc;
+	struct ksiginfo *ksi, *next;
+	int count = 0;
+
+	KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited"));
+
+	if (!SIGISMEMBER(sq->sq_signals, signo))
+		return (0);
+
+	if (SIGISMEMBER(sq->sq_kill, signo)) {
+		count++;
+		SIGDELSET(sq->sq_kill, signo);
+	}
+
+	TAILQ_FOREACH_SAFE(ksi, &sq->sq_list, ksi_link, next) {
+		if (ksi->ksi_signo == signo) {
+			if (count == 0) {
+				TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link);
+				ksi->ksi_sigq = NULL;
+				ksiginfo_copy(ksi, si);
+				if (ksiginfo_tryfree(ksi) && p != NULL)
+					p->p_pendingcnt--;
+			}
+			if (++count > 1)
+				break;
+		}
+	}
+
+	if (count <= 1)
+		SIGDELSET(sq->sq_signals, signo);
+	si->ksi_signo = signo;
+	return (signo);
+}
+
+void
+sigqueue_take(ksiginfo_t *ksi)
+{
+	struct ksiginfo *kp;
+	struct proc	*p;
+	sigqueue_t	*sq;
+
+	if (ksi == NULL || (sq = ksi->ksi_sigq) == NULL)
+		return;
+
+	p = sq->sq_proc;
+	TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link);
+	ksi->ksi_sigq = NULL;
+	if (!(ksi->ksi_flags & KSI_EXT) && p != NULL)
+		p->p_pendingcnt--;
+
+	for (kp = TAILQ_FIRST(&sq->sq_list); kp != NULL;
+	     kp = TAILQ_NEXT(kp, ksi_link)) {
+		if (kp->ksi_signo == ksi->ksi_signo)
+			break;
+	}
+	if (kp == NULL && !SIGISMEMBER(sq->sq_kill, ksi->ksi_signo))
+		SIGDELSET(sq->sq_signals, ksi->ksi_signo);
+}
+
+int
+sigqueue_add(sigqueue_t *sq, int signo, ksiginfo_t *si)
+{
+	struct proc *p = sq->sq_proc;
+	struct ksiginfo *ksi;
+	int ret = 0;
+
+	KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited"));
+	
+	if (signo == SIGKILL || signo == SIGSTOP || si == NULL) {
+		SIGADDSET(sq->sq_kill, signo);
+		goto out_set_bit;
+	}
+
+	/* directly insert the ksi, don't copy it */
+	if (si->ksi_flags & KSI_INS) {
+		TAILQ_INSERT_TAIL(&sq->sq_list, si, ksi_link);
+		si->ksi_sigq = sq;
+		goto out_set_bit;
+	}
+
+	if (__predict_false(ksiginfo_zone == NULL)) {
+		SIGADDSET(sq->sq_kill, signo);
+		goto out_set_bit;
+	}
+	
+	if (p != NULL && p->p_pendingcnt >= max_pending_per_proc) {
+		signal_overflow++;
+		ret = EAGAIN;
+	} else if ((ksi = ksiginfo_alloc(0)) == NULL) {
+		signal_alloc_fail++;
+		ret = EAGAIN;
+	} else {
+		if (p != NULL)
+			p->p_pendingcnt++;
+		ksiginfo_copy(si, ksi);
+		ksi->ksi_signo = signo;
+		TAILQ_INSERT_TAIL(&sq->sq_list, ksi, ksi_link);
+		ksi->ksi_sigq = sq;
+	}
+
+	if ((si->ksi_flags & KSI_TRAP) != 0) {
+		if (ret != 0)
+			SIGADDSET(sq->sq_kill, signo);
+		ret = 0;
+		goto out_set_bit;
+	}
+
+	if (ret != 0)
+		return (ret);
+	
+out_set_bit:
+	SIGADDSET(sq->sq_signals, signo);
+	return (ret);
+}
+
+void
+sigqueue_flush(sigqueue_t *sq)
+{
+	struct proc *p = sq->sq_proc;
+	ksiginfo_t *ksi;
+
+	KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited"));
+
+	if (p != NULL)
+		PROC_LOCK_ASSERT(p, MA_OWNED);
+
+	while ((ksi = TAILQ_FIRST(&sq->sq_list)) != NULL) {
+		TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link);
+		ksi->ksi_sigq = NULL;
+		if (ksiginfo_tryfree(ksi) && p != NULL)
+			p->p_pendingcnt--;
+	}
+
+	SIGEMPTYSET(sq->sq_signals);
+	SIGEMPTYSET(sq->sq_kill);
+}
+
+void
+sigqueue_collect_set(sigqueue_t *sq, sigset_t *set)
+{
+	ksiginfo_t *ksi;
+
+	KASSERT(sq->sq_flags & SQ_INIT, ("sigqueue not inited"));
+
+	TAILQ_FOREACH(ksi, &sq->sq_list, ksi_link)
+		SIGADDSET(*set, ksi->ksi_signo);
+	SIGSETOR(*set, sq->sq_kill);
+}
+
+void
+sigqueue_move_set(sigqueue_t *src, sigqueue_t *dst, sigset_t *setp)
+{
+	sigset_t tmp, set;
+	struct proc *p1, *p2;
+	ksiginfo_t *ksi, *next;
+
+	KASSERT(src->sq_flags & SQ_INIT, ("src sigqueue not inited"));
+	KASSERT(dst->sq_flags & SQ_INIT, ("dst sigqueue not inited"));
+	/*
+	 * make a copy, this allows setp to point to src or dst
+	 * sq_signals without trouble.
+	 */
+	set = *setp;
+	p1 = src->sq_proc;
+	p2 = dst->sq_proc;
+	/* Move siginfo to target list */
+	TAILQ_FOREACH_SAFE(ksi, &src->sq_list, ksi_link, next) {
+		if (SIGISMEMBER(set, ksi->ksi_signo)) {
+			TAILQ_REMOVE(&src->sq_list, ksi, ksi_link);
+			if (p1 != NULL)
+				p1->p_pendingcnt--;
+			TAILQ_INSERT_TAIL(&dst->sq_list, ksi, ksi_link);
+			ksi->ksi_sigq = dst;
+			if (p2 != NULL)
+				p2->p_pendingcnt++;
+		}
+	}
+
+	/* Move pending bits to target list */
+	tmp = src->sq_kill;
+	SIGSETAND(tmp, set);
+	SIGSETOR(dst->sq_kill, tmp);
+	SIGSETNAND(src->sq_kill, tmp);
+
+	tmp = src->sq_signals;
+	SIGSETAND(tmp, set);
+	SIGSETOR(dst->sq_signals, tmp);
+	SIGSETNAND(src->sq_signals, tmp);
+
+	/* Finally, rescan src queue and set pending bits for it */
+	sigqueue_collect_set(src, &src->sq_signals);
+}
+
+void
+sigqueue_move(sigqueue_t *src, sigqueue_t *dst, int signo)
+{
+	sigset_t set;
+
+	SIGEMPTYSET(set);
+	SIGADDSET(set, signo);
+	sigqueue_move_set(src, dst, &set);
+}
+
+void
+sigqueue_delete_set(sigqueue_t *sq, sigset_t *set)
+{
+	struct proc *p = sq->sq_proc;
+	ksiginfo_t *ksi, *next;
+
+	KASSERT(sq->sq_flags & SQ_INIT, ("src sigqueue not inited"));
+
+	/* Remove siginfo queue */
+	TAILQ_FOREACH_SAFE(ksi, &sq->sq_list, ksi_link, next) {
+		if (SIGISMEMBER(*set, ksi->ksi_signo)) {
+			TAILQ_REMOVE(&sq->sq_list, ksi, ksi_link);
+			ksi->ksi_sigq = NULL;
+			if (ksiginfo_tryfree(ksi) && p != NULL)
+				p->p_pendingcnt--;
+		}
+	}
+	SIGSETNAND(sq->sq_kill, *set);
+	SIGSETNAND(sq->sq_signals, *set);
+	/* Finally, rescan queue and set pending bits for it */
+	sigqueue_collect_set(sq, &sq->sq_signals);
+}
+
+void
+sigqueue_delete(sigqueue_t *sq, int signo)
+{
+	sigset_t set;
+
+	SIGEMPTYSET(set);
+	SIGADDSET(set, signo);
+	sigqueue_delete_set(sq, &set);
+}
+
+/* Remove a set of signals for a process */
+void
+sigqueue_delete_set_proc(struct proc *p, sigset_t *set)
+{
+	sigqueue_t worklist;
+	struct thread *td0;
+
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+
+	sigqueue_init(&worklist, NULL);
+	sigqueue_move_set(&p->p_sigqueue, &worklist, set);
+
+	PROC_SLOCK(p);
+	FOREACH_THREAD_IN_PROC(p, td0)
+		sigqueue_move_set(&td0->td_sigqueue, &worklist, set);
+	PROC_SUNLOCK(p);
+
+	sigqueue_flush(&worklist);
+}
+
+void
+sigqueue_delete_proc(struct proc *p, int signo)
+{
+	sigset_t set;
+
+	SIGEMPTYSET(set);
+	SIGADDSET(set, signo);
+	sigqueue_delete_set_proc(p, &set);
+}
+
+void
+sigqueue_delete_stopmask_proc(struct proc *p)
+{
+	sigset_t set;
+
+	SIGEMPTYSET(set);
+	SIGADDSET(set, SIGSTOP);
+	SIGADDSET(set, SIGTSTP);
+	SIGADDSET(set, SIGTTIN);
+	SIGADDSET(set, SIGTTOU);
+	sigqueue_delete_set_proc(p, &set);
+}
+
 /*
  * Determine signal that should be delivered to process p, the current
  * process, 0 if none.  If there is a pending stop signal with default
  * action, the process stops in issignal().
- * XXXKSE   the check for a pending stop is not done under KSE
- *
- * MP SAFE.
  */
 int
 cursig(struct thread *td)
 {
 	PROC_LOCK_ASSERT(td->td_proc, MA_OWNED);
 	mtx_assert(&td->td_proc->p_sigacts->ps_mtx, MA_OWNED);
-	mtx_assert(&sched_lock, MA_NOTOWNED);
+	THREAD_LOCK_ASSERT(td, MA_NOTOWNED);
 	return (SIGPENDING(td) ? issignal(td) : 0);
 }
 
 /*
  * Arrange for ast() to handle unmasked pending signals on return to user
- * mode.  This must be called whenever a signal is added to td_siglist or
+ * mode.  This must be called whenever a signal is added to td_sigqueue or
  * unmasked in td_sigmask.
  */
 void
 signotify(struct thread *td)
 {
 	struct proc *p;
+#ifdef KSE
 	sigset_t set, saved;
+#else
+	sigset_t set;
+#endif
 
 	p = td->td_proc;
 
@@ -209,27 +577,30 @@
 
 	/*
 	 * If our mask changed we may have to move signal that were
-	 * previously masked by all threads to our siglist.
+	 * previously masked by all threads to our sigqueue.
 	 */
-	set = p->p_siglist;
+	set = p->p_sigqueue.sq_signals;
+#ifdef KSE
 	if (p->p_flag & P_SA)
-		saved = p->p_siglist;
+		saved = p->p_sigqueue.sq_signals;
+#endif
 	SIGSETNAND(set, td->td_sigmask);
-	SIGSETNAND(p->p_siglist, set);
-	SIGSETOR(td->td_siglist, set);
-
+	if (! SIGISEMPTY(set))
+		sigqueue_move_set(&p->p_sigqueue, &td->td_sigqueue, &set);
 	if (SIGPENDING(td)) {
-		mtx_lock_spin(&sched_lock);
+		thread_lock(td);
 		td->td_flags |= TDF_NEEDSIGCHK | TDF_ASTPENDING;
-		mtx_unlock_spin(&sched_lock);
+		thread_unlock(td);
 	}
+#ifdef KSE
 	if ((p->p_flag & P_SA) && !(p->p_flag & P_SIGEVENT)) {
-		if (!SIGSETEQ(saved, p->p_siglist)) {
+		if (!SIGSETEQ(saved, p->p_sigqueue.sq_signals)) {
 			/* pending set changed */
 			p->p_flag |= P_SIGEVENT;
 			wakeup(&p->p_siglist);
 		}
 	}
+#endif
 }
 
 int
@@ -273,8 +644,6 @@
  * sigaction
  * freebsd4_sigaction
  * osigaction
- *
- * MPSAFE
  */
 int
 kern_sigaction(td, sig, act, oact, flags)
@@ -284,7 +653,6 @@
 	int flags;
 {
 	struct sigacts *ps;
-	struct thread *td0;
 	struct proc *p = td->td_proc;
 
 	if (!_SIG_VALID(sig))
@@ -382,17 +750,17 @@
 		if (ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN ||
 		    (sigprop(sig) & SA_IGNORE &&
 		     ps->ps_sigact[_SIG_IDX(sig)] == SIG_DFL)) {
+#ifdef KSE
 			if ((p->p_flag & P_SA) &&
-			     SIGISMEMBER(p->p_siglist, sig)) {
+			     SIGISMEMBER(p->p_sigqueue.sq_signals, sig)) {
 				p->p_flag |= P_SIGEVENT;
 				wakeup(&p->p_siglist);
 			}
+#endif
 			/* never to be seen again */
-			SIGDELSET(p->p_siglist, sig);
-			mtx_lock_spin(&sched_lock);
-			FOREACH_THREAD_IN_PROC(p, td0)
-				SIGDELSET(td0->td_siglist, sig);
-			mtx_unlock_spin(&sched_lock);
+			PROC_SLOCK(p);
+			sigqueue_delete_proc(p, sig);
+			PROC_SUNLOCK(p);
 			if (sig != SIGCONT)
 				/* easier in psignal */
 				SIGADDSET(ps->ps_sigignore, sig);
@@ -433,9 +801,6 @@
 	struct	sigaction *oact;
 };
 #endif
-/*
- * MPSAFE
- */
 int
 sigaction(td, uap)
 	struct thread *td;
@@ -466,9 +831,6 @@
 	struct	sigaction *oact;
 };
 #endif
-/*
- * MPSAFE
- */
 int
 freebsd4_sigaction(td, uap)
 	struct thread *td;
@@ -501,9 +863,6 @@
 	struct	osigaction *osa;
 };
 #endif
-/*
- * MPSAFE
- */
 int
 osigaction(td, uap)
 	struct thread *td;
@@ -538,7 +897,7 @@
 	return (error);
 }
 
-#if !defined(__i386__) && !defined(__alpha__)
+#if !defined(__i386__)
 /* Avoid replicating the same stub everywhere */
 int
 osigreturn(td, uap)
@@ -597,11 +956,9 @@
 		if (sigprop(sig) & SA_IGNORE) {
 			if (sig != SIGCONT)
 				SIGADDSET(ps->ps_sigignore, sig);
-			SIGDELSET(p->p_siglist, sig);
-			/*
-			 * There is only one thread at this point.
-			 */
-			SIGDELSET(td->td_siglist, sig);
+			PROC_SLOCK(p);
+			sigqueue_delete_proc(p, sig);
+			PROC_SUNLOCK(p);
 		}
 		ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL;
 	}
@@ -668,10 +1025,6 @@
 	return (error);
 }
 
-/*
- * sigprocmask() - MP SAFE
- */
-
 #ifndef _SYS_SYSPROTO_H_
 struct sigprocmask_args {
 	int	how;
@@ -703,9 +1056,6 @@
 }
 
 #ifdef COMPAT_43	/* XXX - COMPAT_FBSD3 */
-/*
- * osigprocmask() - MP SAFE
- */
 #ifndef _SYS_SYSPROTO_H_
 struct osigprocmask_args {
 	int	how;
@@ -727,18 +1077,10 @@
 }
 #endif /* COMPAT_43 */
 
-#ifndef _SYS_SYSPROTO_H_
-struct sigpending_args {
-	sigset_t	*set;
-};
-#endif
-/*
- * MPSAFE
- */
 int
 sigwait(struct thread *td, struct sigwait_args *uap)
 {
-	siginfo_t info;
+	ksiginfo_t ksi;
 	sigset_t set;
 	int error;
 
@@ -748,7 +1090,7 @@
 		return (0);
 	}
 
-	error = kern_sigtimedwait(td, set, &info, NULL);
+	error = kern_sigtimedwait(td, set, &ksi, NULL);
 	if (error) {
 		if (error == ERESTART)
 			return (error);
@@ -756,26 +1098,18 @@
 		return (0);
 	}
 
-	error = copyout(&info.si_signo, uap->sig, sizeof(info.si_signo));
-	/* Repost if we got an error. */
-	if (error && info.si_signo) {
-		PROC_LOCK(td->td_proc);
-		tdsignal(td, info.si_signo, SIGTARGET_TD);
-		PROC_UNLOCK(td->td_proc);
-	}
+	error = copyout(&ksi.ksi_signo, uap->sig, sizeof(ksi.ksi_signo));
 	td->td_retval[0] = error;
 	return (0);
 }
-/*
- * MPSAFE
- */
+
 int
 sigtimedwait(struct thread *td, struct sigtimedwait_args *uap)
 {
 	struct timespec ts;
 	struct timespec *timeout;
 	sigset_t set;
-	siginfo_t info;
+	ksiginfo_t ksi;
 	int error;
 
 	if (uap->timeout) {
@@ -791,30 +1125,22 @@
 	if (error)
 		return (error);
 
-	error = kern_sigtimedwait(td, set, &info, timeout);
+	error = kern_sigtimedwait(td, set, &ksi, timeout);
 	if (error)
 		return (error);
 
 	if (uap->info)
-		error = copyout(&info, uap->info, sizeof(info));
-	/* Repost if we got an error. */
-	if (error && info.si_signo) {
-		PROC_LOCK(td->td_proc);
-		tdsignal(td, info.si_signo, SIGTARGET_TD);
-		PROC_UNLOCK(td->td_proc);
-	} else {
-		td->td_retval[0] = info.si_signo; 
-	}
+		error = copyout(&ksi.ksi_info, uap->info, sizeof(siginfo_t));
+
+	if (error == 0)
+		td->td_retval[0] = ksi.ksi_signo;
 	return (error);
 }
 
-/*
- * MPSAFE
- */
 int
 sigwaitinfo(struct thread *td, struct sigwaitinfo_args *uap)
 {
-	siginfo_t info;
+	ksiginfo_t ksi;
 	sigset_t set;
 	int error;
 
@@ -822,26 +1148,21 @@
 	if (error)
 		return (error);
 
-	error = kern_sigtimedwait(td, set, &info, NULL);
+	error = kern_sigtimedwait(td, set, &ksi, NULL);
 	if (error)
 		return (error);
 
 	if (uap->info)
-		error = copyout(&info, uap->info, sizeof(info));
-	/* Repost if we got an error. */
-	if (error && info.si_signo) {
-		PROC_LOCK(td->td_proc);
-		tdsignal(td, info.si_signo, SIGTARGET_TD);
-		PROC_UNLOCK(td->td_proc);
-	} else {
-		td->td_retval[0] = info.si_signo;
-	}
+		error = copyout(&ksi.ksi_info, uap->info, sizeof(siginfo_t));
+	
+	if (error == 0)
+		td->td_retval[0] = ksi.ksi_signo;
 	return (error);
 }
 
-static int
-kern_sigtimedwait(struct thread *td, sigset_t waitset, siginfo_t *info,
-    struct timespec *timeout)
+int
+kern_sigtimedwait(struct thread *td, sigset_t waitset, ksiginfo_t *ksi,
+	struct timespec *timeout)
 {
 	struct sigacts *ps;
 	sigset_t savedmask;
@@ -853,6 +1174,8 @@
 	p = td->td_proc;
 	error = 0;
 	sig = 0;
+	ets.tv_sec = 0;
+	ets.tv_nsec = 0;
 	SIG_CANTMASK(waitset);
 
 	PROC_LOCK(p);
@@ -867,36 +1190,42 @@
 		}
 	}
 
-again:
+restart:
 	for (i = 1; i <= _SIG_MAXSIG; ++i) {
 		if (!SIGISMEMBER(waitset, i))
 			continue;
-		if (SIGISMEMBER(td->td_siglist, i)) {
-			SIGFILLSET(td->td_sigmask);
-			SIG_CANTMASK(td->td_sigmask);
-			SIGDELSET(td->td_sigmask, i);
-			mtx_lock(&ps->ps_mtx);
-			sig = cursig(td);
-			i = 0;
-			mtx_unlock(&ps->ps_mtx);
-		} else if (SIGISMEMBER(p->p_siglist, i)) {
-			if (p->p_flag & P_SA) {
-				p->p_flag |= P_SIGEVENT;
-				wakeup(&p->p_siglist);
-			}
-			SIGDELSET(p->p_siglist, i);
-			SIGADDSET(td->td_siglist, i);
-			SIGFILLSET(td->td_sigmask);
-			SIG_CANTMASK(td->td_sigmask);
-			SIGDELSET(td->td_sigmask, i);
-			mtx_lock(&ps->ps_mtx);
-			sig = cursig(td);
-			i = 0;
-			mtx_unlock(&ps->ps_mtx);
+		if (!SIGISMEMBER(td->td_sigqueue.sq_signals, i)) {
+			if (SIGISMEMBER(p->p_sigqueue.sq_signals, i)) {
+#ifdef KSE
+				if (p->p_flag & P_SA) {
+					p->p_flag |= P_SIGEVENT;
+					wakeup(&p->p_siglist);
+				}
+#endif
+				sigqueue_move(&p->p_sigqueue,
+					&td->td_sigqueue, i);
+			} else
+				continue;
 		}
+
+		SIGFILLSET(td->td_sigmask);
+		SIG_CANTMASK(td->td_sigmask);
+		SIGDELSET(td->td_sigmask, i);
+		mtx_lock(&ps->ps_mtx);
+		sig = cursig(td);
+		mtx_unlock(&ps->ps_mtx);
 		if (sig)
 			goto out;
+		else {
+			/*
+			 * Because cursig() may have stopped current thread,
+			 * after it is resumed, things may have already been 
+			 * changed, it should rescan any pending signals.
+			 */
+			goto restart;
+		}
 	}
+
 	if (error)
 		goto out;
 
@@ -934,49 +1263,54 @@
 			error = 0;
 		}
 	}
-	goto again;
+	goto restart;
 
 out:
 	td->td_sigmask = savedmask;
 	signotify(td);
 	if (sig) {
-		sig_t action;
-
+		ksiginfo_init(ksi);
+		sigqueue_get(&td->td_sigqueue, sig, ksi);
+		ksi->ksi_signo = sig;
+		if (ksi->ksi_code == SI_TIMER)
+			itimer_accept(p, ksi->ksi_timerid, ksi);
 		error = 0;
-		mtx_lock(&ps->ps_mtx);
-		action = ps->ps_sigact[_SIG_IDX(sig)];
-		mtx_unlock(&ps->ps_mtx);
+
 #ifdef KTRACE
-		if (KTRPOINT(td, KTR_PSIG))
+		if (KTRPOINT(td, KTR_PSIG)) {
+			sig_t action;
+
+			mtx_lock(&ps->ps_mtx);
+			action = ps->ps_sigact[_SIG_IDX(sig)];
+			mtx_unlock(&ps->ps_mtx);
 			ktrpsig(sig, action, &td->td_sigmask, 0);
+		}
 #endif
-		_STOPEVENT(p, S_SIG, sig);
-
-		SIGDELSET(td->td_siglist, sig);
-		bzero(info, sizeof(*info));
-		info->si_signo = sig;
-		info->si_code = 0;
+		if (sig == SIGKILL)
+			sigexit(td, sig);
 	}
 	PROC_UNLOCK(p);
 	return (error);
 }
 
-/*
- * MPSAFE
- */
+#ifndef _SYS_SYSPROTO_H_
+struct sigpending_args {
+	sigset_t	*set;
+};
+#endif
 int
 sigpending(td, uap)
 	struct thread *td;
 	struct sigpending_args *uap;
 {
 	struct proc *p = td->td_proc;
-	sigset_t siglist;
+	sigset_t pending;
 
 	PROC_LOCK(p);
-	siglist = p->p_siglist;
-	SIGSETOR(siglist, td->td_siglist);
+	pending = p->p_sigqueue.sq_signals;
+	SIGSETOR(pending, td->td_sigqueue.sq_signals);
 	PROC_UNLOCK(p);
-	return (copyout(&siglist, uap->set, sizeof(sigset_t)));
+	return (copyout(&pending, uap->set, sizeof(sigset_t)));
 }
 
 #ifdef COMPAT_43	/* XXX - COMPAT_FBSD3 */
@@ -985,22 +1319,19 @@
 	int	dummy;
 };
 #endif
-/*
- * MPSAFE
- */
 int
 osigpending(td, uap)
 	struct thread *td;
 	struct osigpending_args *uap;
 {
 	struct proc *p = td->td_proc;
-	sigset_t siglist;
+	sigset_t pending;
 
 	PROC_LOCK(p);
-	siglist = p->p_siglist;
-	SIGSETOR(siglist, td->td_siglist);
+	pending = p->p_sigqueue.sq_signals;
+	SIGSETOR(pending, td->td_sigqueue.sq_signals);
 	PROC_UNLOCK(p);
-	SIG2OSIG(siglist, td->td_retval[0]);
+	SIG2OSIG(pending, td->td_retval[0]);
 	return (0);
 }
 #endif /* COMPAT_43 */
@@ -1016,9 +1347,6 @@
 	struct	sigvec *osv;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 osigvec(td, uap)
@@ -1060,9 +1388,6 @@
 	int	mask;
 };
 #endif
-/*
- * MPSAFE
- */
 int
 osigblock(td, uap)
 	register struct thread *td;
@@ -1085,9 +1410,6 @@
 	int	mask;
 };
 #endif
-/*
- * MPSAFE
- */
 int
 osigsetmask(td, uap)
 	struct thread *td;
@@ -1108,20 +1430,14 @@
 #endif /* COMPAT_43 */
 
 /*
- * Suspend process until signal, providing mask to be set
- * in the meantime. 
- ***** XXXKSE this doesn't make sense under KSE.
- ***** Do we suspend the thread or all threads in the process?
- ***** How do we suspend threads running NOW on another processor?
+ * Suspend calling thread until signal, providing mask to be set in the
+ * meantime. 
  */
 #ifndef _SYS_SYSPROTO_H_
 struct sigsuspend_args {
 	const sigset_t *sigmask;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 sigsuspend(td, uap)
@@ -1172,9 +1488,6 @@
 	osigset_t mask;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 osigsuspend(td, uap)
@@ -1206,9 +1519,6 @@
 	struct	sigstack *oss;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 osigstack(td, uap)
@@ -1244,9 +1554,6 @@
 	stack_t	*oss;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 sigaltstack(td, uap)
@@ -1290,9 +1597,9 @@
 		if ((ss->ss_flags & ~SS_DISABLE) != 0)
 			return (EINVAL);
 		if (!(ss->ss_flags & SS_DISABLE)) {
-			if (ss->ss_size < p->p_sysent->sv_minsigstksz) {
+			if (ss->ss_size < p->p_sysent->sv_minsigstksz)
 				return (ENOMEM);
-			}
+
 			td->td_sigstk = *ss;
 			td->td_pflags |= TDP_ALTSTACK;
 		} else {
@@ -1320,10 +1627,10 @@
 		 * broadcast
 		 */
 		sx_slock(&allproc_lock);
-		LIST_FOREACH(p, &allproc, p_list) {
+		FOREACH_PROC_IN_SYSTEM(p) {
 			PROC_LOCK(p);
 			if (p->p_pid <= 1 || p->p_flag & P_SYSTEM ||
-			    p == td->td_proc) {
+			    p == td->td_proc || p->p_state == PRS_NEW) {
 				PROC_UNLOCK(p);
 				continue;
 			}
@@ -1353,7 +1660,8 @@
 		sx_sunlock(&proctree_lock);
 		LIST_FOREACH(p, &pgrp->pg_members, p_pglist) {
 			PROC_LOCK(p);	      
-			if (p->p_pid <= 1 || p->p_flag & P_SYSTEM) {
+			if (p->p_pid <= 1 || p->p_flag & P_SYSTEM ||
+				p->p_state == PRS_NEW ) {
 				PROC_UNLOCK(p);
 				continue;
 			}
@@ -1375,9 +1683,6 @@
 	int	signum;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 kill(td, uap)
@@ -1387,6 +1692,8 @@
 	register struct proc *p;
 	int error;
 
+	AUDIT_ARG(signum, uap->signum);
+	AUDIT_ARG(pid, uap->pid);
 	if ((u_int)uap->signum > _SIG_MAXSIG)
 		return (EINVAL);
 
@@ -1396,6 +1703,7 @@
 			if ((p = zpfind(uap->pid)) == NULL)
 				return (ESRCH);
 		}
+		AUDIT_ARG(process, p);
 		error = p_cansignal(td, p, uap->signum);
 		if (error == 0 && uap->signum)
 			psignal(p, uap->signum);
@@ -1420,9 +1728,6 @@
 	int	signum;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 okillpg(td, uap)
@@ -1430,12 +1735,57 @@
 	register struct okillpg_args *uap;
 {
 
+	AUDIT_ARG(signum, uap->signum);
+	AUDIT_ARG(pid, uap->pgid);
 	if ((u_int)uap->signum > _SIG_MAXSIG)
 		return (EINVAL);
+
 	return (killpg1(td, uap->signum, uap->pgid, 0));
 }
 #endif /* COMPAT_43 */
 
+#ifndef _SYS_SYSPROTO_H_
+struct sigqueue_args {
+	pid_t pid;
+	int signum;
+	/* union sigval */ void *value;
+};
+#endif
+int
+sigqueue(struct thread *td, struct sigqueue_args *uap)
+{
+	ksiginfo_t ksi;
+	struct proc *p;
+	int error;
+
+	if ((u_int)uap->signum > _SIG_MAXSIG)
+		return (EINVAL);
+
+	/*
+	 * Specification says sigqueue can only send signal to
+	 * single process.
+	 */
+	if (uap->pid <= 0)
+		return (EINVAL);
+
+	if ((p = pfind(uap->pid)) == NULL) {
+		if ((p = zpfind(uap->pid)) == NULL)
+			return (ESRCH);
+	}
+	error = p_cansignal(td, p, uap->signum);
+	if (error == 0 && uap->signum != 0) {
+		ksiginfo_init(&ksi);
+		ksi.ksi_signo = uap->signum;
+		ksi.ksi_code = SI_QUEUE;
+		ksi.ksi_pid = td->td_proc->p_pid;
+		ksi.ksi_uid = td->td_ucred->cr_ruid;
+		ksi.ksi_value.sival_ptr = uap->value;
+		error = tdsignal(p, NULL, ksi.ksi_signo, &ksi);
+	}
+	PROC_UNLOCK(p);
+	return (error);
+}
+
 /*
  * Send a signal to a process group.
  */
@@ -1479,27 +1829,33 @@
 }
 
 /*
- * Send a signal caused by a trap to the current thread.
- * If it will be caught immediately, deliver it with correct code.
- * Otherwise, post it normally.
- *
- * MPSAFE
+ * Send a signal caused by a trap to the current thread.  If it will be
+ * caught immediately, deliver it with correct code.  Otherwise, post it
+ * normally.
  */
 void
-trapsignal(struct thread *td, int sig, u_long code)
+trapsignal(struct thread *td, ksiginfo_t *ksi)
 {
 	struct sigacts *ps;
 	struct proc *p;
-	siginfo_t siginfo;
+#ifdef KSE
 	int error;
+#endif
+	int sig;
+	int code;
 
 	p = td->td_proc;
+	sig = ksi->ksi_signo;
+	code = ksi->ksi_code;
+	KASSERT(_SIG_VALID(sig), ("invalid signal"));
+
+#ifdef KSE
 	if (td->td_pflags & TDP_SA) {
 		if (td->td_mailbox == NULL)
 			thread_user_enter(td);
 		PROC_LOCK(p);
 		SIGDELSET(td->td_sigmask, sig);
-		mtx_lock_spin(&sched_lock);
+		thread_lock(td);
 		/*
 		 * Force scheduling an upcall, so UTS has chance to
 		 * process the signal before thread runs again in
@@ -1507,24 +1863,32 @@
 		 */
 		if (td->td_upcall)
 			td->td_upcall->ku_flags |= KUF_DOUPCALL;
-		mtx_unlock_spin(&sched_lock);
+		thread_unlock(td);
 	} else {
 		PROC_LOCK(p);
 	}
+#else
+	PROC_LOCK(p);
+#endif
 	ps = p->p_sigacts;
 	mtx_lock(&ps->ps_mtx);
 	if ((p->p_flag & P_TRACED) == 0 && SIGISMEMBER(ps->ps_sigcatch, sig) &&
 	    !SIGISMEMBER(td->td_sigmask, sig)) {
-		p->p_stats->p_ru.ru_nsignals++;
+		td->td_ru.ru_nsignals++;
 #ifdef KTRACE
 		if (KTRPOINT(curthread, KTR_PSIG))
 			ktrpsig(sig, ps->ps_sigact[_SIG_IDX(sig)],
 			    &td->td_sigmask, code);
 #endif
+#ifdef KSE
 		if (!(td->td_pflags & TDP_SA))
-			(*p->p_sysent->sv_sendsig)(
-				ps->ps_sigact[_SIG_IDX(sig)], sig,
-				&td->td_sigmask, code);
+			(*p->p_sysent->sv_sendsig)(ps->ps_sigact[_SIG_IDX(sig)], 
+				ksi, &td->td_sigmask);
+#else
+		(*p->p_sysent->sv_sendsig)(ps->ps_sigact[_SIG_IDX(sig)], 
+				ksi, &td->td_sigmask);
+#endif
+#ifdef KSE
 		else if (td->td_mailbox == NULL) {
 			mtx_unlock(&ps->ps_mtx);
 			/* UTS caused a sync signal */
@@ -1532,18 +1896,18 @@
 			p->p_sig = sig;		/* XXX to verify code */
 			sigexit(td, sig);
 		} else {
-			cpu_thread_siginfo(sig, code, &siginfo);
 			mtx_unlock(&ps->ps_mtx);
 			SIGADDSET(td->td_sigmask, sig);
 			PROC_UNLOCK(p);
-			error = copyout(&siginfo, &td->td_mailbox->tm_syncsig,
-			    sizeof(siginfo));
+			error = copyout(&ksi->ksi_info, &td->td_mailbox->tm_syncsig,
+			    sizeof(siginfo_t));
 			PROC_LOCK(p);
 			/* UTS memory corrupted */
 			if (error)
 				sigexit(td, SIGSEGV);
 			mtx_lock(&ps->ps_mtx);
 		}
+#endif
 		SIGSETOR(td->td_sigmask, ps->ps_catchmask[_SIG_IDX(sig)]);
 		if (!SIGISMEMBER(ps->ps_signodefer, sig))
 			SIGADDSET(td->td_sigmask, sig);
@@ -1559,10 +1923,23 @@
 		}
 		mtx_unlock(&ps->ps_mtx);
 	} else {
+		/*
+		 * Avoid a possible infinite loop if the thread
+		 * masking the signal or process is ignoring the
+		 * signal.
+		 */
+		if (kern_forcesigexit &&
+		    (SIGISMEMBER(td->td_sigmask, sig) ||
+		     ps->ps_sigact[_SIG_IDX(sig)] == SIG_IGN)) {
+			SIGDELSET(td->td_sigmask, sig);
+			SIGDELSET(ps->ps_sigcatch, sig);
+			SIGDELSET(ps->ps_sigignore, sig);
+			ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL;
+		}
 		mtx_unlock(&ps->ps_mtx);
 		p->p_code = code;	/* XXX for core dump/debugger */
 		p->p_sig = sig;		/* XXX to verify code */
-		tdsignal(td, sig, SIGTARGET_TD);
+		tdsignal(p, td, sig, ksi);
 	}
 	PROC_UNLOCK(p);
 }
@@ -1581,7 +1958,7 @@
 	if (curproc == p && !SIGISMEMBER(curthread->td_sigmask, sig))
 		return (curthread);
 	signal_td = NULL;
-	mtx_lock_spin(&sched_lock);
+	PROC_SLOCK(p);
 	FOREACH_THREAD_IN_PROC(p, td) {
 		if (!SIGISMEMBER(td->td_sigmask, sig)) {
 			signal_td = td;
@@ -1590,7 +1967,7 @@
 	}
 	if (signal_td == NULL)
 		signal_td = FIRST_THREAD_IN_PROC(p);
-	mtx_unlock_spin(&sched_lock);
+	PROC_SUNLOCK(p);
 	return (signal_td);
 }
 
@@ -1606,75 +1983,97 @@
  *     regardless of the signal action (eg, blocked or ignored).
  *
  * Other ignored signals are discarded immediately.
- *
- * MPSAFE
+ * 
+ * NB: This function may be entered from the debugger via the "kill" DDB
+ * command.  There is little that can be done to mitigate the possibly messy
+ * side effects of this unwise possibility.
  */
 void
 psignal(struct proc *p, int sig)
 {
-	struct thread *td;
-	int prop;
+	(void) tdsignal(p, NULL, sig, NULL);
+}
 
-	if (!_SIG_VALID(sig))
-		panic("psignal(): invalid signal");
+int
+psignal_event(struct proc *p, struct sigevent *sigev, ksiginfo_t *ksi)
+{
+	struct thread *td = NULL;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
-	/*
-	 * IEEE Std 1003.1-2001: return success when killing a zombie.
-	 */
-	if (p->p_state == PRS_ZOMBIE)
-		return;
-	prop = sigprop(sig);
+
+	KASSERT(!KSI_ONQ(ksi), ("psignal_event: ksi on queue"));
 
 	/*
-	 * Find a thread to deliver the signal to.
+	 * ksi_code and other fields should be set before
+	 * calling this function.
 	 */
-	td = sigtd(p, sig, prop);
-
-	tdsignal(td, sig, SIGTARGET_P);
+	ksi->ksi_signo = sigev->sigev_signo;
+	ksi->ksi_value = sigev->sigev_value;
+	if (sigev->sigev_notify == SIGEV_THREAD_ID) {
+		td = thread_find(p, sigev->sigev_notify_thread_id);
+		if (td == NULL)
+			return (ESRCH);
+	}
+	return (tdsignal(p, td, ksi->ksi_signo, ksi));
 }
 
-/*
- * MPSAFE
- */
-void
-tdsignal(struct thread *td, int sig, sigtarget_t target)
+int
+tdsignal(struct proc *p, struct thread *td, int sig, ksiginfo_t *ksi)
 {
+#ifdef KSE
 	sigset_t saved;
-	struct proc *p = td->td_proc;
+	int ret;
 
 	if (p->p_flag & P_SA)
-		saved = p->p_siglist;
-	do_tdsignal(td, sig, target);
+		saved = p->p_sigqueue.sq_signals;
+	ret = do_tdsignal(p, td, sig, ksi);
 	if ((p->p_flag & P_SA) && !(p->p_flag & P_SIGEVENT)) {
-		if (!SIGSETEQ(saved, p->p_siglist)) {
+		if (!SIGSETEQ(saved, p->p_sigqueue.sq_signals)) {
 			/* pending set changed */
 			p->p_flag |= P_SIGEVENT;
 			wakeup(&p->p_siglist);
 		}
 	}
+	return (ret);
 }
 
-static void
-do_tdsignal(struct thread *td, int sig, sigtarget_t target)
+static int
+do_tdsignal(struct proc *p, struct thread *td, int sig, ksiginfo_t *ksi)
 {
-	struct proc *p;
-	register sig_t action;
-	sigset_t *siglist;
-	struct thread *td0;
-	register int prop;
+#endif
+	sig_t action;
+	sigqueue_t *sigqueue;
+	int prop;
 	struct sigacts *ps;
 	int intrval;
+	int ret = 0;
+
+	PROC_LOCK_ASSERT(p, MA_OWNED);
 
 	if (!_SIG_VALID(sig))
-		panic("do_tdsignal(): invalid signal");
+#ifdef KSE
+		panic("do_tdsignal(): invalid signal %d", sig);
+#else
+		panic("tdsignal(): invalid signal %d", sig);
+#endif
 
-	p = td->td_proc;
-	ps = p->p_sigacts;
+#ifdef KSE
+	KASSERT(ksi == NULL || !KSI_ONQ(ksi), ("do_tdsignal: ksi on queue"));
+#else
+	KASSERT(ksi == NULL || !KSI_ONQ(ksi), ("tdsignal: ksi on queue"));
+#endif
 
-	PROC_LOCK_ASSERT(p, MA_OWNED);
-	KNOTE_LOCKED(&p->p_klist, NOTE_SIGNAL | sig);
+	/*
+	 * IEEE Std 1003.1-2001: return success when killing a zombie.
+	 */
+	if (p->p_state == PRS_ZOMBIE) {
+		if (ksi && (ksi->ksi_flags & KSI_INS))
+			ksiginfo_tryfree(ksi);
+		return (ret);
+	}
 
+	ps = p->p_sigacts;
+	KNOTE_LOCKED(&p->p_klist, NOTE_SIGNAL | sig);
 	prop = sigprop(sig);
 
 	/*
@@ -1682,13 +2081,15 @@
 	 * assign it to the process so that we can find it later in the first
 	 * thread that unblocks it.  Otherwise, assign it to this thread now.
 	 */
-	if (target == SIGTARGET_TD) {
-		siglist = &td->td_siglist;
-	} else {
-		if (!SIGISMEMBER(td->td_sigmask, sig))
-			siglist = &td->td_siglist;
+	if (td == NULL) {
+		td = sigtd(p, sig, prop);
+		if (SIGISMEMBER(td->td_sigmask, sig))
+			sigqueue = &p->p_sigqueue;
 		else
-			siglist = &p->p_siglist;
+			sigqueue = &td->td_sigqueue;
+	} else {
+		KASSERT(td->td_proc == p, ("invalid thread"));
+		sigqueue = &td->td_sigqueue;
 	}
 
 	/*
@@ -1699,10 +2100,11 @@
 	 * action will be SIG_DFL here.)
 	 */
 	mtx_lock(&ps->ps_mtx);
-	if (SIGISMEMBER(ps->ps_sigignore, sig) ||
-	    (p->p_flag & P_WEXIT)) {
+	if (SIGISMEMBER(ps->ps_sigignore, sig)) {
 		mtx_unlock(&ps->ps_mtx);
-		return;
+		if (ksi && (ksi->ksi_flags & KSI_INS))
+			ksiginfo_tryfree(ksi);
+		return (ret);
 	}
 	if (SIGISMEMBER(td->td_sigmask, sig))
 		action = SIG_HOLD;
@@ -1716,19 +2118,9 @@
 		intrval = ERESTART;
 	mtx_unlock(&ps->ps_mtx);
 
-	if (prop & SA_CONT) {
-		SIG_STOPSIGMASK(p->p_siglist);
-		/*
-		 * XXX Should investigate leaving STOP and CONT sigs only in
-		 * the proc's siglist.
-		 */
-		mtx_lock_spin(&sched_lock);
-		FOREACH_THREAD_IN_PROC(p, td0)
-			SIG_STOPSIGMASK(td0->td_siglist);
-		mtx_unlock_spin(&sched_lock);
-	}
-
-	if (prop & SA_STOP) {
+	if (prop & SA_CONT)
+		sigqueue_delete_stopmask_proc(p);
+	else if (prop & SA_STOP) {
 		/*
 		 * If sending a tty stop signal to a member of an orphaned
 		 * process group, discard the signal here if the action
@@ -1737,25 +2129,33 @@
 		 */
 		if ((prop & SA_TTYSTOP) &&
 		    (p->p_pgrp->pg_jobc == 0) &&
-		    (action == SIG_DFL))
-		        return;
-		SIG_CONTSIGMASK(p->p_siglist);
-		mtx_lock_spin(&sched_lock);
-		FOREACH_THREAD_IN_PROC(p, td0)
-			SIG_CONTSIGMASK(td0->td_siglist);
-		mtx_unlock_spin(&sched_lock);
-		p->p_flag &= ~P_CONTINUED;
+		    (action == SIG_DFL)) {
+			if (ksi && (ksi->ksi_flags & KSI_INS))
+				ksiginfo_tryfree(ksi);
+			return (ret);
+		}
+		PROC_SLOCK(p);
+		sigqueue_delete_proc(p, SIGCONT);
+		PROC_SUNLOCK(p);
+		if (p->p_flag & P_CONTINUED) {
+			p->p_flag &= ~P_CONTINUED;
+			PROC_LOCK(p->p_pptr);
+			sigqueue_take(p->p_ksi);
+			PROC_UNLOCK(p->p_pptr);
+		}
 	}
 
-	SIGADDSET(*siglist, sig);
-	signotify(td);			/* uses schedlock */
+	ret = sigqueue_add(sigqueue, sig, ksi);
+	if (ret != 0)
+		return (ret);
+	signotify(td);
 	/*
 	 * Defer further processing for signals which are held,
 	 * except that stopped processes must be continued by SIGCONT.
 	 */
 	if (action == SIG_HOLD &&
 	    !((prop & SA_CONT) && (p->p_flag & P_STOPPED_SIG)))
-		return;
+		return (ret);
 	/*
 	 * SIGKILL: Remove procfs STOPEVENTs.
 	 */
@@ -1774,6 +2174,7 @@
 	 * waking up threads so that they can cross the user boundary.
 	 * We try do the per-process part here.
 	 */
+	PROC_SLOCK(p);
 	if (P_SHOULDSTOP(p)) {
 		/*
 		 * The process is in stopped mode. All the threads should be
@@ -1785,6 +2186,7 @@
 			 * so no further action is necessary.
 			 * No signal can restart us.
 			 */
+			PROC_SUNLOCK(p);
 			goto out;
 		}
 
@@ -1801,19 +2203,32 @@
 		if (prop & SA_CONT) {
 			/*
 			 * If SIGCONT is default (or ignored), we continue the
-			 * process but don't leave the signal in siglist as
+			 * process but don't leave the signal in sigqueue as
 			 * it has no further action.  If SIGCONT is held, we
 			 * continue the process and leave the signal in
-			 * siglist.  If the process catches SIGCONT, let it
+			 * sigqueue.  If the process catches SIGCONT, let it
 			 * handle the signal itself.  If it isn't waiting on
 			 * an event, it goes back to run state.
 			 * Otherwise, process goes back to sleep state.
 			 */
 			p->p_flag &= ~P_STOPPED_SIG;
-			p->p_flag |= P_CONTINUED;
+			if (p->p_numthreads == p->p_suspcount) {
+				PROC_SUNLOCK(p);
+				p->p_flag |= P_CONTINUED;
+				p->p_xstat = SIGCONT;
+				PROC_LOCK(p->p_pptr);
+				childproc_continued(p);
+				PROC_UNLOCK(p->p_pptr);
+				PROC_SLOCK(p);
+			}
 			if (action == SIG_DFL) {
-				SIGDELSET(*siglist, sig);
-			} else if (action == SIG_CATCH) {
+				thread_unsuspend(p);
+				PROC_SUNLOCK(p);
+				sigqueue_delete(sigqueue, sig);
+				goto out;
+			}
+			if (action == SIG_CATCH) {
+#ifdef KSE
 				/*
 				 * The process wants to catch it so it needs
 				 * to run at least one thread, but which one?
@@ -1824,14 +2239,18 @@
 				 * single thread is runnable asap.
 				 * XXXKSE for now however, make them all run.
 				 */
+#endif
+				/*
+				 * The process wants to catch it so it needs
+				 * to run at least one thread, but which one?
+				 */
 				goto runfast;
 			}
 			/*
 			 * The signal is not ignored or caught.
 			 */
-			mtx_lock_spin(&sched_lock);
 			thread_unsuspend(p);
-			mtx_unlock_spin(&sched_lock);
+			PROC_SUNLOCK(p);
 			goto out;
 		}
 
@@ -1841,8 +2260,9 @@
 			 * (If we did the shell could get confused).
 			 * Just make sure the signal STOP bit set.
 			 */
+			PROC_SUNLOCK(p);
 			p->p_flag |= P_STOPPED_SIG;
-			SIGDELSET(*siglist, sig);
+			sigqueue_delete(sigqueue, sig);
 			goto out;
 		}
 
@@ -1854,10 +2274,11 @@
 		 * the PROCESS runnable, leave it stopped.
 		 * It may run a bit until it hits a thread_suspend_check().
 		 */
-		mtx_lock_spin(&sched_lock);
+		thread_lock(td);
 		if (TD_ON_SLEEPQ(td) && (td->td_flags & TDF_SINTR))
 			sleepq_abort(td, intrval);
-		mtx_unlock_spin(&sched_lock);
+		thread_unlock(td);
+		PROC_SUNLOCK(p);
 		goto out;
 		/*
 		 * Mutexes are short lived. Threads waiting on them will
@@ -1865,28 +2286,36 @@
 		 */
 	} else if (p->p_state == PRS_NORMAL) {
 		if (p->p_flag & P_TRACED || action == SIG_CATCH) {
-			mtx_lock_spin(&sched_lock);
+			thread_lock(td);
 			tdsigwakeup(td, sig, action, intrval);
-			mtx_unlock_spin(&sched_lock);
+			thread_unlock(td);
+			PROC_SUNLOCK(p);
 			goto out;
 		}
 
 		MPASS(action == SIG_DFL);
 
 		if (prop & SA_STOP) {
-			if (p->p_flag & P_PPWAIT)
+			if (p->p_flag & P_PPWAIT) {
+				PROC_SUNLOCK(p);
 				goto out;
+			}
 			p->p_flag |= P_STOPPED_SIG;
 			p->p_xstat = sig;
-			mtx_lock_spin(&sched_lock);
 			sig_suspend_threads(td, p, 1);
-			thread_stopped(p);
 			if (p->p_numthreads == p->p_suspcount) {
-				SIGDELSET(p->p_siglist, p->p_xstat);
-				FOREACH_THREAD_IN_PROC(p, td0)
-					SIGDELSET(td0->td_siglist, p->p_xstat);
-			}
-			mtx_unlock_spin(&sched_lock);
+				/*
+				 * only thread sending signal to another
+				 * process can reach here, if thread is sending
+				 * signal to its process, because thread does
+				 * not suspend itself here, p_numthreads
+				 * should never be equal to p_suspcount.
+				 */
+				thread_stopped(p);
+				PROC_SUNLOCK(p);
+				sigqueue_delete_proc(p, p->p_xstat);
+			} else
+				PROC_SUNLOCK(p);
 			goto out;
 		} 
 		else
@@ -1894,7 +2323,8 @@
 		/* NOTREACHED */
 	} else {
 		/* Not in "NORMAL" state. discard the signal. */
-		SIGDELSET(*siglist, sig);
+		PROC_SUNLOCK(p);
+		sigqueue_delete(sigqueue, sig);
 		goto out;
 	}
 
@@ -1904,13 +2334,15 @@
 	 */
 
 runfast:
-	mtx_lock_spin(&sched_lock);
+	thread_lock(td);
 	tdsigwakeup(td, sig, action, intrval);
+	thread_unlock(td);
 	thread_unsuspend(p);
-	mtx_unlock_spin(&sched_lock);
+	PROC_SUNLOCK(p);
 out:
-	/* If we jump here, sched_lock should not be owned. */
-	mtx_assert(&sched_lock, MA_NOTOWNED);
+	/* If we jump here, proc slock should not be owned. */
+	PROC_SLOCK_ASSERT(p, MA_NOTOWNED);
+	return (ret);
 }
 
 /*
@@ -1925,19 +2357,16 @@
 	register int prop;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
-	mtx_assert(&sched_lock, MA_OWNED);
+	PROC_SLOCK_ASSERT(p, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	prop = sigprop(sig);
 
 	/*
 	 * Bring the priority of a thread up if we want it to get
 	 * killed in this lifetime.
 	 */
-	if (action == SIG_DFL && (prop & SA_KILL)) {
-		if (p->p_nice > 0)
-			sched_nice(td->td_proc, 0);
-		if (td->td_priority > PUSER)
-			sched_prio(td, PUSER);
-	}
+	if (action == SIG_DFL && (prop & SA_KILL) && td->td_priority > PUSER)
+		sched_prio(td, PUSER);
 
 	if (TD_ON_SLEEPQ(td)) {
 		/*
@@ -1954,12 +2383,16 @@
 		 * be awakened.
 		 */
 		if ((prop & SA_CONT) && action == SIG_DFL) {
-			SIGDELSET(p->p_siglist, sig);
+			thread_unlock(td);
+			PROC_SUNLOCK(p);
+			sigqueue_delete(&p->p_sigqueue, sig);
 			/*
 			 * It may be on either list in this state.
 			 * Remove from both for now.
 			 */
-			SIGDELSET(td->td_siglist, sig);
+			sigqueue_delete(&td->td_sigqueue, sig);
+			PROC_SLOCK(p);
+			thread_lock(td);
 			return;
 		}
 
@@ -1989,9 +2422,10 @@
 	struct thread *td2;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
-	mtx_assert(&sched_lock, MA_OWNED);
+	PROC_SLOCK_ASSERT(p, MA_OWNED);
 
 	FOREACH_THREAD_IN_PROC(p, td2) {
+		thread_lock(td2);
 		if ((TD_IS_SLEEPING(td2) || TD_IS_SWAPPED(td2)) &&
 		    (td2->td_flags & TDF_SINTR) &&
 		    !TD_IS_SUSPENDED(td2)) {
@@ -2004,6 +2438,7 @@
 				forward_signal(td2);
 #endif
 		}
+		thread_unlock(td2);
 	}
 }
 
@@ -2014,17 +2449,19 @@
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK,
-	    &p->p_mtx.mtx_object, "Stopping for traced signal");
+	    &p->p_mtx.lock_object, "Stopping for traced signal");
 
-	mtx_lock_spin(&sched_lock);
+	thread_lock(td);
 	td->td_flags |= TDF_XSIG;
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(td);
 	td->td_xsig = sig;
+	PROC_SLOCK(p);
 	while ((p->p_flag & P_TRACED) && (td->td_flags & TDF_XSIG)) {
 		if (p->p_flag & P_SINGLE_EXIT) {
-			mtx_lock_spin(&sched_lock);
+			thread_lock(td);
 			td->td_flags &= ~TDF_XSIG;
-			mtx_unlock_spin(&sched_lock);
+			thread_unlock(td);
+			PROC_SUNLOCK(p);
 			return (sig);
 		}
 		/*
@@ -2034,26 +2471,19 @@
 		p->p_xstat = sig;
 		p->p_xthread = td;
 		p->p_flag |= (P_STOPPED_SIG|P_STOPPED_TRACE);
-		mtx_lock_spin(&sched_lock);
 		sig_suspend_threads(td, p, 0);
 stopme:
-		thread_stopped(p);
-		thread_suspend_one(td);
-		PROC_UNLOCK(p);
-		DROP_GIANT();
-		mi_switch(SW_VOL, NULL);
-		mtx_unlock_spin(&sched_lock);
-		PICKUP_GIANT();
-		PROC_LOCK(p);
-		if (!(p->p_flag & P_TRACED))
+		thread_suspend_switch(td);
+		if (!(p->p_flag & P_TRACED)) {
 			break;
+		}
 		if (td->td_flags & TDF_DBSUSPEND) {
 			if (p->p_flag & P_SINGLE_EXIT)
 				break;
-			mtx_lock_spin(&sched_lock);
 			goto stopme;
 		}
 	}
+	PROC_SUNLOCK(p);
 	return (td->td_xsig);
 }
 
@@ -2085,7 +2515,7 @@
 	for (;;) {
 		int traced = (p->p_flag & P_TRACED) || (p->p_stops & S_SIG);
 
-		sigpending = td->td_siglist;
+		sigpending = td->td_sigqueue.sq_signals;
 		SIGSETNAND(sigpending, td->td_sigmask);
 
 		if (p->p_flag & P_PPWAIT)
@@ -2105,9 +2535,11 @@
 		 * only if P_TRACED was on when they were posted.
 		 */
 		if (SIGISMEMBER(ps->ps_sigignore, sig) && (traced == 0)) {
-			SIGDELSET(td->td_siglist, sig);
+			sigqueue_delete(&td->td_sigqueue, sig);
+#ifdef KSE
 			if (td->td_pflags & TDP_SA)
 				SIGADDSET(td->td_sigmask, sig);
+#endif
 			continue;
 		}
 		if (p->p_flag & P_TRACED && (p->p_flag & P_PPWAIT) == 0) {
@@ -2118,17 +2550,43 @@
 			newsig = ptracestop(td, sig);
 			mtx_lock(&ps->ps_mtx);
 
-			/*
-			 * If parent wants us to take the signal,
-			 * then it will leave it in p->p_xstat;
-			 * otherwise we just look for signals again.
-			 */
-			SIGDELSET(td->td_siglist, sig);	/* clear old signal */
+#ifdef KSE
 			if (td->td_pflags & TDP_SA)
 				SIGADDSET(td->td_sigmask, sig);
-			if (newsig == 0)
-				continue;
-			sig = newsig;
+
+#endif
+			if (sig != newsig) {
+				ksiginfo_t ksi;
+				/*
+				 * clear old signal.
+				 * XXX shrug off debugger, it causes siginfo to
+				 * be thrown away.
+				 */
+				sigqueue_get(&td->td_sigqueue, sig, &ksi);
+
+				/*
+				 * If parent wants us to take the signal,
+				 * then it will leave it in p->p_xstat;
+				 * otherwise we just look for signals again.
+			 	*/
+				if (newsig == 0)
+					continue;
+				sig = newsig;
+
+				/*
+				 * Put the new signal into td_sigqueue. If the
+				 * signal is being masked, look for other signals.
+				 */
+				SIGADDSET(td->td_sigqueue.sq_signals, sig);
+#ifdef KSE
+				if (td->td_pflags & TDP_SA)
+					SIGDELSET(td->td_sigmask, sig);
+#endif
+				if (SIGISMEMBER(td->td_sigmask, sig))
+					continue;
+				signotify(td);
+			}
+
 			/*
 			 * If the traced bit got turned off, go back up
 			 * to the top to rescan signals.  This ensures
@@ -2136,17 +2594,6 @@
 			 */
 			if ((p->p_flag & P_TRACED) == 0)
 				continue;
-
-			/*
-			 * Put the new signal into td_siglist.  If the
-			 * signal is being masked, look for other signals.
-			 */
-			SIGADDSET(td->td_siglist, sig);
-			if (td->td_pflags & TDP_SA)
-				SIGDELSET(td->td_sigmask, sig);
-			if (SIGISMEMBER(td->td_sigmask, sig))
-				continue;
-			signotify(td);
 		}
 
 		prop = sigprop(sig);
@@ -2187,19 +2634,13 @@
 					break;	/* == ignore */
 				mtx_unlock(&ps->ps_mtx);
 				WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK,
-				    &p->p_mtx.mtx_object, "Catching SIGSTOP");
+				    &p->p_mtx.lock_object, "Catching SIGSTOP");
 				p->p_flag |= P_STOPPED_SIG;
 				p->p_xstat = sig;
-				mtx_lock_spin(&sched_lock);
+				PROC_SLOCK(p);
 				sig_suspend_threads(td, p, 0);
-				thread_stopped(p);
-				thread_suspend_one(td);
-				PROC_UNLOCK(p);
-				DROP_GIANT();
-				mi_switch(SW_INVOL, NULL);
-				mtx_unlock_spin(&sched_lock);
-				PICKUP_GIANT();
-				PROC_LOCK(p);
+				thread_suspend_switch(td);
+				PROC_SUNLOCK(p);
 				mtx_lock(&ps->ps_mtx);
 				break;
 			} else if (prop & SA_IGNORE) {
@@ -2230,47 +2671,29 @@
 			 */
 			return (sig);
 		}
-		SIGDELSET(td->td_siglist, sig);		/* take the signal! */
+		sigqueue_delete(&td->td_sigqueue, sig);		/* take the signal! */
 	}
 	/* NOTREACHED */
 }
 
-/*
- * MPSAFE
- */
 void
 thread_stopped(struct proc *p)
 {
-	struct proc *p1 = curthread->td_proc;
-	struct sigacts *ps;
 	int n;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
-	mtx_assert(&sched_lock, MA_OWNED);
+	PROC_SLOCK_ASSERT(p, MA_OWNED);
 	n = p->p_suspcount;
-	if (p == p1)
+	if (p == curproc)
 		n++;
 	if ((p->p_flag & P_STOPPED_SIG) && (n == p->p_numthreads)) {
-		mtx_unlock_spin(&sched_lock);
+		PROC_SUNLOCK(p);
 		p->p_flag &= ~P_WAITED;
 		PROC_LOCK(p->p_pptr);
-		/*
-		 * Wake up parent sleeping in kern_wait(), also send
-		 * SIGCHLD to parent, but SIGCHLD does not guarantee
-		 * that parent will awake, because parent may masked
-		 * the signal.
-		 */
-		p->p_pptr->p_flag |= P_STATCHILD;
-		wakeup(p->p_pptr);
-		ps = p->p_pptr->p_sigacts;
-		mtx_lock(&ps->ps_mtx);
-		if ((ps->ps_flag & PS_NOCLDSTOP) == 0) {
-			mtx_unlock(&ps->ps_mtx);
-			psignal(p->p_pptr, SIGCHLD);
-		} else
-			mtx_unlock(&ps->ps_mtx);
+		childproc_stopped(p, (p->p_flag & P_TRACED) ?
+			CLD_TRAPPED : CLD_STOPPED);
 		PROC_UNLOCK(p->p_pptr);
-		mtx_lock_spin(&sched_lock);
+		PROC_SLOCK(p);
 	}
 }
  
@@ -2286,6 +2709,7 @@
 	register struct proc *p = td->td_proc;
 	struct sigacts *ps;
 	sig_t action;
+	ksiginfo_t ksi;
 	sigset_t returnmask;
 	int code;
 
@@ -2294,7 +2718,11 @@
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	ps = p->p_sigacts;
 	mtx_assert(&ps->ps_mtx, MA_OWNED);
-	SIGDELSET(td->td_siglist, sig);
+	ksiginfo_init(&ksi);
+	sigqueue_get(&td->td_sigqueue, sig, &ksi);
+	ksi.ksi_signo = sig;
+	if (ksi.ksi_code == SI_TIMER)
+		itimer_accept(p, ksi.ksi_timerid, &ksi);
 	action = ps->ps_sigact[_SIG_IDX(sig)];
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_PSIG))
@@ -2307,7 +2735,11 @@
 		mtx_lock(&ps->ps_mtx);
 	}
 
+#ifdef KSE
 	if (!(td->td_pflags & TDP_SA) && action == SIG_DFL) {
+#else
+	if (action == SIG_DFL) {
+#endif
 		/*
 		 * Default action, where the default is to kill
 		 * the process.  (Other cases were ignored above.)
@@ -2316,6 +2748,7 @@
 		sigexit(td, sig);
 		/* NOTREACHED */
 	} else {
+#ifdef KSE
 		if (td->td_pflags & TDP_SA) {
 			if (sig == SIGKILL) {
 				mtx_unlock(&ps->ps_mtx);
@@ -2323,6 +2756,7 @@
 			}
 		}
 
+#endif
 		/*
 		 * If we get here, the signal must be caught.
 		 */
@@ -2357,7 +2791,7 @@
 				SIGADDSET(ps->ps_sigignore, sig);
 			ps->ps_sigact[_SIG_IDX(sig)] = SIG_DFL;
 		}
-		p->p_stats->p_ru.ru_nsignals++;
+		td->td_ru.ru_nsignals++;
 		if (p->p_sig != sig) {
 			code = 0;
 		} else {
@@ -2365,11 +2799,14 @@
 			p->p_code = 0;
 			p->p_sig = 0;
 		}
+#ifdef KSE
 		if (td->td_pflags & TDP_SA)
-			thread_signal_add(curthread, sig);
+			thread_signal_add(curthread, &ksi);
 		else
-			(*p->p_sysent->sv_sendsig)(action, sig,
-			    &returnmask, code);
+			(*p->p_sysent->sv_sendsig)(action, &ksi, &returnmask);
+#else
+		(*p->p_sysent->sv_sendsig)(action, &ksi, &returnmask);
+#endif
 	}
 }
 
@@ -2397,8 +2834,6 @@
  * signal state.  Mark the accounting record with the signal termination.
  * If dumping core, save the signal number for the debugger.  Calls exit and
  * does not return.
- *
- * MPSAFE
  */
 void
 sigexit(td, sig)
@@ -2442,6 +2877,84 @@
 	/* NOTREACHED */
 }
 
+/*
+ * Send queued SIGCHLD to parent when child process's state
+ * is changed.
+ */
+static void
+sigparent(struct proc *p, int reason, int status)
+{
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	PROC_LOCK_ASSERT(p->p_pptr, MA_OWNED);
+
+	if (p->p_ksi != NULL) {
+		p->p_ksi->ksi_signo  = SIGCHLD;
+		p->p_ksi->ksi_code   = reason;
+		p->p_ksi->ksi_status = status;
+		p->p_ksi->ksi_pid    = p->p_pid;
+		p->p_ksi->ksi_uid    = p->p_ucred->cr_ruid;
+		if (KSI_ONQ(p->p_ksi))
+			return;
+	}
+	tdsignal(p->p_pptr, NULL, SIGCHLD, p->p_ksi);
+}
+
+static void
+childproc_jobstate(struct proc *p, int reason, int status)
+{
+	struct sigacts *ps;
+
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	PROC_LOCK_ASSERT(p->p_pptr, MA_OWNED);
+
+	/*
+	 * Wake up parent sleeping in kern_wait(), also send
+	 * SIGCHLD to parent, but SIGCHLD does not guarantee
+	 * that parent will awake, because parent may masked
+	 * the signal.
+	 */
+	p->p_pptr->p_flag |= P_STATCHILD;
+	wakeup(p->p_pptr);
+
+	ps = p->p_pptr->p_sigacts;
+	mtx_lock(&ps->ps_mtx);
+	if ((ps->ps_flag & PS_NOCLDSTOP) == 0) {
+		mtx_unlock(&ps->ps_mtx);
+		sigparent(p, reason, status);
+	} else
+		mtx_unlock(&ps->ps_mtx);
+}
+
+void
+childproc_stopped(struct proc *p, int reason)
+{
+	childproc_jobstate(p, reason, p->p_xstat);
+}
+
+void
+childproc_continued(struct proc *p)
+{
+	childproc_jobstate(p, CLD_CONTINUED, SIGCONT);
+}
+
+void
+childproc_exited(struct proc *p)
+{
+	int reason;
+	int status = p->p_xstat; /* convert to int */
+
+	reason = CLD_EXITED;
+	if (WCOREDUMP(status))
+		reason = CLD_DUMPED;
+	else if (WIFSIGNALED(status))
+		reason = CLD_KILLED;
+	/*
+	 * XXX avoid calling wakeup(p->p_pptr), the work is
+	 * done in exit1().
+	 */
+	sigparent(p, reason, status);
+}
+
 static char corefilename[MAXPATHLEN] = {"%N.core"};
 SYSCTL_STRING(_kern, OID_AUTO, corefile, CTLFLAG_RW, corefilename,
 	      sizeof(corefilename), "process corefile name format string");
@@ -2539,6 +3052,7 @@
 	struct mount *mp;
 	char *name;			/* name of corefile */
 	off_t limit;
+	int vfslocked;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	MPASS((p->p_flag & P_HADTHREADS) == 0 || p->p_singlethread == td);
@@ -2562,21 +3076,17 @@
 	if (limit == 0)
 		return (EFBIG);
 
-	mtx_lock(&Giant);
 restart:
 	name = expand_name(p->p_comm, td->td_ucred->cr_uid, p->p_pid);
-	if (name == NULL) {
-		mtx_unlock(&Giant);
+	if (name == NULL)
 		return (EINVAL);
-	}
-	NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, name, td); /* XXXKSE */
+	NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE, UIO_SYSSPACE, name, td);
 	flags = O_CREAT | FWRITE | O_NOFOLLOW;
-	error = vn_open(&nd, &flags, S_IRUSR | S_IWUSR, -1);
+	error = vn_open(&nd, &flags, S_IRUSR | S_IWUSR, NULL);
 	free(name, M_TEMP);
-	if (error) {
-		mtx_unlock(&Giant);		
+	if (error)
 		return (error);
-	}
+	vfslocked = NDHASGIANT(&nd);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	vp = nd.ni_vp;
 
@@ -2585,7 +3095,7 @@
 	    VOP_GETATTR(vp, &vattr, cred, td) || vattr.va_nlink != 1) {
 		VOP_UNLOCK(vp, 0, td);
 		error = EFAULT;
-		goto out;
+		goto close;
 	}
 
 	VOP_UNLOCK(vp, 0, td);
@@ -2600,9 +3110,10 @@
 		if (locked)
 			VOP_ADVLOCK(vp, (caddr_t)p, F_UNLCK, &lf, F_FLOCK);
 		if ((error = vn_close(vp, FWRITE, cred, td)) != 0)
-			return (error);
+			goto out;
 		if ((error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH)) != 0)
-			return (error);
+			goto out;
+		VFS_UNLOCK_GIANT(vfslocked);
 		goto restart;
 	}
 
@@ -2614,6 +3125,7 @@
 	VOP_LEASE(vp, td, cred, LEASE_WRITE);
 	VOP_SETATTR(vp, &vattr, cred, td);
 	VOP_UNLOCK(vp, 0, td);
+	vn_finished_write(mp);
 	PROC_LOCK(p);
 	p->p_acflag |= ACORE;
 	PROC_UNLOCK(p);
@@ -2626,27 +3138,24 @@
 		lf.l_type = F_UNLCK;
 		VOP_ADVLOCK(vp, (caddr_t)p, F_UNLCK, &lf, F_FLOCK);
 	}
-	vn_finished_write(mp);
-out:
+close:
 	error1 = vn_close(vp, FWRITE, cred, td);
-	mtx_unlock(&Giant);
 	if (error == 0)
 		error = error1;
+out:
+	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
 /*
- * Nonexistent system call-- signal process (may want to handle it).
- * Flag error in case process won't see signal immediately (blocked or ignored).
+ * Nonexistent system call-- signal process (may want to handle it).  Flag
+ * error in case process won't see signal immediately (blocked or ignored).
  */
 #ifndef _SYS_SYSPROTO_H_
 struct nosys_args {
 	int	dummy;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 nosys(td, args)
@@ -2662,8 +3171,8 @@
 }
 
 /*
- * Send a SIGIO or SIGURG signal to a process or process group using
- * stored credentials rather than those of the current process.
+ * Send a SIGIO or SIGURG signal to a process or process group using stored
+ * credentials rather than those of the current process.
  */
 void
 pgsigio(sigiop, sig, checkctty)
Index: kern_environment.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_environment.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/kern_environment.c -L sys/kern/kern_environment.c -u -r1.1.1.1 -r1.2
--- sys/kern/kern_environment.c
+++ sys/kern/kern_environment.c
@@ -35,7 +35,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_environment.c,v 1.39.2.2 2005/10/09 03:29:03 delphij Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_environment.c,v 1.47 2007/03/05 13:10:57 rwatson Exp $");
 
 #include "opt_mac.h"
 
@@ -44,17 +44,18 @@
 #include <sys/proc.h>
 #include <sys/queue.h>
 #include <sys/lock.h>
-#include <sys/mac.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
+#include <sys/priv.h>
 #include <sys/kernel.h>
-#include <sys/sx.h>
 #include <sys/systm.h>
 #include <sys/sysent.h>
 #include <sys/sysproto.h>
 #include <sys/libkern.h>
 #include <sys/kenv.h>
 
+#include <security/mac/mac_framework.h>
+
 static MALLOC_DEFINE(M_KENV, "kenv", "kernel environment");
 
 #define KENV_SIZE	512	/* Maximum number of environment strings */
@@ -65,11 +66,10 @@
 
 /* dynamic environment variables */
 char		**kenvp;
-struct sx	kenv_lock;
+struct mtx	kenv_lock;
 
 /*
- * No need to protect this with a mutex
- * since SYSINITS are single threaded.
+ * No need to protect this with a mutex since SYSINITS are single threaded.
  */
 int	dynamic_kenv = 0;
 
@@ -86,7 +86,7 @@
 		int len;
 	} */ *uap;
 {
-	char *name, *value;
+	char *name, *value, *buffer = NULL;
 	size_t len, done, needed;
 	int error, i;
 
@@ -100,7 +100,9 @@
 			return (error);
 #endif
 		done = needed = 0;
-		sx_slock(&kenv_lock);
+		if (uap->len > 0 && uap->value != NULL)
+			buffer = malloc(uap->len, M_TEMP, M_WAITOK|M_ZERO);
+		mtx_lock(&kenv_lock);
 		for (i = 0; kenvp[i] != NULL; i++) {
 			len = strlen(kenvp[i]) + 1;
 			needed += len;
@@ -109,24 +111,32 @@
 			 * If called with a NULL or insufficiently large
 			 * buffer, just keep computing the required size.
 			 */
-			if (uap->value != NULL && len > 0) {
-				error = copyout(kenvp[i], uap->value + done,
-				    len);
-				if (error)
-					break;
+			if (uap->value != NULL && buffer != NULL && len > 0) {
+				bcopy(kenvp[i], buffer + done, len);
 				done += len;
 			}
 		}
-		sx_sunlock(&kenv_lock);
+		mtx_unlock(&kenv_lock);
+		if (buffer != NULL) {
+			error = copyout(buffer, uap->value, done);
+			free(buffer, M_TEMP);
+		}
 		td->td_retval[0] = ((done == needed) ? 0 : needed);
 		return (error);
 	}
 
-	if ((uap->what == KENV_SET) ||
-	    (uap->what == KENV_UNSET)) {
-		error = suser(td);
+	switch (uap->what) {
+	case KENV_SET:
+		error = priv_check(td, PRIV_KENV_SET);
+		if (error)
+			return (error);
+		break;
+
+	case KENV_UNSET:
+		error = priv_check(td, PRIV_KENV_UNSET);
 		if (error)
 			return (error);
+		break;
 	}
 
 	name = malloc(KENV_MNAMELEN, M_TEMP, M_WAITOK);
@@ -210,12 +220,17 @@
 	i = 0;
 	for (cp = kern_envp; cp != NULL; cp = kernenv_next(cp)) {
 		len = strlen(cp) + 1;
-		kenvp[i] = malloc(len, M_KENV, M_WAITOK);
-		strcpy(kenvp[i++], cp);
+		if (i < KENV_SIZE) {
+			kenvp[i] = malloc(len, M_KENV, M_WAITOK);
+			strcpy(kenvp[i++], cp);
+		} else
+			printf(
+			    "WARNING: too many kenv strings, ignoring %s\n",
+			    cp);
 	}
 	kenvp[i] = NULL;
 
-	sx_init(&kenv_lock, "kernel environment");
+	mtx_init(&kenv_lock, "kernel environment", NULL, MTX_DEF);
 	dynamic_kenv = 1;
 }
 SYSINIT(kenv, SI_SUB_KMEM, SI_ORDER_ANY, init_dynamic_kenv, NULL);
@@ -237,7 +252,7 @@
 	char *cp;
 	int len, i;
 
-	sx_assert(&kenv_lock, SX_LOCKED);
+	mtx_assert(&kenv_lock, MA_OWNED);
 	len = strlen(name);
 	for (cp = kenvp[0], i = 0; cp != NULL; cp = kenvp[++i]) {
 		if ((strncmp(cp, name, len) == 0) &&
@@ -283,16 +298,16 @@
 	int len;
 
 	if (dynamic_kenv) {
-		sx_slock(&kenv_lock);
+		mtx_lock(&kenv_lock);
 		cp = _getenv_dynamic(name, NULL);
 		if (cp != NULL) {
 			strcpy(buf, cp);
-			sx_sunlock(&kenv_lock);
+			mtx_unlock(&kenv_lock);
 			len = strlen(buf) + 1;
 			ret = malloc(len, M_KENV, M_WAITOK);
 			strcpy(ret, buf);
 		} else {
-			sx_sunlock(&kenv_lock);
+			mtx_unlock(&kenv_lock);
 			ret = NULL;
 		}
 	} else
@@ -309,9 +324,9 @@
 	char *cp;
 
 	if (dynamic_kenv) {
-		sx_slock(&kenv_lock);
+		mtx_lock(&kenv_lock);
 		cp = _getenv_dynamic(name, NULL);
-		sx_sunlock(&kenv_lock);
+		mtx_unlock(&kenv_lock);
 	} else
 		cp = _getenv_static(name);
 	if (cp != NULL)
@@ -339,12 +354,12 @@
 	buf = malloc(namelen + vallen, M_KENV, M_WAITOK);
 	sprintf(buf, "%s=%s", name, value);
 
-	sx_xlock(&kenv_lock);
+	mtx_lock(&kenv_lock);
 	cp = _getenv_dynamic(name, &i);
 	if (cp != NULL) {
 		oldenv = kenvp[i];
 		kenvp[i] = buf;
-		sx_xunlock(&kenv_lock);
+		mtx_unlock(&kenv_lock);
 		free(oldenv, M_KENV);
 	} else {
 		/* We add the option if it wasn't found */
@@ -354,13 +369,13 @@
 		/* Bounds checking */
 		if (i < 0 || i >= KENV_SIZE) {
 			free(buf, M_KENV);
-			sx_xunlock(&kenv_lock);
+			mtx_unlock(&kenv_lock);
 			return (-1);
 		}
 
 		kenvp[i] = buf;
 		kenvp[i + 1] = NULL;
-		sx_xunlock(&kenv_lock);
+		mtx_unlock(&kenv_lock);
 	}
 	return (0);
 }
@@ -376,18 +391,18 @@
 
 	KENV_CHECK;
 
-	sx_xlock(&kenv_lock);
+	mtx_lock(&kenv_lock);
 	cp = _getenv_dynamic(name, &i);
 	if (cp != NULL) {
 		oldenv = kenvp[i];
 		for (j = i + 1; kenvp[j] != NULL; j++)
 			kenvp[i++] = kenvp[j];
 		kenvp[i] = NULL;
-		sx_xunlock(&kenv_lock);
+		mtx_unlock(&kenv_lock);
 		free(oldenv, M_KENV);
 		return (0);
 	}
-	sx_xunlock(&kenv_lock);
+	mtx_unlock(&kenv_lock);
 	return (-1);
 }
 
Index: kern_descrip.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_descrip.c,v
retrieving revision 1.5
retrieving revision 1.6
diff -L sys/kern/kern_descrip.c -L sys/kern/kern_descrip.c -u -r1.5 -r1.6
--- sys/kern/kern_descrip.c
+++ sys/kern/kern_descrip.c
@@ -35,7 +35,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_descrip.c,v 1.279.2.5 2005/11/17 13:11:36 rwatson Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_descrip.c,v 1.313.4.1 2008/02/14 11:45:41 simon Exp $");
 
 #include "opt_compat.h"
 #include "opt_ddb.h"
@@ -54,8 +54,10 @@
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
+#include <sys/mqueue.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
+#include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/signalvar.h>
@@ -68,12 +70,14 @@
 #include <sys/unistd.h>
 #include <sys/vnode.h>
 
+#include <security/audit/audit.h>
+
 #include <vm/uma.h>
 
 #include <ddb/ddb.h>
 
-static MALLOC_DEFINE(M_FILEDESC, "file desc", "Open file descriptor table");
-static MALLOC_DEFINE(M_FILEDESC_TO_LEADER, "file desc to leader",
+static MALLOC_DEFINE(M_FILEDESC, "filedesc", "Open file descriptor table");
+static MALLOC_DEFINE(M_FILEDESC_TO_LEADER, "filedesc_to_leader",
 		     "file desc to leader structures");
 static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures");
 
@@ -134,6 +138,7 @@
 int openfiles;			/* actual number of open files */
 struct sx filelist_lock;	/* sx to protect filelist */
 struct mtx sigio_lock;		/* mtx to protect pointers to sigio */
+void	(*mq_fdclose)(struct thread *td, int fd, struct file *fp);
 
 /* A mutex to protect the association between a proc and filedesc. */
 static struct mtx	fdesc_mtx;
@@ -206,9 +211,11 @@
 static void
 fdused(struct filedesc *fdp, int fd)
 {
-	FILEDESC_LOCK_ASSERT(fdp, MA_OWNED);
+
+	FILEDESC_XLOCK_ASSERT(fdp);
 	KASSERT(!fdisused(fdp, fd),
 	    ("fd already used"));
+
 	fdp->fd_map[NDSLOT(fd)] |= NDBIT(fd);
 	if (fd > fdp->fd_lastfile)
 		fdp->fd_lastfile = fd;
@@ -222,11 +229,13 @@
 static void
 fdunused(struct filedesc *fdp, int fd)
 {
-	FILEDESC_LOCK_ASSERT(fdp, MA_OWNED);
+
+	FILEDESC_XLOCK_ASSERT(fdp);
 	KASSERT(fdisused(fdp, fd),
 	    ("fd is already unused"));
 	KASSERT(fdp->fd_ofiles[fd] == NULL,
 	    ("fd is still in use"));
+
 	fdp->fd_map[NDSLOT(fd)] &= ~NDBIT(fd);
 	if (fd < fdp->fd_freefile)
 		fdp->fd_freefile = fd;
@@ -242,9 +251,6 @@
 	int	dummy;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 getdtablesize(struct thread *td, struct getdtablesize_args *uap)
@@ -261,7 +267,7 @@
 /*
  * Duplicate a file descriptor to a particular value.
  *
- * note: keep in mind that a potential race condition exists when closing
+ * Note: keep in mind that a potential race condition exists when closing
  * descriptors from a shared descriptor table (via rfork).
  */
 #ifndef _SYS_SYSPROTO_H_
@@ -270,9 +276,6 @@
 	u_int	to;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 dup2(struct thread *td, struct dup2_args *uap)
@@ -290,9 +293,6 @@
 	u_int	fd;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 dup(struct thread *td, struct dup_args *uap)
@@ -311,9 +311,6 @@
 	long	arg;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 fcntl(struct thread *td, struct fcntl_args *uap)
@@ -344,6 +341,18 @@
 	return (error);
 }
 
+static inline struct file *
+fdtofp(int fd, struct filedesc *fdp)
+{
+	struct file *fp;
+
+	FILEDESC_LOCK_ASSERT(fdp);
+	if ((unsigned)fd >= fdp->fd_nfiles ||
+	    (fp = fdp->fd_ofiles[fd]) == NULL)
+		return (NULL);
+	return (fp);
+}
+
 int
 kern_fcntl(struct thread *td, int fd, int cmd, intptr_t arg)
 {
@@ -355,42 +364,23 @@
 	struct vnode *vp;
 	u_int newmin;
 	int error, flg, tmp;
-	int giant_locked;
-
-	/*
-	 * XXXRW: Some fcntl() calls require Giant -- others don't.  Try to
-	 * avoid grabbing Giant for calls we know don't need it.
-	 */
-	switch (cmd) {
-	case F_DUPFD:
-	case F_GETFD:
-	case F_SETFD:
-	case F_GETFL:
-		giant_locked = 0;
-		break;
-
-	default:
-		giant_locked = 1;
-		mtx_lock(&Giant);
-	}
+	int vfslocked;
 
+	vfslocked = 0;
 	error = 0;
 	flg = F_POSIX;
 	p = td->td_proc;
 	fdp = p->p_fd;
-	FILEDESC_LOCK(fdp);
-	if ((unsigned)fd >= fdp->fd_nfiles ||
-	    (fp = fdp->fd_ofiles[fd]) == NULL) {
-		FILEDESC_UNLOCK(fdp);
-		error = EBADF;
-		goto done2;
-	}
-	pop = &fdp->fd_ofileflags[fd];
 
 	switch (cmd) {
 	case F_DUPFD:
-		/* mtx_assert(&Giant, MA_NOTOWNED); */
-		FILEDESC_UNLOCK(fdp);
+		FILEDESC_SLOCK(fdp);
+		if ((fp = fdtofp(fd, fdp)) == NULL) {
+			FILEDESC_SUNLOCK(fdp);
+			error = EBADF;
+			break;
+		}
+		FILEDESC_SUNLOCK(fdp);
 		newmin = arg;
 		PROC_LOCK(p);
 		if (newmin >= lim_cur(p, RLIMIT_NOFILE) ||
@@ -404,34 +394,56 @@
 		break;
 
 	case F_GETFD:
-		/* mtx_assert(&Giant, MA_NOTOWNED); */
+		FILEDESC_SLOCK(fdp);
+		if ((fp = fdtofp(fd, fdp)) == NULL) {
+			FILEDESC_SUNLOCK(fdp);
+			error = EBADF;
+			break;
+		}
+		pop = &fdp->fd_ofileflags[fd];
 		td->td_retval[0] = (*pop & UF_EXCLOSE) ? FD_CLOEXEC : 0;
-		FILEDESC_UNLOCK(fdp);
+		FILEDESC_SUNLOCK(fdp);
 		break;
 
 	case F_SETFD:
-		/* mtx_assert(&Giant, MA_NOTOWNED); */
+		FILEDESC_XLOCK(fdp);
+		if ((fp = fdtofp(fd, fdp)) == NULL) {
+			FILEDESC_XUNLOCK(fdp);
+			error = EBADF;
+			break;
+		}
+		pop = &fdp->fd_ofileflags[fd];
 		*pop = (*pop &~ UF_EXCLOSE) |
 		    (arg & FD_CLOEXEC ? UF_EXCLOSE : 0);
-		FILEDESC_UNLOCK(fdp);
+		FILEDESC_XUNLOCK(fdp);
 		break;
 
 	case F_GETFL:
-		/* mtx_assert(&Giant, MA_NOTOWNED); */
+		FILEDESC_SLOCK(fdp);
+		if ((fp = fdtofp(fd, fdp)) == NULL) {
+			FILEDESC_SUNLOCK(fdp);
+			error = EBADF;
+			break;
+		}
 		FILE_LOCK(fp);
 		td->td_retval[0] = OFLAGS(fp->f_flag);
 		FILE_UNLOCK(fp);
-		FILEDESC_UNLOCK(fdp);
+		FILEDESC_SUNLOCK(fdp);
 		break;
 
 	case F_SETFL:
-		mtx_assert(&Giant, MA_OWNED);
+		FILEDESC_SLOCK(fdp);
+		if ((fp = fdtofp(fd, fdp)) == NULL) {
+			FILEDESC_SUNLOCK(fdp);
+			error = EBADF;
+			break;
+		}
 		FILE_LOCK(fp);
 		fhold_locked(fp);
 		fp->f_flag &= ~FCNTLFLAGS;
 		fp->f_flag |= FFLAGS(arg & ~O_ACCMODE) & FCNTLFLAGS;
 		FILE_UNLOCK(fp);
-		FILEDESC_UNLOCK(fdp);
+		FILEDESC_SUNLOCK(fdp);
 		tmp = fp->f_flag & FNONBLOCK;
 		error = fo_ioctl(fp, FIONBIO, &tmp, td->td_ucred, td);
 		if (error) {
@@ -453,9 +465,14 @@
 		break;
 
 	case F_GETOWN:
-		mtx_assert(&Giant, MA_OWNED);
+		FILEDESC_SLOCK(fdp);
+		if ((fp = fdtofp(fd, fdp)) == NULL) {
+			FILEDESC_SUNLOCK(fdp);
+			error = EBADF;
+			break;
+		}
 		fhold(fp);
-		FILEDESC_UNLOCK(fdp);
+		FILEDESC_SUNLOCK(fdp);
 		error = fo_ioctl(fp, FIOGETOWN, &tmp, td->td_ucred, td);
 		if (error == 0)
 			td->td_retval[0] = tmp;
@@ -463,33 +480,41 @@
 		break;
 
 	case F_SETOWN:
-		mtx_assert(&Giant, MA_OWNED);
+		FILEDESC_SLOCK(fdp);
+		if ((fp = fdtofp(fd, fdp)) == NULL) {
+			FILEDESC_SUNLOCK(fdp);
+			error = EBADF;
+			break;
+		}
 		fhold(fp);
-		FILEDESC_UNLOCK(fdp);
+		FILEDESC_SUNLOCK(fdp);
 		tmp = arg;
 		error = fo_ioctl(fp, FIOSETOWN, &tmp, td->td_ucred, td);
 		fdrop(fp, td);
 		break;
 
 	case F_SETLKW:
-		mtx_assert(&Giant, MA_OWNED);
 		flg |= F_WAIT;
 		/* FALLTHROUGH F_SETLK */
 
 	case F_SETLK:
-		mtx_assert(&Giant, MA_OWNED);
+		FILEDESC_SLOCK(fdp);
+		if ((fp = fdtofp(fd, fdp)) == NULL) {
+			FILEDESC_SUNLOCK(fdp);
+			error = EBADF;
+			break;
+		}
 		if (fp->f_type != DTYPE_VNODE) {
-			FILEDESC_UNLOCK(fdp);
+			FILEDESC_SUNLOCK(fdp);
 			error = EBADF;
 			break;
 		}
-
 		flp = (struct flock *)arg;
 		if (flp->l_whence == SEEK_CUR) {
 			if (fp->f_offset < 0 ||
 			    (flp->l_start > 0 &&
 			     fp->f_offset > OFF_MAX - flp->l_start)) {
-				FILEDESC_UNLOCK(fdp);
+				FILEDESC_SUNLOCK(fdp);
 				error = EOVERFLOW;
 				break;
 			}
@@ -500,9 +525,9 @@
 		 * VOP_ADVLOCK() may block.
 		 */
 		fhold(fp);
-		FILEDESC_UNLOCK(fdp);
+		FILEDESC_SUNLOCK(fdp);
 		vp = fp->f_vnode;
-
+		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 		switch (flp->l_type) {
 		case F_RDLCK:
 			if ((fp->f_flag & FREAD) == 0) {
@@ -534,33 +559,43 @@
 			error = EINVAL;
 			break;
 		}
+		VFS_UNLOCK_GIANT(vfslocked);
+		vfslocked = 0;
 		/* Check for race with close */
-		FILEDESC_LOCK_FAST(fdp);
+		FILEDESC_SLOCK(fdp);
 		if ((unsigned) fd >= fdp->fd_nfiles ||
 		    fp != fdp->fd_ofiles[fd]) {
-			FILEDESC_UNLOCK_FAST(fdp);
+			FILEDESC_SUNLOCK(fdp);
 			flp->l_whence = SEEK_SET;
 			flp->l_start = 0;
 			flp->l_len = 0;
 			flp->l_type = F_UNLCK;
+			vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 			(void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
 					   F_UNLCK, flp, F_POSIX);
+			VFS_UNLOCK_GIANT(vfslocked);
+			vfslocked = 0;
 		} else
-			FILEDESC_UNLOCK_FAST(fdp);
+			FILEDESC_SUNLOCK(fdp);
 		fdrop(fp, td);
 		break;
 
 	case F_GETLK:
-		mtx_assert(&Giant, MA_OWNED);
+		FILEDESC_SLOCK(fdp);
+		if ((fp = fdtofp(fd, fdp)) == NULL) {
+			FILEDESC_SUNLOCK(fdp);
+			error = EBADF;
+			break;
+		}
 		if (fp->f_type != DTYPE_VNODE) {
-			FILEDESC_UNLOCK(fdp);
+			FILEDESC_SUNLOCK(fdp);
 			error = EBADF;
 			break;
 		}
 		flp = (struct flock *)arg;
 		if (flp->l_type != F_RDLCK && flp->l_type != F_WRLCK &&
 		    flp->l_type != F_UNLCK) {
-			FILEDESC_UNLOCK(fdp);
+			FILEDESC_SUNLOCK(fdp);
 			error = EINVAL;
 			break;
 		}
@@ -569,7 +604,7 @@
 			    fp->f_offset > OFF_MAX - flp->l_start) ||
 			    (flp->l_start < 0 &&
 			     fp->f_offset < OFF_MIN - flp->l_start)) {
-				FILEDESC_UNLOCK(fdp);
+				FILEDESC_SUNLOCK(fdp);
 				error = EOVERFLOW;
 				break;
 			}
@@ -579,20 +614,20 @@
 		 * VOP_ADVLOCK() may block.
 		 */
 		fhold(fp);
-		FILEDESC_UNLOCK(fdp);
+		FILEDESC_SUNLOCK(fdp);
 		vp = fp->f_vnode;
+		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 		error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK, flp,
 		    F_POSIX);
+		VFS_UNLOCK_GIANT(vfslocked);
+		vfslocked = 0;
 		fdrop(fp, td);
 		break;
 	default:
-		FILEDESC_UNLOCK(fdp);
 		error = EINVAL;
 		break;
 	}
-done2:
-	if (giant_locked)
-		mtx_unlock(&Giant);
+	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
@@ -600,7 +635,8 @@
  * Common code for dup, dup2, and fcntl(F_DUPFD).
  */
 static int
-do_dup(struct thread *td, enum dup_type type, int old, int new, register_t *retval)
+do_dup(struct thread *td, enum dup_type type, int old, int new,
+    register_t *retval)
 {
 	struct filedesc *fdp;
 	struct proc *p;
@@ -626,14 +662,14 @@
 	if (new >= maxfd)
 		return (EMFILE);
 
-	FILEDESC_LOCK(fdp);
+	FILEDESC_XLOCK(fdp);
 	if (old >= fdp->fd_nfiles || fdp->fd_ofiles[old] == NULL) {
-		FILEDESC_UNLOCK(fdp);
+		FILEDESC_XUNLOCK(fdp);
 		return (EBADF);
 	}
 	if (type == DUP_FIXED && old == new) {
 		*retval = new;
-		FILEDESC_UNLOCK(fdp);
+		FILEDESC_XUNLOCK(fdp);
 		return (0);
 	}
 	fp = fdp->fd_ofiles[old];
@@ -653,7 +689,7 @@
 			fdused(fdp, new);
 	} else {
 		if ((error = fdalloc(td, new, &new)) != 0) {
-			FILEDESC_UNLOCK(fdp);
+			FILEDESC_XUNLOCK(fdp);
 			fdrop(fp, td);
 			return (error);
 		}
@@ -668,7 +704,7 @@
 		/* we've allocated a descriptor which we won't use */
 		if (fdp->fd_ofiles[new] == NULL)
 			fdunused(fdp, new);
-		FILEDESC_UNLOCK(fdp);
+		FILEDESC_XUNLOCK(fdp);
 		fdrop(fp, td);
 		return (EBADF);
 	}
@@ -713,20 +749,22 @@
 	 */
 	if (delfp != NULL) {
 		knote_fdclose(td, new);
-		FILEDESC_UNLOCK(fdp);
+		if (delfp->f_type == DTYPE_MQUEUE)
+			mq_fdclose(td, new, delfp);
+		FILEDESC_XUNLOCK(fdp);
 		(void) closef(delfp, td);
 		if (holdleaders) {
-			FILEDESC_LOCK_FAST(fdp);
+			FILEDESC_XLOCK(fdp);
 			fdp->fd_holdleaderscount--;
 			if (fdp->fd_holdleaderscount == 0 &&
 			    fdp->fd_holdleaderswakeup != 0) {
 				fdp->fd_holdleaderswakeup = 0;
 				wakeup(&fdp->fd_holdleaderscount);
 			}
-			FILEDESC_UNLOCK_FAST(fdp);
+			FILEDESC_XUNLOCK(fdp);
 		}
 	} else {
-		FILEDESC_UNLOCK(fdp);
+		FILEDESC_XUNLOCK(fdp);
 	}
 	return (0);
 }
@@ -958,28 +996,36 @@
 	int     fd;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 close(td, uap)
 	struct thread *td;
 	struct close_args *uap;
 {
+
+	return (kern_close(td, uap->fd));
+}
+
+int
+kern_close(td, fd)
+	struct thread *td;
+	int fd;
+{
 	struct filedesc *fdp;
 	struct file *fp;
-	int fd, error;
+	int error;
 	int holdleaders;
 
-	fd = uap->fd;
 	error = 0;
 	holdleaders = 0;
 	fdp = td->td_proc->p_fd;
-	FILEDESC_LOCK(fdp);
+
+	AUDIT_SYSCLOSE(td, fd);
+
+	FILEDESC_XLOCK(fdp);
 	if ((unsigned)fd >= fdp->fd_nfiles ||
 	    (fp = fdp->fd_ofiles[fd]) == NULL) {
-		FILEDESC_UNLOCK(fdp);
+		FILEDESC_XUNLOCK(fdp);
 		return (EBADF);
 	}
 	fdp->fd_ofiles[fd] = NULL;
@@ -995,25 +1041,26 @@
 	}
 
 	/*
-	 * We now hold the fp reference that used to be owned by the descriptor
-	 * array.
-	 * We have to unlock the FILEDESC *AFTER* knote_fdclose to prevent a
-	 * race of the fd getting opened, a knote added, and deleteing a knote
-	 * for the new fd.
+	 * We now hold the fp reference that used to be owned by the
+	 * descriptor array.  We have to unlock the FILEDESC *AFTER*
+	 * knote_fdclose to prevent a race of the fd getting opened, a knote
+	 * added, and deleteing a knote for the new fd.
 	 */
 	knote_fdclose(td, fd);
-	FILEDESC_UNLOCK(fdp);
+	if (fp->f_type == DTYPE_MQUEUE)
+		mq_fdclose(td, fd, fp);
+	FILEDESC_XUNLOCK(fdp);
 
 	error = closef(fp, td);
 	if (holdleaders) {
-		FILEDESC_LOCK_FAST(fdp);
+		FILEDESC_XLOCK(fdp);
 		fdp->fd_holdleaderscount--;
 		if (fdp->fd_holdleaderscount == 0 &&
 		    fdp->fd_holdleaderswakeup != 0) {
 			fdp->fd_holdleaderswakeup = 0;
 			wakeup(&fdp->fd_holdleaderscount);
 		}
-		FILEDESC_UNLOCK_FAST(fdp);
+		FILEDESC_XUNLOCK(fdp);
 	}
 	return (error);
 }
@@ -1028,9 +1075,6 @@
 	struct	ostat *sb;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 ofstat(struct thread *td, struct ofstat_args *uap)
@@ -1057,9 +1101,6 @@
 	struct	stat *sb;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 fstat(struct thread *td, struct fstat_args *uap)
@@ -1079,8 +1120,13 @@
 	struct file *fp;
 	int error;
 
+	AUDIT_ARG(fd, fd);
+
 	if ((error = fget(td, fd, &fp)) != 0)
 		return (error);
+
+	AUDIT_ARG(file, td->td_proc, fp);
+
 	error = fo_stat(fp, sbp, td->td_ucred, td);
 	fdrop(fp, td);
 	return (error);
@@ -1095,9 +1141,6 @@
 	struct	nstat *sb;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 nfstat(struct thread *td, struct nfstat_args *uap)
@@ -1123,9 +1166,6 @@
 	int	name;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 fpathconf(struct thread *td, struct fpathconf_args *uap)
@@ -1178,7 +1218,7 @@
 	int nnfiles, onfiles;
 	NDSLOTTYPE *nmap;
 
-	FILEDESC_LOCK_ASSERT(fdp, MA_OWNED);
+	FILEDESC_XLOCK_ASSERT(fdp);
 
 	KASSERT(fdp->fd_nfiles > 0,
 	    ("zero-length file table"));
@@ -1191,7 +1231,7 @@
 		return;
 
 	/* allocate a new table and (if required) new bitmaps */
-	FILEDESC_UNLOCK(fdp);
+	FILEDESC_XUNLOCK(fdp);
 	MALLOC(ntable, struct file **, nnfiles * OFILESIZE,
 	    M_FILEDESC, M_ZERO | M_WAITOK);
 	nfileflags = (char *)&ntable[nnfiles];
@@ -1200,7 +1240,7 @@
 		    M_FILEDESC, M_ZERO | M_WAITOK);
 	else
 		nmap = NULL;
-	FILEDESC_LOCK(fdp);
+	FILEDESC_XLOCK(fdp);
 
 	/*
 	 * We now have new tables ready to go.  Since we dropped the
@@ -1239,7 +1279,7 @@
 	struct filedesc *fdp = p->p_fd;
 	int fd = -1, maxfd;
 
-	FILEDESC_LOCK_ASSERT(fdp, MA_OWNED);
+	FILEDESC_XLOCK_ASSERT(fdp);
 
 	if (fdp->fd_freefile > minfd)
 		minfd = fdp->fd_freefile;	   
@@ -1278,8 +1318,8 @@
 }
 
 /*
- * Check to see whether n user file descriptors
- * are available to the process p.
+ * Check to see whether n user file descriptors are available to the process
+ * p.
  */
 int
 fdavail(struct thread *td, int n)
@@ -1289,7 +1329,7 @@
 	struct file **fpp;
 	int i, lim, last;
 
-	FILEDESC_LOCK_ASSERT(fdp, MA_OWNED);
+	FILEDESC_LOCK_ASSERT(fdp);
 
 	PROC_LOCK(p);
 	lim = min((int)lim_cur(p, RLIMIT_NOFILE), maxfilesperproc);
@@ -1306,12 +1346,11 @@
 }
 
 /*
- * Create a new open file structure and allocate
- * a file decriptor for the process that refers to it.
- * We add one reference to the file for the descriptor table
- * and one reference for resultfp. This is to prevent us being
- * preempted and the entry in the descriptor table closed after
- * we release the FILEDESC lock.
+ * Create a new open file structure and allocate a file decriptor for the
+ * process that refers to it.  We add one reference to the file for the
+ * descriptor table and one reference for resultfp. This is to prevent us
+ * being preempted and the entry in the descriptor table closed after we
+ * release the FILEDESC lock.
  */
 int
 falloc(struct thread *td, struct file **resultfp, int *resultfd)
@@ -1325,8 +1364,10 @@
 
 	fp = uma_zalloc(file_zone, M_WAITOK | M_ZERO);
 	sx_xlock(&filelist_lock);
-	if ((openfiles >= maxuserfiles && (td->td_ucred->cr_ruid != 0 ||
-	   jailed(td->td_ucred))) || openfiles >= maxfiles) {
+
+	if ((openfiles >= maxuserfiles &&
+	    priv_check(td, PRIV_MAXFILES) != 0) ||
+	    openfiles >= maxfiles) {
 		if (ppsratecheck(&lastfail, &curfail, 1)) {
 			printf("kern.maxfiles limit exceeded by uid %i, please see tuning(7).\n",
 				td->td_ucred->cr_ruid);
@@ -1350,7 +1391,7 @@
 	fp->f_ops = &badfileops;
 	fp->f_data = NULL;
 	fp->f_vnode = NULL;
-	FILEDESC_LOCK(p->p_fd);
+	FILEDESC_XLOCK(p->p_fd);
 	if ((fq = p->p_fd->fd_ofiles[0])) {
 		LIST_INSERT_AFTER(fq, fp, f_list);
 	} else {
@@ -1358,14 +1399,14 @@
 	}
 	sx_xunlock(&filelist_lock);
 	if ((error = fdalloc(td, 0, &i))) {
-		FILEDESC_UNLOCK(p->p_fd);
+		FILEDESC_XUNLOCK(p->p_fd);
 		fdrop(fp, td);
 		if (resultfp)
 			fdrop(fp, td);
 		return (error);
 	}
 	p->p_fd->fd_ofiles[i] = fp;
-	FILEDESC_UNLOCK(p->p_fd);
+	FILEDESC_XUNLOCK(p->p_fd);
 	if (resultfp)
 		*resultfp = fp;
 	if (resultfd)
@@ -1383,9 +1424,9 @@
 	struct filedesc0 *newfdp;
 
 	newfdp = malloc(sizeof *newfdp, M_FILEDESC, M_WAITOK | M_ZERO);
-	mtx_init(&newfdp->fd_fd.fd_mtx, FILEDESC_LOCK_DESC, NULL, MTX_DEF);
+	FILEDESC_LOCK_INIT(&newfdp->fd_fd);
 	if (fdp != NULL) {
-		FILEDESC_LOCK(fdp);
+		FILEDESC_XLOCK(fdp);
 		newfdp->fd_fd.fd_cdir = fdp->fd_cdir;
 		if (newfdp->fd_fd.fd_cdir)
 			VREF(newfdp->fd_fd.fd_cdir);
@@ -1395,7 +1436,7 @@
 		newfdp->fd_fd.fd_jdir = fdp->fd_jdir;
 		if (newfdp->fd_fd.fd_jdir)
 			VREF(newfdp->fd_fd.fd_jdir);
-		FILEDESC_UNLOCK(fdp);
+		FILEDESC_XUNLOCK(fdp);
 	}
 
 	/* Create the file descriptor table. */
@@ -1434,7 +1475,7 @@
 	if (i > 0)
 		return;
 
-	mtx_destroy(&fdp->fd_mtx);
+	FILEDESC_LOCK_DESTROY(fdp);
 	FREE(fdp, M_FILEDESC);
 }
 
@@ -1444,9 +1485,10 @@
 struct filedesc *
 fdshare(struct filedesc *fdp)
 {
-	FILEDESC_LOCK_FAST(fdp);
+
+	FILEDESC_XLOCK(fdp);
 	fdp->fd_refcnt++;
-	FILEDESC_UNLOCK_FAST(fdp);
+	FILEDESC_XUNLOCK(fdp);
 	return (fdp);
 }
 
@@ -1457,22 +1499,21 @@
 fdunshare(struct proc *p, struct thread *td)
 {
 
-	FILEDESC_LOCK_FAST(p->p_fd);
+	FILEDESC_XLOCK(p->p_fd);
 	if (p->p_fd->fd_refcnt > 1) {
 		struct filedesc *tmp;
 
-		FILEDESC_UNLOCK_FAST(p->p_fd);
+		FILEDESC_XUNLOCK(p->p_fd);
 		tmp = fdcopy(p->p_fd);
 		fdfree(td);
 		p->p_fd = tmp;
 	} else
-		FILEDESC_UNLOCK_FAST(p->p_fd);
+		FILEDESC_XUNLOCK(p->p_fd);
 }
 
 /*
- * Copy a filedesc structure.
- * A NULL pointer in returns a NULL reference, this is to ease callers,
- * not catch errors.
+ * Copy a filedesc structure.  A NULL pointer in returns a NULL reference,
+ * this is to ease callers, not catch errors.
  */
 struct filedesc *
 fdcopy(struct filedesc *fdp)
@@ -1485,13 +1526,13 @@
 		return (NULL);
 
 	newfdp = fdinit(fdp);
-	FILEDESC_LOCK_FAST(fdp);
+	FILEDESC_SLOCK(fdp);
 	while (fdp->fd_lastfile >= newfdp->fd_nfiles) {
-		FILEDESC_UNLOCK_FAST(fdp);
-		FILEDESC_LOCK(newfdp);
+		FILEDESC_SUNLOCK(fdp);
+		FILEDESC_XLOCK(newfdp);
 		fdgrowtable(newfdp, fdp->fd_lastfile + 1);
-		FILEDESC_UNLOCK(newfdp);
-		FILEDESC_LOCK_FAST(fdp);
+		FILEDESC_XUNLOCK(newfdp);
+		FILEDESC_SLOCK(fdp);
 	}
 	/* copy everything except kqueue descriptors */
 	newfdp->fd_freefile = -1;
@@ -1507,17 +1548,17 @@
 				newfdp->fd_freefile = i;
 		}
 	}
-	FILEDESC_UNLOCK_FAST(fdp);
-	FILEDESC_LOCK(newfdp);
+	FILEDESC_SUNLOCK(fdp);
+	FILEDESC_XLOCK(newfdp);
 	for (i = 0; i <= newfdp->fd_lastfile; ++i)
 		if (newfdp->fd_ofiles[i] != NULL)
 			fdused(newfdp, i);
-	FILEDESC_UNLOCK(newfdp);
-	FILEDESC_LOCK_FAST(fdp);
+	FILEDESC_XUNLOCK(newfdp);
+	FILEDESC_SLOCK(fdp);
 	if (newfdp->fd_freefile == -1)
 		newfdp->fd_freefile = i;
 	newfdp->fd_cmask = fdp->fd_cmask;
-	FILEDESC_UNLOCK_FAST(fdp);
+	FILEDESC_SUNLOCK(fdp);
 	return (newfdp);
 }
 
@@ -1543,7 +1584,7 @@
 	/* Check for special need to clear POSIX style locks */
 	fdtol = td->td_proc->p_fdtol;
 	if (fdtol != NULL) {
-		FILEDESC_LOCK(fdp);
+		FILEDESC_XLOCK(fdp);
 		KASSERT(fdtol->fdl_refcount > 0,
 			("filedesc_to_refcount botch: fdl_refcount=%d",
 			 fdtol->fdl_refcount));
@@ -1557,7 +1598,7 @@
 					continue;
 				fp = *fpp;
 				fhold(fp);
-				FILEDESC_UNLOCK(fdp);
+				FILEDESC_XUNLOCK(fdp);
 				lf.l_whence = SEEK_SET;
 				lf.l_start = 0;
 				lf.l_len = 0;
@@ -1571,7 +1612,7 @@
 						   &lf,
 						   F_POSIX);
 				VFS_UNLOCK_GIANT(locked);
-				FILEDESC_LOCK(fdp);
+				FILEDESC_XLOCK(fdp);
 				fdrop(fp, td);
 				fpp = fdp->fd_ofiles + i;
 			}
@@ -1585,18 +1626,18 @@
 				 * in a shared file descriptor table.
 				 */
 				fdp->fd_holdleaderswakeup = 1;
-				msleep(&fdp->fd_holdleaderscount, &fdp->fd_mtx,
-				       PLOCK, "fdlhold", 0);
+				sx_sleep(&fdp->fd_holdleaderscount,
+				    FILEDESC_LOCK(fdp), PLOCK, "fdlhold", 0);
 				goto retry;
 			}
 			if (fdtol->fdl_holdcount > 0) {
 				/*
-				 * Ensure that fdtol->fdl_leader
-				 * remains valid in closef().
+				 * Ensure that fdtol->fdl_leader remains
+				 * valid in closef().
 				 */
 				fdtol->fdl_wakeup = 1;
-				msleep(fdtol, &fdp->fd_mtx,
-				       PLOCK, "fdlhold", 0);
+				sx_sleep(fdtol, FILEDESC_LOCK(fdp), PLOCK,
+				    "fdlhold", 0);
 				goto retry;
 			}
 		}
@@ -1608,13 +1649,13 @@
 		} else
 			fdtol = NULL;
 		td->td_proc->p_fdtol = NULL;
-		FILEDESC_UNLOCK(fdp);
+		FILEDESC_XUNLOCK(fdp);
 		if (fdtol != NULL)
 			FREE(fdtol, M_FILEDESC_TO_LEADER);
 	}
-	FILEDESC_LOCK_FAST(fdp);
+	FILEDESC_XLOCK(fdp);
 	i = --fdp->fd_refcnt;
-	FILEDESC_UNLOCK_FAST(fdp);
+	FILEDESC_XUNLOCK(fdp);
 	if (i > 0)
 		return;
 	/*
@@ -1626,7 +1667,7 @@
 		if (*fpp)
 			(void) closef(*fpp, td);
 	}
-	FILEDESC_LOCK(fdp);
+	FILEDESC_XLOCK(fdp);
 
 	/* XXX This should happen earlier. */
 	mtx_lock(&fdesc_mtx);
@@ -1646,7 +1687,7 @@
 	fdp->fd_rdir = NULL;
 	jdir = fdp->fd_jdir;
 	fdp->fd_jdir = NULL;
-	FILEDESC_UNLOCK(fdp);
+	FILEDESC_XUNLOCK(fdp);
 
 	if (cdir) {
 		locked = VFS_LOCK_GIANT(cdir->v_mount);
@@ -1706,7 +1747,7 @@
 	 * Note: fdp->fd_ofiles may be reallocated out from under us while
 	 * we are blocked in a close.  Be careful!
 	 */
-	FILEDESC_LOCK(fdp);
+	FILEDESC_XLOCK(fdp);
 	for (i = 0; i <= fdp->fd_lastfile; i++) {
 		if (i > 2)
 			break;
@@ -1722,27 +1763,33 @@
 			fdp->fd_ofiles[i] = NULL;
 			fdp->fd_ofileflags[i] = 0;
 			fdunused(fdp, i);
-			FILEDESC_UNLOCK(fdp);
+			FILEDESC_XUNLOCK(fdp);
 			(void) closef(fp, td);
-			FILEDESC_LOCK(fdp);
+			FILEDESC_XLOCK(fdp);
 		}
 	}
-	FILEDESC_UNLOCK(fdp);
+	FILEDESC_XUNLOCK(fdp);
 }
 
+/*
+ * If a specific file object occupies a specific file descriptor, close the
+ * file descriptor entry and drop a reference on the file object.  This is a
+ * convenience function to handle a subsequent error in a function that calls
+ * falloc() that handles the race that another thread might have closed the
+ * file descriptor out from under the thread creating the file object.
+ */
 void
 fdclose(struct filedesc *fdp, struct file *fp, int idx, struct thread *td)
 {
 
-	FILEDESC_LOCK(fdp);
+	FILEDESC_XLOCK(fdp);
 	if (fdp->fd_ofiles[idx] == fp) {
 		fdp->fd_ofiles[idx] = NULL;
 		fdunused(fdp, idx);
-		FILEDESC_UNLOCK(fdp);
+		FILEDESC_XUNLOCK(fdp);
 		fdrop(fp, td);
-	} else {
-		FILEDESC_UNLOCK(fdp);
-	}
+	} else
+		FILEDESC_XUNLOCK(fdp);
 }
 
 /*
@@ -1759,7 +1806,7 @@
 	if (fdp == NULL)
 		return;
 
-	FILEDESC_LOCK(fdp);
+	FILEDESC_XLOCK(fdp);
 
 	/*
 	 * We cannot cache fd_ofiles or fd_ofileflags since operations
@@ -1767,7 +1814,8 @@
 	 */
 	for (i = 0; i <= fdp->fd_lastfile; i++) {
 		if (fdp->fd_ofiles[i] != NULL &&
-		    (fdp->fd_ofileflags[i] & UF_EXCLOSE)) {
+		    (fdp->fd_ofiles[i]->f_type == DTYPE_MQUEUE ||
+		    (fdp->fd_ofileflags[i] & UF_EXCLOSE))) {
 			struct file *fp;
 
 			knote_fdclose(td, i);
@@ -1779,12 +1827,14 @@
 			fdp->fd_ofiles[i] = NULL;
 			fdp->fd_ofileflags[i] = 0;
 			fdunused(fdp, i);
-			FILEDESC_UNLOCK(fdp);
+			if (fp->f_type == DTYPE_MQUEUE)
+				mq_fdclose(td, i, fp);
+			FILEDESC_XUNLOCK(fdp);
 			(void) closef(fp, td);
-			FILEDESC_LOCK(fdp);
+			FILEDESC_XLOCK(fdp);
 		}
 	}
-	FILEDESC_UNLOCK(fdp);
+	FILEDESC_XUNLOCK(fdp);
 }
 
 /*
@@ -1797,11 +1847,9 @@
 int
 fdcheckstd(struct thread *td)
 {
-	struct nameidata nd;
 	struct filedesc *fdp;
-	struct file *fp;
-	register_t retval;
-	int fd, i, error, flags, devnull;
+	register_t retval, save;
+	int i, error, devnull;
 
 	fdp = td->td_proc->p_fd;
 	if (fdp == NULL)
@@ -1813,45 +1861,14 @@
 		if (fdp->fd_ofiles[i] != NULL)
 			continue;
 		if (devnull < 0) {
-			int vfslocked;
-			error = falloc(td, &fp, &fd);
-			if (error != 0)
-				break;
-			/* Note extra ref on `fp' held for us by falloc(). */
-			KASSERT(fd == i, ("oof, we didn't get our fd"));
-			NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE, UIO_SYSSPACE,
-			    "/dev/null", td);
-			flags = FREAD | FWRITE;
-			error = vn_open(&nd, &flags, 0, fd);
-			if (error != 0) {
-				/*
-				 * Someone may have closed the entry in the
-				 * file descriptor table, so check it hasn't
-				 * changed before dropping the reference count.
-				 */
-				FILEDESC_LOCK(fdp);
-				KASSERT(fdp->fd_ofiles[fd] == fp,
-				    ("table not shared, how did it change?"));
-				fdp->fd_ofiles[fd] = NULL;
-				fdunused(fdp, fd);
-				FILEDESC_UNLOCK(fdp);
-				fdrop(fp, td);
-				fdrop(fp, td);
+			save = td->td_retval[0];
+			error = kern_open(td, "/dev/null", UIO_SYSSPACE,
+			    O_RDWR, 0);
+			devnull = td->td_retval[0];
+			KASSERT(devnull == i, ("oof, we didn't get our fd"));
+			td->td_retval[0] = save;
+			if (error)
 				break;
-			}
-			vfslocked = NDHASGIANT(&nd);
-			NDFREE(&nd, NDF_ONLY_PNBUF);
-			fp->f_flag = flags;
-			fp->f_vnode = nd.ni_vp;
-			if (fp->f_data == NULL)
-				fp->f_data = nd.ni_vp;
-			if (fp->f_ops == &badfileops)
-				fp->f_ops = &vnops;
-			fp->f_type = DTYPE_VNODE;
-			VOP_UNLOCK(nd.ni_vp, 0, td);
-			VFS_UNLOCK_GIANT(vfslocked);
-			devnull = fd;
-			fdrop(fp, td);
 		} else {
 			error = do_dup(td, DUP_FIXED, devnull, i, &retval);
 			if (error != 0)
@@ -1862,8 +1879,7 @@
 }
 
 /*
- * Internal form of close.
- * Decrement reference count on file structure.
+ * Internal form of close.  Decrement reference count on file structure.
  * Note: td may be NULL when closing a file that was being passed in a
  * message.
  *
@@ -1906,11 +1922,11 @@
 		fdtol = td->td_proc->p_fdtol;
 		if (fdtol != NULL) {
 			/*
-			 * Handle special case where file descriptor table
-			 * is shared between multiple process leaders.
+			 * Handle special case where file descriptor table is
+			 * shared between multiple process leaders.
 			 */
 			fdp = td->td_proc->p_fd;
-			FILEDESC_LOCK(fdp);
+			FILEDESC_XLOCK(fdp);
 			for (fdtol = fdtol->fdl_next;
 			     fdtol != td->td_proc->p_fdtol;
 			     fdtol = fdtol->fdl_next) {
@@ -1918,7 +1934,7 @@
 				     P_ADVLOCK) == 0)
 					continue;
 				fdtol->fdl_holdcount++;
-				FILEDESC_UNLOCK(fdp);
+				FILEDESC_XUNLOCK(fdp);
 				lf.l_whence = SEEK_SET;
 				lf.l_start = 0;
 				lf.l_len = 0;
@@ -1927,7 +1943,7 @@
 				(void) VOP_ADVLOCK(vp,
 						   (caddr_t)fdtol->fdl_leader,
 						   F_UNLCK, &lf, F_POSIX);
-				FILEDESC_LOCK(fdp);
+				FILEDESC_XLOCK(fdp);
 				fdtol->fdl_holdcount--;
 				if (fdtol->fdl_holdcount == 0 &&
 				    fdtol->fdl_wakeup != 0) {
@@ -1935,7 +1951,7 @@
 					wakeup(fdtol);
 				}
 			}
-			FILEDESC_UNLOCK(fdp);
+			FILEDESC_XUNLOCK(fdp);
 		}
 		VFS_UNLOCK_GIANT(vfslocked);
 	}
@@ -1943,21 +1959,21 @@
 }
 
 /*
- * Extract the file pointer associated with the specified descriptor for
- * the current user process.
+ * Extract the file pointer associated with the specified descriptor for the
+ * current user process.
  *
  * If the descriptor doesn't exist, EBADF is returned.
  *
- * If the descriptor exists but doesn't match 'flags' then
- * return EBADF for read attempts and EINVAL for write attempts.
+ * If the descriptor exists but doesn't match 'flags' then return EBADF for
+ * read attempts and EINVAL for write attempts.
  *
  * If 'hold' is set (non-zero) the file's refcount will be bumped on return.
- * It should be dropped with fdrop().
- * If it is not set, then the refcount will not be bumped however the
- * thread's filedesc struct will be returned locked (for fgetsock).
+ * It should be dropped with fdrop().  If it is not set, then the refcount
+ * will not be bumped however the thread's filedesc struct will be returned
+ * locked (for fgetsock).
  *
- * If an error occured the non-zero error is returned and *fpp is set to NULL.
- * Otherwise *fpp is set and zero is returned.
+ * If an error occured the non-zero error is returned and *fpp is set to
+ * NULL.  Otherwise *fpp is set and zero is returned.
  */
 static __inline int
 _fget(struct thread *td, int fd, struct file **fpp, int flags, int hold)
@@ -1968,29 +1984,28 @@
 	*fpp = NULL;
 	if (td == NULL || (fdp = td->td_proc->p_fd) == NULL)
 		return (EBADF);
-	FILEDESC_LOCK(fdp);
+	FILEDESC_SLOCK(fdp);
 	if ((fp = fget_locked(fdp, fd)) == NULL || fp->f_ops == &badfileops) {
-		FILEDESC_UNLOCK(fdp);
+		FILEDESC_SUNLOCK(fdp);
 		return (EBADF);
 	}
 
 	/*
-	 * Note: FREAD failure returns EBADF to maintain backwards
-	 * compatibility with what routines returned before.
+	 * FREAD and FWRITE failure return EBADF as per POSIX.
 	 *
 	 * Only one flag, or 0, may be specified.
 	 */
 	if (flags == FREAD && (fp->f_flag & FREAD) == 0) {
-		FILEDESC_UNLOCK(fdp);
+		FILEDESC_SUNLOCK(fdp);
 		return (EBADF);
 	}
 	if (flags == FWRITE && (fp->f_flag & FWRITE) == 0) {
-		FILEDESC_UNLOCK(fdp);
-		return (EINVAL);
+		FILEDESC_SUNLOCK(fdp);
+		return (EBADF);
 	}
 	if (hold) {
 		fhold(fp);
-		FILEDESC_UNLOCK(fdp);
+		FILEDESC_SUNLOCK(fdp);
 	}
 	*fpp = fp;
 	return (0);
@@ -2018,9 +2033,9 @@
 }
 
 /*
- * Like fget() but loads the underlying vnode, or returns an error if
- * the descriptor does not represent a vnode.  Note that pipes use vnodes
- * but never have VM objects.  The returned vnode will be vref()d.
+ * Like fget() but loads the underlying vnode, or returns an error if the
+ * descriptor does not represent a vnode.  Note that pipes use vnodes but
+ * never have VM objects.  The returned vnode will be vref()'d.
  *
  * XXX: what about the unused flags ?
  */
@@ -2039,7 +2054,7 @@
 		*vpp = fp->f_vnode;
 		vref(*vpp);
 	}
-	FILEDESC_UNLOCK(td->td_proc->p_fd);
+	FILEDESC_SUNLOCK(td->td_proc->p_fd);
 	return (error);
 }
 
@@ -2067,11 +2082,15 @@
 #endif
 
 /*
- * Like fget() but loads the underlying socket, or returns an error if
- * the descriptor does not represent a socket.
+ * Like fget() but loads the underlying socket, or returns an error if the
+ * descriptor does not represent a socket.
  *
- * We bump the ref count on the returned socket.  XXX Also obtain the SX
- * lock in the future.
+ * We bump the ref count on the returned socket.  XXX Also obtain the SX lock
+ * in the future.
+ *
+ * XXXRW: fgetsock() and fputsock() are deprecated, as consumers should rely
+ * on their file descriptor reference to prevent the socket from being free'd
+ * during use.
  */
 int
 fgetsock(struct thread *td, int fd, struct socket **spp, u_int *fflagp)
@@ -2079,8 +2098,6 @@
 	struct file *fp;
 	int error;
 
-	NET_ASSERT_GIANT();
-
 	*spp = NULL;
 	if (fflagp != NULL)
 		*fflagp = 0;
@@ -2096,19 +2113,20 @@
 		soref(*spp);
 		SOCK_UNLOCK(*spp);
 	}
-	FILEDESC_UNLOCK(td->td_proc->p_fd);
+	FILEDESC_SUNLOCK(td->td_proc->p_fd);
 	return (error);
 }
 
 /*
- * Drop the reference count on the socket and XXX release the SX lock in
- * the future.  The last reference closes the socket.
+ * Drop the reference count on the socket and XXX release the SX lock in the
+ * future.  The last reference closes the socket.
+ *
+ * XXXRW: fputsock() is deprecated, see comment for fgetsock().
  */
 void
 fputsock(struct socket *so)
 {
 
-	NET_ASSERT_GIANT();
 	ACCEPT_LOCK();
 	SOCK_LOCK(so);
 	sorele(so);
@@ -2138,6 +2156,17 @@
 		FILE_UNLOCK(fp);
 		return (0);
 	}
+
+	/*
+	 * We might have just dropped the last reference to a file
+	 * object that is for a UNIX domain socket whose message
+	 * buffers are being examined in unp_gc().  If that is the
+	 * case, FWAIT will be set in f_gcflag and we need to wait for
+	 * unp_gc() to finish its scan.
+	 */
+	while (fp->f_gcflag & FWAIT)
+		msleep(&fp->f_gcflag, fp->f_mtxp, 0, "fpdrop", 0);
+
 	/* We have the last ref so we can proceed without the file lock. */
 	FILE_UNLOCK(fp);
 	if (fp->f_count < 0)
@@ -2160,8 +2189,8 @@
 /*
  * Apply an advisory lock on a file descriptor.
  *
- * Just attempt to get a record lock of the requested type on
- * the entire file (l_whence = SEEK_SET, l_start = 0, l_len = 0).
+ * Just attempt to get a record lock of the requested type on the entire file
+ * (l_whence = SEEK_SET, l_start = 0, l_len = 0).
  */
 #ifndef _SYS_SYSPROTO_H_
 struct flock_args {
@@ -2169,9 +2198,6 @@
 	int	how;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 flock(struct thread *td, struct flock_args *uap)
@@ -2179,6 +2205,7 @@
 	struct file *fp;
 	struct vnode *vp;
 	struct flock lf;
+	int vfslocked;
 	int error;
 
 	if ((error = fget(td, uap->fd, &fp)) != 0)
@@ -2188,8 +2215,8 @@
 		return (EOPNOTSUPP);
 	}
 
-	mtx_lock(&Giant);
 	vp = fp->f_vnode;
+	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	lf.l_whence = SEEK_SET;
 	lf.l_start = 0;
 	lf.l_len = 0;
@@ -2216,7 +2243,7 @@
 	    (uap->how & LOCK_NB) ? F_FLOCK : F_FLOCK | F_WAIT);
 done2:
 	fdrop(fp, td);
-	mtx_unlock(&Giant);
+	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 /*
@@ -2233,22 +2260,20 @@
 	 * of file descriptors, or the fd to be dup'd has already been
 	 * closed, then reject.
 	 */
-	FILEDESC_LOCK(fdp);
+	FILEDESC_XLOCK(fdp);
 	if (dfd < 0 || dfd >= fdp->fd_nfiles ||
 	    (wfp = fdp->fd_ofiles[dfd]) == NULL) {
-		FILEDESC_UNLOCK(fdp);
+		FILEDESC_XUNLOCK(fdp);
 		return (EBADF);
 	}
 
 	/*
 	 * There are two cases of interest here.
 	 *
-	 * For ENODEV simply dup (dfd) to file descriptor
-	 * (indx) and return.
+	 * For ENODEV simply dup (dfd) to file descriptor (indx) and return.
 	 *
-	 * For ENXIO steal away the file structure from (dfd) and
-	 * store it in (indx).  (dfd) is effectively closed by
-	 * this operation.
+	 * For ENXIO steal away the file structure from (dfd) and store it in
+	 * (indx).  (dfd) is effectively closed by this operation.
 	 *
 	 * Any other error code is just returned.
 	 */
@@ -2261,7 +2286,7 @@
 		FILE_LOCK(wfp);
 		if (((mode & (FREAD|FWRITE)) | wfp->f_flag) != wfp->f_flag) {
 			FILE_UNLOCK(wfp);
-			FILEDESC_UNLOCK(fdp);
+			FILEDESC_XUNLOCK(fdp);
 			return (EACCES);
 		}
 		fp = fdp->fd_ofiles[indx];
@@ -2271,15 +2296,13 @@
 			fdused(fdp, indx);
 		fhold_locked(wfp);
 		FILE_UNLOCK(wfp);
-		FILEDESC_UNLOCK(fdp);
-		if (fp != NULL) {
+		FILEDESC_XUNLOCK(fdp);
+		if (fp != NULL)
 			/*
 			 * We now own the reference to fp that the ofiles[]
 			 * array used to own.  Release it.
 			 */
-			FILE_LOCK(fp);
-			fdrop_locked(fp, td);
-		}
+			fdrop(fp, td);
 		return (0);
 
 	case ENXIO:
@@ -2294,31 +2317,26 @@
 		fdunused(fdp, dfd);
 		if (fp == NULL)
 			fdused(fdp, indx);
-		if (fp != NULL)
-			FILE_LOCK(fp);
+		FILEDESC_XUNLOCK(fdp);
 
 		/*
 		 * We now own the reference to fp that the ofiles[] array
 		 * used to own.  Release it.
 		 */
 		if (fp != NULL)
-			fdrop_locked(fp, td);
-
-		FILEDESC_UNLOCK(fdp);
-
+			fdrop(fp, td);
 		return (0);
 
 	default:
-		FILEDESC_UNLOCK(fdp);
+		FILEDESC_XUNLOCK(fdp);
 		return (error);
 	}
 	/* NOTREACHED */
 }
 
 /*
- * Scan all active processes to see if any of them have a current
- * or root directory of `olddp'. If so, replace them with the new
- * mount point.
+ * Scan all active processes to see if any of them have a current or root
+ * directory of `olddp'. If so, replace them with the new mount point.
  */
 void
 mountcheckdirs(struct vnode *olddp, struct vnode *newdp)
@@ -2330,12 +2348,12 @@
 	if (vrefcnt(olddp) == 1)
 		return;
 	sx_slock(&allproc_lock);
-	LIST_FOREACH(p, &allproc, p_list) {
+	FOREACH_PROC_IN_SYSTEM(p) {
 		fdp = fdhold(p);
 		if (fdp == NULL)
 			continue;
 		nrele = 0;
-		FILEDESC_LOCK_FAST(fdp);
+		FILEDESC_XLOCK(fdp);
 		if (fdp->fd_cdir == olddp) {
 			vref(newdp);
 			fdp->fd_cdir = newdp;
@@ -2346,7 +2364,7 @@
 			fdp->fd_rdir = newdp;
 			nrele++;
 		}
-		FILEDESC_UNLOCK_FAST(fdp);
+		FILEDESC_XUNLOCK(fdp);
 		fddrop(fdp);
 		while (nrele--)
 			vrele(olddp);
@@ -2373,12 +2391,12 @@
 	fdtol->fdl_wakeup = 0;
 	fdtol->fdl_leader = leader;
 	if (old != NULL) {
-		FILEDESC_LOCK(fdp);
+		FILEDESC_XLOCK(fdp);
 		fdtol->fdl_next = old->fdl_next;
 		fdtol->fdl_prev = old;
 		old->fdl_next = fdtol;
 		fdtol->fdl_next->fdl_prev = fdtol;
-		FILEDESC_UNLOCK(fdp);
+		FILEDESC_XUNLOCK(fdp);
 	} else {
 		fdtol->fdl_next = fdtol;
 		fdtol->fdl_prev = fdtol;
@@ -2427,7 +2445,7 @@
 	bzero(&xf, sizeof(xf));
 	xf.xf_size = sizeof(xf);
 	sx_slock(&allproc_lock);
-	LIST_FOREACH(p, &allproc, p_list) {
+	FOREACH_PROC_IN_SYSTEM(p) {
 		if (p->p_state == PRS_NEW)
 			continue;
 		PROC_LOCK(p);
@@ -2441,7 +2459,7 @@
 		fdp = fdhold(p);
 		if (fdp == NULL)
 			continue;
-		FILEDESC_LOCK_FAST(fdp);
+		FILEDESC_SLOCK(fdp);
 		for (n = 0; fdp->fd_refcnt > 0 && n < fdp->fd_nfiles; ++n) {
 			if ((fp = fdp->fd_ofiles[n]) == NULL)
 				continue;
@@ -2458,7 +2476,7 @@
 			if (error)
 				break;
 		}
-		FILEDESC_UNLOCK_FAST(fdp);
+		FILEDESC_SUNLOCK(fdp);
 		fddrop(fdp);
 		if (error)
 			break;
@@ -2490,8 +2508,12 @@
 		return ("pipe");
 	case DTYPE_FIFO:
 		return ("fifo");
+	case DTYPE_KQUEUE:
+		return ("kque");
 	case DTYPE_CRYPTO:
 		return ("crpt");
+	case DTYPE_MQUEUE:
+		return ("mque");
 	default:
 		return ("unkn");
 	}
@@ -2509,7 +2531,7 @@
 	struct proc *p;
 	int n;
 
-	LIST_FOREACH(p, &allproc, p_list) {
+	FOREACH_PROC_IN_SYSTEM(p) {
 		if (p->p_state == PRS_NEW)
 			continue;
 		fdp = p->p_fd;
@@ -2523,20 +2545,43 @@
 	return (NULL);
 }
 
+static void
+db_print_file(struct file *fp, int header)
+{
+	struct proc *p;
+
+	if (header)
+		db_printf("%8s %4s %8s %8s %4s %5s %6s %8s %5s %12s\n",
+		    "File", "Type", "Data", "Flag", "GCFl", "Count",
+		    "MCount", "Vnode", "FPID", "FCmd");
+	p = file_to_first_proc(fp);
+	db_printf("%8p %4s %8p %08x %04x %5d %6d %8p %5d %12s\n", fp,
+	    file_type_to_name(fp->f_type), fp->f_data, fp->f_flag,
+	    fp->f_gcflag, fp->f_count, fp->f_msgcount, fp->f_vnode,
+	    p != NULL ? p->p_pid : -1, p != NULL ? p->p_comm : "-");
+}
+
+DB_SHOW_COMMAND(file, db_show_file)
+{
+	struct file *fp;
+
+	if (!have_addr) {
+		db_printf("usage: show file <addr>\n");
+		return;
+	}
+	fp = (struct file *)addr;
+	db_print_file(fp, 1);
+}
+
 DB_SHOW_COMMAND(files, db_show_files)
 {
 	struct file *fp;
-	struct proc *p;
+	int header;
 
-	db_printf("%8s %4s %8s %8s %4s %5s %6s %8s %5s %12s\n", "File",
-	    "Type", "Data", "Flag", "GCFl", "Count", "MCount", "Vnode",
-	    "FPID", "FCmd");
+	header = 1;
 	LIST_FOREACH(fp, &filehead, f_list) {
-		p = file_to_first_proc(fp);
-		db_printf("%8p %4s %8p %08x %04x %5d %6d %8p %5d %12s\n", fp,
-		    file_type_to_name(fp->f_type), fp->f_data, fp->f_flag,
-		    fp->f_gcflag, fp->f_count, fp->f_msgcount, fp->f_vnode,
-		    p != NULL ? p->p_pid : -1, p != NULL ? p->p_comm : "-");
+		db_print_file(fp, header);
+		header = 0;
 	}
 }
 #endif
Index: kern_resource.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_resource.c,v
retrieving revision 1.1.1.2
retrieving revision 1.2
diff -L sys/kern/kern_resource.c -L sys/kern/kern_resource.c -u -r1.1.1.2 -r1.2
--- sys/kern/kern_resource.c
+++ sys/kern/kern_resource.c
@@ -35,7 +35,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_resource.c,v 1.148.2.1 2005/12/28 17:35:55 ps Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_resource.c,v 1.180.2.1 2007/12/20 07:15:40 davidxu Exp $");
 
 #include "opt_compat.h"
 
@@ -43,18 +43,20 @@
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/file.h>
-#include <sys/imgact.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
+#include <sys/priv.h>
 #include <sys/proc.h>
+#include <sys/refcount.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
 #include <sys/sx.h>
 #include <sys/syscallsubr.h>
 #include <sys/sysent.h>
 #include <sys/time.h>
+#include <sys/umtx.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
@@ -77,16 +79,12 @@
 /*
  * Resource controls and accounting.
  */
-
 #ifndef _SYS_SYSPROTO_H_
 struct getpriority_args {
 	int	which;
 	int	who;
 };
 #endif
-/*
- * MPSAFE
- */
 int
 getpriority(td, uap)
 	struct thread *td;
@@ -141,7 +139,10 @@
 		if (uap->who == 0)
 			uap->who = td->td_ucred->cr_uid;
 		sx_slock(&allproc_lock);
-		LIST_FOREACH(p, &allproc, p_list) {
+		FOREACH_PROC_IN_SYSTEM(p) {
+			/* Do not bother to check PRS_NEW processes */
+			if (p->p_state == PRS_NEW)
+				continue;
 			PROC_LOCK(p);
 			if (!p_cansee(td, p) &&
 			    p->p_ucred->cr_uid == uap->who) {
@@ -170,9 +171,6 @@
 	int	prio;
 };
 #endif
-/*
- * MPSAFE
- */
 int
 setpriority(td, uap)
 	struct thread *td;
@@ -264,18 +262,106 @@
 		n = PRIO_MAX;
 	if (n < PRIO_MIN)
 		n = PRIO_MIN;
- 	if (n < p->p_nice && suser(td) != 0)
+ 	if (n < p->p_nice && priv_check(td, PRIV_SCHED_SETPRIORITY) != 0)
 		return (EACCES);
-	mtx_lock_spin(&sched_lock);
+	PROC_SLOCK(p);
 	sched_nice(p, n);
-	mtx_unlock_spin(&sched_lock);
+	PROC_SUNLOCK(p);
 	return (0);
 }
 
 /*
+ * Set realtime priority for LWP.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct rtprio_thread_args {
+	int		function;
+	lwpid_t		lwpid;
+	struct rtprio	*rtp;
+};
+#endif
+int
+rtprio_thread(struct thread *td, struct rtprio_thread_args *uap)
+{
+	struct proc *curp;
+	struct proc *p;
+	struct rtprio rtp;
+	struct thread *td1;
+	int cierror, error;
+
+	/* Perform copyin before acquiring locks if needed. */
+	if (uap->function == RTP_SET)
+		cierror = copyin(uap->rtp, &rtp, sizeof(struct rtprio));
+	else
+		cierror = 0;
+
+	curp = td->td_proc;
+	/*
+	 * Though lwpid is unique, only current process is supported
+	 * since there is no efficient way to look up a LWP yet.
+	 */
+	p = curp;
+	PROC_LOCK(p);
+
+	switch (uap->function) {
+	case RTP_LOOKUP:
+		if ((error = p_cansee(td, p)))
+			break;
+		PROC_SLOCK(p);
+		if (uap->lwpid == 0 || uap->lwpid == td->td_tid)
+			td1 = td;
+		else
+			td1 = thread_find(p, uap->lwpid);
+		if (td1 != NULL)
+			pri_to_rtp(td1, &rtp);
+		else
+			error = ESRCH;
+		PROC_SUNLOCK(p);
+		PROC_UNLOCK(p);
+		return (copyout(&rtp, uap->rtp, sizeof(struct rtprio)));
+	case RTP_SET:
+		if ((error = p_cansched(td, p)) || (error = cierror))
+			break;
+
+		/* Disallow setting rtprio in most cases if not superuser. */
+/*
+ * Realtime priority has to be restricted for reasons which should be
+ * obvious.  However, for idle priority, there is a potential for
+ * system deadlock if an idleprio process gains a lock on a resource
+ * that other processes need (and the idleprio process can't run
+ * due to a CPU-bound normal process).  Fix me!  XXX
+ */
+#if 0
+ 		if (RTP_PRIO_IS_REALTIME(rtp.type)) {
+#else
+		if (rtp.type != RTP_PRIO_NORMAL) {
+#endif
+			error = priv_check(td, PRIV_SCHED_RTPRIO);
+			if (error)
+				break;
+		}
+
+		PROC_SLOCK(p);
+		if (uap->lwpid == 0 || uap->lwpid == td->td_tid)
+			td1 = td;
+		else
+			td1 = thread_find(p, uap->lwpid);
+		if (td1 != NULL)
+			error = rtp_to_pri(&rtp, td1);
+		else
+			error = ESRCH;
+		PROC_SUNLOCK(p);
+		break;
+	default:
+		error = EINVAL;
+		break;
+	}
+	PROC_UNLOCK(p);
+	return (error);
+}
+
+/*
  * Set realtime priority.
- *
- * MPSAFE
  */
 #ifndef _SYS_SYSPROTO_H_
 struct rtprio_args {
@@ -284,7 +370,6 @@
 	struct rtprio	*rtp;
 };
 #endif
-
 int
 rtprio(td, uap)
 	struct thread *td;		/* curthread */
@@ -292,7 +377,7 @@
 {
 	struct proc *curp;
 	struct proc *p;
-	struct ksegrp *kg;
+	struct thread *tdp;
 	struct rtprio rtp;
 	int cierror, error;
 
@@ -316,7 +401,7 @@
 	case RTP_LOOKUP:
 		if ((error = p_cansee(td, p)))
 			break;
-		mtx_lock_spin(&sched_lock);
+		PROC_SLOCK(p);
 		/*
 		 * Return OUR priority if no pid specified,
 		 * or if one is, report the highest priority
@@ -328,14 +413,14 @@
 		 * as leaving it zero.
 		 */
 		if (uap->pid == 0) {
-			pri_to_rtp(td->td_ksegrp, &rtp);
+			pri_to_rtp(td, &rtp);
 		} else {
 			struct rtprio rtp2;
 
 			rtp.type = RTP_PRIO_IDLE;
 			rtp.prio = RTP_PRIO_MAX;
-			FOREACH_KSEGRP_IN_PROC(p, kg) {
-				pri_to_rtp(kg, &rtp2);
+			FOREACH_THREAD_IN_PROC(p, tdp) {
+				pri_to_rtp(tdp, &rtp2);
 				if (rtp2.type <  rtp.type ||
 				    (rtp2.type == rtp.type &&
 				    rtp2.prio < rtp.prio)) {
@@ -344,7 +429,7 @@
 				}
 			}
 		}
-		mtx_unlock_spin(&sched_lock);
+		PROC_SUNLOCK(p);
 		PROC_UNLOCK(p);
 		return (copyout(&rtp, uap->rtp, sizeof(struct rtprio)));
 	case RTP_SET:
@@ -352,13 +437,6 @@
 			break;
 
 		/* Disallow setting rtprio in most cases if not superuser. */
-		if (suser(td) != 0) {
-			/* can't set someone else's */
-			if (uap->pid) {
-				error = EPERM;
-				break;
-			}
-			/* can't set realtime priority */
 /*
  * Realtime priority has to be restricted for reasons which should be
  * obvious.  However, for idle priority, there is a potential for
@@ -367,32 +445,31 @@
  * due to a CPU-bound normal process).  Fix me!  XXX
  */
 #if 0
- 			if (RTP_PRIO_IS_REALTIME(rtp.type)) {
+		if (RTP_PRIO_IS_REALTIME(rtp.type)) {
 #else
-			if (rtp.type != RTP_PRIO_NORMAL) {
+		if (rtp.type != RTP_PRIO_NORMAL) {
 #endif
-				error = EPERM;
+			error = priv_check(td, PRIV_SCHED_RTPRIO);
+			if (error)
 				break;
-			}
 		}
 
 		/*
 		 * If we are setting our own priority, set just our
-		 * KSEGRP but if we are doing another process,
-		 * do all the groups on that process. If we
+		 * thread but if we are doing another process,
+		 * do all the threads on that process. If we
 		 * specify our own pid we do the latter.
 		 */
-		mtx_lock_spin(&sched_lock);
+		PROC_SLOCK(p);
 		if (uap->pid == 0) {
-			error = rtp_to_pri(&rtp, td->td_ksegrp);
+			error = rtp_to_pri(&rtp, td);
 		} else {
-			FOREACH_KSEGRP_IN_PROC(p, kg) {
-				if ((error = rtp_to_pri(&rtp, kg)) != 0) {
+			FOREACH_THREAD_IN_PROC(p, td) {
+				if ((error = rtp_to_pri(&rtp, td)) != 0)
 					break;
-				}
 			}
 		}
-		mtx_unlock_spin(&sched_lock);
+		PROC_SUNLOCK(p);
 		break;
 	default:
 		error = EINVAL;
@@ -403,51 +480,61 @@
 }
 
 int
-rtp_to_pri(struct rtprio *rtp, struct ksegrp *kg)
+rtp_to_pri(struct rtprio *rtp, struct thread *td)
 {
+	u_char	newpri;
+	u_char	oldpri;
 
-	mtx_assert(&sched_lock, MA_OWNED);
 	if (rtp->prio > RTP_PRIO_MAX)
 		return (EINVAL);
+	thread_lock(td);
 	switch (RTP_PRIO_BASE(rtp->type)) {
 	case RTP_PRIO_REALTIME:
-		kg->kg_user_pri = PRI_MIN_REALTIME + rtp->prio;
+		newpri = PRI_MIN_REALTIME + rtp->prio;
 		break;
 	case RTP_PRIO_NORMAL:
-		kg->kg_user_pri = PRI_MIN_TIMESHARE + rtp->prio;
+		newpri = PRI_MIN_TIMESHARE + rtp->prio;
 		break;
 	case RTP_PRIO_IDLE:
-		kg->kg_user_pri = PRI_MIN_IDLE + rtp->prio;
+		newpri = PRI_MIN_IDLE + rtp->prio;
 		break;
 	default:
+		thread_unlock(td);
 		return (EINVAL);
 	}
-	sched_class(kg, rtp->type);
-	if (curthread->td_ksegrp == kg) {
-		sched_prio(curthread, kg->kg_user_pri); /* XXX dubious */
-	}
+	sched_class(td, rtp->type);	/* XXX fix */
+	oldpri = td->td_user_pri;
+	sched_user_prio(td, newpri);
+	if (curthread == td)
+		sched_prio(curthread, td->td_user_pri); /* XXX dubious */
+	if (TD_ON_UPILOCK(td) && oldpri != newpri) {
+		thread_unlock(td);
+		umtx_pi_adjust(td, oldpri);
+	} else
+		thread_unlock(td);
 	return (0);
 }
 
 void
-pri_to_rtp(struct ksegrp *kg, struct rtprio *rtp)
+pri_to_rtp(struct thread *td, struct rtprio *rtp)
 {
 
-	mtx_assert(&sched_lock, MA_OWNED);
-	switch (PRI_BASE(kg->kg_pri_class)) {
+	thread_lock(td);
+	switch (PRI_BASE(td->td_pri_class)) {
 	case PRI_REALTIME:
-		rtp->prio = kg->kg_user_pri - PRI_MIN_REALTIME;
+		rtp->prio = td->td_base_user_pri - PRI_MIN_REALTIME;
 		break;
 	case PRI_TIMESHARE:
-		rtp->prio = kg->kg_user_pri - PRI_MIN_TIMESHARE;
+		rtp->prio = td->td_base_user_pri - PRI_MIN_TIMESHARE;
 		break;
 	case PRI_IDLE:
-		rtp->prio = kg->kg_user_pri - PRI_MIN_IDLE;
+		rtp->prio = td->td_base_user_pri - PRI_MIN_IDLE;
 		break;
 	default:
 		break;
 	}
-	rtp->type = kg->kg_pri_class;
+	rtp->type = td->td_pri_class;
+	thread_unlock(td);
 }
 
 #if defined(COMPAT_43)
@@ -457,9 +544,6 @@
 	struct	orlimit *rlp;
 };
 #endif
-/*
- * MPSAFE
- */
 int
 osetrlimit(td, uap)
 	struct thread *td;
@@ -483,9 +567,6 @@
 	struct	orlimit *rlp;
 };
 #endif
-/*
- * MPSAFE
- */
 int
 ogetrlimit(td, uap)
 	struct thread *td;
@@ -525,9 +606,6 @@
 	struct	rlimit *rlp;
 };
 #endif
-/*
- * MPSAFE
- */
 int
 setrlimit(td, uap)
 	struct thread *td;
@@ -542,6 +620,41 @@
 	return (error);
 }
 
+static void
+lim_cb(void *arg)
+{
+	struct rlimit rlim;
+	struct thread *td;
+	struct proc *p;
+
+	p = arg;
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	/*
+	 * Check if the process exceeds its cpu resource allocation.  If
+	 * it reaches the max, arrange to kill the process in ast().
+	 */
+	if (p->p_cpulimit == RLIM_INFINITY)
+		return;
+	PROC_SLOCK(p);
+	FOREACH_THREAD_IN_PROC(p, td) {
+		thread_lock(td);
+		ruxagg(&p->p_rux, td);
+		thread_unlock(td);
+	}
+	PROC_SUNLOCK(p);
+	if (p->p_rux.rux_runtime > p->p_cpulimit * cpu_tickrate()) {
+		lim_rlimit(p, RLIMIT_CPU, &rlim);
+		if (p->p_rux.rux_runtime >= rlim.rlim_max * cpu_tickrate()) {
+			killproc(p, "exceeded maximum CPU limit");
+		} else {
+			if (p->p_cpulimit < rlim.rlim_max)
+				p->p_cpulimit += 5;
+			psignal(p, SIGXCPU);
+		}
+	}
+	callout_reset(&p->p_limco, hz, lim_cb, p);
+}
+
 int
 kern_setrlimit(td, which, limp)
 	struct thread *td;
@@ -551,7 +664,7 @@
 	struct plimit *newlim, *oldlim;
 	struct proc *p;
 	register struct rlimit *alimp;
-	rlim_t oldssiz;
+	struct rlimit oldssiz;
 	int error;
 
 	if (which >= RLIM_NLIMITS)
@@ -565,7 +678,7 @@
 	if (limp->rlim_max < 0)
 		limp->rlim_max = RLIM_INFINITY;
 
-	oldssiz = 0;
+	oldssiz.rlim_cur = 0;
 	p = td->td_proc;
 	newlim = lim_alloc();
 	PROC_LOCK(p);
@@ -573,7 +686,7 @@
 	alimp = &oldlim->pl_rlimit[which];
 	if (limp->rlim_cur > alimp->rlim_max ||
 	    limp->rlim_max > alimp->rlim_max)
-		if ((error = suser_cred(td->td_ucred, SUSER_ALLOWJAIL))) {
+		if ((error = priv_check(td, PRIV_PROC_SETRLIMIT))) {
 			PROC_UNLOCK(p);
 			lim_free(newlim);
 			return (error);
@@ -586,9 +699,12 @@
 	switch (which) {
 
 	case RLIMIT_CPU:
-		mtx_lock_spin(&sched_lock);
+		if (limp->rlim_cur != RLIM_INFINITY &&
+		    p->p_cpulimit == RLIM_INFINITY)
+			callout_reset(&p->p_limco, hz, lim_cb, p);
+		PROC_SLOCK(p);
 		p->p_cpulimit = limp->rlim_cur;
-		mtx_unlock_spin(&sched_lock);
+		PROC_SUNLOCK(p);
 		break;
 	case RLIMIT_DATA:
 		if (limp->rlim_cur > maxdsiz)
@@ -602,7 +718,10 @@
 			limp->rlim_cur = maxssiz;
 		if (limp->rlim_max > maxssiz)
 			limp->rlim_max = maxssiz;
-		oldssiz = alimp->rlim_cur;
+		oldssiz = *alimp;
+		if (td->td_proc->p_sysent->sv_fixlimit != NULL)
+			td->td_proc->p_sysent->sv_fixlimit(&oldssiz,
+			    RLIMIT_STACK);
 		break;
 
 	case RLIMIT_NOFILE:
@@ -623,6 +742,8 @@
 			limp->rlim_max = 1;
 		break;
 	}
+	if (td->td_proc->p_sysent->sv_fixlimit != NULL)
+		td->td_proc->p_sysent->sv_fixlimit(limp, which);
 	*alimp = *limp;
 	p->p_limit = newlim;
 	PROC_UNLOCK(p);
@@ -634,20 +755,21 @@
 		 * "rlim_cur" bytes accessible.  If stack limit is going
 		 * up make more accessible, if going down make inaccessible.
 		 */
-		if (limp->rlim_cur != oldssiz) {
+		if (limp->rlim_cur != oldssiz.rlim_cur) {
 			vm_offset_t addr;
 			vm_size_t size;
 			vm_prot_t prot;
 
-			if (limp->rlim_cur > oldssiz) {
+			if (limp->rlim_cur > oldssiz.rlim_cur) {
 				prot = p->p_sysent->sv_stackprot;
-				size = limp->rlim_cur - oldssiz;
+				size = limp->rlim_cur - oldssiz.rlim_cur;
 				addr = p->p_sysent->sv_usrstack -
 				    limp->rlim_cur;
 			} else {
 				prot = VM_PROT_NONE;
-				size = oldssiz - limp->rlim_cur;
-				addr = p->p_sysent->sv_usrstack - oldssiz;
+				size = oldssiz.rlim_cur - limp->rlim_cur;
+				addr = p->p_sysent->sv_usrstack -
+				    oldssiz.rlim_cur;
 			}
 			addr = trunc_page(addr);
 			size = round_page(size);
@@ -656,12 +778,6 @@
 		}
 	}
 
-	if (td->td_proc->p_sysent->sv_fixlimits != NULL) {
-		struct image_params imgp;
-
-		imgp.proc = td->td_proc;
-		td->td_proc->p_sysent->sv_fixlimits(&imgp);
-	}
 	return (0);
 }
 
@@ -671,9 +787,6 @@
 	struct	rlimit *rlp;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 getrlimit(td, uap)
@@ -695,125 +808,124 @@
 }
 
 /*
- * Transform the running time and tick information in proc p into user,
- * system, and interrupt time usage.
+ * Transform the running time and tick information for children of proc p
+ * into user and system time usage.
  */
 void
-calcru(p, up, sp)
+calccru(p, up, sp)
 	struct proc *p;
 	struct timeval *up;
 	struct timeval *sp;
 {
-	struct bintime bt;
-	struct rusage_ext rux;
-	struct thread *td;
-	int bt_valid;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
-	mtx_assert(&sched_lock, MA_NOTOWNED);
-	bt_valid = 0;
-	mtx_lock_spin(&sched_lock);
-	rux = p->p_rux;
-	FOREACH_THREAD_IN_PROC(p, td) {
-		if (TD_IS_RUNNING(td)) {
-			/*
-			 * Adjust for the current time slice.  This is
-			 * actually fairly important since the error here is
-			 * on the order of a time quantum which is much
-			 * greater than the precision of binuptime().
-			 */
-			KASSERT(td->td_oncpu != NOCPU,
-			    ("%s: running thread has no CPU", __func__));
-			if (!bt_valid) {
-				binuptime(&bt);
-				bt_valid = 1;
-			}
-			bintime_add(&rux.rux_runtime, &bt);
-			bintime_sub(&rux.rux_runtime,
-			    &pcpu_find(td->td_oncpu)->pc_switchtime);
-		}
-	}
-	mtx_unlock_spin(&sched_lock);
-	calcru1(p, &rux, up, sp);
-	p->p_rux.rux_uu = rux.rux_uu;
-	p->p_rux.rux_su = rux.rux_su;
-	p->p_rux.rux_iu = rux.rux_iu;
+	calcru1(p, &p->p_crux, up, sp);
 }
 
+/*
+ * Transform the running time and tick information in proc p into user
+ * and system time usage.  If appropriate, include the current time slice
+ * on this CPU.
+ */
 void
-calccru(p, up, sp)
-	struct proc *p;
-	struct timeval *up;
-	struct timeval *sp;
+calcru(struct proc *p, struct timeval *up, struct timeval *sp)
 {
+	struct thread *td;
+	uint64_t u;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
-	calcru1(p, &p->p_crux, up, sp);
+	PROC_SLOCK_ASSERT(p, MA_OWNED);
+	/*
+	 * If we are getting stats for the current process, then add in the
+	 * stats that this thread has accumulated in its current time slice.
+	 * We reset the thread and CPU state as if we had performed a context
+	 * switch right here.
+	 */
+	td = curthread;
+	if (td->td_proc == p) {
+		u = cpu_ticks();
+		p->p_rux.rux_runtime += u - PCPU_GET(switchtime);
+		PCPU_SET(switchtime, u);
+	}
+	/* Make sure the per-thread stats are current. */
+	FOREACH_THREAD_IN_PROC(p, td) {
+		if (td->td_runtime == 0)
+			continue;
+		thread_lock(td);
+		ruxagg(&p->p_rux, td);
+		thread_unlock(td);
+	}
+	calcru1(p, &p->p_rux, up, sp);
 }
 
 static void
-calcru1(p, ruxp, up, sp)
-	struct proc *p;
-	struct rusage_ext *ruxp;
-	struct timeval *up;
-	struct timeval *sp;
+calcru1(struct proc *p, struct rusage_ext *ruxp, struct timeval *up,
+    struct timeval *sp)
 {
-	struct timeval tv;
-	/* {user, system, interrupt, total} {ticks, usec}; previous tu: */
-	u_int64_t ut, uu, st, su, it, iu, tt, tu, ptu;
+	/* {user, system, interrupt, total} {ticks, usec}: */
+	u_int64_t ut, uu, st, su, it, tt, tu;
 
 	ut = ruxp->rux_uticks;
 	st = ruxp->rux_sticks;
 	it = ruxp->rux_iticks;
 	tt = ut + st + it;
 	if (tt == 0) {
+		/* Avoid divide by zero */
 		st = 1;
 		tt = 1;
 	}
-	bintime2timeval(&ruxp->rux_runtime, &tv);
-	tu = (u_int64_t)tv.tv_sec * 1000000 + tv.tv_usec;
-	ptu = ruxp->rux_uu + ruxp->rux_su + ruxp->rux_iu;
-	if (tu < ptu) {
-		printf(
-"calcru: runtime went backwards from %ju usec to %ju usec for pid %d (%s)\n",
-		    (uintmax_t)ptu, (uintmax_t)tu, p->p_pid, p->p_comm);
-		tu = ptu;
-	}
+	tu = cputick2usec(ruxp->rux_runtime);
 	if ((int64_t)tu < 0) {
+		/* XXX: this should be an assert /phk */
 		printf("calcru: negative runtime of %jd usec for pid %d (%s)\n",
 		    (intmax_t)tu, p->p_pid, p->p_comm);
-		tu = ptu;
+		tu = ruxp->rux_tu;
 	}
 
-	/* Subdivide tu. */
-	uu = (tu * ut) / tt;
-	su = (tu * st) / tt;
-	iu = tu - uu - su;
-
-	/* Enforce monotonicity. */
-	if (uu < ruxp->rux_uu || su < ruxp->rux_su || iu < ruxp->rux_iu) {
+	if (tu >= ruxp->rux_tu) {
+		/*
+		 * The normal case, time increased.
+		 * Enforce monotonicity of bucketed numbers.
+		 */
+		uu = (tu * ut) / tt;
 		if (uu < ruxp->rux_uu)
 			uu = ruxp->rux_uu;
-		else if (uu + ruxp->rux_su + ruxp->rux_iu > tu)
-			uu = tu - ruxp->rux_su - ruxp->rux_iu;
-		if (st == 0)
+		su = (tu * st) / tt;
+		if (su < ruxp->rux_su)
 			su = ruxp->rux_su;
-		else {
-			su = ((tu - uu) * st) / (st + it);
-			if (su < ruxp->rux_su)
-				su = ruxp->rux_su;
-			else if (uu + su + ruxp->rux_iu > tu)
-				su = tu - uu - ruxp->rux_iu;
-		}
-		KASSERT(uu + su + ruxp->rux_iu <= tu,
-		    ("calcru: monotonisation botch 1"));
-		iu = tu - uu - su;
-		KASSERT(iu >= ruxp->rux_iu,
-		    ("calcru: monotonisation botch 2"));
+	} else if (tu + 3 > ruxp->rux_tu || 101 * tu > 100 * ruxp->rux_tu) {
+		/* 
+		 * When we calibrate the cputicker, it is not uncommon to
+		 * see the presumably fixed frequency increase slightly over
+		 * time as a result of thermal stabilization and NTP
+		 * discipline (of the reference clock).  We therefore ignore
+		 * a bit of backwards slop because we  expect to catch up
+ 		 * shortly.  We use a 3 microsecond limit to catch low
+		 * counts and a 1% limit for high counts.
+		 */
+		uu = ruxp->rux_uu;
+		su = ruxp->rux_su;
+		tu = ruxp->rux_tu;
+	} else { /* tu < ruxp->rux_tu */
+		/*
+		 * What happene here was likely that a laptop, which ran at
+		 * a reduced clock frequency at boot, kicked into high gear.
+		 * The wisdom of spamming this message in that case is
+		 * dubious, but it might also be indicative of something
+		 * serious, so lets keep it and hope laptops can be made
+		 * more truthful about their CPU speed via ACPI.
+		 */
+		printf("calcru: runtime went backwards from %ju usec "
+		    "to %ju usec for pid %d (%s)\n",
+		    (uintmax_t)ruxp->rux_tu, (uintmax_t)tu,
+		    p->p_pid, p->p_comm);
+		uu = (tu * ut) / tt;
+		su = (tu * st) / tt;
 	}
+
 	ruxp->rux_uu = uu;
 	ruxp->rux_su = su;
-	ruxp->rux_iu = iu;
+	ruxp->rux_tu = tu;
 
 	up->tv_sec = uu / 1000000;
 	up->tv_usec = uu % 1000000;
@@ -827,9 +939,6 @@
 	struct	rusage *rusage;
 };
 #endif
-/*
- * MPSAFE
- */
 int
 getrusage(td, uap)
 	register struct thread *td;
@@ -857,8 +966,8 @@
 	switch (who) {
 
 	case RUSAGE_SELF:
-		*rup = p->p_stats->p_ru;
-		calcru(p, &rup->ru_utime, &rup->ru_stime);
+		rufetchcalc(p, rup, &rup->ru_utime,
+		    &rup->ru_stime);
 		break;
 
 	case RUSAGE_CHILDREN:
@@ -875,22 +984,11 @@
 }
 
 void
-ruadd(ru, rux, ru2, rux2)
-	struct rusage *ru;
-	struct rusage_ext *rux;
-	struct rusage *ru2;
-	struct rusage_ext *rux2;
+rucollect(struct rusage *ru, struct rusage *ru2)
 {
-	register long *ip, *ip2;
-	register int i;
+	long *ip, *ip2;
+	int i;
 
-	bintime_add(&rux->rux_runtime, &rux2->rux_runtime);
-	rux->rux_uticks += rux2->rux_uticks;
-	rux->rux_sticks += rux2->rux_sticks;
-	rux->rux_iticks += rux2->rux_iticks;
-	rux->rux_uu += rux2->rux_uu;
-	rux->rux_su += rux2->rux_su;
-	rux->rux_iu += rux2->rux_iu;
 	if (ru->ru_maxrss < ru2->ru_maxrss)
 		ru->ru_maxrss = ru2->ru_maxrss;
 	ip = &ru->ru_first;
@@ -899,6 +997,78 @@
 		*ip++ += *ip2++;
 }
 
+void
+ruadd(struct rusage *ru, struct rusage_ext *rux, struct rusage *ru2,
+    struct rusage_ext *rux2)
+{
+
+	rux->rux_runtime += rux2->rux_runtime;
+	rux->rux_uticks += rux2->rux_uticks;
+	rux->rux_sticks += rux2->rux_sticks;
+	rux->rux_iticks += rux2->rux_iticks;
+	rux->rux_uu += rux2->rux_uu;
+	rux->rux_su += rux2->rux_su;
+	rux->rux_tu += rux2->rux_tu;
+	rucollect(ru, ru2);
+}
+
+/*
+ * Aggregate tick counts into the proc's rusage_ext.
+ */
+void
+ruxagg(struct rusage_ext *rux, struct thread *td)
+{
+
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	PROC_SLOCK_ASSERT(td->td_proc, MA_OWNED);
+	rux->rux_runtime += td->td_runtime;
+	rux->rux_uticks += td->td_uticks;
+	rux->rux_sticks += td->td_sticks;
+	rux->rux_iticks += td->td_iticks;
+	td->td_runtime = 0;
+	td->td_uticks = 0;
+	td->td_iticks = 0;
+	td->td_sticks = 0;
+}
+
+/*
+ * Update the rusage_ext structure and fetch a valid aggregate rusage
+ * for proc p if storage for one is supplied.
+ */
+void
+rufetch(struct proc *p, struct rusage *ru)
+{
+	struct thread *td;
+
+	PROC_SLOCK_ASSERT(p, MA_OWNED);
+
+	*ru = p->p_ru;
+	if (p->p_numthreads > 0)  {
+		FOREACH_THREAD_IN_PROC(p, td) {
+			thread_lock(td);
+			ruxagg(&p->p_rux, td);
+			thread_unlock(td);
+			rucollect(ru, &td->td_ru);
+		}
+	}
+}
+
+/*
+ * Atomically perform a rufetch and a calcru together.
+ * Consumers, can safely assume the calcru is executed only once
+ * rufetch is completed.
+ */
+void
+rufetchcalc(struct proc *p, struct rusage *ru, struct timeval *up,
+    struct timeval *sp)
+{
+
+	PROC_SLOCK(p);
+	rufetch(p, ru);
+	calcru(p, up, sp);
+	PROC_SUNLOCK(p);
+}
+
 /*
  * Allocate a new resource limits structure and initialize its
  * reference count and mutex pointer.
@@ -909,8 +1079,7 @@
 	struct plimit *limp;
 
 	limp = malloc(sizeof(struct plimit), M_PLIMIT, M_WAITOK);
-	limp->pl_refcnt = 1;
-	limp->pl_mtx = mtx_pool_alloc(mtxpool_sleep);
+	refcount_init(&limp->pl_refcnt, 1);
 	return (limp);
 }
 
@@ -919,25 +1088,27 @@
 	struct plimit *limp;
 {
 
-	LIM_LOCK(limp);
-	limp->pl_refcnt++;
-	LIM_UNLOCK(limp);
+	refcount_acquire(&limp->pl_refcnt);
 	return (limp);
 }
 
 void
+lim_fork(struct proc *p1, struct proc *p2)
+{
+	p2->p_limit = lim_hold(p1->p_limit);
+	callout_init_mtx(&p2->p_limco, &p2->p_mtx, 0);
+	if (p1->p_cpulimit != RLIM_INFINITY)
+		callout_reset(&p2->p_limco, hz, lim_cb, p2);
+}
+
+void
 lim_free(limp)
 	struct plimit *limp;
 {
 
-	LIM_LOCK(limp);
 	KASSERT(limp->pl_refcnt > 0, ("plimit refcnt underflow"));
-	if (--limp->pl_refcnt == 0) {
-		LIM_UNLOCK(limp);
+	if (refcount_release(&limp->pl_refcnt))
 		free((void *)limp, M_PLIMIT);
-		return;
-	}
-	LIM_UNLOCK(limp);
 }
 
 /*
@@ -991,6 +1162,8 @@
 	KASSERT(which >= 0 && which < RLIM_NLIMITS,
 	    ("request for invalid resource limit"));
 	*rlp = p->p_limit->pl_rlimit[which];
+	if (p->p_sysent->sv_fixlimit != NULL)
+		p->p_sysent->sv_fixlimit(rlp, which);
 }
 
 /*
@@ -1088,7 +1261,7 @@
  *   that we don't need to free, simply unlock and return.
  * Suboptimal case:
  *   If refcount lowering results in need to free, bump the count
- *   back up, loose the lock and aquire the locks in the proper
+ *   back up, lose the lock and acquire the locks in the proper
  *   order to try again.
  */
 void
--- /dev/null
+++ sys/kern/vfs_extattr.c
@@ -0,0 +1,785 @@
+/*-
+ * Copyright (c) 1999-2001 Robert N. M. Watson
+ * All rights reserved.
+ *
+ * This software was developed by Robert Watson for the TrustedBSD Project.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: src/sys/kern/vfs_extattr.c,v 1.431 2006/12/23 00:30:03 rwatson Exp $");
+
+#include "opt_mac.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/mount.h>
+#include <sys/mutex.h>
+#include <sys/sysproto.h>
+#include <sys/namei.h>
+#include <sys/filedesc.h>
+#include <sys/limits.h>
+#include <sys/vnode.h>
+#include <sys/proc.h>
+#include <sys/extattr.h>
+
+#include <security/audit/audit.h>
+#include <security/mac/mac_framework.h>
+
+/*
+ * Syscall to push extended attribute configuration information into the VFS.
+ * Accepts a path, which it converts to a mountpoint, as well as a command
+ * (int cmd), and attribute name and misc data.
+ *
+ * Currently this is used only by UFS1 extended attributes.
+ */
+int
+extattrctl(td, uap)
+	struct thread *td;
+	struct extattrctl_args /* {
+		const char *path;
+		int cmd;
+		const char *filename;
+		int attrnamespace;
+		const char *attrname;
+	} */ *uap;
+{
+	struct vnode *filename_vp;
+	struct nameidata nd;
+	struct mount *mp, *mp_writable;
+	char attrname[EXTATTR_MAXNAMELEN];
+	int vfslocked, fnvfslocked, error;
+
+	AUDIT_ARG(cmd, uap->cmd);
+	AUDIT_ARG(value, uap->attrnamespace);
+	/*
+	 * uap->attrname is not always defined.  We check again later when we
+	 * invoke the VFS call so as to pass in NULL there if needed.
+	 */
+	if (uap->attrname != NULL) {
+		error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN,
+		    NULL);
+		if (error)
+			return (error);
+	}
+	AUDIT_ARG(text, attrname);
+
+	vfslocked = fnvfslocked = 0;
+	/*
+	 * uap->filename is not always defined.  If it is, grab a vnode lock,
+	 * which VFS_EXTATTRCTL() will later release.
+	 */
+	filename_vp = NULL;
+	if (uap->filename != NULL) {
+		NDINIT(&nd, LOOKUP, MPSAFE | FOLLOW | LOCKLEAF |
+		    AUDITVNODE2, UIO_USERSPACE, uap->filename, td);
+		error = namei(&nd);
+		if (error)
+			return (error);
+		fnvfslocked = NDHASGIANT(&nd);
+		filename_vp = nd.ni_vp;
+		NDFREE(&nd, NDF_NO_VP_RELE | NDF_NO_VP_UNLOCK);
+	}
+
+	/* uap->path is always defined. */
+	NDINIT(&nd, LOOKUP, MPSAFE | FOLLOW | AUDITVNODE1, UIO_USERSPACE,
+	    uap->path, td);
+	error = namei(&nd);
+	if (error) {
+		if (filename_vp != NULL)
+			vput(filename_vp);
+		goto out;
+	}
+	vfslocked = NDHASGIANT(&nd);
+	mp = nd.ni_vp->v_mount;
+	error = vn_start_write(nd.ni_vp, &mp_writable, V_WAIT | PCATCH);
+	NDFREE(&nd, 0);
+	if (error) {
+		if (filename_vp != NULL)
+			vput(filename_vp);
+		goto out;
+	}
+
+	error = VFS_EXTATTRCTL(mp, uap->cmd, filename_vp, uap->attrnamespace,
+	    uap->attrname != NULL ? attrname : NULL, td);
+
+	vn_finished_write(mp_writable);
+	/*
+	 * VFS_EXTATTRCTL will have unlocked, but not de-ref'd, filename_vp,
+	 * so vrele it if it is defined.
+	 */
+	if (filename_vp != NULL)
+		vrele(filename_vp);
+out:
+	VFS_UNLOCK_GIANT(fnvfslocked);
+	VFS_UNLOCK_GIANT(vfslocked);
+	return (error);
+}
+
+/*-
+ * Set a named extended attribute on a file or directory
+ *
+ * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace",
+ *            kernelspace string pointer "attrname", userspace buffer
+ *            pointer "data", buffer length "nbytes", thread "td".
+ * Returns: 0 on success, an error number otherwise
+ * Locks: none
+ * References: vp must be a valid reference for the duration of the call
+ */
+static int
+extattr_set_vp(struct vnode *vp, int attrnamespace, const char *attrname,
+    void *data, size_t nbytes, struct thread *td)
+{
+	struct mount *mp;
+	struct uio auio;
+	struct iovec aiov;
+	ssize_t cnt;
+	int error;
+
+	VFS_ASSERT_GIANT(vp->v_mount);
+	error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
+	if (error)
+		return (error);
+	VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+
+	aiov.iov_base = data;
+	aiov.iov_len = nbytes;
+	auio.uio_iov = &aiov;
+	auio.uio_iovcnt = 1;
+	auio.uio_offset = 0;
+	if (nbytes > INT_MAX) {
+		error = EINVAL;
+		goto done;
+	}
+	auio.uio_resid = nbytes;
+	auio.uio_rw = UIO_WRITE;
+	auio.uio_segflg = UIO_USERSPACE;
+	auio.uio_td = td;
+	cnt = nbytes;
+
+#ifdef MAC
+	error = mac_check_vnode_setextattr(td->td_ucred, vp, attrnamespace,
+	    attrname, &auio);
+	if (error)
+		goto done;
+#endif
+
+	error = VOP_SETEXTATTR(vp, attrnamespace, attrname, &auio,
+	    td->td_ucred, td);
+	cnt -= auio.uio_resid;
+	td->td_retval[0] = cnt;
+
+done:
+	VOP_UNLOCK(vp, 0, td);
+	vn_finished_write(mp);
+	return (error);
+}
+
+int
+extattr_set_fd(td, uap)
+	struct thread *td;
+	struct extattr_set_fd_args /* {
+		int fd;
+		int attrnamespace;
+		const char *attrname;
+		void *data;
+		size_t nbytes;
+	} */ *uap;
+{
+	struct file *fp;
+	char attrname[EXTATTR_MAXNAMELEN];
+	int vfslocked, error;
+
+	AUDIT_ARG(fd, uap->fd);
+	AUDIT_ARG(value, uap->attrnamespace);
+	error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
+	if (error)
+		return (error);
+	AUDIT_ARG(text, attrname);
+
+	error = getvnode(td->td_proc->p_fd, uap->fd, &fp);
+	if (error)
+		return (error);
+
+	vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
+	error = extattr_set_vp(fp->f_vnode, uap->attrnamespace,
+	    attrname, uap->data, uap->nbytes, td);
+	fdrop(fp, td);
+	VFS_UNLOCK_GIANT(vfslocked);
+
+	return (error);
+}
+
+int
+extattr_set_file(td, uap)
+	struct thread *td;
+	struct extattr_set_file_args /* {
+		const char *path;
+		int attrnamespace;
+		const char *attrname;
+		void *data;
+		size_t nbytes;
+	} */ *uap;
+{
+	struct nameidata nd;
+	char attrname[EXTATTR_MAXNAMELEN];
+	int vfslocked, error;
+
+	AUDIT_ARG(value, uap->attrnamespace);
+	error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
+	if (error)
+		return (error);
+	AUDIT_ARG(text, attrname);
+
+	NDINIT(&nd, LOOKUP, MPSAFE | FOLLOW | AUDITVNODE1, UIO_USERSPACE,
+	    uap->path, td);
+	error = namei(&nd);
+	if (error)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+
+	vfslocked = NDHASGIANT(&nd);
+	error = extattr_set_vp(nd.ni_vp, uap->attrnamespace, attrname,
+	    uap->data, uap->nbytes, td);
+
+	vrele(nd.ni_vp);
+	VFS_UNLOCK_GIANT(vfslocked);
+	return (error);
+}
+
+int
+extattr_set_link(td, uap)
+	struct thread *td;
+	struct extattr_set_link_args /* {
+		const char *path;
+		int attrnamespace;
+		const char *attrname;
+		void *data;
+		size_t nbytes;
+	} */ *uap;
+{
+	struct nameidata nd;
+	char attrname[EXTATTR_MAXNAMELEN];
+	int vfslocked, error;
+
+	AUDIT_ARG(value, uap->attrnamespace);
+	error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
+	if (error)
+		return (error);
+	AUDIT_ARG(text, attrname);
+
+	NDINIT(&nd, LOOKUP, MPSAFE | NOFOLLOW | AUDITVNODE1, UIO_USERSPACE,
+	    uap->path, td);
+	error = namei(&nd);
+	if (error)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+
+	vfslocked = NDHASGIANT(&nd);
+	error = extattr_set_vp(nd.ni_vp, uap->attrnamespace, attrname,
+	    uap->data, uap->nbytes, td);
+
+	vrele(nd.ni_vp);
+	VFS_UNLOCK_GIANT(vfslocked);
+	return (error);
+}
+
+/*-
+ * Get a named extended attribute on a file or directory
+ *
+ * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace",
+ *            kernelspace string pointer "attrname", userspace buffer
+ *            pointer "data", buffer length "nbytes", thread "td".
+ * Returns: 0 on success, an error number otherwise
+ * Locks: none
+ * References: vp must be a valid reference for the duration of the call
+ */
+static int
+extattr_get_vp(struct vnode *vp, int attrnamespace, const char *attrname,
+    void *data, size_t nbytes, struct thread *td)
+{
+	struct uio auio, *auiop;
+	struct iovec aiov;
+	ssize_t cnt;
+	size_t size, *sizep;
+	int error;
+
+	VFS_ASSERT_GIANT(vp->v_mount);
+	VOP_LEASE(vp, td, td->td_ucred, LEASE_READ);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+
+	/*
+	 * Slightly unusual semantics: if the user provides a NULL data
+	 * pointer, they don't want to receive the data, just the maximum
+	 * read length.
+	 */
+	auiop = NULL;
+	sizep = NULL;
+	cnt = 0;
+	if (data != NULL) {
+		aiov.iov_base = data;
+		aiov.iov_len = nbytes;
+		auio.uio_iov = &aiov;
+		auio.uio_iovcnt = 1;
+		auio.uio_offset = 0;
+		if (nbytes > INT_MAX) {
+			error = EINVAL;
+			goto done;
+		}
+		auio.uio_resid = nbytes;
+		auio.uio_rw = UIO_READ;
+		auio.uio_segflg = UIO_USERSPACE;
+		auio.uio_td = td;
+		auiop = &auio;
+		cnt = nbytes;
+	} else
+		sizep = &size;
+
+#ifdef MAC
+	error = mac_check_vnode_getextattr(td->td_ucred, vp, attrnamespace,
+	    attrname, &auio);
+	if (error)
+		goto done;
+#endif
+
+	error = VOP_GETEXTATTR(vp, attrnamespace, attrname, auiop, sizep,
+	    td->td_ucred, td);
+
+	if (auiop != NULL) {
+		cnt -= auio.uio_resid;
+		td->td_retval[0] = cnt;
+	} else
+		td->td_retval[0] = size;
+
+done:
+	VOP_UNLOCK(vp, 0, td);
+	return (error);
+}
+
+int
+extattr_get_fd(td, uap)
+	struct thread *td;
+	struct extattr_get_fd_args /* {
+		int fd;
+		int attrnamespace;
+		const char *attrname;
+		void *data;
+		size_t nbytes;
+	} */ *uap;
+{
+	struct file *fp;
+	char attrname[EXTATTR_MAXNAMELEN];
+	int vfslocked, error;
+
+	AUDIT_ARG(fd, uap->fd);
+	AUDIT_ARG(value, uap->attrnamespace);
+	error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
+	if (error)
+		return (error);
+	AUDIT_ARG(text, attrname);
+
+	error = getvnode(td->td_proc->p_fd, uap->fd, &fp);
+	if (error)
+		return (error);
+
+	vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
+	error = extattr_get_vp(fp->f_vnode, uap->attrnamespace,
+	    attrname, uap->data, uap->nbytes, td);
+
+	fdrop(fp, td);
+	VFS_UNLOCK_GIANT(vfslocked);
+	return (error);
+}
+
+int
+extattr_get_file(td, uap)
+	struct thread *td;
+	struct extattr_get_file_args /* {
+		const char *path;
+		int attrnamespace;
+		const char *attrname;
+		void *data;
+		size_t nbytes;
+	} */ *uap;
+{
+	struct nameidata nd;
+	char attrname[EXTATTR_MAXNAMELEN];
+	int vfslocked, error;
+
+	AUDIT_ARG(value, uap->attrnamespace);
+	error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
+	if (error)
+		return (error);
+	AUDIT_ARG(text, attrname);
+
+	NDINIT(&nd, LOOKUP, MPSAFE | FOLLOW | AUDITVNODE1, UIO_USERSPACE,
+	    uap->path, td);
+	error = namei(&nd);
+	if (error)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+
+	vfslocked = NDHASGIANT(&nd);
+	error = extattr_get_vp(nd.ni_vp, uap->attrnamespace, attrname,
+	    uap->data, uap->nbytes, td);
+
+	vrele(nd.ni_vp);
+	VFS_UNLOCK_GIANT(vfslocked);
+	return (error);
+}
+
+int
+extattr_get_link(td, uap)
+	struct thread *td;
+	struct extattr_get_link_args /* {
+		const char *path;
+		int attrnamespace;
+		const char *attrname;
+		void *data;
+		size_t nbytes;
+	} */ *uap;
+{
+	struct nameidata nd;
+	char attrname[EXTATTR_MAXNAMELEN];
+	int vfslocked, error;
+
+	AUDIT_ARG(value, uap->attrnamespace);
+	error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
+	if (error)
+		return (error);
+	AUDIT_ARG(text, attrname);
+
+	NDINIT(&nd, LOOKUP, MPSAFE | NOFOLLOW | AUDITVNODE1, UIO_USERSPACE,
+	    uap->path, td);
+	error = namei(&nd);
+	if (error)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+
+	vfslocked = NDHASGIANT(&nd);
+	error = extattr_get_vp(nd.ni_vp, uap->attrnamespace, attrname,
+	    uap->data, uap->nbytes, td);
+
+	vrele(nd.ni_vp);
+	VFS_UNLOCK_GIANT(vfslocked);
+	return (error);
+}
+
+/*
+ * extattr_delete_vp(): Delete a named extended attribute on a file or
+ *                      directory
+ *
+ * Arguments: unlocked vnode "vp", attribute namespace "attrnamespace",
+ *            kernelspace string pointer "attrname", proc "p"
+ * Returns: 0 on success, an error number otherwise
+ * Locks: none
+ * References: vp must be a valid reference for the duration of the call
+ */
+static int
+extattr_delete_vp(struct vnode *vp, int attrnamespace, const char *attrname,
+    struct thread *td)
+{
+	struct mount *mp;
+	int error;
+
+	VFS_ASSERT_GIANT(vp->v_mount);
+	error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
+	if (error)
+		return (error);
+	VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+
+#ifdef MAC
+	error = mac_check_vnode_deleteextattr(td->td_ucred, vp, attrnamespace,
+	    attrname);
+	if (error)
+		goto done;
+#endif
+
+	error = VOP_DELETEEXTATTR(vp, attrnamespace, attrname, td->td_ucred,
+	    td);
+	if (error == EOPNOTSUPP)
+		error = VOP_SETEXTATTR(vp, attrnamespace, attrname, NULL,
+		    td->td_ucred, td);
+#ifdef MAC
+done:
+#endif
+	VOP_UNLOCK(vp, 0, td);
+	vn_finished_write(mp);
+	return (error);
+}
+
+int
+extattr_delete_fd(td, uap)
+	struct thread *td;
+	struct extattr_delete_fd_args /* {
+		int fd;
+		int attrnamespace;
+		const char *attrname;
+	} */ *uap;
+{
+	struct file *fp;
+	char attrname[EXTATTR_MAXNAMELEN];
+	int vfslocked, error;
+
+	AUDIT_ARG(fd, uap->fd);
+	AUDIT_ARG(value, uap->attrnamespace);
+	error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
+	if (error)
+		return (error);
+	AUDIT_ARG(text, attrname);
+
+	error = getvnode(td->td_proc->p_fd, uap->fd, &fp);
+	if (error)
+		return (error);
+
+	vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
+	error = extattr_delete_vp(fp->f_vnode, uap->attrnamespace,
+	    attrname, td);
+	fdrop(fp, td);
+	VFS_UNLOCK_GIANT(vfslocked);
+	return (error);
+}
+
+int
+extattr_delete_file(td, uap)
+	struct thread *td;
+	struct extattr_delete_file_args /* {
+		const char *path;
+		int attrnamespace;
+		const char *attrname;
+	} */ *uap;
+{
+	struct nameidata nd;
+	char attrname[EXTATTR_MAXNAMELEN];
+	int vfslocked, error;
+
+	AUDIT_ARG(value, uap->attrnamespace);
+	error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
+	if (error)
+		return(error);
+	AUDIT_ARG(text, attrname);
+
+	NDINIT(&nd, LOOKUP, MPSAFE | FOLLOW | AUDITVNODE1, UIO_USERSPACE,
+	    uap->path, td);
+	error = namei(&nd);
+	if (error)
+		return(error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+
+	vfslocked = NDHASGIANT(&nd);
+	error = extattr_delete_vp(nd.ni_vp, uap->attrnamespace, attrname, td);
+	vrele(nd.ni_vp);
+	VFS_UNLOCK_GIANT(vfslocked);
+	return(error);
+}
+
+int
+extattr_delete_link(td, uap)
+	struct thread *td;
+	struct extattr_delete_link_args /* {
+		const char *path;
+		int attrnamespace;
+		const char *attrname;
+	} */ *uap;
+{
+	struct nameidata nd;
+	char attrname[EXTATTR_MAXNAMELEN];
+	int vfslocked, error;
+
+	AUDIT_ARG(value, uap->attrnamespace);
+	error = copyinstr(uap->attrname, attrname, EXTATTR_MAXNAMELEN, NULL);
+	if (error)
+		return(error);
+	AUDIT_ARG(text, attrname);
+
+	NDINIT(&nd, LOOKUP, MPSAFE | NOFOLLOW | AUDITVNODE1, UIO_USERSPACE,
+	    uap->path, td);
+	error = namei(&nd);
+	if (error)
+		return(error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+
+	vfslocked = NDHASGIANT(&nd);
+	error = extattr_delete_vp(nd.ni_vp, uap->attrnamespace, attrname, td);
+	vrele(nd.ni_vp);
+	VFS_UNLOCK_GIANT(vfslocked);
+	return(error);
+}
+
+/*-
+ * Retrieve a list of extended attributes on a file or directory.
+ *
+ * Arguments: unlocked vnode "vp", attribute namespace 'attrnamespace",
+ *            userspace buffer pointer "data", buffer length "nbytes",
+ *            thread "td".
+ * Returns: 0 on success, an error number otherwise
+ * Locks: none
+ * References: vp must be a valid reference for the duration of the call
+ */
+static int
+extattr_list_vp(struct vnode *vp, int attrnamespace, void *data,
+    size_t nbytes, struct thread *td)
+{
+	struct uio auio, *auiop;
+	size_t size, *sizep;
+	struct iovec aiov;
+	ssize_t cnt;
+	int error;
+
+	VFS_ASSERT_GIANT(vp->v_mount);
+	VOP_LEASE(vp, td, td->td_ucred, LEASE_READ);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+
+	auiop = NULL;
+	sizep = NULL;
+	cnt = 0;
+	if (data != NULL) {
+		aiov.iov_base = data;
+		aiov.iov_len = nbytes;
+		auio.uio_iov = &aiov;
+		auio.uio_iovcnt = 1;
+		auio.uio_offset = 0;
+		if (nbytes > INT_MAX) {
+			error = EINVAL;
+			goto done;
+		}
+		auio.uio_resid = nbytes;
+		auio.uio_rw = UIO_READ;
+		auio.uio_segflg = UIO_USERSPACE;
+		auio.uio_td = td;
+		auiop = &auio;
+		cnt = nbytes;
+	} else
+		sizep = &size;
+
+#ifdef MAC
+	error = mac_check_vnode_listextattr(td->td_ucred, vp, attrnamespace);
+	if (error)
+		goto done;
+#endif
+
+	error = VOP_LISTEXTATTR(vp, attrnamespace, auiop, sizep,
+	    td->td_ucred, td);
+
+	if (auiop != NULL) {
+		cnt -= auio.uio_resid;
+		td->td_retval[0] = cnt;
+	} else
+		td->td_retval[0] = size;
+
+done:
+	VOP_UNLOCK(vp, 0, td);
+	return (error);
+}
+
+
+int
+extattr_list_fd(td, uap)
+	struct thread *td;
+	struct extattr_list_fd_args /* {
+		int fd;
+		int attrnamespace;
+		void *data;
+		size_t nbytes;
+	} */ *uap;
+{
+	struct file *fp;
+	int vfslocked, error;
+
+	AUDIT_ARG(fd, uap->fd);
+	AUDIT_ARG(value, uap->attrnamespace);
+	error = getvnode(td->td_proc->p_fd, uap->fd, &fp);
+	if (error)
+		return (error);
+
+	vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
+	error = extattr_list_vp(fp->f_vnode, uap->attrnamespace, uap->data,
+	    uap->nbytes, td);
+
+	fdrop(fp, td);
+	VFS_UNLOCK_GIANT(vfslocked);
+	return (error);
+}
+
+int
+extattr_list_file(td, uap)
+	struct thread*td;
+	struct extattr_list_file_args /* {
+		const char *path;
+		int attrnamespace;
+		void *data;
+		size_t nbytes;
+	} */ *uap;
+{
+	struct nameidata nd;
+	int vfslocked, error;
+
+	AUDIT_ARG(value, uap->attrnamespace);
+	NDINIT(&nd, LOOKUP, MPSAFE | FOLLOW | AUDITVNODE1, UIO_USERSPACE,
+	    uap->path, td);
+	error = namei(&nd);
+	if (error)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+
+	vfslocked = NDHASGIANT(&nd);
+	error = extattr_list_vp(nd.ni_vp, uap->attrnamespace, uap->data,
+	    uap->nbytes, td);
+
+	vrele(nd.ni_vp);
+	VFS_UNLOCK_GIANT(vfslocked);
+	return (error);
+}
+
+int
+extattr_list_link(td, uap)
+	struct thread*td;
+	struct extattr_list_link_args /* {
+		const char *path;
+		int attrnamespace;
+		void *data;
+		size_t nbytes;
+	} */ *uap;
+{
+	struct nameidata nd;
+	int vfslocked, error;
+
+	AUDIT_ARG(value, uap->attrnamespace);
+	NDINIT(&nd, LOOKUP, MPSAFE | NOFOLLOW | AUDITVNODE1, UIO_USERSPACE,
+	    uap->path, td);
+	error = namei(&nd);
+	if (error)
+		return (error);
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+
+	vfslocked = NDHASGIANT(&nd);
+	error = extattr_list_vp(nd.ni_vp, uap->attrnamespace, uap->data,
+	    uap->nbytes, td);
+
+	vrele(nd.ni_vp);
+	VFS_UNLOCK_GIANT(vfslocked);
+	return (error);
+}
--- /dev/null
+++ sys/kern/subr_fattime.c
@@ -0,0 +1,307 @@
+/*-
+ * Copyright (c) 2006 Poul-Henning Kamp
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: src/sys/kern/subr_fattime.c,v 1.2 2006/10/24 10:27:23 phk Exp $
+ *
+ * Convert MS-DOS FAT format timestamps to and from unix timespecs
+ *
+ * FAT filestamps originally consisted of two 16 bit integers, encoded like
+ * this:
+ *
+ *	yyyyyyymmmmddddd (year - 1980, month, day)
+ *
+ *      hhhhhmmmmmmsssss (hour, minutes, seconds divided by two)
+ *
+ * Subsequently even Microsoft realized that files could be accessed in less
+ * than two seconds and a byte was added containing:
+ *
+ *      sfffffff	 (second mod two, 100ths of second)
+ *
+ * FAT timestamps are in the local timezone, with no indication of which
+ * timezone much less if daylight savings time applies.
+ *
+ * Later on again, in Windows NT, timestamps were defined relative to GMT.
+ *
+ * Purists will point out that UTC replaced GMT for such uses around
+ * a century ago, already then.  Ironically "NT" was an abbreviation of 
+ * "New Technology".  Anyway...
+ *
+ * The 'utc' argument determines if the resulting FATTIME timestamp
+ * should b on the UTC or local timezone calendar.
+ *
+ * The conversion functions below cut time into four-year leap-second
+ * cycles rather than single years and uses table lookups inside those
+ * cycles to get the months and years sorted out.
+ *
+ * Obviously we cannot calculate the correct table index going from
+ * a posix seconds count to Y/M/D, but we can get pretty close by
+ * dividing the daycount by 32 (giving a too low index), and then
+ * adjusting upwards a couple of steps if necessary.
+ *
+ * FAT timestamps have 7 bits for the year and starts at 1980, so
+ * they can represent up to 2107 which means that the non-leap-year
+ * 2100 must be handled.
+ *
+ * XXX: As long as time_t is 32 bits this is not relevant or easily
+ * XXX: testable.  Revisit when time_t grows bigger.
+ * XXX: grepfodder: 64 bit time_t, y2100, y2.1k, 2100, leap year
+ *
+ */
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/time.h>
+#include <sys/clock.h>
+
+#define DAY	(24 * 60 * 60)	/* Length of day in seconds */
+#define YEAR	365		/* Length of normal year */
+#define LYC	(4 * YEAR + 1)	/* Length of 4 year leap-year cycle */
+#define T1980	(10 * 365 + 2)	/* Days from 1970 to 1980 */
+
+/* End of month is N days from start of (normal) year */
+#define JAN	31
+#define FEB	(JAN + 28)
+#define MAR	(FEB + 31)
+#define APR	(MAR + 30)
+#define MAY	(APR + 31)
+#define JUN	(MAY + 30)
+#define JUL	(JUN + 31)
+#define AUG	(JUL + 31)
+#define SEP	(AUG + 30)
+#define OCT	(SEP + 31)
+#define NOV	(OCT + 30)
+#define DEC	(NOV + 31)
+
+/* Table of months in a 4 year leap-year cycle */
+
+#define ENC(y,m)	(((y) << 9) | ((m) << 5))
+
+static const struct {
+	uint16_t	days;	/* month start in days relative to cycle */
+	uint16_t	coded;	/* encoded year + month information */
+} mtab[48] = {
+	{   0 + 0 * YEAR,     ENC(0, 1)  },
+
+	{ JAN + 0 * YEAR,     ENC(0, 2)  }, { FEB + 0 * YEAR + 1, ENC(0, 3)  },
+	{ MAR + 0 * YEAR + 1, ENC(0, 4)  }, { APR + 0 * YEAR + 1, ENC(0, 5)  },
+	{ MAY + 0 * YEAR + 1, ENC(0, 6)  }, { JUN + 0 * YEAR + 1, ENC(0, 7)  },
+	{ JUL + 0 * YEAR + 1, ENC(0, 8)  }, { AUG + 0 * YEAR + 1, ENC(0, 9)  },
+	{ SEP + 0 * YEAR + 1, ENC(0, 10) }, { OCT + 0 * YEAR + 1, ENC(0, 11) },
+	{ NOV + 0 * YEAR + 1, ENC(0, 12) }, { DEC + 0 * YEAR + 1, ENC(1, 1)  },
+
+	{ JAN + 1 * YEAR + 1, ENC(1, 2)  }, { FEB + 1 * YEAR + 1, ENC(1, 3)  },
+	{ MAR + 1 * YEAR + 1, ENC(1, 4)  }, { APR + 1 * YEAR + 1, ENC(1, 5)  },
+	{ MAY + 1 * YEAR + 1, ENC(1, 6)  }, { JUN + 1 * YEAR + 1, ENC(1, 7)  },
+	{ JUL + 1 * YEAR + 1, ENC(1, 8)  }, { AUG + 1 * YEAR + 1, ENC(1, 9)  },
+	{ SEP + 1 * YEAR + 1, ENC(1, 10) }, { OCT + 1 * YEAR + 1, ENC(1, 11) },
+	{ NOV + 1 * YEAR + 1, ENC(1, 12) }, { DEC + 1 * YEAR + 1, ENC(2, 1)  },
+
+	{ JAN + 2 * YEAR + 1, ENC(2, 2)  }, { FEB + 2 * YEAR + 1, ENC(2, 3)  },
+	{ MAR + 2 * YEAR + 1, ENC(2, 4)  }, { APR + 2 * YEAR + 1, ENC(2, 5)  },
+	{ MAY + 2 * YEAR + 1, ENC(2, 6)  }, { JUN + 2 * YEAR + 1, ENC(2, 7)  },
+	{ JUL + 2 * YEAR + 1, ENC(2, 8)  }, { AUG + 2 * YEAR + 1, ENC(2, 9)  },
+	{ SEP + 2 * YEAR + 1, ENC(2, 10) }, { OCT + 2 * YEAR + 1, ENC(2, 11) },
+	{ NOV + 2 * YEAR + 1, ENC(2, 12) }, { DEC + 2 * YEAR + 1, ENC(3, 1)  },
+
+	{ JAN + 3 * YEAR + 1, ENC(3, 2)  }, { FEB + 3 * YEAR + 1, ENC(3, 3)  },
+	{ MAR + 3 * YEAR + 1, ENC(3, 4)  }, { APR + 3 * YEAR + 1, ENC(3, 5)  },
+	{ MAY + 3 * YEAR + 1, ENC(3, 6)  }, { JUN + 3 * YEAR + 1, ENC(3, 7)  },
+	{ JUL + 3 * YEAR + 1, ENC(3, 8)  }, { AUG + 3 * YEAR + 1, ENC(3, 9)  },
+	{ SEP + 3 * YEAR + 1, ENC(3, 10) }, { OCT + 3 * YEAR + 1, ENC(3, 11) },
+	{ NOV + 3 * YEAR + 1, ENC(3, 12) }
+};
+
+
+void
+timespec2fattime(struct timespec *tsp, int utc, u_int16_t *ddp, u_int16_t *dtp, u_int8_t *dhp)
+{
+	time_t t1;
+	unsigned t2, l, m;
+
+	t1 = tsp->tv_sec;
+	if (!utc)
+		t1 -= utc_offset();
+
+	if (dhp != NULL)
+		*dhp = (tsp->tv_sec & 1) * 100 + tsp->tv_nsec / 10000000;
+	if (dtp != NULL) {
+		*dtp = (t1 / 2) % 30;
+		*dtp |= ((t1 / 60) % 60) << 5;
+		*dtp |= ((t1 / 3600) % 24) << 11;
+	}
+	if (ddp != NULL) {
+		t2 = t1 / DAY;
+		if (t2 < T1980) {
+			/* Impossible date, truncate to 1980-01-01 */
+			*ddp = 0x0021;
+		} else {
+			t2 -= T1980;
+
+			/*
+			 * 2100 is not a leap year.
+			 * XXX: a 32 bit time_t can not get us here.
+			 */
+			if (t2 >= ((2100 - 1980) / 4 * LYC + FEB))
+				t2++;
+
+			/* Account for full leapyear cycles */
+			l = t2 / LYC;
+			*ddp = (l * 4) << 9;
+			t2 -= l * LYC;
+
+			/* Find approximate table entry */
+			m = t2 / 32;
+
+			/* Find correct table entry */
+			while (m < 47 && mtab[m + 1].days <= t2)
+				m++;
+
+			/* Get year + month from the table */
+			*ddp += mtab[m].coded;
+
+			/* And apply the day in the month */
+			t2 -= mtab[m].days - 1;
+			*ddp |= t2;
+		}
+	}
+}
+
+/*
+ * Table indexed by the bottom two bits of year + four bits of the month
+ * from the FAT timestamp, returning number of days into 4 year long
+ * leap-year cycle
+ */
+
+#define DCOD(m, y, l)	((m) + YEAR * (y) + (l))
+static const uint16_t daytab[64] = {
+	0, 		 DCOD(  0, 0, 0), DCOD(JAN, 0, 0), DCOD(FEB, 0, 1),
+	DCOD(MAR, 0, 1), DCOD(APR, 0, 1), DCOD(MAY, 0, 1), DCOD(JUN, 0, 1),
+	DCOD(JUL, 0, 1), DCOD(AUG, 0, 1), DCOD(SEP, 0, 1), DCOD(OCT, 0, 1),
+	DCOD(NOV, 0, 1), DCOD(DEC, 0, 1), 0,               0,
+	0, 		 DCOD(  0, 1, 1), DCOD(JAN, 1, 1), DCOD(FEB, 1, 1),
+	DCOD(MAR, 1, 1), DCOD(APR, 1, 1), DCOD(MAY, 1, 1), DCOD(JUN, 1, 1),
+	DCOD(JUL, 1, 1), DCOD(AUG, 1, 1), DCOD(SEP, 1, 1), DCOD(OCT, 1, 1),
+	DCOD(NOV, 1, 1), DCOD(DEC, 1, 1), 0,               0,
+	0,		 DCOD(  0, 2, 1), DCOD(JAN, 2, 1), DCOD(FEB, 2, 1),
+	DCOD(MAR, 2, 1), DCOD(APR, 2, 1), DCOD(MAY, 2, 1), DCOD(JUN, 2, 1),
+	DCOD(JUL, 2, 1), DCOD(AUG, 2, 1), DCOD(SEP, 2, 1), DCOD(OCT, 2, 1),
+	DCOD(NOV, 2, 1), DCOD(DEC, 2, 1), 0,               0,
+	0,		 DCOD(  0, 3, 1), DCOD(JAN, 3, 1), DCOD(FEB, 3, 1),
+	DCOD(MAR, 3, 1), DCOD(APR, 3, 1), DCOD(MAY, 3, 1), DCOD(JUN, 3, 1),
+	DCOD(JUL, 3, 1), DCOD(AUG, 3, 1), DCOD(SEP, 3, 1), DCOD(OCT, 3, 1),
+	DCOD(NOV, 3, 1), DCOD(DEC, 3, 1), 0,               0
+};
+
+void
+fattime2timespec(unsigned dd, unsigned dt, unsigned dh, int utc, struct timespec *tsp)
+{
+	unsigned day;
+
+	/* Unpack time fields */
+	tsp->tv_sec = (dt & 0x1f) << 1;
+	tsp->tv_sec += ((dt & 0x7e0) >> 5) * 60;
+	tsp->tv_sec += ((dt & 0xf800) >> 11) * 3600;
+	tsp->tv_sec += dh / 100;
+	tsp->tv_nsec = (dh % 100) * 10000000;
+
+	/* Day of month */
+	day = (dd & 0x1f) - 1;
+
+	/* Full leap-year cycles */
+	day += LYC * ((dd >> 11) & 0x1f);
+
+	/* Month offset from leap-year cycle */
+	day += daytab[(dd >> 5) & 0x3f];
+
+	/*
+	 * 2100 is not a leap year.
+	 * XXX: a 32 bit time_t can not get us here.
+	 */
+	if (day >= ((2100 - 1980) / 4 * LYC + FEB))
+		day--;
+
+	/* Align with time_t epoch */
+	day += T1980;
+
+	tsp->tv_sec += DAY * day;
+	if (!utc)
+		tsp->tv_sec += utc_offset();
+}
+
+#ifdef TEST_DRIVER
+
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+
+int
+main(int argc __unused, char **argv __unused)
+{
+	int i;
+	struct timespec ts;
+	struct tm tm;
+	double a;
+	u_int16_t d, t;
+	u_int8_t p;
+	char buf[100];
+
+	for (i = 0; i < 10000; i++) {
+		do {
+			ts.tv_sec = random();
+		} while (ts.tv_sec < T1980 * 86400);
+		ts.tv_nsec = random() % 1000000000;
+
+		printf("%10d.%03ld -- ", ts.tv_sec, ts.tv_nsec / 1000000);
+
+		gmtime_r(&ts.tv_sec, &tm);
+		strftime(buf, sizeof buf, "%Y %m %d %H %M %S", &tm);
+		printf("%s -- ", buf);
+
+		a = ts.tv_sec + ts.tv_nsec * 1e-9;
+		d = t = p = 0;
+		timet2fattime(&ts, &d, &t, &p);
+		printf("%04x %04x %02x -- ", d, t, p);
+		printf("%3d %02d %02d %02d %02d %02d -- ",
+		    ((d >> 9)  & 0x7f) + 1980,
+		    (d >> 5)  & 0x0f,
+		    (d >> 0)  & 0x1f,
+		    (t >> 11) & 0x1f,
+		    (t >> 5)  & 0x3f,
+		    ((t >> 0)  & 0x1f) * 2);
+
+		ts.tv_sec = ts.tv_nsec = 0;
+		fattime2timet(d, t, p, &ts);
+		printf("%10d.%03ld == ", ts.tv_sec, ts.tv_nsec / 1000000);
+		gmtime_r(&ts.tv_sec, &tm);
+		strftime(buf, sizeof buf, "%Y %m %d %H %M %S", &tm);
+		printf("%s -- ", buf);
+		a -= ts.tv_sec + ts.tv_nsec * 1e-9;
+		printf("%.3f", a);
+		printf("\n");
+	}
+	return (0);
+}
+
+#endif /* TEST_DRIVER */
Index: subr_stack.c
===================================================================
RCS file: /home/cvs/src/sys/kern/subr_stack.c,v
retrieving revision 1.1
retrieving revision 1.2
diff -L sys/kern/subr_stack.c -L sys/kern/subr_stack.c -u -r1.1 -r1.2
--- sys/kern/subr_stack.c
+++ sys/kern/subr_stack.c
@@ -25,7 +25,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/subr_stack.c,v 1.2.2.1 2006/03/13 03:05:58 jeff Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/subr_stack.c,v 1.3 2006/05/28 22:15:28 kris Exp $");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
@@ -90,7 +90,7 @@
 	long offset;
 	int i;
 
-	KASSERT(st->depth <= STACK_MAX, ("bogous stack"));
+	KASSERT(st->depth <= STACK_MAX, ("bogus stack"));
 	for (i = 0; i < st->depth; i++) {
 		stack_symbol(st->pcs[i], &name, &offset);
 		printf("#%d %p at %s+%#lx\n", i, (void *)st->pcs[i],
@@ -105,7 +105,7 @@
 	long offset;
 	int i;
 
-	KASSERT(st->depth <= STACK_MAX, ("bogous stack"));
+	KASSERT(st->depth <= STACK_MAX, ("bogus stack"));
 	for (i = 0; i < st->depth; i++) {
 		stack_symbol(st->pcs[i], &name, &offset);
 		sbuf_printf(sb, "#%d %p at %s+%#lx\n", i, (void *)st->pcs[i],
@@ -122,7 +122,7 @@
 	long offset;
 	int i;
 
-	KASSERT(st->depth <= STACK_MAX, ("bogous stack"));
+	KASSERT(st->depth <= STACK_MAX, ("bogus stack"));
 	if (cheap) {
 		ktr_tracepoint(mask, file, line, "#0 %p %p %p %p %p %p",
 		    st->pcs[0], st->pcs[1], st->pcs[2], st->pcs[3],
Index: kern_lockf.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_lockf.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/kern_lockf.c -L sys/kern/kern_lockf.c -u -r1.1.1.1 -r1.2
--- sys/kern/kern_lockf.c
+++ sys/kern/kern_lockf.c
@@ -33,7 +33,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_lockf.c,v 1.54 2005/03/29 08:13:01 phk Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_lockf.c,v 1.57 2007/08/07 09:04:50 kib Exp $");
 
 #include "opt_debug_lockf.h"
 
@@ -73,14 +73,14 @@
 #define NOLOCKF (struct lockf *)0
 #define SELF	0x1
 #define OTHERS	0x2
-static int	 lf_clearlock(struct lockf *);
+static int	 lf_clearlock(struct lockf *, struct lockf **);
 static int	 lf_findoverlap(struct lockf *,
 	    struct lockf *, int, struct lockf ***, struct lockf **);
 static struct lockf *
 	 lf_getblock(struct lockf *);
 static int	 lf_getlock(struct lockf *, struct flock *);
-static int	 lf_setlock(struct lockf *);
-static void	 lf_split(struct lockf *, struct lockf *);
+static int	 lf_setlock(struct lockf *, struct vnode *, struct lockf **);
+static void	 lf_split(struct lockf *, struct lockf *, struct lockf **);
 static void	 lf_wakelock(struct lockf *);
 #ifdef LOCKF_DEBUG
 static void	 lf_print(char *, struct lockf *);
@@ -102,12 +102,13 @@
 	struct lockf **head;
 	u_quad_t size;
 {
-	register struct flock *fl = ap->a_fl;
-	register struct lockf *lock;
+	struct flock *fl = ap->a_fl;
+	struct lockf *lock;
+	struct vnode *vp = ap->a_vp;
 	off_t start, end, oadd;
+	struct lockf *clean, *n;
 	int error;
 
-	mtx_lock(&Giant);
 	/*
 	 * Convert the flock structure into a start and end.
 	 */
@@ -124,40 +125,29 @@
 
 	case SEEK_END:
 		if (size > OFF_MAX ||
-		    (fl->l_start > 0 && size > OFF_MAX - fl->l_start)) {
-			error = EOVERFLOW;
-			goto out;
-		}
+		    (fl->l_start > 0 && size > OFF_MAX - fl->l_start))
+			return (EOVERFLOW);
 		start = size + fl->l_start;
 		break;
 
 	default:
-		error = EINVAL;
-		goto out;
-	}
-	if (start < 0) {
-		error = EINVAL;
-		goto out;
+		return (EINVAL);
 	}
+	if (start < 0)
+		return (EINVAL);
 	if (fl->l_len < 0) {
-		if (start == 0) {
-			error = EINVAL;
-			goto out;
-		}
+		if (start == 0)
+			return (EINVAL);
 		end = start - 1;
 		start += fl->l_len;
-		if (start < 0) {
-			error = EINVAL;
-			goto out;
-		}
+		if (start < 0)
+			return (EINVAL);
 	} else if (fl->l_len == 0)
 		end = -1;
 	else {
 		oadd = fl->l_len - 1;
-		if (oadd > OFF_MAX - start) {
-			error = EOVERFLOW;
-			goto out;
-		}
+		if (oadd > OFF_MAX - start)
+			return (EOVERFLOW);
 		end = start + oadd;
 	}
 	/*
@@ -166,11 +156,18 @@
 	if (*head == (struct lockf *)0) {
 		if (ap->a_op != F_SETLK) {
 			fl->l_type = F_UNLCK;
-			error = 0;
-			goto out;
+			return (0);
 		}
 	}
 	/*
+	 * Allocate a spare structure in case we have to split.
+	 */
+	clean = NULL;
+	if (ap->a_op == F_SETLK || ap->a_op == F_UNLCK) {
+		MALLOC(clean, struct lockf *, sizeof *lock, M_LOCKF, M_WAITOK);
+		clean->lf_next = NULL;
+	}
+	/*
 	 * Create the lockf structure
 	 */
 	MALLOC(lock, struct lockf *, sizeof *lock, M_LOCKF, M_WAITOK);
@@ -192,29 +189,36 @@
 	/*
 	 * Do the requested operation.
 	 */
+	VI_LOCK(vp);
 	switch(ap->a_op) {
 	case F_SETLK:
-		error = lf_setlock(lock);
-		goto out;
+		error = lf_setlock(lock, vp, &clean);
+		break;
 
 	case F_UNLCK:
-		error = lf_clearlock(lock);
-		FREE(lock, M_LOCKF);
-		goto out;
+		error = lf_clearlock(lock, &clean);
+		lock->lf_next = clean;
+		clean = lock;
+		break;
 
 	case F_GETLK:
 		error = lf_getlock(lock, fl);
-		FREE(lock, M_LOCKF);
-		goto out;
+		lock->lf_next = clean;
+		clean = lock;
+		break;
 
 	default:
-		free(lock, M_LOCKF);
+		lock->lf_next = clean;
+		clean = lock;
 		error = EINVAL;
-		goto out;
+		break;
+	}
+	VI_UNLOCK(vp);
+	for (lock = clean; lock != NULL; ) {
+		n = lock->lf_next;
+		free(lock, M_LOCKF);
+		lock = n;
 	}
-	/* NOTREACHED */
-out:
-	mtx_unlock(&Giant);
 	return (error);
 }
 
@@ -222,10 +226,12 @@
  * Set a byte-range lock.
  */
 static int
-lf_setlock(lock)
-	register struct lockf *lock;
+lf_setlock(lock, vp, clean)
+	struct lockf *lock;
+	struct vnode *vp;
+	struct lockf **clean;
 {
-	register struct lockf *block;
+	struct lockf *block;
 	struct lockf **head = lock->lf_head;
 	struct lockf **prev, *overlap, *ltmp;
 	static char lockstr[] = "lockf";
@@ -251,7 +257,8 @@
 		 * Free the structure and return if nonblocking.
 		 */
 		if ((lock->lf_flags & F_WAIT) == 0) {
-			FREE(lock, M_LOCKF);
+			lock->lf_next = *clean;
+			*clean = lock;
 			return (EAGAIN);
 		}
 		/*
@@ -266,16 +273,19 @@
 		 */
 		if ((lock->lf_flags & F_POSIX) &&
 		    (block->lf_flags & F_POSIX)) {
-			register struct proc *wproc;
+			struct proc *wproc;
+			struct proc *nproc;
 			struct thread *td;
-			register struct lockf *waitblock;
+			struct lockf *waitblock;
 			int i = 0;
 
 			/* The block is waiting on something */
-			/* XXXKSE this is not complete under threads */
 			wproc = (struct proc *)block->lf_id;
-			mtx_lock_spin(&sched_lock);
+restart:
+			nproc = NULL;
+			PROC_SLOCK(wproc);
 			FOREACH_THREAD_IN_PROC(wproc, td) {
+				thread_lock(td);
 				while (td->td_wchan &&
 				    (td->td_wmesg == lockstr) &&
 				    (i++ < maxlockdepth)) {
@@ -284,15 +294,21 @@
 					waitblock = waitblock->lf_next;
 					if ((waitblock->lf_flags & F_POSIX) == 0)
 						break;
-					wproc = (struct proc *)waitblock->lf_id;
-					if (wproc == (struct proc *)lock->lf_id) {
-						mtx_unlock_spin(&sched_lock);
-						free(lock, M_LOCKF);
+					nproc = (struct proc *)waitblock->lf_id;
+					if (nproc == (struct proc *)lock->lf_id) {
+						PROC_SUNLOCK(wproc);
+						thread_unlock(td);
+						lock->lf_next = *clean;
+						*clean = lock;
 						return (EDEADLK);
 					}
 				}
+				thread_unlock(td);
 			}
-			mtx_unlock_spin(&sched_lock);
+			PROC_SUNLOCK(wproc);
+			wproc = nproc;
+			if (wproc)
+				goto restart;
 		}
 		/*
 		 * For flock type locks, we must first remove
@@ -302,7 +318,7 @@
 		if ((lock->lf_flags & F_FLOCK) &&
 		    lock->lf_type == F_WRLCK) {
 			lock->lf_type = F_UNLCK;
-			(void) lf_clearlock(lock);
+			(void) lf_clearlock(lock, clean);
 			lock->lf_type = F_WRLCK;
 		}
 		/*
@@ -317,7 +333,7 @@
 			lf_printlist("lf_setlock", block);
 		}
 #endif /* LOCKF_DEBUG */
-		error = tsleep(lock, priority, lockstr, 0);
+		error = msleep(lock, VI_MTX(vp), priority, lockstr, 0);
 		/*
 		 * We may have been awakened by a signal and/or by a
 		 * debugger continuing us (in which cases we must remove
@@ -331,7 +347,8 @@
 			lock->lf_next = NOLOCKF;
 		}
 		if (error) {
-			free(lock, M_LOCKF);
+			lock->lf_next = *clean;
+			*clean = lock;
 			return (error);
 		}
 	}
@@ -376,7 +393,8 @@
 			    overlap->lf_type == F_WRLCK)
 				lf_wakelock(overlap);
 			overlap->lf_type = lock->lf_type;
-			FREE(lock, M_LOCKF);
+			lock->lf_next = *clean;
+			*clean = lock;
 			lock = overlap; /* for debug output below */
 			break;
 
@@ -385,7 +403,8 @@
 			 * Check for common starting point and different types.
 			 */
 			if (overlap->lf_type == lock->lf_type) {
-				free(lock, M_LOCKF);
+				lock->lf_next = *clean;
+				*clean = lock;
 				lock = overlap; /* for debug output below */
 				break;
 			}
@@ -394,7 +413,7 @@
 				lock->lf_next = overlap;
 				overlap->lf_start = lock->lf_end + 1;
 			} else
-				lf_split(overlap, lock);
+				lf_split(overlap, lock, clean);
 			lf_wakelock(overlap);
 			break;
 
@@ -426,7 +445,8 @@
 				needtolink = 0;
 			} else
 				*prev = overlap->lf_next;
-			free(overlap, M_LOCKF);
+			overlap->lf_next = *clean;
+			*clean = overlap;
 			continue;
 
 		case 4: /* overlap starts before lock */
@@ -471,8 +491,9 @@
  * and remove it (or shrink it), then wakeup anyone we can.
  */
 static int
-lf_clearlock(unlock)
-	register struct lockf *unlock;
+lf_clearlock(unlock, clean)
+	struct lockf *unlock;
+	struct lockf **clean;
 {
 	struct lockf **head = unlock->lf_head;
 	register struct lockf *lf = *head;
@@ -498,7 +519,8 @@
 
 		case 1: /* overlap == lock */
 			*prev = overlap->lf_next;
-			FREE(overlap, M_LOCKF);
+			overlap->lf_next = *clean;
+			*clean = overlap;
 			break;
 
 		case 2: /* overlap contains lock: split it */
@@ -506,14 +528,15 @@
 				overlap->lf_start = unlock->lf_end + 1;
 				break;
 			}
-			lf_split(overlap, unlock);
+			lf_split(overlap, unlock, clean);
 			overlap->lf_next = unlock->lf_next;
 			break;
 
 		case 3: /* lock contains overlap */
 			*prev = overlap->lf_next;
 			lf = overlap->lf_next;
-			free(overlap, M_LOCKF);
+			overlap->lf_next = *clean;
+			*clean = overlap;
 			continue;
 
 		case 4: /* overlap starts before lock */
@@ -714,11 +737,12 @@
  * two or three locks as necessary.
  */
 static void
-lf_split(lock1, lock2)
-	register struct lockf *lock1;
-	register struct lockf *lock2;
+lf_split(lock1, lock2, split)
+	struct lockf *lock1;
+	struct lockf *lock2;
+	struct lockf **split;
 {
-	register struct lockf *splitlock;
+	struct lockf *splitlock;
 
 #ifdef LOCKF_DEBUG
 	if (lockf_debug & 2) {
@@ -742,9 +766,12 @@
 	}
 	/*
 	 * Make a new lock consisting of the last part of
-	 * the encompassing lock
+	 * the encompassing lock.  We use the preallocated
+	 * splitlock so we don't have to block.
 	 */
-	MALLOC(splitlock, struct lockf *, sizeof *splitlock, M_LOCKF, M_WAITOK);
+	splitlock = *split;
+	KASSERT(splitlock != NULL, ("no split"));
+	*split = splitlock->lf_next;
 	bcopy(lock1, splitlock, sizeof *splitlock);
 	splitlock->lf_start = lock2->lf_end + 1;
 	TAILQ_INIT(&splitlock->lf_blkhd);
--- /dev/null
+++ sys/kern/vfs_acl.c
@@ -0,0 +1,431 @@
+/*-
+ * Copyright (c) 1999-2006 Robert N. M. Watson
+ * All rights reserved.
+ *
+ * This software was developed by Robert Watson for the TrustedBSD Project.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/*
+ * Developed by the TrustedBSD Project.
+ *
+ * ACL system calls and other functions common across different ACL types.
+ * Type-specific routines go into subr_acl_<type>.c.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: src/sys/kern/vfs_acl.c,v 1.53 2007/03/05 13:26:07 rwatson Exp $");
+
+#include "opt_mac.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/vnode.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/namei.h>
+#include <sys/file.h>
+#include <sys/filedesc.h>
+#include <sys/proc.h>
+#include <sys/sysent.h>
+#include <sys/acl.h>
+
+#include <security/mac/mac_framework.h>
+
+#include <vm/uma.h>
+
+uma_zone_t	acl_zone;
+static int	vacl_set_acl(struct thread *td, struct vnode *vp,
+		    acl_type_t type, struct acl *aclp);
+static int	vacl_get_acl(struct thread *td, struct vnode *vp,
+		    acl_type_t type, struct acl *aclp);
+static int	vacl_aclcheck(struct thread *td, struct vnode *vp,
+		    acl_type_t type, struct acl *aclp);
+
+/*
+ * These calls wrap the real vnode operations, and are called by the syscall
+ * code once the syscall has converted the path or file descriptor to a vnode
+ * (unlocked).  The aclp pointer is assumed still to point to userland, so
+ * this should not be consumed within the kernel except by syscall code.
+ * Other code should directly invoke VOP_{SET,GET}ACL.
+ */
+
+/*
+ * Given a vnode, set its ACL.
+ */
+static int
+vacl_set_acl(struct thread *td, struct vnode *vp, acl_type_t type,
+    struct acl *aclp)
+{
+	struct acl inkernacl;
+	struct mount *mp;
+	int error;
+
+	error = copyin(aclp, &inkernacl, sizeof(struct acl));
+	if (error)
+		return(error);
+	error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
+	if (error != 0)
+		return (error);
+	VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+#ifdef MAC
+	error = mac_check_vnode_setacl(td->td_ucred, vp, type, &inkernacl);
+	if (error != 0)
+		goto out;
+#endif
+	error = VOP_SETACL(vp, type, &inkernacl, td->td_ucred, td);
+#ifdef MAC
+out:
+#endif
+	VOP_UNLOCK(vp, 0, td);
+	vn_finished_write(mp);
+	return(error);
+}
+
+/*
+ * Given a vnode, get its ACL.
+ */
+static int
+vacl_get_acl(struct thread *td, struct vnode *vp, acl_type_t type,
+    struct acl *aclp)
+{
+	struct acl inkernelacl;
+	int error;
+
+	VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+#ifdef MAC
+	error = mac_check_vnode_getacl(td->td_ucred, vp, type);
+	if (error != 0)
+		goto out;
+#endif
+	error = VOP_GETACL(vp, type, &inkernelacl, td->td_ucred, td);
+#ifdef MAC
+out:
+#endif
+	VOP_UNLOCK(vp, 0, td);
+	if (error == 0)
+		error = copyout(&inkernelacl, aclp, sizeof(struct acl));
+	return (error);
+}
+
+/*
+ * Given a vnode, delete its ACL.
+ */
+static int
+vacl_delete(struct thread *td, struct vnode *vp, acl_type_t type)
+{
+	struct mount *mp;
+	int error;
+
+	error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
+	if (error)
+		return (error);
+	VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+#ifdef MAC
+	error = mac_check_vnode_deleteacl(td->td_ucred, vp, type);
+	if (error)
+		goto out;
+#endif
+	error = VOP_SETACL(vp, type, 0, td->td_ucred, td);
+#ifdef MAC
+out:
+#endif
+	VOP_UNLOCK(vp, 0, td);
+	vn_finished_write(mp);
+	return (error);
+}
+
+/*
+ * Given a vnode, check whether an ACL is appropriate for it
+ */
+static int
+vacl_aclcheck(struct thread *td, struct vnode *vp, acl_type_t type,
+    struct acl *aclp)
+{
+	struct acl inkernelacl;
+	int error;
+
+	error = copyin(aclp, &inkernelacl, sizeof(struct acl));
+	if (error)
+		return(error);
+	error = VOP_ACLCHECK(vp, type, &inkernelacl, td->td_ucred, td);
+	return (error);
+}
+
+/*
+ * syscalls -- convert the path/fd to a vnode, and call vacl_whatever.  Don't
+ * need to lock, as the vacl_ code will get/release any locks required.
+ */
+
+/*
+ * Given a file path, get an ACL for it
+ */
+int
+__acl_get_file(struct thread *td, struct __acl_get_file_args *uap)
+{
+	struct nameidata nd;
+	int vfslocked, error;
+
+	NDINIT(&nd, LOOKUP, MPSAFE|FOLLOW, UIO_USERSPACE, uap->path, td);
+	error = namei(&nd);
+	vfslocked = NDHASGIANT(&nd);
+	if (error == 0) {
+		error = vacl_get_acl(td, nd.ni_vp, uap->type, uap->aclp);
+		NDFREE(&nd, 0);
+	}
+	VFS_UNLOCK_GIANT(vfslocked);
+	return (error);
+}
+
+/*
+ * Given a file path, get an ACL for it; don't follow links.
+ */
+int
+__acl_get_link(struct thread *td, struct __acl_get_link_args *uap)
+{
+	struct nameidata nd;
+	int vfslocked, error;
+
+	NDINIT(&nd, LOOKUP, MPSAFE|NOFOLLOW, UIO_USERSPACE, uap->path, td);
+	error = namei(&nd);
+	vfslocked = NDHASGIANT(&nd);
+	if (error == 0) {
+		error = vacl_get_acl(td, nd.ni_vp, uap->type, uap->aclp);
+		NDFREE(&nd, 0);
+	}
+	VFS_UNLOCK_GIANT(vfslocked);
+	return (error);
+}
+
+/*
+ * Given a file path, set an ACL for it.
+ */
+int
+__acl_set_file(struct thread *td, struct __acl_set_file_args *uap)
+{
+	struct nameidata nd;
+	int vfslocked, error;
+
+	NDINIT(&nd, LOOKUP, MPSAFE|FOLLOW, UIO_USERSPACE, uap->path, td);
+	error = namei(&nd);
+	vfslocked = NDHASGIANT(&nd);
+	if (error == 0) {
+		error = vacl_set_acl(td, nd.ni_vp, uap->type, uap->aclp);
+		NDFREE(&nd, 0);
+	}
+	VFS_UNLOCK_GIANT(vfslocked);
+	return (error);
+}
+
+/*
+ * Given a file path, set an ACL for it; don't follow links.
+ */
+int
+__acl_set_link(struct thread *td, struct __acl_set_link_args *uap)
+{
+	struct nameidata nd;
+	int vfslocked, error;
+
+	NDINIT(&nd, LOOKUP, MPSAFE|NOFOLLOW, UIO_USERSPACE, uap->path, td);
+	error = namei(&nd);
+	vfslocked = NDHASGIANT(&nd);
+	if (error == 0) {
+		error = vacl_set_acl(td, nd.ni_vp, uap->type, uap->aclp);
+		NDFREE(&nd, 0);
+	}
+	VFS_UNLOCK_GIANT(vfslocked);
+	return (error);
+}
+
+/*
+ * Given a file descriptor, get an ACL for it.
+ */
+int
+__acl_get_fd(struct thread *td, struct __acl_get_fd_args *uap)
+{
+	struct file *fp;
+	int vfslocked, error;
+
+	error = getvnode(td->td_proc->p_fd, uap->filedes, &fp);
+	if (error == 0) {
+		vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
+		error = vacl_get_acl(td, fp->f_vnode, uap->type, uap->aclp);
+		fdrop(fp, td);
+		VFS_UNLOCK_GIANT(vfslocked);
+	}
+	return (error);
+}
+
+/*
+ * Given a file descriptor, set an ACL for it.
+ */
+int
+__acl_set_fd(struct thread *td, struct __acl_set_fd_args *uap)
+{
+	struct file *fp;
+	int vfslocked, error;
+
+	error = getvnode(td->td_proc->p_fd, uap->filedes, &fp);
+	if (error == 0) {
+		vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
+		error = vacl_set_acl(td, fp->f_vnode, uap->type, uap->aclp);
+		fdrop(fp, td);
+		VFS_UNLOCK_GIANT(vfslocked);
+	}
+	return (error);
+}
+
+/*
+ * Given a file path, delete an ACL from it.
+ */
+int
+__acl_delete_file(struct thread *td, struct __acl_delete_file_args *uap)
+{
+	struct nameidata nd;
+	int vfslocked, error;
+
+	NDINIT(&nd, LOOKUP, MPSAFE|FOLLOW, UIO_USERSPACE, uap->path, td);
+	error = namei(&nd);
+	vfslocked = NDHASGIANT(&nd);
+	if (error == 0) {
+		error = vacl_delete(td, nd.ni_vp, uap->type);
+		NDFREE(&nd, 0);
+	}
+	VFS_UNLOCK_GIANT(vfslocked);
+	return (error);
+}
+
+/*
+ * Given a file path, delete an ACL from it; don't follow links.
+ */
+int
+__acl_delete_link(struct thread *td, struct __acl_delete_link_args *uap)
+{
+	struct nameidata nd;
+	int vfslocked, error;
+
+	NDINIT(&nd, LOOKUP, MPSAFE|NOFOLLOW, UIO_USERSPACE, uap->path, td);
+	error = namei(&nd);
+	vfslocked = NDHASGIANT(&nd);
+	if (error == 0) {
+		error = vacl_delete(td, nd.ni_vp, uap->type);
+		NDFREE(&nd, 0);
+	}
+	VFS_UNLOCK_GIANT(vfslocked);
+	return (error);
+}
+
+/*
+ * Given a file path, delete an ACL from it.
+ */
+int
+__acl_delete_fd(struct thread *td, struct __acl_delete_fd_args *uap)
+{
+	struct file *fp;
+	int vfslocked, error;
+
+	error = getvnode(td->td_proc->p_fd, uap->filedes, &fp);
+	if (error == 0) {
+		vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
+		error = vacl_delete(td, fp->f_vnode, uap->type);
+		fdrop(fp, td);
+		VFS_UNLOCK_GIANT(vfslocked);
+	}
+	return (error);
+}
+
+/*
+ * Given a file path, check an ACL for it.
+ */
+int
+__acl_aclcheck_file(struct thread *td, struct __acl_aclcheck_file_args *uap)
+{
+	struct nameidata	nd;
+	int vfslocked, error;
+
+	NDINIT(&nd, LOOKUP, MPSAFE|FOLLOW, UIO_USERSPACE, uap->path, td);
+	error = namei(&nd);
+	vfslocked = NDHASGIANT(&nd);
+	if (error == 0) {
+		error = vacl_aclcheck(td, nd.ni_vp, uap->type, uap->aclp);
+		NDFREE(&nd, 0);
+	}
+	VFS_UNLOCK_GIANT(vfslocked);
+	return (error);
+}
+
+/*
+ * Given a file path, check an ACL for it; don't follow links.
+ */
+int
+__acl_aclcheck_link(struct thread *td, struct __acl_aclcheck_link_args *uap)
+{
+	struct nameidata	nd;
+	int vfslocked, error;
+
+	NDINIT(&nd, LOOKUP, MPSAFE|NOFOLLOW, UIO_USERSPACE, uap->path, td);
+	error = namei(&nd);
+	vfslocked = NDHASGIANT(&nd);
+	if (error == 0) {
+		error = vacl_aclcheck(td, nd.ni_vp, uap->type, uap->aclp);
+		NDFREE(&nd, 0);
+	}
+	VFS_UNLOCK_GIANT(vfslocked);
+	return (error);
+}
+
+/*
+ * Given a file descriptor, check an ACL for it.
+ */
+int
+__acl_aclcheck_fd(struct thread *td, struct __acl_aclcheck_fd_args *uap)
+{
+	struct file *fp;
+	int vfslocked, error;
+
+	error = getvnode(td->td_proc->p_fd, uap->filedes, &fp);
+	if (error == 0) {
+		vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
+		error = vacl_aclcheck(td, fp->f_vnode, uap->type, uap->aclp);
+		fdrop(fp, td);
+		VFS_UNLOCK_GIANT(vfslocked);
+	}
+	return (error);
+}
+
+/* ARGUSED */
+
+static void
+aclinit(void *dummy __unused)
+{
+
+	acl_zone = uma_zcreate("ACL UMA zone", sizeof(struct acl),
+	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+}
+SYSINIT(acls, SI_SUB_ACL, SI_ORDER_FIRST, aclinit, NULL)
Index: kern_intr.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_intr.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/kern_intr.c -L sys/kern/kern_intr.c -u -r1.2 -r1.3
--- sys/kern/kern_intr.c
+++ sys/kern/kern_intr.c
@@ -25,7 +25,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_intr.c,v 1.124.2.3.2.2 2006/04/15 20:08:33 scottl Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_intr.c,v 1.147 2007/06/05 00:00:54 jeff Exp $");
 
 #include "opt_ddb.h"
 
@@ -83,7 +83,7 @@
 
 static MALLOC_DEFINE(M_ITHREAD, "ithread", "Interrupt Threads");
 
-static int intr_storm_threshold = 500;
+static int intr_storm_threshold = 1000;
 TUNABLE_INT("hw.intr_storm_threshold", &intr_storm_threshold);
 SYSCTL_INT(_hw, OID_AUTO, intr_storm_threshold, CTLFLAG_RW,
     &intr_storm_threshold, 0,
@@ -92,13 +92,24 @@
     TAILQ_HEAD_INITIALIZER(event_list);
 
 static void	intr_event_update(struct intr_event *ie);
+#ifdef INTR_FILTER
+static struct intr_thread *ithread_create(const char *name,
+			      struct intr_handler *ih);
+#else
 static struct intr_thread *ithread_create(const char *name);
-static void	ithread_destroy2(struct intr_thread *ithread);
-static void	ithread_execute_handlers(struct proc *p, struct intr_event *ie);
+#endif
+static void	ithread_destroy(struct intr_thread *ithread);
+static void	ithread_execute_handlers(struct proc *p, 
+		    struct intr_event *ie);
+#ifdef INTR_FILTER
+static void	priv_ithread_execute_handler(struct proc *p, 
+		    struct intr_handler *ih);
+#endif
 static void	ithread_loop(void *);
 static void	ithread_update(struct intr_thread *ithd);
 static void	start_softintr(void *);
 
+/* Map an interrupt type to an ithread priority. */
 u_char
 intr_priority(enum intr_type flags)
 {
@@ -162,9 +173,9 @@
 	/* Update name and priority. */
 	strlcpy(td->td_proc->p_comm, ie->ie_fullname,
 	    sizeof(td->td_proc->p_comm));
-	mtx_lock_spin(&sched_lock);
+	thread_lock(td);
 	sched_prio(td, pri);
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(td);
 }
 
 /*
@@ -226,6 +237,7 @@
 	CTR2(KTR_INTR, "%s: updated %s", __func__, ie->ie_fullname);
 }
 
+#ifndef INTR_FILTER
 int
 intr_event_create(struct intr_event **event, void *source, int flags,
     void (*enable)(void *), const char *fmt, ...)
@@ -255,6 +267,40 @@
 	CTR2(KTR_INTR, "%s: created %s", __func__, ie->ie_name);
 	return (0);
 }
+#else
+int
+intr_event_create(struct intr_event **event, void *source, int flags,
+    void (*enable)(void *), void (*eoi)(void *), void (*disab)(void *), 
+    const char *fmt, ...)
+{
+	struct intr_event *ie;
+	va_list ap;
+
+	/* The only valid flag during creation is IE_SOFT. */
+	if ((flags & ~IE_SOFT) != 0)
+		return (EINVAL);
+	ie = malloc(sizeof(struct intr_event), M_ITHREAD, M_WAITOK | M_ZERO);
+	ie->ie_source = source;
+	ie->ie_enable = enable;
+	ie->ie_eoi = eoi;
+	ie->ie_disab = disab;
+	ie->ie_flags = flags;
+	TAILQ_INIT(&ie->ie_handlers);
+	mtx_init(&ie->ie_lock, "intr event", NULL, MTX_DEF);
+
+	va_start(ap, fmt);
+	vsnprintf(ie->ie_name, sizeof(ie->ie_name), fmt, ap);
+	va_end(ap);
+	strlcpy(ie->ie_fullname, ie->ie_name, sizeof(ie->ie_fullname));
+	mtx_pool_lock(mtxpool_sleep, &event_list);
+	TAILQ_INSERT_TAIL(&event_list, ie, ie_list);
+	mtx_pool_unlock(mtxpool_sleep, &event_list);
+	if (event != NULL)
+		*event = ie;
+	CTR2(KTR_INTR, "%s: created %s", __func__, ie->ie_name);
+	return (0);
+}
+#endif
 
 int
 intr_event_destroy(struct intr_event *ie)
@@ -270,7 +316,7 @@
 	mtx_pool_unlock(mtxpool_sleep, &event_list);
 #ifndef notyet
 	if (ie->ie_thread != NULL) {
-		ithread_destroy2(ie->ie_thread);
+		ithread_destroy(ie->ie_thread);
 		ie->ie_thread = NULL;
 	}
 #endif
@@ -280,6 +326,7 @@
 	return (0);
 }
 
+#ifndef INTR_FILTER
 static struct intr_thread *
 ithread_create(const char *name)
 {
@@ -295,53 +342,79 @@
 	if (error)
 		panic("kthread_create() failed with %d", error);
 	td = FIRST_THREAD_IN_PROC(p);	/* XXXKSE */
-	mtx_lock_spin(&sched_lock);
-	td->td_ksegrp->kg_pri_class = PRI_ITHD;
+	thread_lock(td);
+	sched_class(td, PRI_ITHD);
+	TD_SET_IWAIT(td);
+	thread_unlock(td);
+	td->td_pflags |= TDP_ITHREAD;
+	ithd->it_thread = td;
+	CTR2(KTR_INTR, "%s: created %s", __func__, name);
+	return (ithd);
+}
+#else
+static struct intr_thread *
+ithread_create(const char *name, struct intr_handler *ih)
+{
+	struct intr_thread *ithd;
+	struct thread *td;
+	struct proc *p;
+	int error;
+
+	ithd = malloc(sizeof(struct intr_thread), M_ITHREAD, M_WAITOK | M_ZERO);
+
+	error = kthread_create(ithread_loop, ih, &p, RFSTOPPED | RFHIGHPID,
+	    0, "%s", name);
+	if (error)
+		panic("kthread_create() failed with %d", error);
+	td = FIRST_THREAD_IN_PROC(p);	/* XXXKSE */
+	thread_lock(td);
+	sched_class(td, PRI_ITHD);
 	TD_SET_IWAIT(td);
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(td);
 	td->td_pflags |= TDP_ITHREAD;
 	ithd->it_thread = td;
 	CTR2(KTR_INTR, "%s: created %s", __func__, name);
 	return (ithd);
 }
+#endif
 
 static void
-ithread_destroy2(struct intr_thread *ithread)
+ithread_destroy(struct intr_thread *ithread)
 {
 	struct thread *td;
 
 	CTR2(KTR_INTR, "%s: killing %s", __func__, ithread->it_event->ie_name);
 	td = ithread->it_thread;
-	mtx_lock_spin(&sched_lock);
+	thread_lock(td);
 	ithread->it_flags |= IT_DEAD;
 	if (TD_AWAITING_INTR(td)) {
 		TD_CLR_IWAIT(td);
-		setrunqueue(td, SRQ_INTR);
+		sched_add(td, SRQ_INTR);
 	}
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(td);
 }
 
+#ifndef INTR_FILTER
 int
 intr_event_add_handler(struct intr_event *ie, const char *name,
-    driver_intr_t handler, void *arg, u_char pri, enum intr_type flags,
-    void **cookiep)
+    driver_filter_t filter, driver_intr_t handler, void *arg, u_char pri,
+    enum intr_type flags, void **cookiep)
 {
 	struct intr_handler *ih, *temp_ih;
 	struct intr_thread *it;
 
-	if (ie == NULL || name == NULL || handler == NULL)
+	if (ie == NULL || name == NULL || (handler == NULL && filter == NULL))
 		return (EINVAL);
 
 	/* Allocate and populate an interrupt handler structure. */
 	ih = malloc(sizeof(struct intr_handler), M_ITHREAD, M_WAITOK | M_ZERO);
+	ih->ih_filter = filter;
 	ih->ih_handler = handler;
 	ih->ih_argument = arg;
 	ih->ih_name = name;
 	ih->ih_event = ie;
 	ih->ih_pri = pri;
-	if (flags & INTR_FAST)
-		ih->ih_flags = IH_FAST;
-	else if (flags & INTR_EXCL)
+	if (flags & INTR_EXCL)
 		ih->ih_flags = IH_EXCLUSIVE;
 	if (flags & INTR_MPSAFE)
 		ih->ih_flags |= IH_MPSAFE;
@@ -371,10 +444,9 @@
 	intr_event_update(ie);
 
 	/* Create a thread if we need one. */
-	while (ie->ie_thread == NULL && !(flags & INTR_FAST)) {
+	while (ie->ie_thread == NULL && handler != NULL) {
 		if (ie->ie_flags & IE_ADDING_THREAD)
-			msleep(ie, &ie->ie_lock, curthread->td_priority,
-			    "ithread", 0);
+			msleep(ie, &ie->ie_lock, 0, "ithread", 0);
 		else {
 			ie->ie_flags |= IE_ADDING_THREAD;
 			mtx_unlock(&ie->ie_lock);
@@ -395,7 +467,111 @@
 		*cookiep = ih;
 	return (0);
 }
+#else
+int
+intr_event_add_handler(struct intr_event *ie, const char *name,
+    driver_filter_t filter, driver_intr_t handler, void *arg, u_char pri,
+    enum intr_type flags, void **cookiep)
+{
+	struct intr_handler *ih, *temp_ih;
+	struct intr_thread *it;
+
+	if (ie == NULL || name == NULL || (handler == NULL && filter == NULL))
+		return (EINVAL);
+
+	/* Allocate and populate an interrupt handler structure. */
+	ih = malloc(sizeof(struct intr_handler), M_ITHREAD, M_WAITOK | M_ZERO);
+	ih->ih_filter = filter;
+	ih->ih_handler = handler;
+	ih->ih_argument = arg;
+	ih->ih_name = name;
+	ih->ih_event = ie;
+	ih->ih_pri = pri;
+	if (flags & INTR_EXCL)
+		ih->ih_flags = IH_EXCLUSIVE;
+	if (flags & INTR_MPSAFE)
+		ih->ih_flags |= IH_MPSAFE;
+	if (flags & INTR_ENTROPY)
+		ih->ih_flags |= IH_ENTROPY;
+
+	/* We can only have one exclusive handler in a event. */
+	mtx_lock(&ie->ie_lock);
+	if (!TAILQ_EMPTY(&ie->ie_handlers)) {
+		if ((flags & INTR_EXCL) ||
+		    (TAILQ_FIRST(&ie->ie_handlers)->ih_flags & IH_EXCLUSIVE)) {
+			mtx_unlock(&ie->ie_lock);
+			free(ih, M_ITHREAD);
+			return (EINVAL);
+		}
+	}
+
+	/* Add the new handler to the event in priority order. */
+	TAILQ_FOREACH(temp_ih, &ie->ie_handlers, ih_next) {
+		if (temp_ih->ih_pri > ih->ih_pri)
+			break;
+	}
+	if (temp_ih == NULL)
+		TAILQ_INSERT_TAIL(&ie->ie_handlers, ih, ih_next);
+	else
+		TAILQ_INSERT_BEFORE(temp_ih, ih, ih_next);
+	intr_event_update(ie);
 
+	/* For filtered handlers, create a private ithread to run on. */
+	if (filter != NULL && handler != NULL) { 
+		mtx_unlock(&ie->ie_lock);
+		it = ithread_create("intr: newborn", ih);		
+		mtx_lock(&ie->ie_lock);
+		it->it_event = ie; 
+		ih->ih_thread = it;
+		ithread_update(it); // XXX - do we really need this?!?!?
+	} else { /* Create the global per-event thread if we need one. */
+		while (ie->ie_thread == NULL && handler != NULL) {
+			if (ie->ie_flags & IE_ADDING_THREAD)
+				msleep(ie, &ie->ie_lock, 0, "ithread", 0);
+			else {
+				ie->ie_flags |= IE_ADDING_THREAD;
+				mtx_unlock(&ie->ie_lock);
+				it = ithread_create("intr: newborn", ih);
+				mtx_lock(&ie->ie_lock);
+				ie->ie_flags &= ~IE_ADDING_THREAD;
+				ie->ie_thread = it;
+				it->it_event = ie;
+				ithread_update(it);
+				wakeup(ie);
+			}
+		}
+	}
+	CTR3(KTR_INTR, "%s: added %s to %s", __func__, ih->ih_name,
+	    ie->ie_name);
+	mtx_unlock(&ie->ie_lock);
+
+	if (cookiep != NULL)
+		*cookiep = ih;
+	return (0);
+}
+#endif
+
+/*
+ * Return the ie_source field from the intr_event an intr_handler is
+ * associated with.
+ */
+void *
+intr_handler_source(void *cookie)
+{
+	struct intr_handler *ih;
+	struct intr_event *ie;
+
+	ih = (struct intr_handler *)cookie;
+	if (ih == NULL)
+		return (NULL);
+	ie = ih->ih_event;
+	KASSERT(ie != NULL,
+	    ("interrupt handler \"%s\" has a NULL interrupt event",
+	    ih->ih_name));
+	return (ie->ie_source);
+}
+
+#ifndef INTR_FILTER
 int
 intr_event_remove_handler(void *cookie)
 {
@@ -413,7 +589,7 @@
 	ie = handler->ih_event;
 	KASSERT(ie != NULL,
 	    ("interrupt handler \"%s\" has a NULL interrupt event",
-		handler->ih_name));
+	    handler->ih_name));
 	mtx_lock(&ie->ie_lock);
 	CTR3(KTR_INTR, "%s: removing %s from %s", __func__, handler->ih_name,
 	    ie->ie_name);
@@ -446,7 +622,7 @@
 	 * so we have to remove the handler here rather than letting the
 	 * thread do it.
 	 */
-	mtx_lock_spin(&sched_lock);
+	thread_lock(ie->ie_thread->it_thread);
 	if (!TD_AWAITING_INTR(ie->ie_thread->it_thread) && !cold) {
 		handler->ih_flags |= IH_DEAD;
 
@@ -458,10 +634,9 @@
 		ie->ie_thread->it_need = 1;
 	} else
 		TAILQ_REMOVE(&ie->ie_handlers, handler, ih_next);
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(ie->ie_thread->it_thread);
 	while (handler->ih_flags & IH_DEAD)
-		msleep(handler, &ie->ie_lock, curthread->td_priority, "iev_rmh",
-		    0);
+		msleep(handler, &ie->ie_lock, 0, "iev_rmh", 0);
 	intr_event_update(ie);
 #ifdef notyet
 	/*
@@ -477,7 +652,7 @@
 		}
 	}
 	if (dead) {
-		ithread_destroy2(ie->ie_thread);
+		ithread_destroy(ie->ie_thread);
 		ie->ie_thread = NULL;
 	}
 #endif
@@ -524,24 +699,179 @@
 
 	/*
 	 * Set it_need to tell the thread to keep running if it is already
-	 * running.  Then, grab sched_lock and see if we actually need to
-	 * put this thread on the runqueue.
+	 * running.  Then, lock the thread and see if we actually need to
+	 * put it on the runqueue.
+	 */
+	it->it_need = 1;
+	thread_lock(td);
+	if (TD_AWAITING_INTR(td)) {
+		CTR3(KTR_INTR, "%s: schedule pid %d (%s)", __func__, p->p_pid,
+		    p->p_comm);
+		TD_CLR_IWAIT(td);
+		sched_add(td, SRQ_INTR);
+	} else {
+		CTR5(KTR_INTR, "%s: pid %d (%s): it_need %d, state %d",
+		    __func__, p->p_pid, p->p_comm, it->it_need, td->td_state);
+	}
+	thread_unlock(td);
+
+	return (0);
+}
+#else
+int
+intr_event_remove_handler(void *cookie)
+{
+	struct intr_handler *handler = (struct intr_handler *)cookie;
+	struct intr_event *ie;
+	struct intr_thread *it;
+#ifdef INVARIANTS
+	struct intr_handler *ih;
+#endif
+#ifdef notyet
+	int dead;
+#endif
+
+	if (handler == NULL)
+		return (EINVAL);
+	ie = handler->ih_event;
+	KASSERT(ie != NULL,
+	    ("interrupt handler \"%s\" has a NULL interrupt event",
+	    handler->ih_name));
+	mtx_lock(&ie->ie_lock);
+	CTR3(KTR_INTR, "%s: removing %s from %s", __func__, handler->ih_name,
+	    ie->ie_name);
+#ifdef INVARIANTS
+	TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next)
+		if (ih == handler)
+			goto ok;
+	mtx_unlock(&ie->ie_lock);
+	panic("interrupt handler \"%s\" not found in interrupt event \"%s\"",
+	    ih->ih_name, ie->ie_name);
+ok:
+#endif
+	/*
+	 * If there are no ithreads (per event and per handler), then
+	 * just remove the handler and return.  
+	 * XXX: Note that an INTR_FAST handler might be running on another CPU!
+	 */
+	if (ie->ie_thread == NULL && handler->ih_thread == NULL) {
+		TAILQ_REMOVE(&ie->ie_handlers, handler, ih_next);
+		mtx_unlock(&ie->ie_lock);
+		free(handler, M_ITHREAD);
+		return (0);
+	}
+
+	/* Private or global ithread? */
+	it = (handler->ih_thread) ? handler->ih_thread : ie->ie_thread;
+	/*
+	 * If the interrupt thread is already running, then just mark this
+	 * handler as being dead and let the ithread do the actual removal.
+	 *
+	 * During a cold boot while cold is set, msleep() does not sleep,
+	 * so we have to remove the handler here rather than letting the
+	 * thread do it.
+	 */
+	thread_lock(it->it_thread);
+	if (!TD_AWAITING_INTR(it->it_thread) && !cold) {
+		handler->ih_flags |= IH_DEAD;
+
+		/*
+		 * Ensure that the thread will process the handler list
+		 * again and remove this handler if it has already passed
+		 * it on the list.
+		 */
+		it->it_need = 1;
+	} else
+		TAILQ_REMOVE(&ie->ie_handlers, handler, ih_next);
+	thread_unlock(it->it_thread);
+	while (handler->ih_flags & IH_DEAD)
+		msleep(handler, &ie->ie_lock, 0, "iev_rmh", 0);
+	/* 
+	 * At this point, the handler has been disconnected from the event,
+	 * so we can kill the private ithread if any.
+	 */
+	if (handler->ih_thread) {
+		ithread_destroy(handler->ih_thread);
+		handler->ih_thread = NULL;
+	}
+	intr_event_update(ie);
+#ifdef notyet
+	/*
+	 * XXX: This could be bad in the case of ppbus(8).  Also, I think
+	 * this could lead to races of stale data when servicing an
+	 * interrupt.
+	 */
+	dead = 1;
+	TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next) {
+		if (handler != NULL) {
+			dead = 0;
+			break;
+		}
+	}
+	if (dead) {
+		ithread_destroy(ie->ie_thread);
+		ie->ie_thread = NULL;
+	}
+#endif
+	mtx_unlock(&ie->ie_lock);
+	free(handler, M_ITHREAD);
+	return (0);
+}
+
+int
+intr_event_schedule_thread(struct intr_event *ie, struct intr_thread *it)
+{
+	struct intr_entropy entropy;
+	struct thread *td;
+	struct thread *ctd;
+	struct proc *p;
+
+	/*
+	 * If no ithread or no handlers, then we have a stray interrupt.
+	 */
+	if (ie == NULL || TAILQ_EMPTY(&ie->ie_handlers) || it == NULL)
+		return (EINVAL);
+
+	ctd = curthread;
+	td = it->it_thread;
+	p = td->td_proc;
+
+	/*
+	 * If any of the handlers for this ithread claim to be good
+	 * sources of entropy, then gather some.
+	 */
+	if (harvest.interrupt && ie->ie_flags & IE_ENTROPY) {
+		CTR3(KTR_INTR, "%s: pid %d (%s) gathering entropy", __func__,
+		    p->p_pid, p->p_comm);
+		entropy.event = (uintptr_t)ie;
+		entropy.td = ctd;
+		random_harvest(&entropy, sizeof(entropy), 2, 0,
+		    RANDOM_INTERRUPT);
+	}
+
+	KASSERT(p != NULL, ("ithread %s has no process", ie->ie_name));
+
+	/*
+	 * Set it_need to tell the thread to keep running if it is already
+	 * running.  Then, lock the thread and see if we actually need to
+	 * put it on the runqueue.
 	 */
 	it->it_need = 1;
-	mtx_lock_spin(&sched_lock);
+	thread_lock(td);
 	if (TD_AWAITING_INTR(td)) {
 		CTR3(KTR_INTR, "%s: schedule pid %d (%s)", __func__, p->p_pid,
 		    p->p_comm);
 		TD_CLR_IWAIT(td);
-		setrunqueue(td, SRQ_INTR);
+		sched_add(td, SRQ_INTR);
 	} else {
 		CTR5(KTR_INTR, "%s: pid %d (%s): it_need %d, state %d",
 		    __func__, p->p_pid, p->p_comm, it->it_need, td->td_state);
 	}
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(td);
 
 	return (0);
 }
+#endif
 
 /*
  * Add a software interrupt handler to a specified event.  If a given event
@@ -554,7 +884,7 @@
 	struct intr_event *ie;
 	int error;
 
-	if (flags & (INTR_FAST | INTR_ENTROPY))
+	if (flags & INTR_ENTROPY)
 		return (EINVAL);
 
 	ie = (eventp != NULL) ? *eventp : NULL;
@@ -563,14 +893,19 @@
 		if (!(ie->ie_flags & IE_SOFT))
 			return (EINVAL);
 	} else {
-		error = intr_event_create(&ie, NULL, IE_SOFT, NULL,
-		    "swi%d:", pri);
+#ifdef INTR_FILTER
+		error = intr_event_create(&ie, NULL, IE_SOFT,
+		    NULL, NULL, NULL, "swi%d:", pri);
+#else
+		error = intr_event_create(&ie, NULL, IE_SOFT,
+		    NULL, "swi%d:", pri);
+#endif
 		if (error)
 			return (error);
 		if (eventp != NULL)
 			*eventp = ie;
 	}
-	return (intr_event_add_handler(ie, name, handler, arg,
+	return (intr_event_add_handler(ie, name, NULL, handler, arg,
 		    (pri * RQ_PPQ) + PI_SOFT, flags, cookiep));
 		    /* XXKSE.. think of a better way to get separate queues */
 }
@@ -585,8 +920,6 @@
 	struct intr_event *ie = ih->ih_event;
 	int error;
 
-	PCPU_LAZY_INC(cnt.v_intr);
-
 	CTR3(KTR_INTR, "swi_sched: %s %s need=%d", ie->ie_name, ih->ih_name,
 	    ih->ih_need);
 
@@ -596,8 +929,14 @@
 	 * it will execute it the next time it runs.
 	 */
 	atomic_store_rel_int(&ih->ih_need, 1);
+
 	if (!(flags & SWI_DELAY)) {
+		PCPU_INC(cnt.v_soft);
+#ifdef INTR_FILTER
+		error = intr_event_schedule_thread(ie, ie->ie_thread);
+#else
 		error = intr_event_schedule_thread(ie);
+#endif
 		KASSERT(error == 0, ("stray software interrupt"));
 	}
 }
@@ -615,25 +954,38 @@
 	return (intr_event_remove_handler(cookie));
 }
 
-/* ABI compatibility shims. */
-#undef ithread_remove_handler
-#undef ithread_destroy
-int	ithread_remove_handler(void *);
-int	ithread_destroy(struct ithd *);
-
-int
-ithread_remove_handler(void *cookie)
-{
-
-	return (intr_event_remove_handler(cookie));
-}
-
-int
-ithread_destroy(struct ithd *ithread)
+#ifdef INTR_FILTER
+static void
+priv_ithread_execute_handler(struct proc *p, struct intr_handler *ih)
 {
+	struct intr_event *ie;
 
-	return (intr_event_destroy(ithread));
+	ie = ih->ih_event;
+	/*
+	 * If this handler is marked for death, remove it from
+	 * the list of handlers and wake up the sleeper.
+	 */
+	if (ih->ih_flags & IH_DEAD) {
+		mtx_lock(&ie->ie_lock);
+		TAILQ_REMOVE(&ie->ie_handlers, ih, ih_next);
+		ih->ih_flags &= ~IH_DEAD;
+		wakeup(ih);
+		mtx_unlock(&ie->ie_lock);
+		return;
+	}
+	
+	/* Execute this handler. */
+	CTR6(KTR_INTR, "%s: pid %d exec %p(%p) for %s flg=%x",
+	     __func__, p->p_pid, (void *)ih->ih_handler, ih->ih_argument,
+	     ih->ih_name, ih->ih_flags);
+	
+	if (!(ih->ih_flags & IH_MPSAFE))
+		mtx_lock(&Giant);
+	ih->ih_handler(ih->ih_argument);
+	if (!(ih->ih_flags & IH_MPSAFE))
+		mtx_unlock(&Giant);
 }
+#endif
 
 static void
 ithread_execute_handlers(struct proc *p, struct intr_event *ie)
@@ -658,6 +1010,10 @@
 			continue;
 		}
 
+		/* Skip filter only handlers */
+		if (ih->ih_handler == NULL)
+			continue;
+
 		/*
 		 * For software interrupt threads, we only execute
 		 * handlers that have their need flag set.  Hardware
@@ -670,14 +1026,10 @@
 				atomic_store_rel_int(&ih->ih_need, 0);
 		}
 
-		/* Fast handlers are handled in primary interrupt context. */
-		if (ih->ih_flags & IH_FAST)
-			continue;
-
 		/* Execute this handler. */
 		CTR6(KTR_INTR, "%s: pid %d exec %p(%p) for %s flg=%x",
-		    __func__, p->p_pid, (void *)ih->ih_handler, ih->ih_argument,
-		    ih->ih_name, ih->ih_flags);
+		    __func__, p->p_pid, (void *)ih->ih_handler, 
+		    ih->ih_argument, ih->ih_name, ih->ih_flags);
 
 		if (!(ih->ih_flags & IH_MPSAFE))
 			mtx_lock(&Giant);
@@ -698,14 +1050,15 @@
 	 * number of back to back interrupts exceeds the storm threshold,
 	 * then enter storming mode.
 	 */
-	if (intr_storm_threshold != 0 && ie->ie_count >= intr_storm_threshold) {
-		if (ie->ie_warned == 0) {
+	if (intr_storm_threshold != 0 && ie->ie_count >= intr_storm_threshold &&
+	    !(ie->ie_flags & IE_SOFT)) {
+		/* Report the message only once every second. */
+		if (ppsratecheck(&ie->ie_warntm, &ie->ie_warncnt, 1)) {
 			printf(
-	"Interrupt storm detected on \"%s\"; throttling interrupt source\n",
+	"interrupt storm detected on \"%s\"; throttling interrupt source\n",
 			    ie->ie_name);
-			ie->ie_warned = 1;
 		}
-		tsleep(&ie->ie_count, curthread->td_priority, "istorm", 1);
+		pause("istorm", 1);
 	} else
 		ie->ie_count++;
 
@@ -717,6 +1070,7 @@
 		ie->ie_enable(ie->ie_source);
 }
 
+#ifndef INTR_FILTER
 /*
  * This is the main code for interrupt threads.
  */
@@ -774,15 +1128,221 @@
 		 * lock.  This may take a while and it_need may get
 		 * set again, so we have to check it again.
 		 */
-		mtx_lock_spin(&sched_lock);
+		thread_lock(td);
 		if (!ithd->it_need && !(ithd->it_flags & IT_DEAD)) {
 			TD_SET_IWAIT(td);
 			ie->ie_count = 0;
 			mi_switch(SW_VOL, NULL);
 		}
-		mtx_unlock_spin(&sched_lock);
+		thread_unlock(td);
 	}
 }
+#else
+/*
+ * This is the main code for interrupt threads.
+ */
+static void
+ithread_loop(void *arg)
+{
+	struct intr_thread *ithd;
+	struct intr_handler *ih;
+	struct intr_event *ie;
+	struct thread *td;
+	struct proc *p;
+	int priv;
+
+	td = curthread;
+	p = td->td_proc;
+	ih = (struct intr_handler *)arg;
+	priv = (ih->ih_thread != NULL) ? 1 : 0;
+	ithd = (priv) ? ih->ih_thread : ih->ih_event->ie_thread;
+	KASSERT(ithd->it_thread == td,
+	    ("%s: ithread and proc linkage out of sync", __func__));
+	ie = ithd->it_event;
+	ie->ie_count = 0;
+
+	/*
+	 * As long as we have interrupts outstanding, go through the
+	 * list of handlers, giving each one a go at it.
+	 */
+	for (;;) {
+		/*
+		 * If we are an orphaned thread, then just die.
+		 */
+		if (ithd->it_flags & IT_DEAD) {
+			CTR3(KTR_INTR, "%s: pid %d (%s) exiting", __func__,
+			    p->p_pid, p->p_comm);
+			free(ithd, M_ITHREAD);
+			kthread_exit(0);
+		}
+
+		/*
+		 * Service interrupts.  If another interrupt arrives while
+		 * we are running, it will set it_need to note that we
+		 * should make another pass.
+		 */
+		while (ithd->it_need) {
+			/*
+			 * This might need a full read and write barrier
+			 * to make sure that this write posts before any
+			 * of the memory or device accesses in the
+			 * handlers.
+			 */
+			atomic_store_rel_int(&ithd->it_need, 0);
+			if (priv)
+				priv_ithread_execute_handler(p, ih);
+			else 
+				ithread_execute_handlers(p, ie);
+		}
+		WITNESS_WARN(WARN_PANIC, NULL, "suspending ithread");
+		mtx_assert(&Giant, MA_NOTOWNED);
+
+		/*
+		 * Processed all our interrupts.  Now get the sched
+		 * lock.  This may take a while and it_need may get
+		 * set again, so we have to check it again.
+		 */
+		thread_lock(td);
+		if (!ithd->it_need && !(ithd->it_flags & IT_DEAD)) {
+			TD_SET_IWAIT(td);
+			ie->ie_count = 0;
+			mi_switch(SW_VOL, NULL);
+		}
+		thread_unlock(td);
+	}
+}
+
+/* 
+ * Main loop for interrupt filter.
+ *
+ * Some architectures (i386, amd64 and arm) require the optional frame 
+ * parameter, and use it as the main argument for fast handler execution
+ * when ih_argument == NULL.
+ *
+ * Return value:
+ * o FILTER_STRAY:              No filter recognized the event, and no
+ *                              filter-less handler is registered on this 
+ *                              line.
+ * o FILTER_HANDLED:            A filter claimed the event and served it.
+ * o FILTER_SCHEDULE_THREAD:    No filter claimed the event, but there's at
+ *                              least one filter-less handler on this line.
+ * o FILTER_HANDLED | 
+ *   FILTER_SCHEDULE_THREAD:    A filter claimed the event, and asked for
+ *                              scheduling the per-handler ithread.
+ *
+ * In case an ithread has to be scheduled, in *ithd there will be a 
+ * pointer to a struct intr_thread containing the thread to be
+ * scheduled.
+ */
+
+int
+intr_filter_loop(struct intr_event *ie, struct trapframe *frame, 
+		 struct intr_thread **ithd) 
+{
+	struct intr_handler *ih;
+	void *arg;
+	int ret, thread_only;
+
+	ret = 0;
+	thread_only = 0;
+	TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next) {
+		/*
+		 * Execute fast interrupt handlers directly.
+		 * To support clock handlers, if a handler registers
+		 * with a NULL argument, then we pass it a pointer to
+		 * a trapframe as its argument.
+		 */
+		arg = ((ih->ih_argument == NULL) ? frame : ih->ih_argument);
+		
+		CTR5(KTR_INTR, "%s: exec %p/%p(%p) for %s", __func__,
+		     ih->ih_filter, ih->ih_handler, arg, ih->ih_name);
+
+		if (ih->ih_filter != NULL)
+			ret = ih->ih_filter(arg);
+		else {
+			thread_only = 1;
+			continue;
+		}
+
+		if (ret & FILTER_STRAY)
+			continue;
+		else { 
+			*ithd = ih->ih_thread;
+			return (ret);
+		}
+	}
+
+	/*
+	 * No filters handled the interrupt and we have at least
+	 * one handler without a filter.  In this case, we schedule
+	 * all of the filter-less handlers to run in the ithread.
+	 */	
+	if (thread_only) {
+		*ithd = ie->ie_thread;
+		return (FILTER_SCHEDULE_THREAD);
+	}
+	return (FILTER_STRAY);
+}
+
+/*
+ * Main interrupt handling body.
+ *
+ * Input:
+ * o ie:                        the event connected to this interrupt.
+ * o frame:                     some archs (i.e. i386) pass a frame to some.
+ *                              handlers as their main argument.
+ * Return value:
+ * o 0:                         everything ok.
+ * o EINVAL:                    stray interrupt.
+ */
+int
+intr_event_handle(struct intr_event *ie, struct trapframe *frame)
+{
+	struct intr_thread *ithd;
+	struct thread *td;
+	int thread;
+
+	ithd = NULL;
+	td = curthread;
+
+	if (ie == NULL || TAILQ_EMPTY(&ie->ie_handlers))
+		return (EINVAL);
+
+	td->td_intr_nesting_level++;
+	thread = 0;
+	critical_enter();
+	thread = intr_filter_loop(ie, frame, &ithd);
+	
+	/*
+	 * If the interrupt was fully served, send it an EOI but leave
+	 * it unmasked. Otherwise, mask the source as well as sending
+	 * it an EOI.
+	 */
+	if (thread & FILTER_HANDLED) {
+		if (ie->ie_eoi != NULL)
+			ie->ie_eoi(ie->ie_source);
+	} else {
+		if (ie->ie_disab != NULL)
+			ie->ie_disab(ie->ie_source);
+	}
+	critical_exit();
+	
+	/* Interrupt storm logic */
+	if (thread & FILTER_STRAY) {
+		ie->ie_count++;
+		if (ie->ie_count < intr_storm_threshold)
+			printf("Interrupt stray detection not present\n");
+	}
+
+	/* Schedule an ithread if needed. */
+	if (thread & FILTER_SCHEDULE_THREAD) {
+		if (intr_event_schedule_thread(ie, ithd) != 0)
+			panic("%s: impossible stray interrupt", __func__);
+	}
+	td->td_intr_nesting_level--;
+	return (0);
+}
+#endif
 
 #ifdef DDB
 /*
@@ -829,14 +1389,10 @@
 	db_printsym((uintptr_t)ih->ih_handler, DB_STGY_PROC);
 	db_printf("(%p)", ih->ih_argument);
 	if (ih->ih_need ||
-	    (ih->ih_flags & (IH_FAST | IH_EXCLUSIVE | IH_ENTROPY | IH_DEAD |
+	    (ih->ih_flags & (IH_EXCLUSIVE | IH_ENTROPY | IH_DEAD |
 	    IH_MPSAFE)) != 0) {
 		db_printf(" {");
 		comma = 0;
-		if (ih->ih_flags & IH_FAST) {
-			db_printf("FAST");
-			comma = 1;
-		}
 		if (ih->ih_flags & IH_EXCLUSIVE) {
 			if (comma)
 				db_printf(", ");
@@ -927,16 +1483,16 @@
 DB_SHOW_COMMAND(intr, db_show_intr)
 {
 	struct intr_event *ie;
-	int quit, all, verbose;
+	int all, verbose;
 
-	quit = 0;
 	verbose = index(modif, 'v') != NULL;
 	all = index(modif, 'a') != NULL;
-	db_setup_paging(db_simple_pager, &quit, db_lines_per_page);
 	TAILQ_FOREACH(ie, &event_list, ie_list) {
 		if (!all && TAILQ_EMPTY(&ie->ie_handlers))
 			continue;
 		db_dump_intr_event(ie, verbose);
+		if (db_pager_quit)
+			break;
 	}
 }
 #endif /* DDB */
@@ -998,11 +1554,9 @@
 {
 	u_long *i;
 	char *cp;
-	int quit;
 
 	cp = intrnames;
-	db_setup_paging(db_simple_pager, &quit, db_lines_per_page);
-	for (i = intrcnt, quit = 0; i != eintrcnt && !quit; i++) {
+	for (i = intrcnt; i != eintrcnt && !db_pager_quit; i++) {
 		if (*cp == '\0')
 			break;
 		if (*i != 0)
Index: vfs_bio.c
===================================================================
RCS file: /home/cvs/src/sys/kern/vfs_bio.c,v
retrieving revision 1.6
retrieving revision 1.7
diff -L sys/kern/vfs_bio.c -L sys/kern/vfs_bio.c -u -r1.6 -r1.7
--- sys/kern/vfs_bio.c
+++ sys/kern/vfs_bio.c
@@ -39,7 +39,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/vfs_bio.c,v 1.491.2.7 2006/03/13 03:06:09 jeff Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/vfs_bio.c,v 1.528 2007/09/26 11:22:23 ru Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -48,6 +48,7 @@
 #include <sys/buf.h>
 #include <sys/devicestat.h>
 #include <sys/eventhandler.h>
+#include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
@@ -71,7 +72,7 @@
 #include "opt_directio.h"
 #include "opt_swap.h"
 
-static MALLOC_DEFINE(M_BIOBUF, "BIO buffer", "BIO buffer");
+static MALLOC_DEFINE(M_BIOBUF, "biobuf", "BIO buffer");
 
 struct	bio_ops bioops;		/* I/O operation notification */
 
@@ -80,6 +81,7 @@
 	.bop_write	=	bufwrite,
 	.bop_strategy	=	bufstrategy,
 	.bop_sync	=	bufsync,
+	.bop_bdflush	=	bufbdflush,
 };
 
 /*
@@ -99,10 +101,11 @@
 			       int pageno, vm_page_t m);
 static void vfs_clean_pages(struct buf *bp);
 static void vfs_setdirty(struct buf *bp);
+static void vfs_setdirty_locked_object(struct buf *bp);
 static void vfs_vmio_release(struct buf *bp);
 static int vfs_bio_clcheck(struct vnode *vp, int size,
 		daddr_t lblkno, daddr_t blkno);
-static int flushbufqueues(int flushdeps);
+static int flushbufqueues(int, int);
 static void buf_daemon(void);
 static void bremfreel(struct buf *bp);
 
@@ -145,10 +148,13 @@
 static int hirunningspace;
 SYSCTL_INT(_vfs, OID_AUTO, hirunningspace, CTLFLAG_RW, &hirunningspace, 0,
     "Maximum amount of space to use for in-progress I/O");
-static int dirtybufferflushes;
+int dirtybufferflushes;
 SYSCTL_INT(_vfs, OID_AUTO, dirtybufferflushes, CTLFLAG_RW, &dirtybufferflushes,
     0, "Number of bdwrite to bawrite conversions to limit dirty buffers");
-static int altbufferflushes;
+int bdwriteskip;
+SYSCTL_INT(_vfs, OID_AUTO, bdwriteskip, CTLFLAG_RW, &bdwriteskip,
+    0, "Number of buffers supplied to bdwrite with snapshot deadlock risk");
+int altbufferflushes;
 SYSCTL_INT(_vfs, OID_AUTO, altbufferflushes, CTLFLAG_RW, &altbufferflushes,
     0, "Number of fsync flushes to limit dirty buffers");
 static int recursiveflushes;
@@ -163,7 +169,7 @@
 static int hidirtybuffers;
 SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW, &hidirtybuffers, 0,
     "When the number of dirty buffers is considered severe");
-static int dirtybufthresh;
+int dirtybufthresh;
 SYSCTL_INT(_vfs, OID_AUTO, dirtybufthresh, CTLFLAG_RW, &dirtybufthresh,
     0, "Number of bdwrite to bawrite conversions to clear dirty buffers");
 static int numfreebuffers;
@@ -237,15 +243,21 @@
 static struct mtx bdonelock;
 
 /*
+ * Lock that protects against bwait()/bdone()/B_DONE races.
+ */
+static struct mtx bpinlock;
+
+/*
  * Definitions for the buffer free lists.
  */
-#define BUFFER_QUEUES	5	/* number of free buffer queues */
+#define BUFFER_QUEUES	6	/* number of free buffer queues */
 
 #define QUEUE_NONE	0	/* on no queue */
 #define QUEUE_CLEAN	1	/* non-B_DELWRI buffers */
 #define QUEUE_DIRTY	2	/* B_DELWRI buffers */
-#define QUEUE_EMPTYKVA	3	/* empty buffer headers w/KVA assignment */
-#define QUEUE_EMPTY	4	/* empty buffer headers */
+#define QUEUE_DIRTY_GIANT 3	/* B_DELWRI buffers that need giant */
+#define QUEUE_EMPTYKVA	4	/* empty buffer headers w/KVA assignment */
+#define QUEUE_EMPTY	5	/* empty buffer headers */
 
 /* Queues for free buffers with various properties */
 static TAILQ_HEAD(bqueues, buf) bufqueues[BUFFER_QUEUES] = { { 0 } };
@@ -408,7 +420,7 @@
 	}
 }
 
-/* Wake up the buffer deamon if necessary */
+/* Wake up the buffer daemon if necessary */
 static __inline
 void
 bd_wakeup(int dirtybuflevel)
@@ -443,6 +455,7 @@
 caddr_t
 kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est)
 {
+	int maxbuf;
 
 	/*
 	 * physmem_est is in pages.  Convert it to kilobytes (assumes
@@ -454,7 +467,7 @@
 	 * The nominal buffer size (and minimum KVA allocation) is BKVASIZE.
 	 * For the first 64MB of ram nominally allocate sufficient buffers to
 	 * cover 1/4 of our ram.  Beyond the first 64MB allocate additional
-	 * buffers to cover 1/20 of our ram over 64MB.  When auto-sizing
+	 * buffers to cover 1/10 of our ram over 64MB.  When auto-sizing
 	 * the buffer cache we limit the eventual kva reservation to
 	 * maxbcache bytes.
 	 *
@@ -472,6 +485,11 @@
 
 		if (maxbcache && nbuf > maxbcache / BKVASIZE)
 			nbuf = maxbcache / BKVASIZE;
+
+		/* XXX Avoid integer overflows later on with maxbufspace. */
+		maxbuf = (INT_MAX / 3) / BKVASIZE;
+		if (nbuf > maxbuf)
+			nbuf = maxbuf;
 	}
 
 #if 0
@@ -523,6 +541,7 @@
 	mtx_init(&nblock, "needsbuffer lock", NULL, MTX_DEF);
 	mtx_init(&bdlock, "buffer daemon lock", NULL, MTX_DEF);
 	mtx_init(&bdonelock, "bdone lock", NULL, MTX_DEF);
+	mtx_init(&bpinlock, "bpin lock", NULL, MTX_DEF);
 
 	/* next, make a null set of free lists */
 	for (i = 0; i < BUFFER_QUEUES; i++)
@@ -621,12 +640,8 @@
 	if (bp->b_kvasize) {
 		atomic_add_int(&buffreekvacnt, 1);
 		atomic_subtract_int(&bufspace, bp->b_kvasize);
-		vm_map_lock(buffer_map);
-		vm_map_delete(buffer_map,
-		    (vm_offset_t) bp->b_kvabase,
-		    (vm_offset_t) bp->b_kvabase + bp->b_kvasize
-		);
-		vm_map_unlock(buffer_map);
+		vm_map_remove(buffer_map, (vm_offset_t) bp->b_kvabase,
+		    (vm_offset_t) bp->b_kvabase + bp->b_kvasize);
 		bp->b_kvasize = 0;
 		bufspacewakeup();
 	}
@@ -720,18 +735,51 @@
 }
 
 /*
+ * Attempt to initiate asynchronous I/O on read-ahead blocks.  We must
+ * clear BIO_ERROR and B_INVAL prior to initiating I/O . If B_CACHE is set,
+ * the buffer is valid and we do not have to do anything.
+ */
+void
+breada(struct vnode * vp, daddr_t * rablkno, int * rabsize,
+    int cnt, struct ucred * cred)
+{
+	struct buf *rabp;
+	int i;
+
+	for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
+		if (inmem(vp, *rablkno))
+			continue;
+		rabp = getblk(vp, *rablkno, *rabsize, 0, 0, 0);
+
+		if ((rabp->b_flags & B_CACHE) == 0) {
+			if (!TD_IS_IDLETHREAD(curthread))
+				curthread->td_ru.ru_inblock++;
+			rabp->b_flags |= B_ASYNC;
+			rabp->b_flags &= ~B_INVAL;
+			rabp->b_ioflags &= ~BIO_ERROR;
+			rabp->b_iocmd = BIO_READ;
+			if (rabp->b_rcred == NOCRED && cred != NOCRED)
+				rabp->b_rcred = crhold(cred);
+			vfs_busy_pages(rabp, 0);
+			BUF_KERNPROC(rabp);
+			rabp->b_iooffset = dbtob(rabp->b_blkno);
+			bstrategy(rabp);
+		} else {
+			brelse(rabp);
+		}
+	}
+}
+
+/*
  * Operates like bread, but also starts asynchronous I/O on
- * read-ahead blocks.  We must clear BIO_ERROR and B_INVAL prior
- * to initiating I/O . If B_CACHE is set, the buffer is valid 
- * and we do not have to do anything.
+ * read-ahead blocks.
  */
 int
 breadn(struct vnode * vp, daddr_t blkno, int size,
     daddr_t * rablkno, int *rabsize,
     int cnt, struct ucred * cred, struct buf **bpp)
 {
-	struct buf *bp, *rabp;
-	int i;
+	struct buf *bp;
 	int rv = 0, readwait = 0;
 
 	CTR3(KTR_BUF, "breadn(%p, %jd, %d)", vp, blkno, size);
@@ -739,8 +787,8 @@
 
 	/* if not found in cache, do some I/O */
 	if ((bp->b_flags & B_CACHE) == 0) {
-		if (curthread != PCPU_GET(idlethread))
-			curthread->td_proc->p_stats->p_ru.ru_inblock++;
+		if (!TD_IS_IDLETHREAD(curthread))
+			curthread->td_ru.ru_inblock++;
 		bp->b_iocmd = BIO_READ;
 		bp->b_flags &= ~B_INVAL;
 		bp->b_ioflags &= ~BIO_ERROR;
@@ -752,28 +800,7 @@
 		++readwait;
 	}
 
-	for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
-		if (inmem(vp, *rablkno))
-			continue;
-		rabp = getblk(vp, *rablkno, *rabsize, 0, 0, 0);
-
-		if ((rabp->b_flags & B_CACHE) == 0) {
-			if (curthread != PCPU_GET(idlethread))
-				curthread->td_proc->p_stats->p_ru.ru_inblock++;
-			rabp->b_flags |= B_ASYNC;
-			rabp->b_flags &= ~B_INVAL;
-			rabp->b_ioflags &= ~BIO_ERROR;
-			rabp->b_iocmd = BIO_READ;
-			if (rabp->b_rcred == NOCRED && cred != NOCRED)
-				rabp->b_rcred = crhold(cred);
-			vfs_busy_pages(rabp, 0);
-			BUF_KERNPROC(rabp);
-			rabp->b_iooffset = dbtob(rabp->b_blkno);
-			bstrategy(rabp);
-		} else {
-			brelse(rabp);
-		}
-	}
+	breada(vp, rablkno, rabsize, cnt, cred);
 
 	if (readwait) {
 		rv = bufwait(bp);
@@ -796,6 +823,8 @@
 bufwrite(struct buf *bp)
 {
 	int oldflags;
+	struct vnode *vp;
+	int vp_md;
 
 	CTR3(KTR_BUF, "bufwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 	if (bp->b_flags & B_INVAL) {
@@ -807,9 +836,19 @@
 
 	if (BUF_REFCNT(bp) == 0)
 		panic("bufwrite: buffer is not busy???");
+
+	if (bp->b_pin_count > 0)
+		bunpin_wait(bp);
+
 	KASSERT(!(bp->b_vflags & BV_BKGRDINPROG),
 	    ("FFS background buffer should not get here %p", bp));
 
+	vp = bp->b_vp;
+	if (vp)
+		vp_md = vp->v_vflag & VV_MD;
+	else
+		vp_md = 0;
+
 	/* Mark the buffer clean */
 	bundirty(bp);
 
@@ -827,8 +866,8 @@
 	bp->b_runningbufspace = bp->b_bufsize;
 	atomic_add_int(&runningbufspace, bp->b_runningbufspace);
 
-	if (curthread != PCPU_GET(idlethread))
-		curthread->td_proc->p_stats->p_ru.ru_oublock++;
+	if (!TD_IS_IDLETHREAD(curthread))
+		curthread->td_ru.ru_oublock++;
 	if (oldflags & B_ASYNC)
 		BUF_KERNPROC(bp);
 	bp->b_iooffset = dbtob(bp->b_blkno);
@@ -847,13 +886,54 @@
 		 * or syncer daemon trying to clean up as that can lead
 		 * to deadlock.
 		 */
-		if ((curthread->td_pflags & TDP_NORUNNINGBUF) == 0)
+		if ((curthread->td_pflags & TDP_NORUNNINGBUF) == 0 && !vp_md)
 			waitrunningbufspace();
 	}
 
 	return (0);
 }
 
+void
+bufbdflush(struct bufobj *bo, struct buf *bp)
+{
+	struct buf *nbp;
+
+	if (bo->bo_dirty.bv_cnt > dirtybufthresh + 10) {
+		(void) VOP_FSYNC(bp->b_vp, MNT_NOWAIT, curthread);
+		altbufferflushes++;
+	} else if (bo->bo_dirty.bv_cnt > dirtybufthresh) {
+		BO_LOCK(bo);
+		/*
+		 * Try to find a buffer to flush.
+		 */
+		TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) {
+			if ((nbp->b_vflags & BV_BKGRDINPROG) ||
+			    BUF_LOCK(nbp,
+				     LK_EXCLUSIVE | LK_NOWAIT, NULL))
+				continue;
+			if (bp == nbp)
+				panic("bdwrite: found ourselves");
+			BO_UNLOCK(bo);
+			/* Don't countdeps with the bo lock held. */
+			if (buf_countdeps(nbp, 0)) {
+				BO_LOCK(bo);
+				BUF_UNLOCK(nbp);
+				continue;
+			}
+			if (nbp->b_flags & B_CLUSTEROK) {
+				vfs_bio_awrite(nbp);
+			} else {
+				bremfree(nbp);
+				bawrite(nbp);
+			}
+			dirtybufferflushes++;
+			break;
+		}
+		if (nbp == NULL)
+			BO_UNLOCK(bo);
+	}
+}
+
 /*
  * Delayed write. (Buffer is marked dirty).  Do not bother writing
  * anything if the buffer is marked invalid.
@@ -868,7 +948,6 @@
 {
 	struct thread *td = curthread;
 	struct vnode *vp;
-	struct buf *nbp;
 	struct bufobj *bo;
 
 	CTR3(KTR_BUF, "bdwrite(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
@@ -889,43 +968,10 @@
 	 */
 	vp = bp->b_vp;
 	bo = bp->b_bufobj;
-	if ((td->td_pflags & TDP_COWINPROGRESS) == 0) {
-		BO_LOCK(bo);
-		if (bo->bo_dirty.bv_cnt > dirtybufthresh + 10) {
-			BO_UNLOCK(bo);
-			(void) VOP_FSYNC(vp, MNT_NOWAIT, td);
-			altbufferflushes++;
-		} else if (bo->bo_dirty.bv_cnt > dirtybufthresh) {
-			/*
-			 * Try to find a buffer to flush.
-			 */
-			TAILQ_FOREACH(nbp, &bo->bo_dirty.bv_hd, b_bobufs) {
-				if ((nbp->b_vflags & BV_BKGRDINPROG) ||
-				    BUF_LOCK(nbp,
-				    LK_EXCLUSIVE | LK_NOWAIT, NULL))
-					continue;
-				if (bp == nbp)
-					panic("bdwrite: found ourselves");
-				BO_UNLOCK(bo);
-				/* Don't countdeps with the bo lock held. */
-				if (buf_countdeps(nbp, 0)) {
-					BO_LOCK(bo);
-					BUF_UNLOCK(nbp);
-					continue;
-				}
-				if (nbp->b_flags & B_CLUSTEROK) {
-					vfs_bio_awrite(nbp);
-				} else {
-					bremfree(nbp);
-					bawrite(nbp);
-				}
-				dirtybufferflushes++;
-				break;
-			}
-			if (nbp == NULL)
-				BO_UNLOCK(bo);
-		} else
-			BO_UNLOCK(bo);
+	if ((td->td_pflags & (TDP_COWINPROGRESS|TDP_INBDFLUSH)) == 0) {
+		td->td_pflags |= TDP_INBDFLUSH;
+		BO_BDFLUSH(bo, bp);
+		td->td_pflags &= ~TDP_INBDFLUSH;
 	} else
 		recursiveflushes++;
 
@@ -1117,6 +1163,11 @@
 	KASSERT(!(bp->b_flags & (B_CLUSTER|B_PAGING)),
 	    ("brelse: inappropriate B_PAGING or B_CLUSTER bp %p", bp));
 
+	if (bp->b_flags & B_MANAGED) {
+		bqrelse(bp);
+		return;
+	}
+
 	if (bp->b_iocmd == BIO_WRITE &&
 	    (bp->b_ioflags & BIO_ERROR) &&
 	    !(bp->b_flags & B_INVAL)) {
@@ -1136,7 +1187,7 @@
 		 * cache the buffer.
 		 */
 		bp->b_flags |= B_INVAL;
-		if (LIST_FIRST(&bp->b_dep) != NULL)
+		if (!LIST_EMPTY(&bp->b_dep))
 			buf_deallocate(bp);
 		if (bp->b_flags & B_DELWRI) {
 			atomic_subtract_int(&numdirtybuffers, 1);
@@ -1329,6 +1380,9 @@
 		TAILQ_INSERT_HEAD(&bufqueues[QUEUE_CLEAN], bp, b_freelist);
 	/* remaining buffers */
 	} else {
+		if ((bp->b_flags & (B_DELWRI|B_NEEDSGIANT)) ==
+		    (B_DELWRI|B_NEEDSGIANT))
+			bp->b_qindex = QUEUE_DIRTY_GIANT;
 		if (bp->b_flags & B_DELWRI)
 			bp->b_qindex = QUEUE_DIRTY;
 		else
@@ -1399,6 +1453,18 @@
 		BUF_UNLOCK(bp);
 		return;
 	}
+
+	if (bp->b_flags & B_MANAGED) {
+		if (bp->b_flags & B_REMFREE) {
+			mtx_lock(&bqlock);
+			bremfreel(bp);
+			mtx_unlock(&bqlock);
+		}
+		bp->b_flags &= ~(B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
+		BUF_UNLOCK(bp);
+		return;
+	}
+
 	mtx_lock(&bqlock);
 	/* Handle delayed bremfree() processing. */
 	if (bp->b_flags & B_REMFREE)
@@ -1407,8 +1473,11 @@
 		panic("bqrelse: free buffer onto another queue???");
 	/* buffers with stale but valid contents */
 	if (bp->b_flags & B_DELWRI) {
-		bp->b_qindex = QUEUE_DIRTY;
-		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_DIRTY], bp, b_freelist);
+		if (bp->b_flags & B_NEEDSGIANT)
+			bp->b_qindex = QUEUE_DIRTY_GIANT;
+		else
+			bp->b_qindex = QUEUE_DIRTY;
+		TAILQ_INSERT_TAIL(&bufqueues[bp->b_qindex], bp, b_freelist);
 	} else {
 		/*
 		 * XXX This lock may not be necessary since BKGRDINPROG
@@ -1473,7 +1542,7 @@
 		 * the responsibility of the process that
 		 * busied the pages to deal with them.
 		 */
-		if ((m->flags & PG_BUSY) || (m->busy != 0))
+		if ((m->oflags & VPO_BUSY) || (m->busy != 0))
 			continue;
 			
 		if (m->wire_count == 0) {
@@ -1484,7 +1553,6 @@
 			 */
 			if ((bp->b_flags & B_ASYNC) == 0 && !m->valid &&
 			    m->hold_count == 0) {
-				pmap_remove_all(m);
 				vm_page_free(m);
 			} else if (bp->b_flags & B_DIRECT) {
 				vm_page_try_to_free(m);
@@ -1798,7 +1866,7 @@
 			crfree(bp->b_wcred);
 			bp->b_wcred = NOCRED;
 		}
-		if (LIST_FIRST(&bp->b_dep) != NULL)
+		if (!LIST_EMPTY(&bp->b_dep))
 			buf_deallocate(bp);
 		if (bp->b_vflags & BV_BKGRDINPROG)
 			panic("losing buffer 3");
@@ -1826,6 +1894,10 @@
 		bp->b_npages = 0;
 		bp->b_dirtyoff = bp->b_dirtyend = 0;
 		bp->b_bufobj = NULL;
+		bp->b_pin_count = 0;
+		bp->b_fsprivate1 = NULL;
+		bp->b_fsprivate2 = NULL;
+		bp->b_fsprivate3 = NULL;
 
 		LIST_INIT(&bp->b_dep);
 
@@ -1841,6 +1913,17 @@
 		}
 
 		/*
+		 * Notify any waiters for the buffer lock about
+		 * identity change by freeing the buffer.
+		 */
+		if (qindex == QUEUE_CLEAN && BUF_LOCKWAITERS(bp) > 0) {
+			bp->b_flags |= B_INVAL;
+			bfreekva(bp);
+			brelse(bp);
+			goto restart;
+		}
+
+		/*
 		 * If we are overcomitted then recover the buffer and its
 		 * KVM space.  This occurs in rare situations when multiple
 		 * processes are blocked in getnewbuf() or allocbuf().
@@ -1959,7 +2042,6 @@
 static void
 buf_daemon()
 {
-	mtx_lock(&Giant);
 
 	/*
 	 * This process needs to be suspended prior to shutdown sync.
@@ -1985,13 +2067,28 @@
 		 * normally would so they can run in parallel with our drain.
 		 */
 		while (numdirtybuffers > lodirtybuffers) {
-			if (flushbufqueues(0) == 0) {
+			int flushed;
+
+			flushed = flushbufqueues(QUEUE_DIRTY, 0);
+			/* The list empty check here is slightly racy */
+			if (!TAILQ_EMPTY(&bufqueues[QUEUE_DIRTY_GIANT])) {
+				mtx_lock(&Giant);
+				flushed += flushbufqueues(QUEUE_DIRTY_GIANT, 0);
+				mtx_unlock(&Giant);
+			}
+			if (flushed == 0) {
 				/*
 				 * Could not find any buffers without rollback
 				 * dependencies, so just write the first one
 				 * in the hopes of eventually making progress.
 				 */
-				flushbufqueues(1);
+				flushbufqueues(QUEUE_DIRTY, 1);
+				if (!TAILQ_EMPTY(
+				    &bufqueues[QUEUE_DIRTY_GIANT])) {
+					mtx_lock(&Giant);
+					flushbufqueues(QUEUE_DIRTY_GIANT, 1);
+					mtx_unlock(&Giant);
+				}
 				break;
 			}
 			uio_yield();
@@ -2039,7 +2136,7 @@
     0, "Number of buffers flushed with dependecies that require rollbacks");
 
 static int
-flushbufqueues(int flushdeps)
+flushbufqueues(int queue, int flushdeps)
 {
 	struct thread *td = curthread;
 	struct buf sentinel;
@@ -2056,16 +2153,20 @@
 	flushed = 0;
 	bp = NULL;
 	mtx_lock(&bqlock);
-	TAILQ_INSERT_TAIL(&bufqueues[QUEUE_DIRTY], &sentinel, b_freelist);
+	TAILQ_INSERT_TAIL(&bufqueues[queue], &sentinel, b_freelist);
 	while (flushed != target) {
-		bp = TAILQ_FIRST(&bufqueues[QUEUE_DIRTY]);
+		bp = TAILQ_FIRST(&bufqueues[queue]);
 		if (bp == &sentinel)
 			break;
-		TAILQ_REMOVE(&bufqueues[QUEUE_DIRTY], bp, b_freelist);
-		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_DIRTY], bp, b_freelist);
+		TAILQ_REMOVE(&bufqueues[queue], bp, b_freelist);
+		TAILQ_INSERT_TAIL(&bufqueues[queue], bp, b_freelist);
 
 		if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT, NULL) != 0)
 			continue;
+		if (bp->b_pin_count > 0) {
+			BUF_UNLOCK(bp);
+			continue;
+		}
 		BO_LOCK(bp->b_bufobj);
 		if ((bp->b_vflags & BV_BKGRDINPROG) != 0 ||
 		    (bp->b_flags & B_DELWRI) == 0) {
@@ -2084,7 +2185,7 @@
 			continue;
 		}
 
-		if (LIST_FIRST(&bp->b_dep) != NULL && buf_countdeps(bp, 0)) {
+		if (!LIST_EMPTY(&bp->b_dep) && buf_countdeps(bp, 0)) {
 			if (flushdeps == 0) {
 				BUF_UNLOCK(bp);
 				continue;
@@ -2124,7 +2225,7 @@
 		vn_finished_write(mp);
 		BUF_UNLOCK(bp);
 	}
-	TAILQ_REMOVE(&bufqueues[QUEUE_DIRTY], &sentinel, b_freelist);
+	TAILQ_REMOVE(&bufqueues[queue], &sentinel, b_freelist);
 	mtx_unlock(&bqlock);
 	return (flushed);
 }
@@ -2206,8 +2307,6 @@
 static void
 vfs_setdirty(struct buf *bp) 
 {
-	int i;
-	vm_object_t object;
 
 	/*
 	 * Degenerate case - empty buffer
@@ -2218,20 +2317,25 @@
 
 	/*
 	 * We qualify the scan for modified pages on whether the
-	 * object has been flushed yet.  The OBJ_WRITEABLE flag
-	 * is not cleared simply by protecting pages off.
+	 * object has been flushed yet.
 	 */
 
 	if ((bp->b_flags & B_VMIO) == 0)
 		return;
 
-	object = bp->b_pages[0]->object;
-	VM_OBJECT_LOCK(object);
-	if ((object->flags & OBJ_WRITEABLE) && !(object->flags & OBJ_MIGHTBEDIRTY))
-		printf("Warning: object %p writeable but not mightbedirty\n", object);
-	if (!(object->flags & OBJ_WRITEABLE) && (object->flags & OBJ_MIGHTBEDIRTY))
-		printf("Warning: object %p mightbedirty but not writeable\n", object);
+	VM_OBJECT_LOCK(bp->b_bufobj->bo_object);
+	vfs_setdirty_locked_object(bp);
+	VM_OBJECT_UNLOCK(bp->b_bufobj->bo_object);
+}
 
+static void
+vfs_setdirty_locked_object(struct buf *bp)
+{
+	vm_object_t object;
+	int i;
+
+	object = bp->b_bufobj->bo_object;
+	VM_OBJECT_LOCK_ASSERT(object, MA_OWNED);
 	if (object->flags & (OBJ_MIGHTBEDIRTY|OBJ_CLEANING)) {
 		vm_offset_t boffset;
 		vm_offset_t eoffset;
@@ -2282,7 +2386,6 @@
 				bp->b_dirtyend = eoffset;
 		}
 	}
-	VM_OBJECT_UNLOCK(object);
 }
 
 /*
@@ -2347,14 +2450,14 @@
 	 * XXX remove if 0 sections (clean this up after its proven)
          */
 	if (numfreebuffers == 0) {
-		if (curthread == PCPU_GET(idlethread))
+		if (TD_IS_IDLETHREAD(curthread))
 			return NULL;
 		mtx_lock(&nblock);
 		needsbuffer |= VFS_BIO_NEED_ANY;
 		mtx_unlock(&nblock);
 	}
 
-	VI_LOCK(vp);
+	BO_LOCK(bo);
 	bp = gbincore(bo, blkno);
 	if (bp != NULL) {
 		int lockflags;
@@ -2400,10 +2503,23 @@
 			if ((bp->b_flags & B_VMIO) == 0 ||
 			    (size > bp->b_kvasize)) {
 				if (bp->b_flags & B_DELWRI) {
+					/*
+					 * If buffer is pinned and caller does
+					 * not want sleep  waiting for it to be
+					 * unpinned, bail out
+					 * */
+					if (bp->b_pin_count > 0) {
+						if (flags & GB_LOCK_NOWAIT) {
+							bqrelse(bp);
+							return (NULL);
+						} else {
+							bunpin_wait(bp);
+						}
+					}
 					bp->b_flags |= B_NOCACHE;
 					bwrite(bp);
 				} else {
-					if (LIST_FIRST(&bp->b_dep) == NULL) {
+					if (LIST_EMPTY(&bp->b_dep)) {
 						bp->b_flags |= B_RELBUF;
 						brelse(bp);
 					} else {
@@ -2470,7 +2586,7 @@
 		 * returned by getnewbuf() is locked.  Note that the returned
 		 * buffer is also considered valid (not marked B_INVAL).
 		 */
-		VI_UNLOCK(vp);
+		BO_UNLOCK(bo);
 		/*
 		 * If the user does not want us to create the buffer, bail out
 		 * here.
@@ -2516,7 +2632,6 @@
 		 */
 		bp->b_blkno = bp->b_lblkno = blkno;
 		bp->b_offset = offset;
-
 		bgetvp(vp, bp);
 		BO_UNLOCK(bo);
 
@@ -2783,7 +2898,8 @@
 						VM_WAIT;
 						VM_OBJECT_LOCK(obj);
 					} else {
-						bp->b_flags &= ~B_CACHE;
+						if (m->valid == 0)
+							bp->b_flags &= ~B_CACHE;
 						bp->b_pages[bp->b_npages] = m;
 						++bp->b_npages;
 					}
@@ -2795,26 +2911,19 @@
 				 * retry because it might have gotten freed out
 				 * from under us.
 				 *
-				 * We can only test PG_BUSY here.  Blocking on
+				 * We can only test VPO_BUSY here.  Blocking on
 				 * m->busy might lead to a deadlock:
 				 *
 				 *  vm_fault->getpages->cluster_read->allocbuf
 				 *
 				 */
-				vm_page_lock_queues();
 				if (vm_page_sleep_if_busy(m, FALSE, "pgtblk"))
 					continue;
 
 				/*
-				 * We have a good page.  Should we wakeup the
-				 * page daemon?
+				 * We have a good page.
 				 */
-				if ((curproc != pageproc) &&
-				    ((m->queue - m->pc) == PQ_CACHE) &&
-				    ((cnt.v_free_count + cnt.v_cache_count) <
-					(cnt.v_free_min + cnt.v_cache_min))) {
-					pagedaemon_wakeup();
-				}
+				vm_page_lock_queues();
 				vm_page_wire(m);
 				vm_page_unlock_queues();
 				bp->b_pages[bp->b_npages] = m;
@@ -3041,11 +3150,11 @@
 	struct bufobj *dropobj;
 	void    (*biodone)(struct buf *);
 
-
 	CTR3(KTR_BUF, "bufdone(%p) vp %p flags %X", bp, bp->b_vp, bp->b_flags);
 	dropobj = NULL;
 
-	KASSERT(BUF_REFCNT(bp) > 0, ("biodone: bp %p not busy %d", bp, BUF_REFCNT(bp)));
+	KASSERT(BUF_REFCNT(bp) > 0, ("biodone: bp %p not busy %d", bp,
+	    BUF_REFCNT(bp)));
 	KASSERT(!(bp->b_flags & B_DONE), ("biodone: bp %p already done", bp));
 
 	runningbufwakeup(bp);
@@ -3060,7 +3169,20 @@
 			bufobj_wdrop(dropobj);
 		return;
 	}
-	if (LIST_FIRST(&bp->b_dep) != NULL)
+
+	bufdone_finish(bp);
+
+	if (dropobj)
+		bufobj_wdrop(dropobj);
+}
+
+void
+bufdone_finish(struct buf *bp)
+{
+	KASSERT(BUF_REFCNT(bp) > 0, ("biodone: bp %p not busy %d", bp,
+	    BUF_REFCNT(bp)));
+
+	if (!LIST_EMPTY(&bp->b_dep))
 		buf_complete(bp);
 
 	if (bp->b_flags & B_VMIO) {
@@ -3070,6 +3192,7 @@
 		vm_object_t obj;
 		int iosize;
 		struct vnode *vp = bp->b_vp;
+		boolean_t are_queues_locked;
 
 		obj = bp->b_bufobj->bo_object;
 
@@ -3106,7 +3229,11 @@
 		    !(bp->b_ioflags & BIO_ERROR)) {
 			bp->b_flags |= B_CACHE;
 		}
-		vm_page_lock_queues();
+		if (bp->b_iocmd == BIO_READ) {
+			vm_page_lock_queues();
+			are_queues_locked = TRUE;
+		} else
+			are_queues_locked = FALSE;
 		for (i = 0; i < bp->b_npages; i++) {
 			int bogusflag = 0;
 			int resid;
@@ -3125,7 +3252,8 @@
 				if (m == NULL)
 					panic("biodone: page disappeared!");
 				bp->b_pages[i] = m;
-				pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages);
+				pmap_qenter(trunc_page((vm_offset_t)bp->b_data),
+				    bp->b_pages, bp->b_npages);
 			}
 #if defined(VFS_BIO_DEBUG)
 			if (OFF_TO_IDX(foff) != m->pindex) {
@@ -3174,7 +3302,8 @@
 			foff = (foff + PAGE_SIZE) & ~(off_t)PAGE_MASK;
 			iosize -= resid;
 		}
-		vm_page_unlock_queues();
+		if (are_queues_locked)
+			vm_page_unlock_queues();
 		vm_object_pip_wakeupn(obj, 0);
 		VM_OBJECT_UNLOCK(obj);
 	}
@@ -3192,8 +3321,6 @@
 			bqrelse(bp);
 	} else
 		bdone(bp);
-	if (dropobj)
-		bufobj_wdrop(dropobj);
 }
 
 /*
@@ -3214,7 +3341,6 @@
 
 	obj = bp->b_bufobj->bo_object;
 	VM_OBJECT_LOCK(obj);
-	vm_page_lock_queues();
 	for (i = 0; i < bp->b_npages; i++) {
 		m = bp->b_pages[i];
 		if (m == bogus_page) {
@@ -3228,7 +3354,6 @@
 		vm_object_pip_subtract(obj, 1);
 		vm_page_io_finish(m);
 	}
-	vm_page_unlock_queues();
 	vm_object_pip_wakeupn(obj, 0);
 	VM_OBJECT_UNLOCK(obj);
 }
@@ -3275,7 +3400,7 @@
  * This routine is called before a device strategy routine.
  * It is used to tell the VM system that paging I/O is in
  * progress, and treat the pages associated with the buffer
- * almost as being PG_BUSY.  Also the object paging_in_progress
+ * almost as being VPO_BUSY.  Also the object paging_in_progress
  * flag is handled to make sure that the object doesn't become
  * inconsistant.
  *
@@ -3298,10 +3423,10 @@
 	foff = bp->b_offset;
 	KASSERT(bp->b_offset != NOOFFSET,
 	    ("vfs_busy_pages: no buffer offset"));
-	vfs_setdirty(bp);
 	VM_OBJECT_LOCK(obj);
+	if (bp->b_bufsize != 0)
+		vfs_setdirty_locked_object(bp);
 retry:
-	vm_page_lock_queues();
 	for (i = 0; i < bp->b_npages; i++) {
 		m = bp->b_pages[i];
 
@@ -3309,6 +3434,7 @@
 			goto retry;
 	}
 	bogus = 0;
+	vm_page_lock_queues();
 	for (i = 0; i < bp->b_npages; i++) {
 		m = bp->b_pages[i];
 
@@ -3749,6 +3875,32 @@
 	return (error);
 }
 
+void
+bpin(struct buf *bp)
+{
+	mtx_lock(&bpinlock);
+	bp->b_pin_count++;
+	mtx_unlock(&bpinlock);
+}
+
+void
+bunpin(struct buf *bp)
+{
+	mtx_lock(&bpinlock);
+	if (--bp->b_pin_count == 0)
+		wakeup(bp);
+	mtx_unlock(&bpinlock);
+}
+
+void
+bunpin_wait(struct buf *bp)
+{
+	mtx_lock(&bpinlock);
+	while (bp->b_pin_count > 0)
+		msleep(bp, &bpinlock, PRIBIO, "bwunpin", 0);
+	mtx_unlock(&bpinlock);
+}
+
 #include "opt_ddb.h"
 #ifdef DDB
 #include <ddb/ddb.h>
Index: subr_sbuf.c
===================================================================
RCS file: /home/cvs/src/sys/kern/subr_sbuf.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/subr_sbuf.c -L sys/kern/subr_sbuf.c -u -r1.1.1.1 -r1.2
--- sys/kern/subr_sbuf.c
+++ sys/kern/subr_sbuf.c
@@ -27,7 +27,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/subr_sbuf.c,v 1.29 2005/02/10 12:02:37 phk Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/subr_sbuf.c,v 1.30 2005/12/23 11:49:53 phk Exp $");
 
 #include <sys/param.h>
 
@@ -379,7 +379,7 @@
 		return (-1);	/* XXX */
 	}
 
-	return (0);
+	return (done);
 }
 #endif
 
Index: uipc_socket.c
===================================================================
RCS file: /home/cvs/src/sys/kern/uipc_socket.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/uipc_socket.c -L sys/kern/uipc_socket.c -u -r1.2 -r1.3
--- sys/kern/uipc_socket.c
+++ sys/kern/uipc_socket.c
@@ -1,8 +1,9 @@
 /*-
- * Copyright (c) 2004 The FreeBSD Foundation
- * Copyright (c) 2004-2005 Robert N. M. Watson
  * Copyright (c) 1982, 1986, 1988, 1990, 1993
- *	The Regents of the University of California.  All rights reserved.
+ *	The Regents of the University of California.
+ * Copyright (c) 2004 The FreeBSD Foundation
+ * Copyright (c) 2004-2007 Robert N. M. Watson
+ * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -31,8 +32,70 @@
  *	@(#)uipc_socket.c	8.3 (Berkeley) 4/15/94
  */
 
+/*
+ * Comments on the socket life cycle:
+ *
+ * soalloc() sets of socket layer state for a socket, called only by
+ * socreate() and sonewconn().  Socket layer private.
+ *
+ * sodealloc() tears down socket layer state for a socket, called only by
+ * sofree() and sonewconn().  Socket layer private.
+ *
+ * pru_attach() associates protocol layer state with an allocated socket;
+ * called only once, may fail, aborting socket allocation.  This is called
+ * from socreate() and sonewconn().  Socket layer private.
+ *
+ * pru_detach() disassociates protocol layer state from an attached socket,
+ * and will be called exactly once for sockets in which pru_attach() has
+ * been successfully called.  If pru_attach() returned an error,
+ * pru_detach() will not be called.  Socket layer private.
+ *
+ * pru_abort() and pru_close() notify the protocol layer that the last
+ * consumer of a socket is starting to tear down the socket, and that the
+ * protocol should terminate the connection.  Historically, pru_abort() also
+ * detached protocol state from the socket state, but this is no longer the
+ * case.
+ *
+ * socreate() creates a socket and attaches protocol state.  This is a public
+ * interface that may be used by socket layer consumers to create new
+ * sockets.
+ *
+ * sonewconn() creates a socket and attaches protocol state.  This is a
+ * public interface  that may be used by protocols to create new sockets when
+ * a new connection is received and will be available for accept() on a
+ * listen socket.
+ *
+ * soclose() destroys a socket after possibly waiting for it to disconnect.
+ * This is a public interface that socket consumers should use to close and
+ * release a socket when done with it.
+ *
+ * soabort() destroys a socket without waiting for it to disconnect (used
+ * only for incoming connections that are already partially or fully
+ * connected).  This is used internally by the socket layer when clearing
+ * listen socket queues (due to overflow or close on the listen socket), but
+ * is also a public interface protocols may use to abort connections in
+ * their incomplete listen queues should they no longer be required.  Sockets
+ * placed in completed connection listen queues should not be aborted for
+ * reasons described in the comment above the soclose() implementation.  This
+ * is not a general purpose close routine, and except in the specific
+ * circumstances described here, should not be used.
+ *
+ * sofree() will free a socket and its protocol state if all references on
+ * the socket have been released, and is the public interface to attempt to
+ * free a socket when a reference is removed.  This is a socket layer private
+ * interface.
+ *
+ * NOTE: In addition to socreate() and soclose(), which provide a single
+ * socket reference to the consumer to be managed as required, there are two
+ * calls to explicitly manage socket references, soref(), and sorele().
+ * Currently, these are generally required only when transitioning a socket
+ * from a listen queue to a file descriptor, in order to prevent garbage
+ * collection of the socket at an untimely moment.  For a number of reasons,
+ * these interfaces are not preferred, and should be avoided.
+ */
+
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/uipc_socket.c,v 1.242.2.4 2005/12/28 18:05:13 ps Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/uipc_socket.c,v 1.302.4.1 2008/02/02 12:44:13 rwatson Exp $");
 
 #include "opt_inet.h"
 #include "opt_mac.h"
@@ -52,6 +115,7 @@
 #include <sys/file.h>			/* for struct knote */
 #include <sys/kernel.h>
 #include <sys/event.h>
+#include <sys/eventhandler.h>
 #include <sys/poll.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
@@ -59,10 +123,14 @@
 #include <sys/socketvar.h>
 #include <sys/resourcevar.h>
 #include <sys/signalvar.h>
+#include <sys/stat.h>
+#include <sys/sx.h>
 #include <sys/sysctl.h>
 #include <sys/uio.h>
 #include <sys/jail.h>
 
+#include <security/mac/mac_framework.h>
+
 #include <vm/uma.h>
 
 #ifdef COMPAT_IA32
@@ -91,16 +159,16 @@
 uma_zone_t socket_zone;
 so_gen_t	so_gencnt;	/* generation count for sockets */
 
+int	maxsockets;
+
 MALLOC_DEFINE(M_SONAME, "soname", "socket name");
 MALLOC_DEFINE(M_PCB, "pcb", "protocol control block");
 
-SYSCTL_DECL(_kern_ipc);
-
 static int somaxconn = SOMAXCONN;
-static int somaxconn_sysctl(SYSCTL_HANDLER_ARGS);
+static int sysctl_somaxconn(SYSCTL_HANDLER_ARGS);
 /* XXX: we dont have SYSCTL_USHORT */
 SYSCTL_PROC(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLTYPE_UINT | CTLFLAG_RW,
-    0, sizeof(int), somaxconn_sysctl, "I", "Maximum pending socket connection "
+    0, sizeof(int), sysctl_somaxconn, "I", "Maximum pending socket connection "
     "queue size");
 static int numopensockets;
 SYSCTL_INT(_kern_ipc, OID_AUTO, numopensockets, CTLFLAG_RD,
@@ -132,57 +200,135 @@
 MTX_SYSINIT(so_global_mtx, &so_global_mtx, "so_glabel", MTX_DEF);
 
 /*
- * Socket operation routines.
- * These routines are called by the routines in
- * sys_socket.c or from a system process, and
- * implement the semantics of socket operations by
- * switching out to the protocol specific routines.
+ * General IPC sysctl name space, used by sockets and a variety of other IPC
+ * types.
  */
+SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC");
 
 /*
- * Get a socket structure from our zone, and initialize it.
- * Note that it would probably be better to allocate socket
- * and PCB at the same time, but I'm not convinced that all
- * the protocols can be easily modified to do this.
+ * Sysctl to get and set the maximum global sockets limit.  Notify protocols
+ * of the change so that they can update their dependent limits as required.
+ */
+static int
+sysctl_maxsockets(SYSCTL_HANDLER_ARGS)
+{
+	int error, newmaxsockets;
+
+	newmaxsockets = maxsockets;
+	error = sysctl_handle_int(oidp, &newmaxsockets, 0, req);
+	if (error == 0 && req->newptr) {
+		if (newmaxsockets > maxsockets) {
+			maxsockets = newmaxsockets;
+			if (maxsockets > ((maxfiles / 4) * 3)) {
+				maxfiles = (maxsockets * 5) / 4;
+				maxfilesperproc = (maxfiles * 9) / 10;
+			}
+			EVENTHANDLER_INVOKE(maxsockets_change);
+		} else
+			error = EINVAL;
+	}
+	return (error);
+}
+
+SYSCTL_PROC(_kern_ipc, OID_AUTO, maxsockets, CTLTYPE_INT|CTLFLAG_RW,
+    &maxsockets, 0, sysctl_maxsockets, "IU",
+    "Maximum number of sockets avaliable");
+
+/*
+ * Initialise maxsockets.
+ */
+static void init_maxsockets(void *ignored)
+{
+	TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets);
+	maxsockets = imax(maxsockets, imax(maxfiles, nmbclusters));
+}
+SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL);
+
+/*
+ * Socket operation routines.  These routines are called by the routines in
+ * sys_socket.c or from a system process, and implement the semantics of
+ * socket operations by switching out to the protocol specific routines.
+ */
+
+/*
+ * Get a socket structure from our zone, and initialize it.  Note that it
+ * would probably be better to allocate socket and PCB at the same time, but
+ * I'm not convinced that all the protocols can be easily modified to do
+ * this.
  *
  * soalloc() returns a socket with a ref count of 0.
  */
-struct socket *
-soalloc(int mflags)
+static struct socket *
+soalloc(void)
 {
 	struct socket *so;
 
-	so = uma_zalloc(socket_zone, mflags | M_ZERO);
-	if (so != NULL) {
+	so = uma_zalloc(socket_zone, M_NOWAIT | M_ZERO);
+	if (so == NULL)
+		return (NULL);
 #ifdef MAC
-		if (mac_init_socket(so, mflags) != 0) {
-			uma_zfree(socket_zone, so);
-			return (NULL);
-		}
-#endif
-		SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd");
-		SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv");
-		TAILQ_INIT(&so->so_aiojobq);
-		mtx_lock(&so_global_mtx);
-		so->so_gencnt = ++so_gencnt;
-		++numopensockets;
-		mtx_unlock(&so_global_mtx);
+	if (mac_init_socket(so, M_NOWAIT) != 0) {
+		uma_zfree(socket_zone, so);
+		return (NULL);
 	}
+#endif
+	SOCKBUF_LOCK_INIT(&so->so_snd, "so_snd");
+	SOCKBUF_LOCK_INIT(&so->so_rcv, "so_rcv");
+	sx_init(&so->so_snd.sb_sx, "so_snd_sx");
+	sx_init(&so->so_rcv.sb_sx, "so_rcv_sx");
+	TAILQ_INIT(&so->so_aiojobq);
+	mtx_lock(&so_global_mtx);
+	so->so_gencnt = ++so_gencnt;
+	++numopensockets;
+	mtx_unlock(&so_global_mtx);
 	return (so);
 }
 
 /*
+ * Free the storage associated with a socket at the socket layer, tear down
+ * locks, labels, etc.  All protocol state is assumed already to have been
+ * torn down (and possibly never set up) by the caller.
+ */
+static void
+sodealloc(struct socket *so)
+{
+
+	KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count));
+	KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL"));
+
+	mtx_lock(&so_global_mtx);
+	so->so_gencnt = ++so_gencnt;
+	--numopensockets;	/* Could be below, but faster here. */
+	mtx_unlock(&so_global_mtx);
+	if (so->so_rcv.sb_hiwat)
+		(void)chgsbsize(so->so_cred->cr_uidinfo,
+		    &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY);
+	if (so->so_snd.sb_hiwat)
+		(void)chgsbsize(so->so_cred->cr_uidinfo,
+		    &so->so_snd.sb_hiwat, 0, RLIM_INFINITY);
+#ifdef INET
+	/* remove acccept filter if one is present. */
+	if (so->so_accf != NULL)
+		do_setopt_accept_filter(so, NULL);
+#endif
+#ifdef MAC
+	mac_destroy_socket(so);
+#endif
+	crfree(so->so_cred);
+	sx_destroy(&so->so_snd.sb_sx);
+	sx_destroy(&so->so_rcv.sb_sx);
+	SOCKBUF_LOCK_DESTROY(&so->so_snd);
+	SOCKBUF_LOCK_DESTROY(&so->so_rcv);
+	uma_zfree(socket_zone, so);
+}
+
+/*
  * socreate returns a socket with a ref count of 1.  The socket should be
  * closed with soclose().
  */
 int
-socreate(dom, aso, type, proto, cred, td)
-	int dom;
-	struct socket **aso;
-	int type;
-	int proto;
-	struct ucred *cred;
-	struct thread *td;
+socreate(int dom, struct socket **aso, int type, int proto,
+    struct ucred *cred, struct thread *td)
 {
 	struct protosw *prp;
 	struct socket *so;
@@ -206,7 +352,7 @@
 
 	if (prp->pr_type != type)
 		return (EPROTOTYPE);
-	so = soalloc(M_WAITOK);
+	so = soalloc();
 	if (so == NULL)
 		return (ENOBUFS);
 
@@ -229,55 +375,120 @@
 	 */
 	error = (*prp->pr_usrreqs->pru_attach)(so, proto, td);
 	if (error) {
-		ACCEPT_LOCK();
-		SOCK_LOCK(so);
-		so->so_state |= SS_NOFDREF;
-		sorele(so);
+		KASSERT(so->so_count == 1, ("socreate: so_count %d",
+		    so->so_count));
+		so->so_count = 0;
+		sodealloc(so);
 		return (error);
 	}
 	*aso = so;
 	return (0);
 }
 
-int
-sobind(so, nam, td)
-	struct socket *so;
-	struct sockaddr *nam;
-	struct thread *td;
-{
-
-	return ((*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td));
-}
+#ifdef REGRESSION
+static int regression_sonewconn_earlytest = 1;
+SYSCTL_INT(_regression, OID_AUTO, sonewconn_earlytest, CTLFLAG_RW,
+    &regression_sonewconn_earlytest, 0, "Perform early sonewconn limit test");
+#endif
 
-void
-sodealloc(struct socket *so)
+/*
+ * When an attempt at a new connection is noted on a socket which accepts
+ * connections, sonewconn is called.  If the connection is possible (subject
+ * to space constraints, etc.) then we allocate a new structure, propoerly
+ * linked into the data structure of the original socket, and return this.
+ * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED.
+ *
+ * Note: the ref count on the socket is 0 on return.
+ */
+struct socket *
+sonewconn(struct socket *head, int connstatus)
 {
+	struct socket *so;
+	int over;
 
-	KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count));
-	mtx_lock(&so_global_mtx);
-	so->so_gencnt = ++so_gencnt;
-	mtx_unlock(&so_global_mtx);
-	if (so->so_rcv.sb_hiwat)
-		(void)chgsbsize(so->so_cred->cr_uidinfo,
-		    &so->so_rcv.sb_hiwat, 0, RLIM_INFINITY);
-	if (so->so_snd.sb_hiwat)
-		(void)chgsbsize(so->so_cred->cr_uidinfo,
-		    &so->so_snd.sb_hiwat, 0, RLIM_INFINITY);
-#ifdef INET
-	/* remove acccept filter if one is present. */
-	if (so->so_accf != NULL)
-		do_setopt_accept_filter(so, NULL);
+	ACCEPT_LOCK();
+	over = (head->so_qlen > 3 * head->so_qlimit / 2);
+	ACCEPT_UNLOCK();
+#ifdef REGRESSION
+	if (regression_sonewconn_earlytest && over)
+#else
+	if (over)
 #endif
+		return (NULL);
+	so = soalloc();
+	if (so == NULL)
+		return (NULL);
+	if ((head->so_options & SO_ACCEPTFILTER) != 0)
+		connstatus = 0;
+	so->so_head = head;
+	so->so_type = head->so_type;
+	so->so_options = head->so_options &~ SO_ACCEPTCONN;
+	so->so_linger = head->so_linger;
+	so->so_state = head->so_state | SS_NOFDREF;
+	so->so_proto = head->so_proto;
+	so->so_cred = crhold(head->so_cred);
 #ifdef MAC
-	mac_destroy_socket(so);
+	SOCK_LOCK(head);
+	mac_create_socket_from_socket(head, so);
+	SOCK_UNLOCK(head);
 #endif
-	crfree(so->so_cred);
-	SOCKBUF_LOCK_DESTROY(&so->so_snd);
-	SOCKBUF_LOCK_DESTROY(&so->so_rcv);
-	uma_zfree(socket_zone, so);
-	mtx_lock(&so_global_mtx);
-	--numopensockets;
-	mtx_unlock(&so_global_mtx);
+	knlist_init(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv),
+	    NULL, NULL, NULL);
+	knlist_init(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd),
+	    NULL, NULL, NULL);
+	if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat) ||
+	    (*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) {
+		sodealloc(so);
+		return (NULL);
+	}
+	so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
+	so->so_snd.sb_lowat = head->so_snd.sb_lowat;
+	so->so_rcv.sb_timeo = head->so_rcv.sb_timeo;
+	so->so_snd.sb_timeo = head->so_snd.sb_timeo;
+	so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE;
+	so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE;
+	so->so_state |= connstatus;
+	ACCEPT_LOCK();
+	if (connstatus) {
+		TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
+		so->so_qstate |= SQ_COMP;
+		head->so_qlen++;
+	} else {
+		/*
+		 * Keep removing sockets from the head until there's room for
+		 * us to insert on the tail.  In pre-locking revisions, this
+		 * was a simple if(), but as we could be racing with other
+		 * threads and soabort() requires dropping locks, we must
+		 * loop waiting for the condition to be true.
+		 */
+		while (head->so_incqlen > head->so_qlimit) {
+			struct socket *sp;
+			sp = TAILQ_FIRST(&head->so_incomp);
+			TAILQ_REMOVE(&head->so_incomp, sp, so_list);
+			head->so_incqlen--;
+			sp->so_qstate &= ~SQ_INCOMP;
+			sp->so_head = NULL;
+			ACCEPT_UNLOCK();
+			soabort(sp);
+			ACCEPT_LOCK();
+		}
+		TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list);
+		so->so_qstate |= SQ_INCOMP;
+		head->so_incqlen++;
+	}
+	ACCEPT_UNLOCK();
+	if (connstatus) {
+		sorwakeup(head);
+		wakeup_one(&head->so_timeo);
+	}
+	return (so);
+}
+
+int
+sobind(struct socket *so, struct sockaddr *nam, struct thread *td)
+{
+
+	return ((*so->so_proto->pr_usrreqs->pru_bind)(so, nam, td));
 }
 
 /*
@@ -293,31 +504,14 @@
  * socket-layer test and set to avoid races at the socket layer.
  */
 int
-solisten(so, backlog, td)
-	struct socket *so;
-	int backlog;
-	struct thread *td;
+solisten(struct socket *so, int backlog, struct thread *td)
 {
-	int error;
-
-	error = (*so->so_proto->pr_usrreqs->pru_listen)(so, td);
-	if (error)
-		return (error);
 
-	/*
-	 * XXXRW: The following state adjustment should occur in
-	 * solisten_proto(), but we don't currently pass the backlog request
-	 * to the protocol via pru_listen().
-	 */
-	if (backlog < 0 || backlog > somaxconn)
-		backlog = somaxconn;
-	so->so_qlimit = backlog;
-	return (0);
+	return ((*so->so_proto->pr_usrreqs->pru_listen)(so, backlog, td));
 }
 
 int
-solisten_proto_check(so)
-	struct socket *so;
+solisten_proto_check(struct socket *so)
 {
 
 	SOCK_LOCK_ASSERT(so);
@@ -329,37 +523,48 @@
 }
 
 void
-solisten_proto(so)
-	struct socket *so;
+solisten_proto(struct socket *so, int backlog)
 {
 
 	SOCK_LOCK_ASSERT(so);
 
+	if (backlog < 0 || backlog > somaxconn)
+		backlog = somaxconn;
+	so->so_qlimit = backlog;
 	so->so_options |= SO_ACCEPTCONN;
 }
 
 /*
  * Attempt to free a socket.  This should really be sotryfree().
  *
- * We free the socket if the protocol is no longer interested in the socket,
- * there's no file descriptor reference, and the refcount is 0.  While the
- * calling macro sotryfree() tests the refcount, sofree() has to test it
- * again as it's possible to race with an accept()ing thread if the socket is
- * in an listen queue of a listen socket, as being in the listen queue
- * doesn't elevate the reference count.  sofree() acquires the accept mutex
- * early for this test in order to avoid that race.
+ * sofree() will succeed if:
+ *
+ * - There are no outstanding file descriptor references or related consumers
+ *   (so_count == 0).
+ *
+ * - The socket has been closed by user space, if ever open (SS_NOFDREF).
+ *
+ * - The protocol does not have an outstanding strong reference on the socket
+ *   (SS_PROTOREF).
+ *
+ * - The socket is not in a completed connection queue, so a process has been
+ *   notified that it is present.  If it is removed, the user process may
+ *   block in accept() despite select() saying the socket was ready.
+ *
+ * Otherwise, it will quietly abort so that a future call to sofree(), when
+ * conditions are right, can succeed.
  */
 void
-sofree(so)
-	struct socket *so;
+sofree(struct socket *so)
 {
+	struct protosw *pr = so->so_proto;
 	struct socket *head;
 
 	ACCEPT_LOCK_ASSERT();
 	SOCK_LOCK_ASSERT(so);
 
-	if (so->so_pcb != NULL || (so->so_state & SS_NOFDREF) == 0 ||
-	    so->so_count != 0) {
+	if ((so->so_state & SS_NOFDREF) == 0 || so->so_count != 0 ||
+	    (so->so_state & SS_PROTOREF) || (so->so_qstate & SQ_COMP)) {
 		SOCK_UNLOCK(so);
 		ACCEPT_UNLOCK();
 		return;
@@ -374,22 +579,6 @@
 		KASSERT((so->so_qstate & SQ_COMP) == 0 ||
 		    (so->so_qstate & SQ_INCOMP) == 0,
 		    ("sofree: so->so_qstate is SQ_COMP and also SQ_INCOMP"));
-		/*
-		 * accept(2) is responsible draining the completed
-		 * connection queue and freeing those sockets, so
-		 * we just return here if this socket is currently
-		 * on the completed connection queue.  Otherwise,
-		 * accept(2) may hang after select(2) has indicating
-		 * that a listening socket was ready.  If it's an
-		 * incomplete connection, we remove it from the queue
-		 * and free it; otherwise, it won't be released until
-		 * the listening socket is closed.
-		 */
-		if ((so->so_qstate & SQ_COMP) != 0) {
-			SOCK_UNLOCK(so);
-			ACCEPT_UNLOCK();
-			return;
-		}
 		TAILQ_REMOVE(&head->so_incomp, so, so_list);
 		head->so_incqlen--;
 		so->so_qstate &= ~SQ_INCOMP;
@@ -399,45 +588,77 @@
 	    (so->so_qstate & SQ_INCOMP) == 0,
 	    ("sofree: so_head == NULL, but still SQ_COMP(%d) or SQ_INCOMP(%d)",
 	    so->so_qstate & SQ_COMP, so->so_qstate & SQ_INCOMP));
+	if (so->so_options & SO_ACCEPTCONN) {
+		KASSERT((TAILQ_EMPTY(&so->so_comp)), ("sofree: so_comp populated"));
+		KASSERT((TAILQ_EMPTY(&so->so_incomp)), ("sofree: so_comp populated"));
+	}
 	SOCK_UNLOCK(so);
 	ACCEPT_UNLOCK();
-	SOCKBUF_LOCK(&so->so_snd);
-	so->so_snd.sb_flags |= SB_NOINTR;
-	(void)sblock(&so->so_snd, M_WAITOK);
+
+	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
+		(*pr->pr_domain->dom_dispose)(so->so_rcv.sb_mb);
+	if (pr->pr_usrreqs->pru_detach != NULL)
+		(*pr->pr_usrreqs->pru_detach)(so);
+
 	/*
-	 * socantsendmore_locked() drops the socket buffer mutex so that it
-	 * can safely perform wakeups.  Re-acquire the mutex before
-	 * continuing.
+	 * From this point on, we assume that no other references to this
+	 * socket exist anywhere else in the stack.  Therefore, no locks need
+	 * to be acquired or held.
+	 *
+	 * We used to do a lot of socket buffer and socket locking here, as
+	 * well as invoke sorflush() and perform wakeups.  The direct call to
+	 * dom_dispose() and sbrelease_internal() are an inlining of what was
+	 * necessary from sorflush().
+	 *
+	 * Notice that the socket buffer and kqueue state are torn down
+	 * before calling pru_detach.  This means that protocols shold not
+	 * assume they can perform socket wakeups, etc, in their detach code.
 	 */
-	socantsendmore_locked(so);
-	SOCKBUF_LOCK(&so->so_snd);
-	sbunlock(&so->so_snd);
-	sbrelease_locked(&so->so_snd, so);
-	SOCKBUF_UNLOCK(&so->so_snd);
-	sorflush(so);
+	sbdestroy(&so->so_snd, so);
+	sbdestroy(&so->so_rcv, so);
 	knlist_destroy(&so->so_rcv.sb_sel.si_note);
 	knlist_destroy(&so->so_snd.sb_sel.si_note);
 	sodealloc(so);
 }
 
 /*
- * Close a socket on last file table reference removal.
- * Initiate disconnect if connected.
- * Free socket when disconnect complete.
+ * Close a socket on last file table reference removal.  Initiate disconnect
+ * if connected.  Free socket when disconnect complete.
  *
- * This function will sorele() the socket.  Note that soclose() may be
- * called prior to the ref count reaching zero.  The actual socket
- * structure will not be freed until the ref count reaches zero.
+ * This function will sorele() the socket.  Note that soclose() may be called
+ * prior to the ref count reaching zero.  The actual socket structure will
+ * not be freed until the ref count reaches zero.
  */
 int
-soclose(so)
-	struct socket *so;
+soclose(struct socket *so)
 {
 	int error = 0;
 
 	KASSERT(!(so->so_state & SS_NOFDREF), ("soclose: SS_NOFDREF on enter"));
 
 	funsetown(&so->so_sigio);
+	if (so->so_state & SS_ISCONNECTED) {
+		if ((so->so_state & SS_ISDISCONNECTING) == 0) {
+			error = sodisconnect(so);
+			if (error)
+				goto drop;
+		}
+		if (so->so_options & SO_LINGER) {
+			if ((so->so_state & SS_ISDISCONNECTING) &&
+			    (so->so_state & SS_NBIO))
+				goto drop;
+			while (so->so_state & SS_ISCONNECTED) {
+				error = tsleep(&so->so_timeo,
+				    PSOCK | PCATCH, "soclos", so->so_linger * hz);
+				if (error)
+					break;
+			}
+		}
+	}
+
+drop:
+	if (so->so_proto->pr_usrreqs->pru_close != NULL)
+		(*so->so_proto->pr_usrreqs->pru_close)(so);
 	if (so->so_options & SO_ACCEPTCONN) {
 		struct socket *sp;
 		ACCEPT_LOCK();
@@ -447,7 +668,7 @@
 			sp->so_qstate &= ~SQ_INCOMP;
 			sp->so_head = NULL;
 			ACCEPT_UNLOCK();
-			(void) soabort(sp);
+			soabort(sp);
 			ACCEPT_LOCK();
 		}
 		while ((sp = TAILQ_FIRST(&so->so_comp)) != NULL) {
@@ -456,38 +677,11 @@
 			sp->so_qstate &= ~SQ_COMP;
 			sp->so_head = NULL;
 			ACCEPT_UNLOCK();
-			(void) soabort(sp);
+			soabort(sp);
 			ACCEPT_LOCK();
 		}
 		ACCEPT_UNLOCK();
 	}
-	if (so->so_pcb == NULL)
-		goto discard;
-	if (so->so_state & SS_ISCONNECTED) {
-		if ((so->so_state & SS_ISDISCONNECTING) == 0) {
-			error = sodisconnect(so);
-			if (error)
-				goto drop;
-		}
-		if (so->so_options & SO_LINGER) {
-			if ((so->so_state & SS_ISDISCONNECTING) &&
-			    (so->so_state & SS_NBIO))
-				goto drop;
-			while (so->so_state & SS_ISCONNECTED) {
-				error = tsleep(&so->so_timeo,
-				    PSOCK | PCATCH, "soclos", so->so_linger * hz);
-				if (error)
-					break;
-			}
-		}
-	}
-drop:
-	if (so->so_pcb != NULL) {
-		int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
-		if (error == 0)
-			error = error2;
-	}
-discard:
 	ACCEPT_LOCK();
 	SOCK_LOCK(so);
 	KASSERT((so->so_state & SS_NOFDREF) == 0, ("soclose: NOFDREF"));
@@ -497,31 +691,44 @@
 }
 
 /*
- * soabort() must not be called with any socket locks held, as it calls
- * into the protocol, which will call back into the socket code causing
- * it to acquire additional socket locks that may cause recursion or lock
- * order reversals.
+ * soabort() is used to abruptly tear down a connection, such as when a
+ * resource limit is reached (listen queue depth exceeded), or if a listen
+ * socket is closed while there are sockets waiting to be accepted.
+ *
+ * This interface is tricky, because it is called on an unreferenced socket,
+ * and must be called only by a thread that has actually removed the socket
+ * from the listen queue it was on, or races with other threads are risked.
+ *
+ * This interface will call into the protocol code, so must not be called
+ * with any socket locks held.  Protocols do call it while holding their own
+ * recursible protocol mutexes, but this is something that should be subject
+ * to review in the future.
  */
-int
-soabort(so)
-	struct socket *so;
+void
+soabort(struct socket *so)
 {
-	int error;
 
-	error = (*so->so_proto->pr_usrreqs->pru_abort)(so);
-	if (error) {
-		ACCEPT_LOCK();
-		SOCK_LOCK(so);
-		sotryfree(so);	/* note: does not decrement the ref count */
-		return error;
-	}
-	return (0);
+	/*
+	 * In as much as is possible, assert that no references to this
+	 * socket are held.  This is not quite the same as asserting that the
+	 * current thread is responsible for arranging for no references, but
+	 * is as close as we can get for now.
+	 */
+	KASSERT(so->so_count == 0, ("soabort: so_count"));
+	KASSERT((so->so_state & SS_PROTOREF) == 0, ("soabort: SS_PROTOREF"));
+	KASSERT(so->so_state & SS_NOFDREF, ("soabort: !SS_NOFDREF"));
+	KASSERT((so->so_state & SQ_COMP) == 0, ("soabort: SQ_COMP"));
+	KASSERT((so->so_state & SQ_INCOMP) == 0, ("soabort: SQ_INCOMP"));
+
+	if (so->so_proto->pr_usrreqs->pru_abort != NULL)
+		(*so->so_proto->pr_usrreqs->pru_abort)(so);
+	ACCEPT_LOCK();
+	SOCK_LOCK(so);
+	sofree(so);
 }
 
 int
-soaccept(so, nam)
-	struct socket *so;
-	struct sockaddr **nam;
+soaccept(struct socket *so, struct sockaddr **nam)
 {
 	int error;
 
@@ -534,10 +741,7 @@
 }
 
 int
-soconnect(so, nam, td)
-	struct socket *so;
-	struct sockaddr *nam;
-	struct thread *td;
+soconnect(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
 	int error;
 
@@ -545,9 +749,8 @@
 		return (EOPNOTSUPP);
 	/*
 	 * If protocol is connection-based, can only connect once.
-	 * Otherwise, if connected, try to disconnect first.
-	 * This allows user to disconnect by connecting to, e.g.,
-	 * a null address.
+	 * Otherwise, if connected, try to disconnect first.  This allows
+	 * user to disconnect by connecting to, e.g., a null address.
 	 */
 	if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) &&
 	    ((so->so_proto->pr_flags & PR_CONNREQUIRED) ||
@@ -555,8 +758,8 @@
 		error = EISCONN;
 	} else {
 		/*
-		 * Prevent accumulated error from previous connection
-		 * from biting us.
+		 * Prevent accumulated error from previous connection from
+		 * biting us.
 		 */
 		so->so_error = 0;
 		error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, td);
@@ -566,17 +769,14 @@
 }
 
 int
-soconnect2(so1, so2)
-	struct socket *so1;
-	struct socket *so2;
+soconnect2(struct socket *so1, struct socket *so2)
 {
 
 	return ((*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2));
 }
 
 int
-sodisconnect(so)
-	struct socket *so;
+sodisconnect(struct socket *so)
 {
 	int error;
 
@@ -588,25 +788,6 @@
 	return (error);
 }
 
-#define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK)
-/*
- * Send on a socket.
- * If send must go all at once and message is larger than
- * send buffering, then hard error.
- * Lock against other senders.
- * If must go all at once and not enough room now, then
- * inform user that this would block and do nothing.
- * Otherwise, if nonblocking, send as much as possible.
- * The data to be sent is described by "uio" if nonzero,
- * otherwise by the mbuf chain "top" (which must be null
- * if uio is not).  Data provided in mbuf chain must be small
- * enough to send all at once.
- *
- * Returns nonzero on error, timeout or signal; callers
- * must check for short counts if EINTR/ERESTART are returned.
- * Data and control buffers are freed on return.
- */
-
 #ifdef ZERO_COPY_SOCKETS
 struct so_zerocopy_stats{
 	int size_ok;
@@ -620,37 +801,315 @@
 #include <vm/vm.h>
 #include <vm/vm_page.h>
 #include <vm/vm_object.h>
+
+/*
+ * sosend_copyin() is only used if zero copy sockets are enabled.  Otherwise
+ * sosend_dgram() and sosend_generic() use m_uiotombuf().
+ * 
+ * sosend_copyin() accepts a uio and prepares an mbuf chain holding part or
+ * all of the data referenced by the uio.  If desired, it uses zero-copy.
+ * *space will be updated to reflect data copied in.
+ *
+ * NB: If atomic I/O is requested, the caller must already have checked that
+ * space can hold resid bytes.
+ *
+ * NB: In the event of an error, the caller may need to free the partial
+ * chain pointed to by *mpp.  The contents of both *uio and *space may be
+ * modified even in the case of an error.
+ */
+static int
+sosend_copyin(struct uio *uio, struct mbuf **retmp, int atomic, long *space,
+    int flags)
+{
+	struct mbuf *m, **mp, *top;
+	long len, resid;
+	int error;
+#ifdef ZERO_COPY_SOCKETS
+	int cow_send;
+#endif
+
+	*retmp = top = NULL;
+	mp = ⊤
+	len = 0;
+	resid = uio->uio_resid;
+	error = 0;
+	do {
+#ifdef ZERO_COPY_SOCKETS
+		cow_send = 0;
+#endif /* ZERO_COPY_SOCKETS */
+		if (resid >= MINCLSIZE) {
+#ifdef ZERO_COPY_SOCKETS
+			if (top == NULL) {
+				m = m_gethdr(M_WAITOK, MT_DATA);
+				m->m_pkthdr.len = 0;
+				m->m_pkthdr.rcvif = NULL;
+			} else
+				m = m_get(M_WAITOK, MT_DATA);
+			if (so_zero_copy_send &&
+			    resid>=PAGE_SIZE &&
+			    *space>=PAGE_SIZE &&
+			    uio->uio_iov->iov_len>=PAGE_SIZE) {
+				so_zerocp_stats.size_ok++;
+				so_zerocp_stats.align_ok++;
+				cow_send = socow_setup(m, uio);
+				len = cow_send;
+			}
+			if (!cow_send) {
+				m_clget(m, M_WAITOK);
+				len = min(min(MCLBYTES, resid), *space);
+			}
+#else /* ZERO_COPY_SOCKETS */
+			if (top == NULL) {
+				m = m_getcl(M_TRYWAIT, MT_DATA, M_PKTHDR);
+				m->m_pkthdr.len = 0;
+				m->m_pkthdr.rcvif = NULL;
+			} else
+				m = m_getcl(M_TRYWAIT, MT_DATA, 0);
+			len = min(min(MCLBYTES, resid), *space);
+#endif /* ZERO_COPY_SOCKETS */
+		} else {
+			if (top == NULL) {
+				m = m_gethdr(M_TRYWAIT, MT_DATA);
+				m->m_pkthdr.len = 0;
+				m->m_pkthdr.rcvif = NULL;
+
+				len = min(min(MHLEN, resid), *space);
+				/*
+				 * For datagram protocols, leave room
+				 * for protocol headers in first mbuf.
+				 */
+				if (atomic && m && len < MHLEN)
+					MH_ALIGN(m, len);
+			} else {
+				m = m_get(M_TRYWAIT, MT_DATA);
+				len = min(min(MLEN, resid), *space);
+			}
+		}
+		if (m == NULL) {
+			error = ENOBUFS;
+			goto out;
+		}
+
+		*space -= len;
+#ifdef ZERO_COPY_SOCKETS
+		if (cow_send)
+			error = 0;
+		else
+#endif /* ZERO_COPY_SOCKETS */
+		error = uiomove(mtod(m, void *), (int)len, uio);
+		resid = uio->uio_resid;
+		m->m_len = len;
+		*mp = m;
+		top->m_pkthdr.len += len;
+		if (error)
+			goto out;
+		mp = &m->m_next;
+		if (resid <= 0) {
+			if (flags & MSG_EOR)
+				top->m_flags |= M_EOR;
+			break;
+		}
+	} while (*space > 0 && atomic);
+out:
+	*retmp = top;
+	return (error);
+}
 #endif /*ZERO_COPY_SOCKETS*/
 
+#define	SBLOCKWAIT(f)	(((f) & MSG_DONTWAIT) ? 0 : SBL_WAIT)
+
 int
-sosend(so, addr, uio, top, control, flags, td)
-	struct socket *so;
-	struct sockaddr *addr;
-	struct uio *uio;
-	struct mbuf *top;
-	struct mbuf *control;
-	int flags;
-	struct thread *td;
+sosend_dgram(struct socket *so, struct sockaddr *addr, struct uio *uio,
+    struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
 {
-	struct mbuf **mp;
-	struct mbuf *m;
-	long space, len = 0, resid;
+	long space, resid;
 	int clen = 0, error, dontroute;
+#ifdef ZERO_COPY_SOCKETS
 	int atomic = sosendallatonce(so) || top;
+#endif
+
+	KASSERT(so->so_type == SOCK_DGRAM, ("sodgram_send: !SOCK_DGRAM"));
+	KASSERT(so->so_proto->pr_flags & PR_ATOMIC,
+	    ("sodgram_send: !PR_ATOMIC"));
+
+	if (uio != NULL)
+		resid = uio->uio_resid;
+	else
+		resid = top->m_pkthdr.len;
+	/*
+	 * In theory resid should be unsigned.  However, space must be
+	 * signed, as it might be less than 0 if we over-committed, and we
+	 * must use a signed comparison of space and resid.  On the other
+	 * hand, a negative resid causes us to loop sending 0-length
+	 * segments to the protocol.
+	 *
+	 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
+	 * type sockets since that's an error.
+	 */
+	if (resid < 0) {
+		error = EINVAL;
+		goto out;
+	}
+
+	dontroute =
+	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0;
+	if (td != NULL)
+		td->td_ru.ru_msgsnd++;
+	if (control != NULL)
+		clen = control->m_len;
+
+	SOCKBUF_LOCK(&so->so_snd);
+	if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
+		SOCKBUF_UNLOCK(&so->so_snd);
+		error = EPIPE;
+		goto out;
+	}
+	if (so->so_error) {
+		error = so->so_error;
+		so->so_error = 0;
+		SOCKBUF_UNLOCK(&so->so_snd);
+		goto out;
+	}
+	if ((so->so_state & SS_ISCONNECTED) == 0) {
+		/*
+		 * `sendto' and `sendmsg' is allowed on a connection-based
+		 * socket if it supports implied connect.  Return ENOTCONN if
+		 * not connected and no address is supplied.
+		 */
+		if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
+		    (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
+			if ((so->so_state & SS_ISCONFIRMING) == 0 &&
+			    !(resid == 0 && clen != 0)) {
+				SOCKBUF_UNLOCK(&so->so_snd);
+				error = ENOTCONN;
+				goto out;
+			}
+		} else if (addr == NULL) {
+			if (so->so_proto->pr_flags & PR_CONNREQUIRED)
+				error = ENOTCONN;
+			else
+				error = EDESTADDRREQ;
+			SOCKBUF_UNLOCK(&so->so_snd);
+			goto out;
+		}
+	}
+
+	/*
+	 * Do we need MSG_OOB support in SOCK_DGRAM?  Signs here may be a
+	 * problem and need fixing.
+	 */
+	space = sbspace(&so->so_snd);
+	if (flags & MSG_OOB)
+		space += 1024;
+	space -= clen;
+	SOCKBUF_UNLOCK(&so->so_snd);
+	if (resid > space) {
+		error = EMSGSIZE;
+		goto out;
+	}
+	if (uio == NULL) {
+		resid = 0;
+		if (flags & MSG_EOR)
+			top->m_flags |= M_EOR;
+	} else {
 #ifdef ZERO_COPY_SOCKETS
-	int cow_send;
-#endif /* ZERO_COPY_SOCKETS */
+		error = sosend_copyin(uio, &top, atomic, &space, flags);
+		if (error)
+			goto out;
+#else
+		/*
+		 * Copy the data from userland into a mbuf chain.
+		 * If no data is to be copied in, a single empty mbuf
+		 * is returned.
+		 */
+		top = m_uiotombuf(uio, M_WAITOK, space, max_hdr,
+		    (M_PKTHDR | ((flags & MSG_EOR) ? M_EOR : 0)));
+		if (top == NULL) {
+			error = EFAULT;	/* only possible error */
+			goto out;
+		}
+		space -= resid - uio->uio_resid;
+#endif
+		resid = uio->uio_resid;
+	}
+	KASSERT(resid == 0, ("sosend_dgram: resid != 0"));
+	/*
+	 * XXXRW: Frobbing SO_DONTROUTE here is even worse without sblock
+	 * than with.
+	 */
+	if (dontroute) {
+		SOCK_LOCK(so);
+		so->so_options |= SO_DONTROUTE;
+		SOCK_UNLOCK(so);
+	}
+	/*
+	 * XXX all the SBS_CANTSENDMORE checks previously done could be out
+	 * of date.  We could have recieved a reset packet in an interrupt or
+	 * maybe we slept while doing page faults in uiomove() etc.  We could
+	 * probably recheck again inside the locking protection here, but
+	 * there are probably other places that this also happens.  We must
+	 * rethink this.
+	 */
+	error = (*so->so_proto->pr_usrreqs->pru_send)(so,
+	    (flags & MSG_OOB) ? PRUS_OOB :
+	/*
+	 * If the user set MSG_EOF, the protocol understands this flag and
+	 * nothing left to send then use PRU_SEND_EOF instead of PRU_SEND.
+	 */
+	    ((flags & MSG_EOF) &&
+	     (so->so_proto->pr_flags & PR_IMPLOPCL) &&
+	     (resid <= 0)) ?
+		PRUS_EOF :
+		/* If there is more to send set PRUS_MORETOCOME */
+		(resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
+		top, addr, control, td);
+	if (dontroute) {
+		SOCK_LOCK(so);
+		so->so_options &= ~SO_DONTROUTE;
+		SOCK_UNLOCK(so);
+	}
+	clen = 0;
+	control = NULL;
+	top = NULL;
+out:
+	if (top != NULL)
+		m_freem(top);
+	if (control != NULL)
+		m_freem(control);
+	return (error);
+}
+
+/*
+ * Send on a socket.  If send must go all at once and message is larger than
+ * send buffering, then hard error.  Lock against other senders.  If must go
+ * all at once and not enough room now, then inform user that this would
+ * block and do nothing.  Otherwise, if nonblocking, send as much as
+ * possible.  The data to be sent is described by "uio" if nonzero, otherwise
+ * by the mbuf chain "top" (which must be null if uio is not).  Data provided
+ * in mbuf chain must be small enough to send all at once.
+ *
+ * Returns nonzero on error, timeout or signal; callers must check for short
+ * counts if EINTR/ERESTART are returned.  Data and control buffers are freed
+ * on return.
+ */
+int
+sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio,
+    struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
+{
+	long space, resid;
+	int clen = 0, error, dontroute;
+	int atomic = sosendallatonce(so) || top;
 
 	if (uio != NULL)
 		resid = uio->uio_resid;
 	else
 		resid = top->m_pkthdr.len;
 	/*
-	 * In theory resid should be unsigned.
-	 * However, space must be signed, as it might be less than 0
-	 * if we over-committed, and we must use a signed comparison
-	 * of space and resid.  On the other hand, a negative resid
-	 * causes us to loop sending 0-length segments to the protocol.
+	 * In theory resid should be unsigned.  However, space must be
+	 * signed, as it might be less than 0 if we over-committed, and we
+	 * must use a signed comparison of space and resid.  On the other
+	 * hand, a negative resid causes us to loop sending 0-length
+	 * segments to the protocol.
 	 *
 	 * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM
 	 * type sockets since that's an error.
@@ -664,24 +1123,26 @@
 	    (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 &&
 	    (so->so_proto->pr_flags & PR_ATOMIC);
 	if (td != NULL)
-		td->td_proc->p_stats->p_ru.ru_msgsnd++;
+		td->td_ru.ru_msgsnd++;
 	if (control != NULL)
 		clen = control->m_len;
-#define	snderr(errno)	{ error = (errno); goto release; }
 
-	SOCKBUF_LOCK(&so->so_snd);
-restart:
-	SOCKBUF_LOCK_ASSERT(&so->so_snd);
 	error = sblock(&so->so_snd, SBLOCKWAIT(flags));
 	if (error)
-		goto out_locked;
+		goto out;
+
+restart:
 	do {
-		SOCKBUF_LOCK_ASSERT(&so->so_snd);
-		if (so->so_snd.sb_state & SBS_CANTSENDMORE)
-			snderr(EPIPE);
+		SOCKBUF_LOCK(&so->so_snd);
+		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
+			SOCKBUF_UNLOCK(&so->so_snd);
+			error = EPIPE;
+			goto release;
+		}
 		if (so->so_error) {
 			error = so->so_error;
 			so->so_error = 0;
+			SOCKBUF_UNLOCK(&so->so_snd);
 			goto release;
 		}
 		if ((so->so_state & SS_ISCONNECTED) == 0) {
@@ -694,186 +1155,117 @@
 			if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
 			    (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
 				if ((so->so_state & SS_ISCONFIRMING) == 0 &&
-				    !(resid == 0 && clen != 0))
-					snderr(ENOTCONN);
-			} else if (addr == NULL)
-			    snderr(so->so_proto->pr_flags & PR_CONNREQUIRED ?
-				   ENOTCONN : EDESTADDRREQ);
+				    !(resid == 0 && clen != 0)) {
+					SOCKBUF_UNLOCK(&so->so_snd);
+					error = ENOTCONN;
+					goto release;
+				}
+			} else if (addr == NULL) {
+				SOCKBUF_UNLOCK(&so->so_snd);
+				if (so->so_proto->pr_flags & PR_CONNREQUIRED)
+					error = ENOTCONN;
+				else
+					error = EDESTADDRREQ;
+				goto release;
+			}
 		}
 		space = sbspace(&so->so_snd);
 		if (flags & MSG_OOB)
 			space += 1024;
 		if ((atomic && resid > so->so_snd.sb_hiwat) ||
-		    clen > so->so_snd.sb_hiwat)
-			snderr(EMSGSIZE);
+		    clen > so->so_snd.sb_hiwat) {
+			SOCKBUF_UNLOCK(&so->so_snd);
+			error = EMSGSIZE;
+			goto release;
+		}
 		if (space < resid + clen &&
 		    (atomic || space < so->so_snd.sb_lowat || space < clen)) {
-			if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO))
-				snderr(EWOULDBLOCK);
-			sbunlock(&so->so_snd);
+			if ((so->so_state & SS_NBIO) || (flags & MSG_NBIO)) {
+				SOCKBUF_UNLOCK(&so->so_snd);
+				error = EWOULDBLOCK;
+				goto release;
+			}
 			error = sbwait(&so->so_snd);
+			SOCKBUF_UNLOCK(&so->so_snd);
 			if (error)
-				goto out_locked;
+				goto release;
 			goto restart;
 		}
 		SOCKBUF_UNLOCK(&so->so_snd);
-		mp = ⊤
 		space -= clen;
 		do {
-		    if (uio == NULL) {
-			/*
-			 * Data is prepackaged in "top".
-			 */
-			resid = 0;
-			if (flags & MSG_EOR)
-				top->m_flags |= M_EOR;
-		    } else do {
-#ifdef ZERO_COPY_SOCKETS
-			cow_send = 0;
-#endif /* ZERO_COPY_SOCKETS */
-			if (resid >= MINCLSIZE) {
-#ifdef ZERO_COPY_SOCKETS
-				if (top == NULL) {
-					MGETHDR(m, M_TRYWAIT, MT_DATA);
-					if (m == NULL) {
-						error = ENOBUFS;
-						SOCKBUF_LOCK(&so->so_snd);
-						goto release;
-					}
-					m->m_pkthdr.len = 0;
-					m->m_pkthdr.rcvif = NULL; 
-				} else {
-					MGET(m, M_TRYWAIT, MT_DATA);
-					if (m == NULL) {
-						error = ENOBUFS;
-						SOCKBUF_LOCK(&so->so_snd);
-						goto release;
-					}
-				}
-				if (so_zero_copy_send &&
-				    resid>=PAGE_SIZE &&
-				    space>=PAGE_SIZE &&
-				    uio->uio_iov->iov_len>=PAGE_SIZE) {
-					so_zerocp_stats.size_ok++;
-					so_zerocp_stats.align_ok++;
-					cow_send = socow_setup(m, uio);
-					len = cow_send;
-				}
-				if (!cow_send) {
-					MCLGET(m, M_TRYWAIT);
-					if ((m->m_flags & M_EXT) == 0) {
-						m_free(m);
-						m = NULL;
-					} else {
-						len = min(min(MCLBYTES, resid), space);
-					}
-				}
-#else /* ZERO_COPY_SOCKETS */
-				if (top == NULL) {
-					m = m_getcl(M_TRYWAIT, MT_DATA, M_PKTHDR);
-					m->m_pkthdr.len = 0;
-					m->m_pkthdr.rcvif = NULL;
-				} else
-					m = m_getcl(M_TRYWAIT, MT_DATA, 0);
-				len = min(min(MCLBYTES, resid), space);
-#endif /* ZERO_COPY_SOCKETS */
+			if (uio == NULL) {
+				resid = 0;
+				if (flags & MSG_EOR)
+					top->m_flags |= M_EOR;
 			} else {
+#ifdef ZERO_COPY_SOCKETS
+				error = sosend_copyin(uio, &top, atomic,
+				    &space, flags);
+				if (error != 0)
+					goto release;
+#else
+				/*
+				 * Copy the data from userland into a mbuf
+				 * chain.  If no data is to be copied in,
+				 * a single empty mbuf is returned.
+				 */
+				top = m_uiotombuf(uio, M_WAITOK, space,
+				    (atomic ? max_hdr : 0),
+				    (atomic ? M_PKTHDR : 0) |
+				    ((flags & MSG_EOR) ? M_EOR : 0));
 				if (top == NULL) {
-					m = m_gethdr(M_TRYWAIT, MT_DATA);
-					m->m_pkthdr.len = 0;
-					m->m_pkthdr.rcvif = NULL;
-
-					len = min(min(MHLEN, resid), space);
-					/*
-					 * For datagram protocols, leave room
-					 * for protocol headers in first mbuf.
-					 */
-					if (atomic && m && len < MHLEN)
-						MH_ALIGN(m, len);
-				} else {
-					m = m_get(M_TRYWAIT, MT_DATA);
-					len = min(min(MLEN, resid), space);
+					error = EFAULT; /* only possible error */
+					goto release;
 				}
+				space -= resid - uio->uio_resid;
+#endif
+				resid = uio->uio_resid;
 			}
-			if (m == NULL) {
-				error = ENOBUFS;
-				SOCKBUF_LOCK(&so->so_snd);
-				goto release;
-			}
-
-			space -= len;
-#ifdef ZERO_COPY_SOCKETS
-			if (cow_send)
-				error = 0;
-			else
-#endif /* ZERO_COPY_SOCKETS */
-			error = uiomove(mtod(m, void *), (int)len, uio);
-			resid = uio->uio_resid;
-			m->m_len = len;
-			*mp = m;
-			top->m_pkthdr.len += len;
-			if (error) {
-				SOCKBUF_LOCK(&so->so_snd);
-				goto release;
-			}
-			mp = &m->m_next;
-			if (resid <= 0) {
-				if (flags & MSG_EOR)
-					top->m_flags |= M_EOR;
-				break;
+			if (dontroute) {
+				SOCK_LOCK(so);
+				so->so_options |= SO_DONTROUTE;
+				SOCK_UNLOCK(so);
 			}
-		    } while (space > 0 && atomic);
-		    if (dontroute) {
-			    SOCK_LOCK(so);
-			    so->so_options |= SO_DONTROUTE;
-			    SOCK_UNLOCK(so);
-		    }
-		    /*
-		     * XXX all the SBS_CANTSENDMORE checks previously
-		     * done could be out of date.  We could have recieved
-		     * a reset packet in an interrupt or maybe we slept
-		     * while doing page faults in uiomove() etc. We could
-		     * probably recheck again inside the locking protection
-		     * here, but there are probably other places that this
-		     * also happens.  We must rethink this.
-		     */
-		    error = (*so->so_proto->pr_usrreqs->pru_send)(so,
-			(flags & MSG_OOB) ? PRUS_OOB :
 			/*
-			 * If the user set MSG_EOF, the protocol
-			 * understands this flag and nothing left to
-			 * send then use PRU_SEND_EOF instead of PRU_SEND.
+			 * XXX all the SBS_CANTSENDMORE checks previously
+			 * done could be out of date.  We could have recieved
+			 * a reset packet in an interrupt or maybe we slept
+			 * while doing page faults in uiomove() etc.  We
+			 * could probably recheck again inside the locking
+			 * protection here, but there are probably other
+			 * places that this also happens.  We must rethink
+			 * this.
+			 */
+			error = (*so->so_proto->pr_usrreqs->pru_send)(so,
+			    (flags & MSG_OOB) ? PRUS_OOB :
+			/*
+			 * If the user set MSG_EOF, the protocol understands
+			 * this flag and nothing left to send then use
+			 * PRU_SEND_EOF instead of PRU_SEND.
 			 */
-			((flags & MSG_EOF) &&
-			 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
-			 (resid <= 0)) ?
+			    ((flags & MSG_EOF) &&
+			     (so->so_proto->pr_flags & PR_IMPLOPCL) &&
+			     (resid <= 0)) ?
 				PRUS_EOF :
-			/* If there is more to send set PRUS_MORETOCOME */
-			(resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
-			top, addr, control, td);
-		    if (dontroute) {
-			    SOCK_LOCK(so);
-			    so->so_options &= ~SO_DONTROUTE;
-			    SOCK_UNLOCK(so);
-		    }
-		    clen = 0;
-		    control = NULL;
-		    top = NULL;
-		    mp = ⊤
-		    if (error) {
-			SOCKBUF_LOCK(&so->so_snd);
-			goto release;
-		    }
+			/* If there is more to send set PRUS_MORETOCOME. */
+			    (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0,
+			    top, addr, control, td);
+			if (dontroute) {
+				SOCK_LOCK(so);
+				so->so_options &= ~SO_DONTROUTE;
+				SOCK_UNLOCK(so);
+			}
+			clen = 0;
+			control = NULL;
+			top = NULL;
+			if (error)
+				goto release;
 		} while (resid && space > 0);
-		SOCKBUF_LOCK(&so->so_snd);
 	} while (resid);
 
 release:
-	SOCKBUF_LOCK_ASSERT(&so->so_snd);
 	sbunlock(&so->so_snd);
-out_locked:
-	SOCKBUF_LOCK_ASSERT(&so->so_snd);
-	SOCKBUF_UNLOCK(&so->so_snd);
 out:
 	if (top != NULL)
 		m_freem(top);
@@ -882,6 +1274,19 @@
 	return (error);
 }
 
+int
+sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
+    struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
+{
+
+	/* XXXRW: Temporary debugging. */
+	KASSERT(so->so_proto->pr_usrreqs->pru_sosend != sosend,
+	    ("sosend: protocol calls sosend"));
+
+	return (so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio, top,
+	    control, flags, td));
+}
+
 /*
  * The part of soreceive() that implements reading non-inline out-of-band
  * data from a socket.  For more complete comments, see soreceive(), from
@@ -891,10 +1296,7 @@
  * unable to return an mbuf chain to the caller.
  */
 static int
-soreceive_rcvoob(so, uio, flags)
-	struct socket *so;
-	struct uio *uio;
-	int flags;
+soreceive_rcvoob(struct socket *so, struct uio *uio, int flags)
 {
 	struct protosw *pr = so->so_proto;
 	struct mbuf *m;
@@ -971,29 +1373,24 @@
 
 
 /*
- * Implement receive operations on a socket.
- * We depend on the way that records are added to the sockbuf
- * by sbappend*.  In particular, each record (mbufs linked through m_next)
- * must begin with an address if the protocol so specifies,
- * followed by an optional mbuf or mbufs containing ancillary data,
- * and then zero or more mbufs of data.
- * In order to avoid blocking network interrupts for the entire time here,
- * we splx() while doing the actual copy to user space.
- * Although the sockbuf is locked, new data may still be appended,
- * and thus we must maintain consistency of the sockbuf during that time.
- *
- * The caller may receive the data as a single mbuf chain by supplying
- * an mbuf **mp0 for use in returning the chain.  The uio is then used
- * only for the count in uio_resid.
+ * Implement receive operations on a socket.  We depend on the way that
+ * records are added to the sockbuf by sbappend.  In particular, each record
+ * (mbufs linked through m_next) must begin with an address if the protocol
+ * so specifies, followed by an optional mbuf or mbufs containing ancillary
+ * data, and then zero or more mbufs of data.  In order to allow parallelism
+ * between network receive and copying to user space, as well as avoid
+ * sleeping with a mutex held, we release the socket buffer mutex during the
+ * user space copy.  Although the sockbuf is locked, new data may still be
+ * appended, and thus we must maintain consistency of the sockbuf during that
+ * time.
+ *
+ * The caller may receive the data as a single mbuf chain by supplying an
+ * mbuf **mp0 for use in returning the chain.  The uio is then used only for
+ * the count in uio_resid.
  */
 int
-soreceive(so, psa, uio, mp0, controlp, flagsp)
-	struct socket *so;
-	struct sockaddr **psa;
-	struct uio *uio;
-	struct mbuf **mp0;
-	struct mbuf **controlp;
-	int *flagsp;
+soreceive_generic(struct socket *so, struct sockaddr **psa, struct uio *uio,
+    struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
 {
 	struct mbuf *m, **mp;
 	int flags, len, error, offset;
@@ -1019,24 +1416,23 @@
 	    && uio->uio_resid)
 		(*pr->pr_usrreqs->pru_rcvd)(so, 0);
 
-	SOCKBUF_LOCK(&so->so_rcv);
-restart:
-	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 	error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
 	if (error)
-		goto out;
+		return (error);
 
+restart:
+	SOCKBUF_LOCK(&so->so_rcv);
 	m = so->so_rcv.sb_mb;
 	/*
-	 * If we have less data than requested, block awaiting more
-	 * (subject to any timeout) if:
+	 * If we have less data than requested, block awaiting more (subject
+	 * to any timeout) if:
 	 *   1. the current count is less than the low water mark, or
 	 *   2. MSG_WAITALL is set, and it is possible to do the entire
 	 *	receive operation at once if we block (resid <= hiwat).
 	 *   3. MSG_DONTWAIT is not set
 	 * If MSG_WAITALL is set but resid is larger than the receive buffer,
-	 * we have to do the receive in sections, and thus risk returning
-	 * a short count if a timeout or signal occurs after we start.
+	 * we have to do the receive in sections, and thus risk returning a
+	 * short count if a timeout or signal occurs after we start.
 	 */
 	if (m == NULL || (((flags & MSG_DONTWAIT) == 0 &&
 	    so->so_rcv.sb_cc < uio->uio_resid) &&
@@ -1052,14 +1448,16 @@
 			error = so->so_error;
 			if ((flags & MSG_PEEK) == 0)
 				so->so_error = 0;
+			SOCKBUF_UNLOCK(&so->so_rcv);
 			goto release;
 		}
 		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 		if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
-			if (m)
-				goto dontblock;
-			else
+			if (m == NULL) {
+				SOCKBUF_UNLOCK(&so->so_rcv);
 				goto release;
+			} else
+				goto dontblock;
 		}
 		for (; m != NULL; m = m->m_next)
 			if (m->m_type == MT_OOBDATA  || (m->m_flags & M_EOR)) {
@@ -1068,22 +1466,26 @@
 			}
 		if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 &&
 		    (so->so_proto->pr_flags & PR_CONNREQUIRED)) {
+			SOCKBUF_UNLOCK(&so->so_rcv);
 			error = ENOTCONN;
 			goto release;
 		}
-		if (uio->uio_resid == 0)
+		if (uio->uio_resid == 0) {
+			SOCKBUF_UNLOCK(&so->so_rcv);
 			goto release;
+		}
 		if ((so->so_state & SS_NBIO) ||
 		    (flags & (MSG_DONTWAIT|MSG_NBIO))) {
+			SOCKBUF_UNLOCK(&so->so_rcv);
 			error = EWOULDBLOCK;
 			goto release;
 		}
 		SBLASTRECORDCHK(&so->so_rcv);
 		SBLASTMBUFCHK(&so->so_rcv);
-		sbunlock(&so->so_rcv);
 		error = sbwait(&so->so_rcv);
+		SOCKBUF_UNLOCK(&so->so_rcv);
 		if (error)
-			goto out;
+			goto release;
 		goto restart;
 	}
 dontblock:
@@ -1104,7 +1506,7 @@
 	 */
 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 	if (uio->uio_td)
-		uio->uio_td->td_proc->p_stats->p_ru.ru_msgrcv++;
+		uio->uio_td->td_ru.ru_msgrcv++;
 	KASSERT(m == so->so_rcv.sb_mb, ("soreceive: m != so->so_rcv.sb_mb"));
 	SBLASTRECORDCHK(&so->so_rcv);
 	SBLASTMBUFCHK(&so->so_rcv);
@@ -1173,7 +1575,10 @@
 			}
 			cm = cmn;
 		}
-		nextrecord = so->so_rcv.sb_mb->m_nextpkt;
+		if (m != NULL)
+			nextrecord = so->so_rcv.sb_mb->m_nextpkt;
+		else
+			nextrecord = so->so_rcv.sb_mb;
 		orig_resid = 0;
 	}
 	if (m != NULL) {
@@ -1226,7 +1631,7 @@
 		} else if (type == MT_OOBDATA)
 			break;
 		else
-		    KASSERT(m->m_type == MT_DATA || m->m_type == MT_HEADER,
+		    KASSERT(m->m_type == MT_DATA,
 			("m->m_type == %d", m->m_type));
 		so->so_rcv.sb_state &= ~SBS_RCVATMARK;
 		len = uio->uio_resid;
@@ -1235,12 +1640,11 @@
 		if (len > m->m_len - moff)
 			len = m->m_len - moff;
 		/*
-		 * If mp is set, just pass back the mbufs.
-		 * Otherwise copy them out via the uio, then free.
-		 * Sockbuf must be consistent here (points to current mbuf,
-		 * it points to next record) when we drop priority;
-		 * we must note any additions to the sockbuf when we
-		 * block interrupts again.
+		 * If mp is set, just pass back the mbufs.  Otherwise copy
+		 * them out via the uio, then free.  Sockbuf must be
+		 * consistent here (points to current mbuf, it points to next
+		 * record) when we drop priority; we must note any additions
+		 * to the sockbuf when we block interrupts again.
 		 */
 		if (mp == NULL) {
 			SOCKBUF_LOCK_ASSERT(&so->so_rcv);
@@ -1264,8 +1668,21 @@
 #endif /* ZERO_COPY_SOCKETS */
 			error = uiomove(mtod(m, char *) + moff, (int)len, uio);
 			SOCKBUF_LOCK(&so->so_rcv);
-			if (error)
+			if (error) {
+				/*
+				 * The MT_SONAME mbuf has already been removed
+				 * from the record, so it is necessary to
+				 * remove the data mbufs, if any, to preserve
+				 * the invariant in the case of PR_ADDR that
+				 * requires MT_SONAME mbufs at the head of
+				 * each record.
+				 */
+				if (m && pr->pr_flags & PR_ATOMIC &&
+				    ((flags & MSG_PEEK) == 0))
+					(void)sbdroprecord_locked(&so->so_rcv);
+				SOCKBUF_UNLOCK(&so->so_rcv);
 				goto release;
+			}
 		} else
 			uio->uio_resid -= len;
 		SOCKBUF_LOCK_ASSERT(&so->so_rcv);
@@ -1287,14 +1704,7 @@
 					so->so_rcv.sb_mb = m_free(m);
 					m = so->so_rcv.sb_mb;
 				}
-				if (m != NULL) {
-					m->m_nextpkt = nextrecord;
-					if (nextrecord == NULL)
-						so->so_rcv.sb_lastrecord = m;
-				} else {
-					so->so_rcv.sb_mb = nextrecord;
-					SB_EMPTY_FIXUP(&so->so_rcv);
-				}
+				sockbuf_pushsync(&so->so_rcv, nextrecord);
 				SBLASTRECORDCHK(&so->so_rcv);
 				SBLASTMBUFCHK(&so->so_rcv);
 			}
@@ -1316,9 +1726,11 @@
 						SOCKBUF_LOCK(&so->so_rcv);
  					if (*mp == NULL) {
  						/*
- 						 * m_copym() couldn't allocate an mbuf. 
-						 * Adjust uio_resid back (it was adjusted 
-						 * down by len bytes, which we didn't end 
+ 						 * m_copym() couldn't
+						 * allocate an mbuf.  Adjust
+						 * uio_resid back (it was
+						 * adjusted down by len
+						 * bytes, which we didn't end
 						 * up "copying" over).
  						 */
  						uio->uio_resid += len;
@@ -1347,11 +1759,11 @@
 		if (flags & MSG_EOR)
 			break;
 		/*
-		 * If the MSG_WAITALL flag is set (for non-atomic socket),
-		 * we must not quit until "uio->uio_resid == 0" or an error
-		 * termination.  If a signal/timeout occurs, return
-		 * with a short count but without error.
-		 * Keep sockbuf locked against other readers.
+		 * If the MSG_WAITALL flag is set (for non-atomic socket), we
+		 * must not quit until "uio->uio_resid == 0" or an error
+		 * termination.  If a signal/timeout occurs, return with a
+		 * short count but without error.  Keep sockbuf locked
+		 * against other readers.
 		 */
 		while (flags & MSG_WAITALL && m == NULL && uio->uio_resid > 0 &&
 		    !sosendallatonce(so) && nextrecord == NULL) {
@@ -1362,7 +1774,7 @@
 			 * Notify the protocol that some data has been
 			 * drained before blocking.
 			 */
-			if (pr->pr_flags & PR_WANTRCVD && so->so_pcb != NULL) {
+			if (pr->pr_flags & PR_WANTRCVD) {
 				SOCKBUF_UNLOCK(&so->so_rcv);
 				(*pr->pr_usrreqs->pru_rcvd)(so, flags);
 				SOCKBUF_LOCK(&so->so_rcv);
@@ -1370,8 +1782,10 @@
 			SBLASTRECORDCHK(&so->so_rcv);
 			SBLASTMBUFCHK(&so->so_rcv);
 			error = sbwait(&so->so_rcv);
-			if (error)
+			if (error) {
+				SOCKBUF_UNLOCK(&so->so_rcv);
 				goto release;
+			}
 			m = so->so_rcv.sb_mb;
 			if (m != NULL)
 				nextrecord = m->m_nextpkt;
@@ -1401,12 +1815,12 @@
 		SBLASTRECORDCHK(&so->so_rcv);
 		SBLASTMBUFCHK(&so->so_rcv);
 		/*
-		 * If soreceive() is being done from the socket callback, then 
-		 * don't need to generate ACK to peer to update window, since 
-		 * ACK will be generated on return to TCP.
+		 * If soreceive() is being done from the socket callback,
+		 * then don't need to generate ACK to peer to update window,
+		 * since ACK will be generated on return to TCP.
 		 */
-		if (!(flags & MSG_SOCALLBCK) && 
-		    (pr->pr_flags & PR_WANTRCVD) && so->so_pcb) {
+		if (!(flags & MSG_SOCALLBCK) &&
+		    (pr->pr_flags & PR_WANTRCVD)) {
 			SOCKBUF_UNLOCK(&so->so_rcv);
 			(*pr->pr_usrreqs->pru_rcvd)(so, flags);
 			SOCKBUF_LOCK(&so->so_rcv);
@@ -1415,25 +1829,33 @@
 	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 	if (orig_resid == uio->uio_resid && orig_resid &&
 	    (flags & MSG_EOR) == 0 && (so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0) {
-		sbunlock(&so->so_rcv);
+		SOCKBUF_UNLOCK(&so->so_rcv);
 		goto restart;
 	}
+	SOCKBUF_UNLOCK(&so->so_rcv);
 
 	if (flagsp != NULL)
 		*flagsp |= flags;
 release:
-	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
 	sbunlock(&so->so_rcv);
-out:
-	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
-	SOCKBUF_UNLOCK(&so->so_rcv);
 	return (error);
 }
 
 int
-soshutdown(so, how)
-	struct socket *so;
-	int how;
+soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio,
+    struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
+{
+
+	/* XXXRW: Temporary debugging. */
+	KASSERT(so->so_proto->pr_usrreqs->pru_soreceive != soreceive,
+	    ("soreceive: protocol calls soreceive"));
+
+	return (so->so_proto->pr_usrreqs->pru_soreceive(so, psa, uio, mp0,
+	    controlp, flagsp));
+}
+
+int
+soshutdown(struct socket *so, int how)
 {
 	struct protosw *pr = so->so_proto;
 
@@ -1448,8 +1870,7 @@
 }
 
 void
-sorflush(so)
-	struct socket *so;
+sorflush(struct socket *so)
 {
 	struct sockbuf *sb = &so->so_rcv;
 	struct protosw *pr = so->so_proto;
@@ -1463,27 +1884,28 @@
 	 * however, we have to initialize and destroy the mutex in the copy
 	 * so that dom_dispose() and sbrelease() can lock t as needed.
 	 */
-	SOCKBUF_LOCK(sb);
-	sb->sb_flags |= SB_NOINTR;
-	(void) sblock(sb, M_WAITOK);
+
 	/*
-	 * socantrcvmore_locked() drops the socket buffer mutex so that it
-	 * can safely perform wakeups.  Re-acquire the mutex before
-	 * continuing.
+	 * Dislodge threads currently blocked in receive and wait to acquire
+	 * a lock against other simultaneous readers before clearing the
+	 * socket buffer.  Don't let our acquire be interrupted by a signal
+	 * despite any existing socket disposition on interruptable waiting.
 	 */
-	socantrcvmore_locked(so);
-	SOCKBUF_LOCK(sb);
-	sbunlock(sb);
+	socantrcvmore(so);
+	(void) sblock(sb, SBL_WAIT | SBL_NOINTR);
+
 	/*
-	 * Invalidate/clear most of the sockbuf structure, but leave
-	 * selinfo and mutex data unchanged.
+	 * Invalidate/clear most of the sockbuf structure, but leave selinfo
+	 * and mutex data unchanged.
 	 */
+	SOCKBUF_LOCK(sb);
 	bzero(&asb, offsetof(struct sockbuf, sb_startzero));
 	bcopy(&sb->sb_startzero, &asb.sb_startzero,
 	    sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
 	bzero(&sb->sb_startzero,
 	    sizeof(*sb) - offsetof(struct sockbuf, sb_startzero));
 	SOCKBUF_UNLOCK(sb);
+	sbunlock(sb);
 
 	SOCKBUF_LOCK_INIT(&asb, "so_rcv");
 	if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose != NULL)
@@ -1493,26 +1915,22 @@
 }
 
 /*
- * Perhaps this routine, and sooptcopyout(), below, ought to come in
- * an additional variant to handle the case where the option value needs
- * to be some kind of integer, but not a specific size.
- * In addition to their use here, these functions are also called by the
- * protocol-level pr_ctloutput() routines.
+ * Perhaps this routine, and sooptcopyout(), below, ought to come in an
+ * additional variant to handle the case where the option value needs to be
+ * some kind of integer, but not a specific size.  In addition to their use
+ * here, these functions are also called by the protocol-level pr_ctloutput()
+ * routines.
  */
 int
-sooptcopyin(sopt, buf, len, minlen)
-	struct	sockopt *sopt;
-	void	*buf;
-	size_t	len;
-	size_t	minlen;
+sooptcopyin(struct sockopt *sopt, void *buf, size_t len, size_t minlen)
 {
 	size_t	valsize;
 
 	/*
-	 * If the user gives us more than we wanted, we ignore it,
-	 * but if we don't get the minimum length the caller
-	 * wants, we return EINVAL.  On success, sopt->sopt_valsize
-	 * is set to however much we actually retrieved.
+	 * If the user gives us more than we wanted, we ignore it, but if we
+	 * don't get the minimum length the caller wants, we return EINVAL.
+	 * On success, sopt->sopt_valsize is set to however much we actually
+	 * retrieved.
 	 */
 	if ((valsize = sopt->sopt_valsize) < minlen)
 		return EINVAL;
@@ -1523,11 +1941,12 @@
 		return (copyin(sopt->sopt_val, buf, valsize));
 
 	bcopy(sopt->sopt_val, buf, valsize);
-	return 0;
+	return (0);
 }
 
 /*
- * Kernel version of setsockopt(2)/
+ * Kernel version of setsockopt(2).
+ *
  * XXX: optlen is size_t, not socklen_t
  */
 int
@@ -1546,9 +1965,7 @@
 }
 
 int
-sosetopt(so, sopt)
-	struct socket *so;
-	struct sockopt *sopt;
+sosetopt(struct socket *so, struct sockopt *sopt)
 {
 	int	error, optval;
 	struct	linger l;
@@ -1620,8 +2037,8 @@
 				goto bad;
 
 			/*
-			 * Values < 1 make no sense for any of these
-			 * options, so disallow them.
+			 * Values < 1 make no sense for any of these options,
+			 * so disallow them.
 			 */
 			if (optval < 1) {
 				error = EINVAL;
@@ -1642,8 +2059,8 @@
 				break;
 
 			/*
-			 * Make sure the low-water is never greater than
-			 * the high-water.
+			 * Make sure the low-water is never greater than the
+			 * high-water.
 			 */
 			case SO_SNDLOWAT:
 				SOCKBUF_LOCK(&so->so_snd);
@@ -1732,7 +2149,9 @@
 	return (error);
 }
 
-/* Helper routine for getsockopt */
+/*
+ * Helper routine for getsockopt.
+ */
 int
 sooptcopyout(struct sockopt *sopt, const void *buf, size_t len)
 {
@@ -1742,13 +2161,12 @@
 	error = 0;
 
 	/*
-	 * Documented get behavior is that we always return a value,
-	 * possibly truncated to fit in the user's buffer.
-	 * Traditional behavior is that we always tell the user
-	 * precisely how much we copied, rather than something useful
-	 * like the total amount we had available for her.
-	 * Note that this interface is not idempotent; the entire answer must
-	 * generated ahead of time.
+	 * Documented get behavior is that we always return a value, possibly
+	 * truncated to fit in the user's buffer.  Traditional behavior is
+	 * that we always tell the user precisely how much we copied, rather
+	 * than something useful like the total amount we had available for
+	 * her.  Note that this interface is not idempotent; the entire
+	 * answer must generated ahead of time.
 	 */
 	valsize = min(len, sopt->sopt_valsize);
 	sopt->sopt_valsize = valsize;
@@ -1758,13 +2176,11 @@
 		else
 			bcopy(buf, sopt->sopt_val, valsize);
 	}
-	return error;
+	return (error);
 }
 
 int
-sogetopt(so, sopt)
-	struct socket *so;
-	struct sockopt *sopt;
+sogetopt(struct socket *so, struct sockopt *sopt)
 {
 	int	error, optval;
 	struct	linger l;
@@ -1817,8 +2233,10 @@
 			goto integer;
 
 		case SO_ERROR:
+			SOCK_LOCK(so);
 			optval = so->so_error;
 			so->so_error = 0;
+			SOCK_UNLOCK(so);
 			goto integer;
 
 		case SO_SNDBUF:
@@ -1954,7 +2372,7 @@
 		m_prev->m_next = m;
 		m_prev = m;
 	}
-	return 0;
+	return (0);
 }
 
 /* XXX; copyin sopt data into mbuf chain for (__FreeBSD__ < 3) routines. */
@@ -1964,7 +2382,7 @@
 	struct mbuf *m0 = m;
 
 	if (sopt->sopt_val == NULL)
-		return 0;
+		return (0);
 	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
 		if (sopt->sopt_td != NULL) {
 			int error;
@@ -1983,7 +2401,7 @@
 	}
 	if (m != NULL) /* should be allocated enoughly at ip6_sooptmcopyin() */
 		panic("ip6_sooptmcopyin");
-	return 0;
+	return (0);
 }
 
 /* XXX; copyout mbuf chain data into soopt for (__FreeBSD__ < 3) routines. */
@@ -1994,7 +2412,7 @@
 	size_t valsize = 0;
 
 	if (sopt->sopt_val == NULL)
-		return 0;
+		return (0);
 	while (m != NULL && sopt->sopt_valsize >= m->m_len) {
 		if (sopt->sopt_td != NULL) {
 			int error;
@@ -2018,13 +2436,17 @@
 		return(EINVAL);
 	}
 	sopt->sopt_valsize = valsize;
-	return 0;
+	return (0);
 }
 
+/*
+ * sohasoutofband(): protocol notifies socket layer of the arrival of new
+ * out-of-band data, which will then notify socket consumers.
+ */
 void
-sohasoutofband(so)
-	struct socket *so;
+sohasoutofband(struct socket *so)
 {
+
 	if (so->so_sigio != NULL)
 		pgsigio(&so->so_sigio, SIGURG, 0);
 	selwakeuppri(&so->so_rcv.sb_sel, PSOCK);
@@ -2034,6 +2456,19 @@
 sopoll(struct socket *so, int events, struct ucred *active_cred,
     struct thread *td)
 {
+
+	/* XXXRW: Temporary debugging. */
+	KASSERT(so->so_proto->pr_usrreqs->pru_sopoll != sopoll,
+	    ("sopoll: protocol calls sopoll"));
+
+	return (so->so_proto->pr_usrreqs->pru_sopoll(so, events, active_cred,
+	    td));
+}
+
+int
+sopoll_generic(struct socket *so, int events, struct ucred *active_cred,
+    struct thread *td)
+{
 	int revents = 0;
 
 	SOCKBUF_LOCK(&so->so_snd);
@@ -2103,6 +2538,146 @@
 	return (0);
 }
 
+/*
+ * Some routines that return EOPNOTSUPP for entry points that are not
+ * supported by a protocol.  Fill in as needed.
+ */
+int
+pru_accept_notsupp(struct socket *so, struct sockaddr **nam)
+{
+
+	return EOPNOTSUPP;
+}
+
+int
+pru_attach_notsupp(struct socket *so, int proto, struct thread *td)
+{
+
+	return EOPNOTSUPP;
+}
+
+int
+pru_bind_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
+{
+
+	return EOPNOTSUPP;
+}
+
+int
+pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
+{
+
+	return EOPNOTSUPP;
+}
+
+int
+pru_connect2_notsupp(struct socket *so1, struct socket *so2)
+{
+
+	return EOPNOTSUPP;
+}
+
+int
+pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data,
+    struct ifnet *ifp, struct thread *td)
+{
+
+	return EOPNOTSUPP;
+}
+
+int
+pru_disconnect_notsupp(struct socket *so)
+{
+
+	return EOPNOTSUPP;
+}
+
+int
+pru_listen_notsupp(struct socket *so, int backlog, struct thread *td)
+{
+
+	return EOPNOTSUPP;
+}
+
+int
+pru_peeraddr_notsupp(struct socket *so, struct sockaddr **nam)
+{
+
+	return EOPNOTSUPP;
+}
+
+int
+pru_rcvd_notsupp(struct socket *so, int flags)
+{
+
+	return EOPNOTSUPP;
+}
+
+int
+pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags)
+{
+
+	return EOPNOTSUPP;
+}
+
+int
+pru_send_notsupp(struct socket *so, int flags, struct mbuf *m,
+    struct sockaddr *addr, struct mbuf *control, struct thread *td)
+{
+
+	return EOPNOTSUPP;
+}
+
+/*
+ * This isn't really a ``null'' operation, but it's the default one and
+ * doesn't do anything destructive.
+ */
+int
+pru_sense_null(struct socket *so, struct stat *sb)
+{
+
+	sb->st_blksize = so->so_snd.sb_hiwat;
+	return 0;
+}
+
+int
+pru_shutdown_notsupp(struct socket *so)
+{
+
+	return EOPNOTSUPP;
+}
+
+int
+pru_sockaddr_notsupp(struct socket *so, struct sockaddr **nam)
+{
+
+	return EOPNOTSUPP;
+}
+
+int
+pru_sosend_notsupp(struct socket *so, struct sockaddr *addr, struct uio *uio,
+    struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
+{
+
+	return EOPNOTSUPP;
+}
+
+int
+pru_soreceive_notsupp(struct socket *so, struct sockaddr **paddr,
+    struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp)
+{
+
+	return EOPNOTSUPP;
+}
+
+int
+pru_sopoll_notsupp(struct socket *so, int events, struct ucred *cred,
+    struct thread *td)
+{
+
+	return EOPNOTSUPP;
+}
+
 static void
 filt_sordetach(struct knote *kn)
 {
@@ -2195,13 +2770,13 @@
 }
 
 static int
-somaxconn_sysctl(SYSCTL_HANDLER_ARGS)
+sysctl_somaxconn(SYSCTL_HANDLER_ARGS)
 {
 	int error;
 	int val;
 
 	val = somaxconn;
-	error = sysctl_handle_int(oidp, &val, sizeof(int), req);
+	error = sysctl_handle_int(oidp, &val, 0, req);
 	if (error || !req->newptr )
 		return (error);
 
@@ -2211,3 +2786,172 @@
 	somaxconn = val;
 	return (0);
 }
+
+/*
+ * These functions are used by protocols to notify the socket layer (and its
+ * consumers) of state changes in the sockets driven by protocol-side events.
+ */
+
+/*
+ * Procedures to manipulate state flags of socket and do appropriate wakeups.
+ *
+ * Normal sequence from the active (originating) side is that
+ * soisconnecting() is called during processing of connect() call, resulting
+ * in an eventual call to soisconnected() if/when the connection is
+ * established.  When the connection is torn down soisdisconnecting() is
+ * called during processing of disconnect() call, and soisdisconnected() is
+ * called when the connection to the peer is totally severed.  The semantics
+ * of these routines are such that connectionless protocols can call
+ * soisconnected() and soisdisconnected() only, bypassing the in-progress
+ * calls when setting up a ``connection'' takes no time.
+ *
+ * From the passive side, a socket is created with two queues of sockets:
+ * so_incomp for connections in progress and so_comp for connections already
+ * made and awaiting user acceptance.  As a protocol is preparing incoming
+ * connections, it creates a socket structure queued on so_incomp by calling
+ * sonewconn().  When the connection is established, soisconnected() is
+ * called, and transfers the socket structure to so_comp, making it available
+ * to accept().
+ *
+ * If a socket is closed with sockets on either so_incomp or so_comp, these
+ * sockets are dropped.
+ *
+ * If higher-level protocols are implemented in the kernel, the wakeups done
+ * here will sometimes cause software-interrupt process scheduling.
+ */
+void
+soisconnecting(struct socket *so)
+{
+
+	SOCK_LOCK(so);
+	so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
+	so->so_state |= SS_ISCONNECTING;
+	SOCK_UNLOCK(so);
+}
+
+void
+soisconnected(struct socket *so)
+{
+	struct socket *head;
+
+	ACCEPT_LOCK();
+	SOCK_LOCK(so);
+	so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING);
+	so->so_state |= SS_ISCONNECTED;
+	head = so->so_head;
+	if (head != NULL && (so->so_qstate & SQ_INCOMP)) {
+		if ((so->so_options & SO_ACCEPTFILTER) == 0) {
+			SOCK_UNLOCK(so);
+			TAILQ_REMOVE(&head->so_incomp, so, so_list);
+			head->so_incqlen--;
+			so->so_qstate &= ~SQ_INCOMP;
+			TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
+			head->so_qlen++;
+			so->so_qstate |= SQ_COMP;
+			ACCEPT_UNLOCK();
+			sorwakeup(head);
+			wakeup_one(&head->so_timeo);
+		} else {
+			ACCEPT_UNLOCK();
+			so->so_upcall =
+			    head->so_accf->so_accept_filter->accf_callback;
+			so->so_upcallarg = head->so_accf->so_accept_filter_arg;
+			so->so_rcv.sb_flags |= SB_UPCALL;
+			so->so_options &= ~SO_ACCEPTFILTER;
+			SOCK_UNLOCK(so);
+			so->so_upcall(so, so->so_upcallarg, M_DONTWAIT);
+		}
+		return;
+	}
+	SOCK_UNLOCK(so);
+	ACCEPT_UNLOCK();
+	wakeup(&so->so_timeo);
+	sorwakeup(so);
+	sowwakeup(so);
+}
+
+void
+soisdisconnecting(struct socket *so)
+{
+
+	/*
+	 * Note: This code assumes that SOCK_LOCK(so) and
+	 * SOCKBUF_LOCK(&so->so_rcv) are the same.
+	 */
+	SOCKBUF_LOCK(&so->so_rcv);
+	so->so_state &= ~SS_ISCONNECTING;
+	so->so_state |= SS_ISDISCONNECTING;
+	so->so_rcv.sb_state |= SBS_CANTRCVMORE;
+	sorwakeup_locked(so);
+	SOCKBUF_LOCK(&so->so_snd);
+	so->so_snd.sb_state |= SBS_CANTSENDMORE;
+	sowwakeup_locked(so);
+	wakeup(&so->so_timeo);
+}
+
+void
+soisdisconnected(struct socket *so)
+{
+
+	/*
+	 * Note: This code assumes that SOCK_LOCK(so) and
+	 * SOCKBUF_LOCK(&so->so_rcv) are the same.
+	 */
+	SOCKBUF_LOCK(&so->so_rcv);
+	so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
+	so->so_state |= SS_ISDISCONNECTED;
+	so->so_rcv.sb_state |= SBS_CANTRCVMORE;
+	sorwakeup_locked(so);
+	SOCKBUF_LOCK(&so->so_snd);
+	so->so_snd.sb_state |= SBS_CANTSENDMORE;
+	sbdrop_locked(&so->so_snd, so->so_snd.sb_cc);
+	sowwakeup_locked(so);
+	wakeup(&so->so_timeo);
+}
+
+/*
+ * Make a copy of a sockaddr in a malloced buffer of type M_SONAME.
+ */
+struct sockaddr *
+sodupsockaddr(const struct sockaddr *sa, int mflags)
+{
+	struct sockaddr *sa2;
+
+	sa2 = malloc(sa->sa_len, M_SONAME, mflags);
+	if (sa2)
+		bcopy(sa, sa2, sa->sa_len);
+	return sa2;
+}
+
+/*
+ * Create an external-format (``xsocket'') structure using the information in
+ * the kernel-format socket structure pointed to by so.  This is done to
+ * reduce the spew of irrelevant information over this interface, to isolate
+ * user code from changes in the kernel structure, and potentially to provide
+ * information-hiding if we decide that some of this information should be
+ * hidden from users.
+ */
+void
+sotoxsocket(struct socket *so, struct xsocket *xso)
+{
+
+	xso->xso_len = sizeof *xso;
+	xso->xso_so = so;
+	xso->so_type = so->so_type;
+	xso->so_options = so->so_options;
+	xso->so_linger = so->so_linger;
+	xso->so_state = so->so_state;
+	xso->so_pcb = so->so_pcb;
+	xso->xso_protocol = so->so_proto->pr_protocol;
+	xso->xso_family = so->so_proto->pr_domain->dom_family;
+	xso->so_qlen = so->so_qlen;
+	xso->so_incqlen = so->so_incqlen;
+	xso->so_qlimit = so->so_qlimit;
+	xso->so_timeo = so->so_timeo;
+	xso->so_error = so->so_error;
+	xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0;
+	xso->so_oobmark = so->so_oobmark;
+	sbtoxsockbuf(&so->so_snd, &xso->so_snd);
+	sbtoxsockbuf(&so->so_rcv, &xso->so_rcv);
+	xso->so_uid = so->so_cred->cr_uid;
+}
Index: kern_condvar.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_condvar.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/kern_condvar.c -L sys/kern/kern_condvar.c -u -r1.2 -r1.3
--- sys/kern/kern_condvar.c
+++ sys/kern/kern_condvar.c
@@ -25,7 +25,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: /repoman/r/ncvs/src/sys/kern/kern_condvar.c,v 1.52.2.1 2006/02/27 00:19:40 davidxu Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_condvar.c,v 1.62 2007/06/04 23:50:56 jeff Exp $");
 
 #include "opt_ktrace.h"
 
@@ -49,12 +49,11 @@
 /*
  * Common sanity checks for cv_wait* functions.
  */
-#define	CV_ASSERT(cvp, mp, td) do {					\
+#define	CV_ASSERT(cvp, lock, td) do {					\
 	KASSERT((td) != NULL, ("%s: curthread NULL", __func__));	\
 	KASSERT(TD_IS_RUNNING(td), ("%s: not TDS_RUNNING", __func__));	\
 	KASSERT((cvp) != NULL, ("%s: cvp NULL", __func__));		\
-	KASSERT((mp) != NULL, ("%s: mp NULL", __func__));		\
-	mtx_assert((mp), MA_OWNED | MA_NOTRECURSED);			\
+	KASSERT((lock) != NULL, ("%s: lock NULL", __func__));		\
 } while (0)
 
 /*
@@ -93,20 +92,23 @@
  * held when cv_signal or cv_broadcast are called.
  */
 void
-cv_wait(struct cv *cvp, struct mtx *mp)
+_cv_wait(struct cv *cvp, struct lock_object *lock)
 {
+	WITNESS_SAVE_DECL(lock_witness);
+	struct lock_class *class;
 	struct thread *td;
-	WITNESS_SAVE_DECL(mp);
+	int lock_state;
 
 	td = curthread;
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_CSW))
 		ktrcsw(1, 0);
 #endif
-	CV_ASSERT(cvp, mp, td);
-	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, &mp->mtx_object,
+	CV_ASSERT(cvp, lock, td);
+	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, lock,
 	    "Waiting on \"%s\"", cvp->cv_description);
-	WITNESS_SAVE(&mp->mtx_object, mp);
+	WITNESS_SAVE(lock, lock_witness);
+	class = LOCK_CLASS(lock);
 
 	if (cold || panicstr) {
 		/*
@@ -122,9 +124,66 @@
 
 	cvp->cv_waiters++;
 	DROP_GIANT();
-	mtx_unlock(mp);
 
-	sleepq_add(cvp, mp, cvp->cv_description, SLEEPQ_CONDVAR);
+	sleepq_add(cvp, lock, cvp->cv_description, SLEEPQ_CONDVAR, 0);
+	if (class->lc_flags & LC_SLEEPABLE)
+		sleepq_release(cvp);
+	lock_state = class->lc_unlock(lock);
+	if (class->lc_flags & LC_SLEEPABLE)
+		sleepq_lock(cvp);
+	sleepq_wait(cvp);
+
+#ifdef KTRACE
+	if (KTRPOINT(td, KTR_CSW))
+		ktrcsw(0, 0);
+#endif
+	PICKUP_GIANT();
+	class->lc_lock(lock, lock_state);
+	WITNESS_RESTORE(lock, lock_witness);
+}
+
+/*
+ * Wait on a condition variable.  This function differs from cv_wait by
+ * not aquiring the mutex after condition variable was signaled.
+ */
+void
+_cv_wait_unlock(struct cv *cvp, struct lock_object *lock)
+{
+	struct lock_class *class;
+	struct thread *td;
+
+	td = curthread;
+#ifdef KTRACE
+	if (KTRPOINT(td, KTR_CSW))
+		ktrcsw(1, 0);
+#endif
+	CV_ASSERT(cvp, lock, td);
+	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, lock,
+	    "Waiting on \"%s\"", cvp->cv_description);
+	class = LOCK_CLASS(lock);
+
+	if (cold || panicstr) {
+		/*
+		 * During autoconfiguration, just give interrupts
+		 * a chance, then just return.  Don't run any other
+		 * thread or panic below, in case this is the idle
+		 * process and already asleep.
+		 */
+		class->lc_unlock(lock);
+		return;
+	}
+
+	sleepq_lock(cvp);
+
+	cvp->cv_waiters++;
+	DROP_GIANT();
+
+	sleepq_add(cvp, lock, cvp->cv_description, SLEEPQ_CONDVAR, 0);
+	if (class->lc_flags & LC_SLEEPABLE)
+		sleepq_release(cvp);
+	class->lc_unlock(lock);
+	if (class->lc_flags & LC_SLEEPABLE)
+		sleepq_lock(cvp);
 	sleepq_wait(cvp);
 
 #ifdef KTRACE
@@ -132,8 +191,6 @@
 		ktrcsw(0, 0);
 #endif
 	PICKUP_GIANT();
-	mtx_lock(mp);
-	WITNESS_RESTORE(&mp->mtx_object, mp);
 }
 
 /*
@@ -143,12 +200,13 @@
  * restarted if possible.
  */
 int
-cv_wait_sig(struct cv *cvp, struct mtx *mp)
+_cv_wait_sig(struct cv *cvp, struct lock_object *lock)
 {
+	WITNESS_SAVE_DECL(lock_witness);
+	struct lock_class *class;
 	struct thread *td;
 	struct proc *p;
-	int rval;
-	WITNESS_SAVE_DECL(mp);
+	int lock_state, rval;
 
 	td = curthread;
 	p = td->td_proc;
@@ -156,10 +214,11 @@
 	if (KTRPOINT(td, KTR_CSW))
 		ktrcsw(1, 0);
 #endif
-	CV_ASSERT(cvp, mp, td);
-	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, &mp->mtx_object,
+	CV_ASSERT(cvp, lock, td);
+	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, lock,
 	    "Waiting on \"%s\"", cvp->cv_description);
-	WITNESS_SAVE(&mp->mtx_object, mp);
+	WITNESS_SAVE(lock, lock_witness);
+	class = LOCK_CLASS(lock);
 
 	if (cold || panicstr) {
 		/*
@@ -175,10 +234,14 @@
 
 	cvp->cv_waiters++;
 	DROP_GIANT();
-	mtx_unlock(mp);
 
-	sleepq_add(cvp, mp, cvp->cv_description, SLEEPQ_CONDVAR |
-	    SLEEPQ_INTERRUPTIBLE);
+	sleepq_add(cvp, lock, cvp->cv_description, SLEEPQ_CONDVAR |
+	    SLEEPQ_INTERRUPTIBLE, 0);
+	if (class->lc_flags & LC_SLEEPABLE)
+		sleepq_release(cvp);
+	lock_state = class->lc_unlock(lock);
+	if (class->lc_flags & LC_SLEEPABLE)
+		sleepq_lock(cvp);
 	rval = sleepq_wait_sig(cvp);
 
 #ifdef KTRACE
@@ -186,8 +249,8 @@
 		ktrcsw(0, 0);
 #endif
 	PICKUP_GIANT();
-	mtx_lock(mp);
-	WITNESS_RESTORE(&mp->mtx_object, mp);
+	class->lc_lock(lock, lock_state);
+	WITNESS_RESTORE(lock, lock_witness);
 
 	return (rval);
 }
@@ -198,11 +261,12 @@
  * expires.
  */
 int
-cv_timedwait(struct cv *cvp, struct mtx *mp, int timo)
+_cv_timedwait(struct cv *cvp, struct lock_object *lock, int timo)
 {
+	WITNESS_SAVE_DECL(lock_witness);
+	struct lock_class *class;
 	struct thread *td;
-	int rval;
-	WITNESS_SAVE_DECL(mp);
+	int lock_state, rval;
 
 	td = curthread;
 	rval = 0;
@@ -210,10 +274,11 @@
 	if (KTRPOINT(td, KTR_CSW))
 		ktrcsw(1, 0);
 #endif
-	CV_ASSERT(cvp, mp, td);
-	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, &mp->mtx_object,
+	CV_ASSERT(cvp, lock, td);
+	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, lock,
 	    "Waiting on \"%s\"", cvp->cv_description);
-	WITNESS_SAVE(&mp->mtx_object, mp);
+	WITNESS_SAVE(lock, lock_witness);
+	class = LOCK_CLASS(lock);
 
 	if (cold || panicstr) {
 		/*
@@ -229,10 +294,14 @@
 
 	cvp->cv_waiters++;
 	DROP_GIANT();
-	mtx_unlock(mp);
 
-	sleepq_add(cvp, mp, cvp->cv_description, SLEEPQ_CONDVAR);
+	sleepq_add(cvp, lock, cvp->cv_description, SLEEPQ_CONDVAR, 0);
 	sleepq_set_timeout(cvp, timo);
+	if (class->lc_flags & LC_SLEEPABLE)
+		sleepq_release(cvp);
+	lock_state = class->lc_unlock(lock);
+	if (class->lc_flags & LC_SLEEPABLE)
+		sleepq_lock(cvp);
 	rval = sleepq_timedwait(cvp);
 
 #ifdef KTRACE
@@ -240,8 +309,8 @@
 		ktrcsw(0, 0);
 #endif
 	PICKUP_GIANT();
-	mtx_lock(mp);
-	WITNESS_RESTORE(&mp->mtx_object, mp);
+	class->lc_lock(lock, lock_state);
+	WITNESS_RESTORE(lock, lock_witness);
 
 	return (rval);
 }
@@ -253,12 +322,13 @@
  * a signal was caught.
  */
 int
-cv_timedwait_sig(struct cv *cvp, struct mtx *mp, int timo)
+_cv_timedwait_sig(struct cv *cvp, struct lock_object *lock, int timo)
 {
+	WITNESS_SAVE_DECL(lock_witness);
+	struct lock_class *class;
 	struct thread *td;
 	struct proc *p;
-	int rval;
-	WITNESS_SAVE_DECL(mp);
+	int lock_state, rval;
 
 	td = curthread;
 	p = td->td_proc;
@@ -267,10 +337,11 @@
 	if (KTRPOINT(td, KTR_CSW))
 		ktrcsw(1, 0);
 #endif
-	CV_ASSERT(cvp, mp, td);
-	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, &mp->mtx_object,
+	CV_ASSERT(cvp, lock, td);
+	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, lock,
 	    "Waiting on \"%s\"", cvp->cv_description);
-	WITNESS_SAVE(&mp->mtx_object, mp);
+	WITNESS_SAVE(lock, lock_witness);
+	class = LOCK_CLASS(lock);
 
 	if (cold || panicstr) {
 		/*
@@ -286,11 +357,15 @@
 
 	cvp->cv_waiters++;
 	DROP_GIANT();
-	mtx_unlock(mp);
 
-	sleepq_add(cvp, mp, cvp->cv_description, SLEEPQ_CONDVAR |
-	    SLEEPQ_INTERRUPTIBLE);
+	sleepq_add(cvp, lock, cvp->cv_description, SLEEPQ_CONDVAR |
+	    SLEEPQ_INTERRUPTIBLE, 0);
 	sleepq_set_timeout(cvp, timo);
+	if (class->lc_flags & LC_SLEEPABLE)
+		sleepq_release(cvp);
+	lock_state = class->lc_unlock(lock);
+	if (class->lc_flags & LC_SLEEPABLE)
+		sleepq_lock(cvp);
 	rval = sleepq_timedwait_sig(cvp);
 
 #ifdef KTRACE
@@ -298,8 +373,8 @@
 		ktrcsw(0, 0);
 #endif
 	PICKUP_GIANT();
-	mtx_lock(mp);
-	WITNESS_RESTORE(&mp->mtx_object, mp);
+	class->lc_lock(lock, lock_state);
+	WITNESS_RESTORE(lock, lock_witness);
 
 	return (rval);
 }
@@ -318,9 +393,9 @@
 	sleepq_lock(cvp);
 	if (cvp->cv_waiters > 0) {
 		cvp->cv_waiters--;
-		sleepq_signal(cvp, SLEEPQ_CONDVAR, -1);
-	} else
-		sleepq_release(cvp);
+		sleepq_signal(cvp, SLEEPQ_CONDVAR, -1, 0);
+	}
+	sleepq_release(cvp);
 }
 
 /*
@@ -334,7 +409,7 @@
 	sleepq_lock(cvp);
 	if (cvp->cv_waiters > 0) {
 		cvp->cv_waiters = 0;
-		sleepq_broadcast(cvp, SLEEPQ_CONDVAR, pri);
+		sleepq_broadcast(cvp, SLEEPQ_CONDVAR, pri, 0);
 	} else
 		sleepq_release(cvp);
 }
Index: subr_mchain.c
===================================================================
RCS file: /home/cvs/src/sys/kern/subr_mchain.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/subr_mchain.c -L sys/kern/subr_mchain.c -u -r1.1.1.1 -r1.2
--- sys/kern/subr_mchain.c
+++ sys/kern/subr_mchain.c
@@ -28,7 +28,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/subr_mchain.c,v 1.17 2005/01/06 23:35:39 imp Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/subr_mchain.c,v 1.18 2005/07/29 13:22:36 imura Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -183,6 +183,7 @@
 	caddr_t dst;
 	c_caddr_t src;
 	int cplen, error, mleft, count;
+	size_t srclen, dstlen;
 
 	m = mbp->mb_cur;
 	mleft = mbp->mb_mleft;
@@ -199,10 +200,13 @@
 			continue;
 		}
 		cplen = mleft > size ? size : mleft;
+		srclen = dstlen = cplen;
 		dst = mtod(m, caddr_t) + m->m_len;
 		switch (type) {
 		    case MB_MCUSTOM:
-			error = mbp->mb_copy(mbp, source, dst, cplen);
+			srclen = size;
+			dstlen = mleft;
+			error = mbp->mb_copy(mbp, source, dst, &srclen, &dstlen);
 			if (error)
 				return error;
 			break;
@@ -222,11 +226,11 @@
 			bzero(dst, cplen);
 			break;
 		}
-		size -= cplen;
-		source += cplen;
-		m->m_len += cplen;
-		mleft -= cplen;
-		mbp->mb_count += cplen;
+		size -= srclen;
+		source += srclen;
+		m->m_len += dstlen;
+		mleft -= dstlen;
+		mbp->mb_count += dstlen;
 	}
 	mbp->mb_cur = m;
 	mbp->mb_mleft = mleft;
Index: uipc_syscalls.c
===================================================================
RCS file: /home/cvs/src/sys/kern/uipc_syscalls.c,v
retrieving revision 1.3
retrieving revision 1.4
diff -L sys/kern/uipc_syscalls.c -L sys/kern/uipc_syscalls.c -u -r1.3 -r1.4
--- sys/kern/uipc_syscalls.c
+++ sys/kern/uipc_syscalls.c
@@ -33,8 +33,9 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/uipc_syscalls.c,v 1.221.2.1 2005/12/28 19:30:41 ps Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/uipc_syscalls.c,v 1.259.4.2 2008/02/14 11:45:41 simon Exp $");
 
+#include "opt_sctp.h"
 #include "opt_compat.h"
 #include "opt_ktrace.h"
 #include "opt_mac.h"
@@ -43,7 +44,6 @@
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
-#include <sys/mac.h>
 #include <sys/mutex.h>
 #include <sys/sysproto.h>
 #include <sys/malloc.h>
@@ -68,6 +68,8 @@
 #include <sys/ktrace.h>
 #endif
 
+#include <security/mac/mac_framework.h>
+
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
@@ -75,6 +77,11 @@
 #include <vm/vm_kern.h>
 #include <vm/vm_extern.h>
 
+#ifdef SCTP
+#include <netinet/sctp.h>
+#include <netinet/sctp_peeloff.h>
+#endif /* SCTP */
+
 static int sendit(struct thread *td, int s, struct msghdr *mp, int flags);
 static int recvit(struct thread *td, int s, struct msghdr *mp, void *namelenp);
 
@@ -92,7 +99,6 @@
 int nsfbufspeak;
 int nsfbufsused;
 
-SYSCTL_DECL(_kern_ipc);
 SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufs, CTLFLAG_RDTUN, &nsfbufs, 0,
     "Maximum number of sendfile(2) sf_bufs available");
 SYSCTL_INT(_kern_ipc, OID_AUTO, nsfbufspeak, CTLFLAG_RD, &nsfbufspeak, 0,
@@ -105,10 +111,11 @@
  * file entry is held upon returning.  This is lighter weight than
  * fgetsock(), which bumps the socket reference drops the file reference
  * count instead, as this approach avoids several additional mutex operations
- * associated with the additional reference count.
+ * associated with the additional reference count.  If requested, return the
+ * open file flags.
  */
 static int
-getsock(struct filedesc *fdp, int fd, struct file **fpp)
+getsock(struct filedesc *fdp, int fd, struct file **fpp, u_int *fflagp)
 {
 	struct file *fp;
 	int error;
@@ -117,7 +124,7 @@
 	if (fdp == NULL)
 		error = EBADF;
 	else {
-		FILEDESC_LOCK_FAST(fdp);
+		FILEDESC_SLOCK(fdp);
 		fp = fget_locked(fdp, fd);
 		if (fp == NULL)
 			error = EBADF;
@@ -126,9 +133,11 @@
 			error = ENOTSOCK;
 		} else {
 			fhold(fp);
+			if (fflagp != NULL)
+				*fflagp = fp->f_flag;
 			error = 0;
 		}
-		FILEDESC_UNLOCK_FAST(fdp);
+		FILEDESC_SUNLOCK(fdp);
 	}
 	*fpp = fp;
 	return (error);
@@ -141,13 +150,10 @@
 #define COMPAT_OLDSOCK
 #endif
 
-/*
- * MPSAFE
- */
 int
 socket(td, uap)
 	struct thread *td;
-	register struct socket_args /* {
+	struct socket_args /* {
 		int	domain;
 		int	type;
 		int	protocol;
@@ -169,33 +175,28 @@
 	if (error)
 		return (error);
 	/* An extra reference on `fp' has been held for us by falloc(). */
-	NET_LOCK_GIANT();
 	error = socreate(uap->domain, &so, uap->type, uap->protocol,
 	    td->td_ucred, td);
-	NET_UNLOCK_GIANT();
 	if (error) {
 		fdclose(fdp, fp, fd, td);
 	} else {
-		FILEDESC_LOCK_FAST(fdp);
+		FILE_LOCK(fp);
 		fp->f_data = so;	/* already has ref count */
 		fp->f_flag = FREAD|FWRITE;
-		fp->f_ops = &socketops;
 		fp->f_type = DTYPE_SOCKET;
-		FILEDESC_UNLOCK_FAST(fdp);
+		fp->f_ops = &socketops;
+		FILE_UNLOCK(fp);
 		td->td_retval[0] = fd;
 	}
 	fdrop(fp, td);
 	return (error);
 }
 
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 bind(td, uap)
 	struct thread *td;
-	register struct bind_args /* {
+	struct bind_args /* {
 		int	s;
 		caddr_t	name;
 		int	namelen;
@@ -207,7 +208,9 @@
 	if ((error = getsockaddr(&sa, uap->name, uap->namelen)) != 0)
 		return (error);
 
-	return (kern_bind(td, uap->s, sa));
+	error = kern_bind(td, uap->s, sa);
+	free(sa, M_SONAME);
+	return (error);
 }
 
 int
@@ -220,37 +223,30 @@
 	struct file *fp;
 	int error;
 
-	NET_LOCK_GIANT();
-	error = getsock(td->td_proc->p_fd, fd, &fp);
+	error = getsock(td->td_proc->p_fd, fd, &fp, NULL);
 	if (error)
-		goto done2;
+		return (error);
 	so = fp->f_data;
 #ifdef MAC
 	SOCK_LOCK(so);
 	error = mac_check_socket_bind(td->td_ucred, so, sa);
 	SOCK_UNLOCK(so);
 	if (error)
-		goto done1;
+		goto done;
 #endif
 	error = sobind(so, sa, td);
 #ifdef MAC
-done1:
+done:
 #endif
 	fdrop(fp, td);
-done2:
-	NET_UNLOCK_GIANT();
-	FREE(sa, M_SONAME);
 	return (error);
 }
 
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 listen(td, uap)
 	struct thread *td;
-	register struct listen_args /* {
+	struct listen_args /* {
 		int	s;
 		int	backlog;
 	} */ *uap;
@@ -259,8 +255,7 @@
 	struct file *fp;
 	int error;
 
-	NET_LOCK_GIANT();
-	error = getsock(td->td_proc->p_fd, uap->s, &fp);
+	error = getsock(td->td_proc->p_fd, uap->s, &fp, NULL);
 	if (error == 0) {
 		so = fp->f_data;
 #ifdef MAC
@@ -276,28 +271,71 @@
 #endif
 		fdrop(fp, td);
 	}
-	NET_UNLOCK_GIANT();
 	return(error);
 }
 
 /*
  * accept1()
- * MPSAFE
  */
 static int
 accept1(td, uap, compat)
 	struct thread *td;
-	register struct accept_args /* {
+	struct accept_args /* {
 		int	s;
 		struct sockaddr	* __restrict name;
 		socklen_t	* __restrict anamelen;
 	} */ *uap;
 	int compat;
 {
+	struct sockaddr *name;
+	socklen_t namelen;
+	struct file *fp;
+	int error;
+
+	if (uap->name == NULL)
+		return (kern_accept(td, uap->s, NULL, NULL, NULL));
+
+	error = copyin(uap->anamelen, &namelen, sizeof (namelen));
+	if (error)
+		return (error);
+
+	error = kern_accept(td, uap->s, &name, &namelen, &fp);
+
+	/*
+	 * return a namelen of zero for older code which might
+	 * ignore the return value from accept.
+	 */
+	if (error) {
+		(void) copyout(&namelen,
+		    uap->anamelen, sizeof(*uap->anamelen));
+		return (error);
+	}
+
+	if (error == 0 && name != NULL) {
+#ifdef COMPAT_OLDSOCK
+		if (compat)
+			((struct osockaddr *)name)->sa_family =
+			    name->sa_family;
+#endif
+		error = copyout(name, uap->name, namelen);
+	}
+	if (error == 0)
+		error = copyout(&namelen, uap->anamelen,
+		    sizeof(namelen));
+	if (error)
+		fdclose(td->td_proc->p_fd, fp, td->td_retval[0], td);
+	fdrop(fp, td);
+	free(name, M_SONAME);
+	return (error);
+}
+
+int
+kern_accept(struct thread *td, int s, struct sockaddr **name,
+    socklen_t *namelen, struct file **fp)
+{
 	struct filedesc *fdp;
-	struct file *nfp = NULL;
+	struct file *headfp, *nfp = NULL;
 	struct sockaddr *sa = NULL;
-	socklen_t namelen;
 	int error;
 	struct socket *head, *so;
 	int fd;
@@ -305,18 +343,17 @@
 	pid_t pgid;
 	int tmp;
 
-	fdp = td->td_proc->p_fd;
-	if (uap->name) {
-		error = copyin(uap->anamelen, &namelen, sizeof (namelen));
-		if(error)
-			return (error);
-		if (namelen < 0)
+	if (name) {
+		*name = NULL;
+		if (*namelen < 0)
 			return (EINVAL);
 	}
-	NET_LOCK_GIANT();
-	error = fgetsock(td, uap->s, &head, &fflag);
+
+	fdp = td->td_proc->p_fd;
+	error = getsock(fdp, s, &headfp, &fflag);
 	if (error)
-		goto done2;
+		return (error);
+	head = headfp->f_data;
 	if ((head->so_options & SO_ACCEPTCONN) == 0) {
 		error = EINVAL;
 		goto done;
@@ -389,8 +426,8 @@
 	FILE_LOCK(nfp);
 	nfp->f_data = so;	/* nfp has ref count from falloc */
 	nfp->f_flag = fflag;
-	nfp->f_ops = &socketops;
 	nfp->f_type = DTYPE_SOCKET;
+	nfp->f_ops = &socketops;
 	FILE_UNLOCK(nfp);
 	/* Sync socket nonblocking/async state with file flags */
 	tmp = fflag & FNONBLOCK;
@@ -404,34 +441,21 @@
 		 * return a namelen of zero for older code which might
 		 * ignore the return value from accept.
 		 */
-		if (uap->name != NULL) {
-			namelen = 0;
-			(void) copyout(&namelen,
-			    uap->anamelen, sizeof(*uap->anamelen));
-		}
+		if (name)
+			*namelen = 0;
 		goto noconnection;
 	}
 	if (sa == NULL) {
-		namelen = 0;
-		if (uap->name)
-			goto gotnoname;
-		error = 0;
+		if (name)
+			*namelen = 0;
 		goto done;
 	}
-	if (uap->name) {
+	if (name) {
 		/* check sa_len before it is destroyed */
-		if (namelen > sa->sa_len)
-			namelen = sa->sa_len;
-#ifdef COMPAT_OLDSOCK
-		if (compat)
-			((struct osockaddr *)sa)->sa_family =
-			    sa->sa_family;
-#endif
-		error = copyout(sa, uap->name, (u_int)namelen);
-		if (!error)
-gotnoname:
-			error = copyout(&namelen,
-			    uap->anamelen, sizeof (*uap->anamelen));
+		if (*namelen > sa->sa_len)
+			*namelen = sa->sa_len;
+		*name = sa;
+		sa = NULL;
 	}
 noconnection:
 	if (sa)
@@ -445,20 +469,23 @@
 		fdclose(fdp, nfp, fd, td);
 
 	/*
-	 * Release explicitly held references before returning.
+	 * Release explicitly held references before returning.  We return
+	 * a reference on nfp to the caller on success if they request it.
 	 */
 done:
+	if (fp != NULL) {
+		if (error == 0) {
+			*fp = nfp;
+			nfp = NULL;
+		} else
+			*fp = NULL;
+	}
 	if (nfp != NULL)
 		fdrop(nfp, td);
-	fputsock(head);
-done2:
-	NET_UNLOCK_GIANT();
+	fdrop(headfp, td);
 	return (error);
 }
 
-/*
- * MPSAFE (accept1() is MPSAFE)
- */
 int
 accept(td, uap)
 	struct thread *td;
@@ -469,9 +496,6 @@
 }
 
 #ifdef COMPAT_OLDSOCK
-/*
- * MPSAFE (accept1() is MPSAFE)
- */
 int
 oaccept(td, uap)
 	struct thread *td;
@@ -482,14 +506,11 @@
 }
 #endif /* COMPAT_OLDSOCK */
 
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 connect(td, uap)
 	struct thread *td;
-	register struct connect_args /* {
+	struct connect_args /* {
 		int	s;
 		caddr_t	name;
 		int	namelen;
@@ -502,7 +523,9 @@
 	if (error)
 		return (error);
 
-	return (kern_connect(td, uap->s, sa));
+	error = kern_connect(td, uap->s, sa);
+	free(sa, M_SONAME);
+	return (error);
 }
 
 
@@ -517,10 +540,9 @@
 	int error;
 	int interrupted = 0;
 
-	NET_LOCK_GIANT();
-	error = getsock(td->td_proc->p_fd, fd, &fp);
+	error = getsock(td->td_proc->p_fd, fd, &fp, NULL);
 	if (error)
-		goto done2;
+		return (error);
 	so = fp->f_data;
 	if (so->so_state & SS_ISCONNECTING) {
 		error = EALREADY;
@@ -562,26 +584,20 @@
 		error = EINTR;
 done1:
 	fdrop(fp, td);
-done2:
-	NET_UNLOCK_GIANT();
-	FREE(sa, M_SONAME);
 	return (error);
 }
 
-/*
- * MPSAFE
- */
 int
 socketpair(td, uap)
 	struct thread *td;
-	register struct socketpair_args /* {
+	struct socketpair_args /* {
 		int	domain;
 		int	type;
 		int	protocol;
 		int	*rsv;
 	} */ *uap;
 {
-	register struct filedesc *fdp = td->td_proc->p_fd;
+	struct filedesc *fdp = td->td_proc->p_fd;
 	struct file *fp1, *fp2;
 	struct socket *so1, *so2;
 	int fd, error, sv[2];
@@ -594,11 +610,10 @@
 		return (error);
 #endif
 
-	NET_LOCK_GIANT();
 	error = socreate(uap->domain, &so1, uap->type, uap->protocol,
 	    td->td_ucred, td);
 	if (error)
-		goto done2;
+		return (error);
 	error = socreate(uap->domain, &so2, uap->type, uap->protocol,
 	    td->td_ucred, td);
 	if (error)
@@ -627,18 +642,21 @@
 	}
 	FILE_LOCK(fp1);
 	fp1->f_flag = FREAD|FWRITE;
-	fp1->f_ops = &socketops;
 	fp1->f_type = DTYPE_SOCKET;
+	fp1->f_ops = &socketops;
 	FILE_UNLOCK(fp1);
 	FILE_LOCK(fp2);
 	fp2->f_flag = FREAD|FWRITE;
-	fp2->f_ops = &socketops;
 	fp2->f_type = DTYPE_SOCKET;
+	fp2->f_ops = &socketops;
 	FILE_UNLOCK(fp2);
+	so1 = so2 = NULL;
 	error = copyout(sv, uap->rsv, 2 * sizeof (int));
+	if (error)
+		goto free4;
 	fdrop(fp1, td);
 	fdrop(fp2, td);
-	goto done2;
+	return (0);
 free4:
 	fdclose(fdp, fp2, sv[1], td);
 	fdrop(fp2, td);
@@ -646,19 +664,19 @@
 	fdclose(fdp, fp1, sv[0], td);
 	fdrop(fp1, td);
 free2:
-	(void)soclose(so2);
+	if (so2 != NULL)
+		(void)soclose(so2);
 free1:
-	(void)soclose(so1);
-done2:
-	NET_UNLOCK_GIANT();
+	if (so1 != NULL)
+		(void)soclose(so1);
 	return (error);
 }
 
 static int
 sendit(td, s, mp, flags)
-	register struct thread *td;
+	struct thread *td;
 	int s;
-	register struct msghdr *mp;
+	struct msghdr *mp;
 	int flags;
 {
 	struct mbuf *control;
@@ -691,7 +709,7 @@
 			goto bad;
 #ifdef COMPAT_OLDSOCK
 		if (mp->msg_flags == MSG_COMPAT) {
-			register struct cmsghdr *cm;
+			struct cmsghdr *cm;
 
 			M_PREPEND(control, sizeof(*cm), M_TRYWAIT);
 			if (control == 0) {
@@ -736,10 +754,9 @@
 	struct uio *ktruio = NULL;
 #endif
 
-	NET_LOCK_GIANT();
-	error = getsock(td->td_proc->p_fd, s, &fp);
+	error = getsock(td->td_proc->p_fd, s, &fp, NULL);
 	if (error)
-		goto bad2;
+		return (error);
 	so = (struct socket *)fp->f_data;
 
 #ifdef MAC
@@ -769,8 +786,7 @@
 		ktruio = cloneuio(&auio);
 #endif
 	len = auio.uio_resid;
-	error = so->so_proto->pr_usrreqs->pru_sosend(so, mp->msg_name, &auio,
-	    0, control, flags, td);
+	error = sosend(so, mp->msg_name, &auio, 0, control, flags, td);
 	if (error) {
 		if (auio.uio_resid != len && (error == ERESTART ||
 		    error == EINTR || error == EWOULDBLOCK))
@@ -793,18 +809,13 @@
 #endif
 bad:
 	fdrop(fp, td);
-bad2:
-	NET_UNLOCK_GIANT();
 	return (error);
 }
 
-/*
- * MPSAFE
- */
 int
 sendto(td, uap)
 	struct thread *td;
-	register struct sendto_args /* {
+	struct sendto_args /* {
 		int	s;
 		caddr_t	buf;
 		size_t	len;
@@ -832,13 +843,10 @@
 }
 
 #ifdef COMPAT_OLDSOCK
-/*
- * MPSAFE
- */
 int
 osend(td, uap)
 	struct thread *td;
-	register struct osend_args /* {
+	struct osend_args /* {
 		int	s;
 		caddr_t	buf;
 		int	len;
@@ -861,9 +869,6 @@
 	return (error);
 }
 
-/*
- * MPSAFE
- */
 int
 osendmsg(td, uap)
 	struct thread *td;
@@ -891,9 +896,6 @@
 }
 #endif
 
-/*
- * MPSAFE
- */
 int
 sendmsg(td, uap)
 	struct thread *td;
@@ -923,12 +925,11 @@
 }
 
 int
-kern_recvit(td, s, mp, namelenp, segflg, controlp)
+kern_recvit(td, s, mp, fromseg, controlp)
 	struct thread *td;
 	int s;
 	struct msghdr *mp;
-	void *namelenp;
-	enum uio_seg segflg;
+	enum uio_seg fromseg;
 	struct mbuf **controlp;
 {
 	struct uio auio;
@@ -948,12 +949,9 @@
 	if(controlp != NULL)
 		*controlp = 0;
 
-	NET_LOCK_GIANT();
-	error = getsock(td->td_proc->p_fd, s, &fp);
-	if (error) {
-		NET_UNLOCK_GIANT();
+	error = getsock(td->td_proc->p_fd, s, &fp, NULL);
+	if (error)
 		return (error);
-	}
 	so = fp->f_data;
 
 #ifdef MAC
@@ -962,14 +960,13 @@
 	SOCK_UNLOCK(so);
 	if (error) {
 		fdrop(fp, td);
-		NET_UNLOCK_GIANT();
 		return (error);
 	}
 #endif
 
 	auio.uio_iov = mp->msg_iov;
 	auio.uio_iovcnt = mp->msg_iovlen;
-	auio.uio_segflg = segflg;
+	auio.uio_segflg = UIO_USERSPACE;
 	auio.uio_rw = UIO_READ;
 	auio.uio_td = td;
 	auio.uio_offset = 0;			/* XXX */
@@ -978,7 +975,6 @@
 	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
 		if ((auio.uio_resid += iov->iov_len) < 0) {
 			fdrop(fp, td);
-			NET_UNLOCK_GIANT();
 			return (EINVAL);
 		}
 	}
@@ -987,8 +983,8 @@
 		ktruio = cloneuio(&auio);
 #endif
 	len = auio.uio_resid;
-	error = so->so_proto->pr_usrreqs->pru_soreceive(so, &fromsa, &auio,
-	    (struct mbuf **)0, (mp->msg_control || controlp) ? &control : (struct mbuf **)0,
+	error = soreceive(so, &fromsa, &auio, (struct mbuf **)0,
+	    (mp->msg_control || controlp) ? &control : (struct mbuf **)0,
 	    &mp->msg_flags);
 	if (error) {
 		if (auio.uio_resid != (int)len && (error == ERESTART ||
@@ -1016,20 +1012,15 @@
 				((struct osockaddr *)fromsa)->sa_family =
 				    fromsa->sa_family;
 #endif
-			error = copyout(fromsa, mp->msg_name, (unsigned)len);
-			if (error)
-				goto out;
+			if (fromseg == UIO_USERSPACE) {
+				error = copyout(fromsa, mp->msg_name,
+				    (unsigned)len);
+				if (error)
+					goto out;
+			} else
+				bcopy(fromsa, mp->msg_name, len);
 		}
 		mp->msg_namelen = len;
-		if (namelenp &&
-		    (error = copyout(&len, namelenp, sizeof (socklen_t)))) {
-#ifdef COMPAT_OLDSOCK
-			if (mp->msg_flags & MSG_COMPAT)
-				error = 0;	/* old recvfrom didn't check */
-			else
-#endif
-			goto out;
-		}
 	}
 	if (mp->msg_control && controlp == NULL) {
 #ifdef COMPAT_OLDSOCK
@@ -1079,7 +1070,6 @@
 	}
 out:
 	fdrop(fp, td);
-	NET_UNLOCK_GIANT();
 	if (fromsa)
 		FREE(fromsa, M_SONAME);
 
@@ -1098,17 +1088,25 @@
 	struct msghdr *mp;
 	void *namelenp;
 {
+	int error;
 
-	return (kern_recvit(td, s, mp, namelenp, UIO_USERSPACE, NULL));
+	error = kern_recvit(td, s, mp, UIO_USERSPACE, NULL);
+	if (error)
+		return (error);
+	if (namelenp) {
+		error = copyout(&mp->msg_namelen, namelenp, sizeof (socklen_t));
+#ifdef COMPAT_OLDSOCK
+		if (mp->msg_flags & MSG_COMPAT)
+			error = 0;	/* old recvfrom didn't check */
+#endif
+	}
+	return (error);
 }
 
-/*
- * MPSAFE
- */
 int
 recvfrom(td, uap)
 	struct thread *td;
-	register struct recvfrom_args /* {
+	struct recvfrom_args /* {
 		int	s;
 		caddr_t	buf;
 		size_t	len;
@@ -1142,9 +1140,6 @@
 }
 
 #ifdef COMPAT_OLDSOCK
-/*
- * MPSAFE
- */
 int
 orecvfrom(td, uap)
 	struct thread *td;
@@ -1156,15 +1151,11 @@
 }
 #endif
 
-
 #ifdef COMPAT_OLDSOCK
-/*
- * MPSAFE
- */
 int
 orecv(td, uap)
 	struct thread *td;
-	register struct orecv_args /* {
+	struct orecv_args /* {
 		int	s;
 		caddr_t	buf;
 		int	len;
@@ -1191,8 +1182,6 @@
  * Old recvmsg.  This code takes advantage of the fact that the old msghdr
  * overlays the new one, missing only the flags, and with the (old) access
  * rights where the control fields are now.
- *
- * MPSAFE
  */
 int
 orecvmsg(td, uap)
@@ -1224,9 +1213,6 @@
 }
 #endif
 
-/*
- * MPSAFE
- */
 int
 recvmsg(td, uap)
 	struct thread *td;
@@ -1261,14 +1247,11 @@
 	return (error);
 }
 
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 shutdown(td, uap)
 	struct thread *td;
-	register struct shutdown_args /* {
+	struct shutdown_args /* {
 		int	s;
 		int	how;
 	} */ *uap;
@@ -1277,25 +1260,20 @@
 	struct file *fp;
 	int error;
 
-	NET_LOCK_GIANT();
-	error = getsock(td->td_proc->p_fd, uap->s, &fp);
+	error = getsock(td->td_proc->p_fd, uap->s, &fp, NULL);
 	if (error == 0) {
 		so = fp->f_data;
 		error = soshutdown(so, uap->how);
 		fdrop(fp, td);
 	}
-	NET_UNLOCK_GIANT();
 	return (error);
 }
 
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 setsockopt(td, uap)
 	struct thread *td;
-	register struct setsockopt_args /* {
+	struct setsockopt_args /* {
 		int	s;
 		int	level;
 		int	name;
@@ -1344,25 +1322,20 @@
 		panic("kern_setsockopt called with bad valseg");
 	}
 
-	NET_LOCK_GIANT();
-	error = getsock(td->td_proc->p_fd, s, &fp);
+	error = getsock(td->td_proc->p_fd, s, &fp, NULL);
 	if (error == 0) {
 		so = fp->f_data;
 		error = sosetopt(so, &sopt);
 		fdrop(fp, td);
 	}
-	NET_UNLOCK_GIANT();
 	return(error);
 }
 
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 getsockopt(td, uap)
 	struct thread *td;
-	register struct getsockopt_args /* {
+	struct getsockopt_args /* {
 		int	s;
 		int	level;
 		int	name;
@@ -1427,83 +1400,89 @@
 		panic("kern_getsockopt called with bad valseg");
 	}
 
-	NET_LOCK_GIANT();
-	error = getsock(td->td_proc->p_fd, s, &fp);
+	error = getsock(td->td_proc->p_fd, s, &fp, NULL);
 	if (error == 0) {
 		so = fp->f_data;
 		error = sogetopt(so, &sopt);
 		*valsize = sopt.sopt_valsize;
 		fdrop(fp, td);
 	}
-	NET_UNLOCK_GIANT();
 	return (error);
 }
 
 /*
  * getsockname1() - Get socket name.
- *
- * MPSAFE
  */
 /* ARGSUSED */
 static int
 getsockname1(td, uap, compat)
 	struct thread *td;
-	register struct getsockname_args /* {
+	struct getsockname_args /* {
 		int	fdes;
 		struct sockaddr * __restrict asa;
 		socklen_t * __restrict alen;
 	} */ *uap;
 	int compat;
 {
-	struct socket *so;
 	struct sockaddr *sa;
-	struct file *fp;
 	socklen_t len;
 	int error;
 
-	NET_LOCK_GIANT();
-	error = getsock(td->td_proc->p_fd, uap->fdes, &fp);
+	error = copyin(uap->alen, &len, sizeof(len));
 	if (error)
-		goto done2;
-	so = fp->f_data;
-	error = copyin(uap->alen, &len, sizeof (len));
-	if (error)
-		goto done1;
-	if (len < 0) {
-		error = EINVAL;
-		goto done1;
-	}
-	sa = 0;
-	error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &sa);
+		return (error);
+
+	error = kern_getsockname(td, uap->fdes, &sa, &len);
 	if (error)
-		goto bad;
-	if (sa == 0) {
-		len = 0;
-		goto gotnothing;
-	}
+		return (error);
 
-	len = MIN(len, sa->sa_len);
+	if (len != 0) {
 #ifdef COMPAT_OLDSOCK
-	if (compat)
-		((struct osockaddr *)sa)->sa_family = sa->sa_family;
+		if (compat)
+			((struct osockaddr *)sa)->sa_family = sa->sa_family;
 #endif
-	error = copyout(sa, uap->asa, (u_int)len);
+		error = copyout(sa, uap->asa, (u_int)len);
+	}
+	free(sa, M_SONAME);
 	if (error == 0)
-gotnothing:
-		error = copyout(&len, uap->alen, sizeof (len));
+		error = copyout(&len, uap->alen, sizeof(len));
+	return (error);
+}
+
+int
+kern_getsockname(struct thread *td, int fd, struct sockaddr **sa,
+    socklen_t *alen)
+{
+	struct socket *so;
+	struct file *fp;
+	socklen_t len;
+	int error;
+
+	if (*alen < 0)
+		return (EINVAL);
+
+	error = getsock(td->td_proc->p_fd, fd, &fp, NULL);
+	if (error)
+		return (error);
+	so = fp->f_data;
+	*sa = NULL;
+	error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, sa);
+	if (error)
+		goto bad;
+	if (*sa == NULL)
+		len = 0;
+	else
+		len = MIN(*alen, (*sa)->sa_len);
+	*alen = len;
 bad:
-	if (sa)
-		FREE(sa, M_SONAME);
-done1:
 	fdrop(fp, td);
-done2:
-	NET_UNLOCK_GIANT();
+	if (error && *sa) {
+		free(*sa, M_SONAME);
+		*sa = NULL;
+	}
 	return (error);
 }
 
-/*
- * MPSAFE
- */
 int
 getsockname(td, uap)
 	struct thread *td;
@@ -1514,9 +1493,6 @@
 }
 
 #ifdef COMPAT_OLDSOCK
-/*
- * MPSAFE
- */
 int
 ogetsockname(td, uap)
 	struct thread *td;
@@ -1529,74 +1505,82 @@
 
 /*
  * getpeername1() - Get name of peer for connected socket.
- *
- * MPSAFE
  */
 /* ARGSUSED */
 static int
 getpeername1(td, uap, compat)
 	struct thread *td;
-	register struct getpeername_args /* {
+	struct getpeername_args /* {
 		int	fdes;
 		struct sockaddr * __restrict	asa;
 		socklen_t * __restrict	alen;
 	} */ *uap;
 	int compat;
 {
-	struct socket *so;
 	struct sockaddr *sa;
+	socklen_t len;
+	int error;
+
+	error = copyin(uap->alen, &len, sizeof (len));
+	if (error)
+		return (error);
+
+	error = kern_getpeername(td, uap->fdes, &sa, &len);
+	if (error)
+		return (error);
+
+	if (len != 0) {
+#ifdef COMPAT_OLDSOCK
+		if (compat)
+			((struct osockaddr *)sa)->sa_family = sa->sa_family;
+#endif
+		error = copyout(sa, uap->asa, (u_int)len);
+	}
+	free(sa, M_SONAME);
+	if (error == 0)
+		error = copyout(&len, uap->alen, sizeof(len));
+	return (error);
+}
+
+int
+kern_getpeername(struct thread *td, int fd, struct sockaddr **sa,
+    socklen_t *alen)
+{
+	struct socket *so;
 	struct file *fp;
 	socklen_t len;
 	int error;
 
-	NET_LOCK_GIANT();
-	error = getsock(td->td_proc->p_fd, uap->fdes, &fp);
+	if (*alen < 0)
+		return (EINVAL);
+
+	error = getsock(td->td_proc->p_fd, fd, &fp, NULL);
 	if (error)
-		goto done2;
+		return (error);
 	so = fp->f_data;
 	if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) {
 		error = ENOTCONN;
-		goto done1;
-	}
-	error = copyin(uap->alen, &len, sizeof (len));
-	if (error)
-		goto done1;
-	if (len < 0) {
-		error = EINVAL;
-		goto done1;
+		goto done;
 	}
-	sa = 0;
-	error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, &sa);
+	*sa = NULL;
+	error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, sa);
 	if (error)
 		goto bad;
-	if (sa == 0) {
+	if (*sa == NULL)
 		len = 0;
-		goto gotnothing;
-	}
-	len = MIN(len, sa->sa_len);
-#ifdef COMPAT_OLDSOCK
-	if (compat)
-		((struct osockaddr *)sa)->sa_family =
-		    sa->sa_family;
-#endif
-	error = copyout(sa, uap->asa, (u_int)len);
-	if (error)
-		goto bad;
-gotnothing:
-	error = copyout(&len, uap->alen, sizeof (len));
+	else
+		len = MIN(*alen, (*sa)->sa_len);
+	*alen = len;
 bad:
-	if (sa)
-		FREE(sa, M_SONAME);
-done1:
+	if (error && *sa) {
+		free(*sa, M_SONAME);
+		*sa = NULL;
+	}
+done:
 	fdrop(fp, td);
-done2:
-	NET_UNLOCK_GIANT();
 	return (error);
 }
 
-/*
- * MPSAFE
- */
 int
 getpeername(td, uap)
 	struct thread *td;
@@ -1607,9 +1591,6 @@
 }
 
 #ifdef COMPAT_OLDSOCK
-/*
- * MPSAFE
- */
 int
 ogetpeername(td, uap)
 	struct thread *td;
@@ -1627,8 +1608,8 @@
 	caddr_t buf;
 	int buflen, type;
 {
-	register struct sockaddr *sa;
-	register struct mbuf *m;
+	struct sockaddr *sa;
+	struct mbuf *m;
 	int error;
 
 	if ((u_int)buflen > MLEN) {
@@ -1722,16 +1703,13 @@
 /*
  * sendfile(2)
  *
- * MPSAFE
- *
  * int sendfile(int fd, int s, off_t offset, size_t nbytes,
  *	 struct sf_hdtr *hdtr, off_t *sbytes, int flags)
  *
  * Send a file specified by 'fd' and starting at 'offset' to a socket
- * specified by 's'. Send only 'nbytes' of the file or until EOF if
- * nbytes == 0. Optionally add a header and/or trailer to the socket
- * output. If specified, write the total number of bytes sent into *sbytes.
- *
+ * specified by 's'. Send only 'nbytes' of the file or until EOF if nbytes ==
+ * 0.  Optionally add a header and/or trailer to the socket output.  If
+ * specified, write the total number of bytes sent into *sbytes.
  */
 int
 sendfile(struct thread *td, struct sendfile_args *uap)
@@ -1740,399 +1718,477 @@
 	return (do_sendfile(td, uap, 0));
 }
 
-#ifdef COMPAT_FREEBSD4
-int
-freebsd4_sendfile(struct thread *td, struct freebsd4_sendfile_args *uap)
-{
-	struct sendfile_args args;
-
-	args.fd = uap->fd;
-	args.s = uap->s;
-	args.offset = uap->offset;
-	args.nbytes = uap->nbytes;
-	args.hdtr = uap->hdtr;
-	args.sbytes = uap->sbytes;
-	args.flags = uap->flags;
-
-	return (do_sendfile(td, &args, 1));
-}
-#endif /* COMPAT_FREEBSD4 */
-
 static int
 do_sendfile(struct thread *td, struct sendfile_args *uap, int compat)
 {
-	struct vnode *vp;
-	struct vm_object *obj = NULL;
-	struct socket *so = NULL;
-	struct mbuf *m, *m_header = NULL;
-	struct sf_buf *sf;
-	struct vm_page *pg;
-	struct writev_args nuap;
 	struct sf_hdtr hdtr;
-	struct uio *hdr_uio = NULL;
-	off_t off, xfsize, hdtr_size, sbytes = 0;
-	int error, headersize = 0, headersent = 0;
+	struct uio *hdr_uio, *trl_uio;
+	int error;
+
+	hdr_uio = trl_uio = NULL;
+
+	if (uap->hdtr != NULL) {
+		error = copyin(uap->hdtr, &hdtr, sizeof(hdtr));
+		if (error)
+			goto out;
+		if (hdtr.headers != NULL) {
+			error = copyinuio(hdtr.headers, hdtr.hdr_cnt, &hdr_uio);
+			if (error)
+				goto out;
+		}
+		if (hdtr.trailers != NULL) {
+			error = copyinuio(hdtr.trailers, hdtr.trl_cnt, &trl_uio);
+			if (error)
+				goto out;
+
+		}
+	}
 
-	mtx_lock(&Giant);
+	error = kern_sendfile(td, uap, hdr_uio, trl_uio, compat);
+out:
+	if (hdr_uio)
+		free(hdr_uio, M_IOV);
+	if (trl_uio)
+		free(trl_uio, M_IOV);
+	return (error);
+}
 
-	hdtr_size = 0;
+#ifdef COMPAT_FREEBSD4
+int
+freebsd4_sendfile(struct thread *td, struct freebsd4_sendfile_args *uap)
+{
+	struct sendfile_args args;
+
+	args.fd = uap->fd;
+	args.s = uap->s;
+	args.offset = uap->offset;
+	args.nbytes = uap->nbytes;
+	args.hdtr = uap->hdtr;
+	args.sbytes = uap->sbytes;
+	args.flags = uap->flags;
+
+	return (do_sendfile(td, &args, 1));
+}
+#endif /* COMPAT_FREEBSD4 */
+
+int
+kern_sendfile(struct thread *td, struct sendfile_args *uap,
+    struct uio *hdr_uio, struct uio *trl_uio, int compat)
+{
+	struct file *sock_fp;
+	struct vnode *vp;
+	struct vm_object *obj = NULL;
+	struct socket *so = NULL;
+	struct mbuf *m = NULL;
+	struct sf_buf *sf;
+	struct vm_page *pg;
+	off_t off, xfsize, fsbytes = 0, sbytes = 0, rem = 0;
+	int error, hdrlen = 0, mnw = 0;
+	int vfslocked;
 
 	/*
-	 * The descriptor must be a regular file and have a backing VM object.
+	 * The file descriptor must be a regular file and have a
+	 * backing VM object.
+	 * File offset must be positive.  If it goes beyond EOF
+	 * we send only the header/trailer and no payload data.
 	 */
 	if ((error = fgetvp_read(td, uap->fd, &vp)) != 0)
-		goto done;
+		goto out;
+	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
-	if (vp->v_type == VREG)
+	if (vp->v_type == VREG) {
 		obj = vp->v_object;
+		if (obj != NULL) {
+			/*
+			 * Temporarily increase the backing VM
+			 * object's reference count so that a forced
+			 * reclamation of its vnode does not
+			 * immediately destroy it.
+			 */
+			VM_OBJECT_LOCK(obj);
+			if ((obj->flags & OBJ_DEAD) == 0) {
+				vm_object_reference_locked(obj);
+				VM_OBJECT_UNLOCK(obj);
+			} else {
+				VM_OBJECT_UNLOCK(obj);
+				obj = NULL;
+			}
+		}
+	}
 	VOP_UNLOCK(vp, 0, td);
+	VFS_UNLOCK_GIANT(vfslocked);
 	if (obj == NULL) {
 		error = EINVAL;
-		goto done;
+		goto out;
 	}
-	if ((error = fgetsock(td, uap->s, &so, NULL)) != 0)
-		goto done;
+	if (uap->offset < 0) {
+		error = EINVAL;
+		goto out;
+	}
+
+	/*
+	 * The socket must be a stream socket and connected.
+	 * Remember if it a blocking or non-blocking socket.
+	 */
+	if ((error = getsock(td->td_proc->p_fd, uap->s, &sock_fp,
+	    NULL)) != 0)
+		goto out;
+	so = sock_fp->f_data;
 	if (so->so_type != SOCK_STREAM) {
 		error = EINVAL;
-		goto done;
+		goto out;
 	}
 	if ((so->so_state & SS_ISCONNECTED) == 0) {
 		error = ENOTCONN;
-		goto done;
-	}
-	if (uap->offset < 0) {
-		error = EINVAL;
-		goto done;
+		goto out;
 	}
+	/*
+	 * Do not wait on memory allocations but return ENOMEM for
+	 * caller to retry later.
+	 * XXX: Experimental.
+	 */
+	if (uap->flags & SF_MNOWAIT)
+		mnw = 1;
 
 #ifdef MAC
 	SOCK_LOCK(so);
 	error = mac_check_socket_send(td->td_ucred, so);
 	SOCK_UNLOCK(so);
 	if (error)
-		goto done;
+		goto out;
 #endif
 
-	/*
-	 * If specified, get the pointer to the sf_hdtr struct for
-	 * any headers/trailers.
-	 */
-	if (uap->hdtr != NULL) {
-		error = copyin(uap->hdtr, &hdtr, sizeof(hdtr));
-		if (error)
-			goto done;
-		/*
-		 * Send any headers.
-		 */
-		if (hdtr.headers != NULL) {
-			error = copyinuio(hdtr.headers, hdtr.hdr_cnt, &hdr_uio);
-			if (error)
-				goto done;
-			hdr_uio->uio_td = td;
-			hdr_uio->uio_rw = UIO_WRITE;
-			if (hdr_uio->uio_resid > 0) {
-				m_header = m_uiotombuf(hdr_uio, M_DONTWAIT, 0, 0);
-				if (m_header == NULL)
-					goto done;
-				headersize = m_header->m_pkthdr.len;
-				if (compat)
-					sbytes += headersize;
+	/* If headers are specified copy them into mbufs. */
+	if (hdr_uio != NULL) {
+		hdr_uio->uio_td = td;
+		hdr_uio->uio_rw = UIO_WRITE;
+		if (hdr_uio->uio_resid > 0) {
+			/*
+			 * In FBSD < 5.0 the nbytes to send also included
+			 * the header.  If compat is specified subtract the
+			 * header size from nbytes.
+			 */
+			if (compat) {
+				if (uap->nbytes > hdr_uio->uio_resid)
+					uap->nbytes -= hdr_uio->uio_resid;
+				else
+					uap->nbytes = 0;
 			}
+			m = m_uiotombuf(hdr_uio, (mnw ? M_NOWAIT : M_WAITOK),
+			    0, 0, 0);
+			if (m == NULL) {
+				error = mnw ? EAGAIN : ENOBUFS;
+				goto out;
+			}
+			hdrlen = m_length(m, NULL);
 		}
 	}
 
 	/*
 	 * Protect against multiple writers to the socket.
+	 *
+	 * XXXRW: Historically this has assumed non-interruptibility, so now
+	 * we implement that, but possibly shouldn't.
 	 */
-	SOCKBUF_LOCK(&so->so_snd);
-	(void) sblock(&so->so_snd, M_WAITOK);
-	SOCKBUF_UNLOCK(&so->so_snd);
+	(void)sblock(&so->so_snd, SBL_WAIT | SBL_NOINTR);
 
 	/*
-	 * Loop through the pages in the file, starting with the requested
+	 * Loop through the pages of the file, starting with the requested
 	 * offset. Get a file page (do I/O if necessary), map the file page
 	 * into an sf_buf, attach an mbuf header to the sf_buf, and queue
 	 * it on the socket.
+	 * This is done in two loops.  The inner loop turns as many pages
+	 * as it can, up to available socket buffer space, without blocking
+	 * into mbufs to have it bulk delivered into the socket send buffer.
+	 * The outer loop checks the state and available space of the socket
+	 * and takes care of the overall progress.
 	 */
-	for (off = uap->offset; ; off += xfsize, sbytes += xfsize) {
-		vm_pindex_t pindex;
-		vm_offset_t pgoff;
-
-		pindex = OFF_TO_IDX(off);
-		VM_OBJECT_LOCK(obj);
-retry_lookup:
-		/*
-		 * Calculate the amount to transfer. Not to exceed a page,
-		 * the EOF, or the passed in nbytes.
-		 */
-		xfsize = obj->un_pager.vnp.vnp_size - off;
-		VM_OBJECT_UNLOCK(obj);
-		if (xfsize > PAGE_SIZE)
-			xfsize = PAGE_SIZE;
-		pgoff = (vm_offset_t)(off & PAGE_MASK);
-		if (PAGE_SIZE - pgoff < xfsize)
-			xfsize = PAGE_SIZE - pgoff;
-		if (uap->nbytes && xfsize > (uap->nbytes - sbytes))
-			xfsize = uap->nbytes - sbytes;
-		if (xfsize <= 0) {
-			if (m_header != NULL) {
-				m = m_header;
-				m_header = NULL;
-				SOCKBUF_LOCK(&so->so_snd);
-				goto retry_space;
-			} else
-				break;
-		}
+	for (off = uap->offset, rem = uap->nbytes; ; ) {
+		int loopbytes = 0;
+		int space = 0;
+		int done = 0;
+
 		/*
-		 * Optimize the non-blocking case by looking at the socket space
-		 * before going to the extra work of constituting the sf_buf.
+		 * Check the socket state for ongoing connection,
+		 * no errors and space in socket buffer.
+		 * If space is low allow for the remainder of the
+		 * file to be processed if it fits the socket buffer.
+		 * Otherwise block in waiting for sufficient space
+		 * to proceed, or if the socket is nonblocking, return
+		 * to userland with EAGAIN while reporting how far
+		 * we've come.
+		 * We wait until the socket buffer has significant free
+		 * space to do bulk sends.  This makes good use of file
+		 * system read ahead and allows packet segmentation
+		 * offloading hardware to take over lots of work.  If
+		 * we were not careful here we would send off only one
+		 * sfbuf at a time.
 		 */
 		SOCKBUF_LOCK(&so->so_snd);
-		if ((so->so_state & SS_NBIO) && sbspace(&so->so_snd) <= 0) {
-			if (so->so_snd.sb_state & SBS_CANTSENDMORE)
-				error = EPIPE;
-			else
-				error = EAGAIN;
-			sbunlock(&so->so_snd);
+		if (so->so_snd.sb_lowat < so->so_snd.sb_hiwat / 2)
+			so->so_snd.sb_lowat = so->so_snd.sb_hiwat / 2;
+retry_space:
+		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
+			error = EPIPE;
+			SOCKBUF_UNLOCK(&so->so_snd);
+			goto done;
+		} else if (so->so_error) {
+			error = so->so_error;
+			so->so_error = 0;
 			SOCKBUF_UNLOCK(&so->so_snd);
 			goto done;
 		}
-		SOCKBUF_UNLOCK(&so->so_snd);
-		VM_OBJECT_LOCK(obj);
-		/*
-		 * Attempt to look up the page.
-		 *
-		 *	Allocate if not found
-		 *
-		 *	Wait and loop if busy.
-		 */
-		pg = vm_page_lookup(obj, pindex);
-
-		if (pg == NULL) {
-			pg = vm_page_alloc(obj, pindex, VM_ALLOC_NOBUSY |
-			    VM_ALLOC_NORMAL | VM_ALLOC_WIRED);
-			if (pg == NULL) {
-				VM_OBJECT_UNLOCK(obj);
-				VM_WAIT;
-				VM_OBJECT_LOCK(obj);
-				goto retry_lookup;
+		space = sbspace(&so->so_snd);
+		if (space < rem &&
+		    (space <= 0 ||
+		     space < so->so_snd.sb_lowat)) {
+			if (so->so_state & SS_NBIO) {
+				SOCKBUF_UNLOCK(&so->so_snd);
+				error = EAGAIN;
+				goto done;
 			}
-			vm_page_lock_queues();
-		} else {
-			vm_page_lock_queues();
-			if (vm_page_sleep_if_busy(pg, TRUE, "sfpbsy"))
-				goto retry_lookup;
 			/*
-			 * Wire the page so it does not get ripped out from
-			 * under us.
+			 * sbwait drops the lock while sleeping.
+			 * When we loop back to retry_space the
+			 * state may have changed and we retest
+			 * for it.
+			 */
+			error = sbwait(&so->so_snd);
+			/*
+			 * An error from sbwait usually indicates that we've
+			 * been interrupted by a signal. If we've sent anything
+			 * then return bytes sent, otherwise return the error.
 			 */
-			vm_page_wire(pg);
+			if (error) {
+				SOCKBUF_UNLOCK(&so->so_snd);
+				goto done;
+			}
+			goto retry_space;
 		}
+		SOCKBUF_UNLOCK(&so->so_snd);
 
 		/*
-		 * If page is not valid for what we need, initiate I/O
+		 * Reduce space in the socket buffer by the size of
+		 * the header mbuf chain.
+		 * hdrlen is set to 0 after the first loop.
 		 */
+		space -= hdrlen;
 
-		if (pg->valid && vm_page_is_valid(pg, pgoff, xfsize)) {
-			VM_OBJECT_UNLOCK(obj);
-		} else if (uap->flags & SF_NODISKIO) {
-			error = EBUSY;
-		} else {
-			int bsize, resid;
+		/*
+		 * Loop and construct maximum sized mbuf chain to be bulk
+		 * dumped into socket buffer.
+		 */
+		while(space > loopbytes) {
+			vm_pindex_t pindex;
+			vm_offset_t pgoff;
+			struct mbuf *m0;
 
+			VM_OBJECT_LOCK(obj);
+			/*
+			 * Calculate the amount to transfer.
+			 * Not to exceed a page, the EOF,
+			 * or the passed in nbytes.
+			 */
+			pgoff = (vm_offset_t)(off & PAGE_MASK);
+			xfsize = omin(PAGE_SIZE - pgoff,
+			    obj->un_pager.vnp.vnp_size - uap->offset -
+			    fsbytes - loopbytes);
+			if (uap->nbytes)
+				rem = (uap->nbytes - fsbytes - loopbytes);
+			else
+				rem = obj->un_pager.vnp.vnp_size -
+				    uap->offset - fsbytes - loopbytes;
+			xfsize = omin(rem, xfsize);
+			if (xfsize <= 0) {
+				VM_OBJECT_UNLOCK(obj);
+				done = 1;		/* all data sent */
+				break;
+			}
 			/*
-			 * Ensure that our page is still around when the I/O
-			 * completes.
+			 * Don't overflow the send buffer.
+			 * Stop here and send out what we've
+			 * already got.
 			 */
-			vm_page_io_start(pg);
-			vm_page_unlock_queues();
-			VM_OBJECT_UNLOCK(obj);
+			if (space < loopbytes + xfsize) {
+				VM_OBJECT_UNLOCK(obj);
+				break;
+			}
 
 			/*
-			 * Get the page from backing store.
+			 * Attempt to look up the page.  Allocate
+			 * if not found or wait and loop if busy.
 			 */
-			bsize = vp->v_mount->mnt_stat.f_iosize;
-			vn_lock(vp, LK_SHARED | LK_RETRY, td);
+			pindex = OFF_TO_IDX(off);
+			pg = vm_page_grab(obj, pindex, VM_ALLOC_NOBUSY |
+			    VM_ALLOC_NORMAL | VM_ALLOC_WIRED | VM_ALLOC_RETRY);
+
 			/*
-			 * XXXMAC: Because we don't have fp->f_cred here,
-			 * we pass in NOCRED.  This is probably wrong, but
-			 * is consistent with our original implementation.
+			 * Check if page is valid for what we need,
+			 * otherwise initiate I/O.
+			 * If we already turned some pages into mbufs,
+			 * send them off before we come here again and
+			 * block.
 			 */
-			error = vn_rdwr(UIO_READ, vp, NULL, MAXBSIZE,
-			    trunc_page(off), UIO_NOCOPY, IO_NODELOCKED |
-			    IO_VMIO | ((MAXBSIZE / bsize) << IO_SEQSHIFT),
-			    td->td_ucred, NOCRED, &resid, td);
-			VOP_UNLOCK(vp, 0, td);
-			VM_OBJECT_LOCK(obj);
-			vm_page_lock_queues();
-			vm_page_io_finish(pg);
-			if (!error)
+			if (pg->valid && vm_page_is_valid(pg, pgoff, xfsize))
 				VM_OBJECT_UNLOCK(obj);
-			mbstat.sf_iocnt++;
-		}
-	
-		if (error) {
-			vm_page_unwire(pg, 0);
+			else if (m != NULL)
+				error = EAGAIN;	/* send what we already got */
+			else if (uap->flags & SF_NODISKIO)
+				error = EBUSY;
+			else {
+				int bsize, resid;
+
+				/*
+				 * Ensure that our page is still around
+				 * when the I/O completes.
+				 */
+				vm_page_io_start(pg);
+				VM_OBJECT_UNLOCK(obj);
+
+				/*
+				 * Get the page from backing store.
+				 */
+				bsize = vp->v_mount->mnt_stat.f_iosize;
+				vfslocked = VFS_LOCK_GIANT(vp->v_mount);
+				vn_lock(vp, LK_SHARED | LK_RETRY, td);
+
+				/*
+				 * XXXMAC: Because we don't have fp->f_cred
+				 * here, we pass in NOCRED.  This is probably
+				 * wrong, but is consistent with our original
+				 * implementation.
+				 */
+				error = vn_rdwr(UIO_READ, vp, NULL, MAXBSIZE,
+				    trunc_page(off), UIO_NOCOPY, IO_NODELOCKED |
+				    IO_VMIO | ((MAXBSIZE / bsize) << IO_SEQSHIFT),
+				    td->td_ucred, NOCRED, &resid, td);
+				VOP_UNLOCK(vp, 0, td);
+				VFS_UNLOCK_GIANT(vfslocked);
+				VM_OBJECT_LOCK(obj);
+				vm_page_io_finish(pg);
+				if (!error)
+					VM_OBJECT_UNLOCK(obj);
+				mbstat.sf_iocnt++;
+			}
+			if (error) {
+				vm_page_lock_queues();
+				vm_page_unwire(pg, 0);
+				/*
+				 * See if anyone else might know about
+				 * this page.  If not and it is not valid,
+				 * then free it.
+				 */
+				if (pg->wire_count == 0 && pg->valid == 0 &&
+				    pg->busy == 0 && !(pg->oflags & VPO_BUSY) &&
+				    pg->hold_count == 0) {
+					vm_page_free(pg);
+				}
+				vm_page_unlock_queues();
+				VM_OBJECT_UNLOCK(obj);
+				if (error == EAGAIN)
+					error = 0;	/* not a real error */
+				break;
+			}
+
 			/*
-			 * See if anyone else might know about this page.
-			 * If not and it is not valid, then free it.
+			 * Get a sendfile buf.  We usually wait as long
+			 * as necessary, but this wait can be interrupted.
 			 */
-			if (pg->wire_count == 0 && pg->valid == 0 &&
-			    pg->busy == 0 && !(pg->flags & PG_BUSY) &&
-			    pg->hold_count == 0) {
-				vm_page_free(pg);
+			if ((sf = sf_buf_alloc(pg,
+			    (mnw ? SFB_NOWAIT : SFB_CATCH))) == NULL) {
+				mbstat.sf_allocfail++;
+				vm_page_lock_queues();
+				vm_page_unwire(pg, 0);
+				/*
+				 * XXX: Not same check as above!?
+				 */
+				if (pg->wire_count == 0 && pg->object == NULL)
+					vm_page_free(pg);
+				vm_page_unlock_queues();
+				error = (mnw ? EAGAIN : EINTR);
+				break;
 			}
-			vm_page_unlock_queues();
-			VM_OBJECT_UNLOCK(obj);
-			SOCKBUF_LOCK(&so->so_snd);
-			sbunlock(&so->so_snd);
-			SOCKBUF_UNLOCK(&so->so_snd);
-			goto done;
-		}
-		vm_page_unlock_queues();
 
-		/*
-		 * Get a sendfile buf. We usually wait as long as necessary,
-		 * but this wait can be interrupted.
-		 */
-		if ((sf = sf_buf_alloc(pg, SFB_CATCH)) == NULL) {
-			mbstat.sf_allocfail++;
-			vm_page_lock_queues();
-			vm_page_unwire(pg, 0);
-			if (pg->wire_count == 0 && pg->object == NULL)
-				vm_page_free(pg);
-			vm_page_unlock_queues();
-			SOCKBUF_LOCK(&so->so_snd);
-			sbunlock(&so->so_snd);
-			SOCKBUF_UNLOCK(&so->so_snd);
-			error = EINTR;
-			goto done;
-		}
+			/*
+			 * Get an mbuf and set it up as having
+			 * external storage.
+			 */
+			m0 = m_get((mnw ? M_NOWAIT : M_WAITOK), MT_DATA);
+			if (m0 == NULL) {
+				error = (mnw ? EAGAIN : ENOBUFS);
+				sf_buf_mext((void *)sf_buf_kva(sf), sf);
+				break;
+			}
+			MEXTADD(m0, sf_buf_kva(sf), PAGE_SIZE, sf_buf_mext,
+			    sf, M_RDONLY, EXT_SFBUF);
+			m0->m_data = (char *)sf_buf_kva(sf) + pgoff;
+			m0->m_len = xfsize;
+
+			/* Append to mbuf chain. */
+			if (m != NULL)
+				m_cat(m, m0);
+			else
+				m = m0;
 
-		/*
-		 * Get an mbuf header and set it up as having external storage.
-		 */
-		if (m_header)
-			MGET(m, M_TRYWAIT, MT_DATA);
-		else
-			MGETHDR(m, M_TRYWAIT, MT_DATA);
-		if (m == NULL) {
-			error = ENOBUFS;
-			sf_buf_mext((void *)sf_buf_kva(sf), sf);
-			SOCKBUF_LOCK(&so->so_snd);
-			sbunlock(&so->so_snd);
-			SOCKBUF_UNLOCK(&so->so_snd);
-			goto done;
-		}
-		/*
-		 * Setup external storage for mbuf.
-		 */
-		MEXTADD(m, sf_buf_kva(sf), PAGE_SIZE, sf_buf_mext, sf, M_RDONLY,
-		    EXT_SFBUF);
-		m->m_data = (char *)sf_buf_kva(sf) + pgoff;
-		m->m_pkthdr.len = m->m_len = xfsize;
-
-		if (m_header) {
-			m_cat(m_header, m);
-			m = m_header;
-			m_header = NULL;
-			m_fixhdr(m);
+			/* Keep track of bits processed. */
+			loopbytes += xfsize;
+			off += xfsize;
 		}
 
-		/*
-		 * Add the buffer to the socket buffer chain.
-		 */
-		SOCKBUF_LOCK(&so->so_snd);
-retry_space:
-		/*
-		 * Make sure that the socket is still able to take more data.
-		 * CANTSENDMORE being true usually means that the connection
-		 * was closed. so_error is true when an error was sensed after
-		 * a previous send.
-		 * The state is checked after the page mapping and buffer
-		 * allocation above since those operations may block and make
-		 * any socket checks stale. From this point forward, nothing
-		 * blocks before the pru_send (or more accurately, any blocking
-		 * results in a loop back to here to re-check).
-		 */
-		SOCKBUF_LOCK_ASSERT(&so->so_snd);
-		if ((so->so_snd.sb_state & SBS_CANTSENDMORE) || so->so_error) {
+		/* Add the buffer chain to the socket buffer. */
+		if (m != NULL) {
+			int mlen, err;
+
+			mlen = m_length(m, NULL);
+			SOCKBUF_LOCK(&so->so_snd);
 			if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
 				error = EPIPE;
-			} else {
-				error = so->so_error;
-				so->so_error = 0;
-			}
-			m_freem(m);
-			sbunlock(&so->so_snd);
-			SOCKBUF_UNLOCK(&so->so_snd);
-			goto done;
-		}
-		/*
-		 * Wait for socket space to become available. We do this just
-		 * after checking the connection state above in order to avoid
-		 * a race condition with sbwait().
-		 */
-		if (sbspace(&so->so_snd) < so->so_snd.sb_lowat) {
-			if (so->so_state & SS_NBIO) {
-				m_freem(m);
-				sbunlock(&so->so_snd);
 				SOCKBUF_UNLOCK(&so->so_snd);
-				error = EAGAIN;
 				goto done;
 			}
-			error = sbwait(&so->so_snd);
-			/*
-			 * An error from sbwait usually indicates that we've
-			 * been interrupted by a signal. If we've sent anything
-			 * then return bytes sent, otherwise return the error.
-			 */
-			if (error) {
-				m_freem(m);
-				sbunlock(&so->so_snd);
-				SOCKBUF_UNLOCK(&so->so_snd);
-				goto done;
-			}
-			goto retry_space;
-		}
-		SOCKBUF_UNLOCK(&so->so_snd);
-		error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, m, 0, 0, td);
-		if (error) {
-			SOCKBUF_LOCK(&so->so_snd);
-			sbunlock(&so->so_snd);
 			SOCKBUF_UNLOCK(&so->so_snd);
-			goto done;
+			/* Avoid error aliasing. */
+			err = (*so->so_proto->pr_usrreqs->pru_send)
+				    (so, 0, m, NULL, NULL, td);
+			if (err == 0) {
+				/*
+				 * We need two counters to get the
+				 * file offset and nbytes to send
+				 * right:
+				 * - sbytes contains the total amount
+				 *   of bytes sent, including headers.
+				 * - fsbytes contains the total amount
+				 *   of bytes sent from the file.
+				 */
+				sbytes += mlen;
+				fsbytes += mlen;
+				if (hdrlen) {
+					fsbytes -= hdrlen;
+					hdrlen = 0;
+				}
+			} else if (error == 0)
+				error = err;
+			m = NULL;	/* pru_send always consumes */
 		}
-		headersent = 1;
+
+		/* Quit outer loop on error or when we're done. */
+		if (error || done)
+			goto done;
 	}
-	SOCKBUF_LOCK(&so->so_snd);
-	sbunlock(&so->so_snd);
-	SOCKBUF_UNLOCK(&so->so_snd);
 
 	/*
 	 * Send trailers. Wimp out and use writev(2).
 	 */
-	if (uap->hdtr != NULL && hdtr.trailers != NULL) {
-			nuap.fd = uap->s;
-			nuap.iovp = hdtr.trailers;
-			nuap.iovcnt = hdtr.trl_cnt;
-			error = writev(td, &nuap);
-			if (error)
-				goto done;
-			if (compat)
-				sbytes += td->td_retval[0];
-			else
-				hdtr_size += td->td_retval[0];
+	if (trl_uio != NULL) {
+		error = kern_writev(td, uap->s, trl_uio);
+		if (error)
+			goto done;
+		sbytes += td->td_retval[0];
 	}
 
 done:
-	if (headersent) {
-		if (!compat)
-			hdtr_size += headersize;
-	} else {
-		if (compat)
-			sbytes -= headersize;
-	}
+	sbunlock(&so->so_snd);
+out:
 	/*
 	 * If there was no error we have to clear td->td_retval[0]
 	 * because it may have been set by writev.
@@ -2141,23 +2197,464 @@
 		td->td_retval[0] = 0;
 	}
 	if (uap->sbytes != NULL) {
-		if (!compat)
-			sbytes += hdtr_size;
 		copyout(&sbytes, uap->sbytes, sizeof(off_t));
 	}
-	if (vp)
+	if (obj != NULL)
+		vm_object_deallocate(obj);
+	if (vp != NULL) {
+		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 		vrele(vp);
+		VFS_UNLOCK_GIANT(vfslocked);
+	}
 	if (so)
-		fputsock(so);
-	if (hdr_uio != NULL)
-		free(hdr_uio, M_IOV);
-	if (m_header)
-		m_freem(m_header);
-
-	mtx_unlock(&Giant);
+		fdrop(sock_fp, td);
+	if (m)
+		m_freem(m);
 
 	if (error == ERESTART)
 		error = EINTR;
 
 	return (error);
 }
+
+/*
+ * SCTP syscalls.
+ * Functionality only compiled in if SCTP is defined in the kernel Makefile,
+ * otherwise all return EOPNOTSUPP.
+ * XXX: We should make this loadable one day.
+ */
+int
+sctp_peeloff(td, uap)
+	struct thread *td;
+	struct sctp_peeloff_args /* {
+		int	sd;
+		caddr_t	name;
+	} */ *uap;
+{
+#ifdef SCTP
+	struct filedesc *fdp;
+	struct file *nfp = NULL;
+	int error;
+	struct socket *head, *so;
+	int fd;
+	u_int fflag;
+
+	fdp = td->td_proc->p_fd;
+	error = fgetsock(td, uap->sd, &head, &fflag);
+	if (error)
+		goto done2;
+	error = sctp_can_peel_off(head, (sctp_assoc_t)uap->name);
+	if (error)
+		goto done2;
+	/*
+	 * At this point we know we do have a assoc to pull
+	 * we proceed to get the fd setup. This may block
+	 * but that is ok.
+	 */
+
+	error = falloc(td, &nfp, &fd);
+	if (error)
+		goto done;
+	td->td_retval[0] = fd;
+
+	so = sonewconn(head, SS_ISCONNECTED);
+	if (so == NULL) 
+		goto noconnection;
+	/*
+	 * Before changing the flags on the socket, we have to bump the
+	 * reference count.  Otherwise, if the protocol calls sofree(),
+	 * the socket will be released due to a zero refcount.
+	 */
+        SOCK_LOCK(so);
+        soref(so);                      /* file descriptor reference */
+        SOCK_UNLOCK(so);
+
+	ACCEPT_LOCK();
+
+	TAILQ_REMOVE(&head->so_comp, so, so_list);
+	head->so_qlen--;
+	so->so_state |= (head->so_state & SS_NBIO);
+	so->so_state &= ~SS_NOFDREF;
+	so->so_qstate &= ~SQ_COMP;
+	so->so_head = NULL;
+	ACCEPT_UNLOCK();
+	FILE_LOCK(nfp);
+	nfp->f_data = so;
+	nfp->f_flag = fflag;
+	nfp->f_type = DTYPE_SOCKET;
+	nfp->f_ops = &socketops;
+	FILE_UNLOCK(nfp);
+	error = sctp_do_peeloff(head, so, (sctp_assoc_t)uap->name);
+	if (error)
+		goto noconnection;
+	if (head->so_sigio != NULL)
+		fsetown(fgetown(&head->so_sigio), &so->so_sigio);
+
+noconnection:
+	/*
+	 * close the new descriptor, assuming someone hasn't ripped it
+	 * out from under us.
+	 */
+	if (error)
+		fdclose(fdp, nfp, fd, td);
+
+	/*
+	 * Release explicitly held references before returning.
+	 */
+done:
+	if (nfp != NULL)
+		fdrop(nfp, td);
+	fputsock(head);
+done2:
+	return (error);
+#else  /* SCTP */
+	return (EOPNOTSUPP);
+#endif /* SCTP */
+}
+
+int
+sctp_generic_sendmsg (td, uap)
+	struct thread *td;
+	struct sctp_generic_sendmsg_args /* {
+		int sd, 
+		caddr_t msg, 
+		int mlen, 
+		caddr_t to, 
+		__socklen_t tolen, 
+		struct sctp_sndrcvinfo *sinfo, 
+		int flags
+	} */ *uap;
+{
+#ifdef SCTP
+	struct sctp_sndrcvinfo sinfo, *u_sinfo = NULL;
+	struct socket *so;
+	struct file *fp = NULL;
+	int use_rcvinfo = 1;
+	int error = 0, len;
+	struct sockaddr *to = NULL;
+#ifdef KTRACE
+	struct uio *ktruio = NULL;
+#endif
+	struct uio auio;
+	struct iovec iov[1];
+
+	if (uap->sinfo) {
+		error = copyin(uap->sinfo, &sinfo, sizeof (sinfo));
+		if (error)
+			return (error);
+		u_sinfo = &sinfo;
+	}
+	if (uap->tolen) {
+		error = getsockaddr(&to, uap->to, uap->tolen);
+		if (error) {
+			to = NULL;
+			goto sctp_bad2;
+		}
+	}
+
+	error = getsock(td->td_proc->p_fd, uap->sd, &fp, NULL);
+	if (error)
+		goto sctp_bad;
+
+	iov[0].iov_base = uap->msg;
+	iov[0].iov_len = uap->mlen;
+
+	so = (struct socket *)fp->f_data;
+#ifdef MAC
+	SOCK_LOCK(so);
+	error = mac_check_socket_send(td->td_ucred, so);
+	SOCK_UNLOCK(so);
+	if (error)
+		goto sctp_bad;
+#endif /* MAC */
+
+	auio.uio_iov =  iov;
+	auio.uio_iovcnt = 1;
+	auio.uio_segflg = UIO_USERSPACE;
+	auio.uio_rw = UIO_WRITE;
+	auio.uio_td = td;
+	auio.uio_offset = 0;			/* XXX */
+	auio.uio_resid = 0;
+	len = auio.uio_resid = uap->mlen;
+	error = sctp_lower_sosend(so, to, &auio,
+		    (struct mbuf *)NULL, (struct mbuf *)NULL,
+		    uap->flags, use_rcvinfo, u_sinfo, td);
+	if (error) {
+		if (auio.uio_resid != len && (error == ERESTART ||
+		    error == EINTR || error == EWOULDBLOCK))
+			error = 0;
+		/* Generation of SIGPIPE can be controlled per socket. */
+		if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) &&
+		    !(uap->flags & MSG_NOSIGNAL)) {
+			PROC_LOCK(td->td_proc);
+			psignal(td->td_proc, SIGPIPE);
+			PROC_UNLOCK(td->td_proc);
+		}
+	}
+	if (error == 0)
+		td->td_retval[0] = len - auio.uio_resid;
+#ifdef KTRACE
+	if (ktruio != NULL) {
+		ktruio->uio_resid = td->td_retval[0];
+		ktrgenio(uap->sd, UIO_WRITE, ktruio, error);
+	}
+#endif /* KTRACE */
+sctp_bad:
+	if (fp)
+		fdrop(fp, td);
+sctp_bad2:
+	if (to)
+		free(to, M_SONAME);
+	return (error);
+#else  /* SCTP */
+	return (EOPNOTSUPP);
+#endif /* SCTP */
+}
+
+int
+sctp_generic_sendmsg_iov(td, uap)
+	struct thread *td;
+	struct sctp_generic_sendmsg_iov_args /* {
+		int sd, 
+		struct iovec *iov, 
+		int iovlen, 
+		caddr_t to, 
+		__socklen_t tolen, 
+		struct sctp_sndrcvinfo *sinfo, 
+		int flags
+	} */ *uap;
+{
+#ifdef SCTP
+	struct sctp_sndrcvinfo sinfo, *u_sinfo = NULL;
+	struct socket *so;
+	struct file *fp = NULL;
+	int use_rcvinfo = 1;
+	int error=0, len, i;
+	struct sockaddr *to = NULL;
+#ifdef KTRACE
+	struct uio *ktruio = NULL;
+#endif
+	struct uio auio;
+	struct iovec *iov, *tiov;
+
+	if (uap->sinfo) {
+		error = copyin(uap->sinfo, &sinfo, sizeof (sinfo));
+		if (error)
+			return (error);
+		u_sinfo = &sinfo;
+	}
+	if (uap->tolen) {
+		error = getsockaddr(&to, uap->to, uap->tolen);
+		if (error) {
+			to = NULL;
+			goto sctp_bad2;
+		}
+	}
+
+	error = getsock(td->td_proc->p_fd, uap->sd, &fp, NULL);
+	if (error)
+		goto sctp_bad1;
+
+	error = copyiniov(uap->iov, uap->iovlen, &iov, EMSGSIZE);
+	if (error)
+		goto sctp_bad1;
+
+	so = (struct socket *)fp->f_data;
+#ifdef MAC
+	SOCK_LOCK(so);
+	error = mac_check_socket_send(td->td_ucred, so);
+	SOCK_UNLOCK(so);
+	if (error)
+		goto sctp_bad;
+#endif /* MAC */
+
+	auio.uio_iov =  iov;
+	auio.uio_iovcnt = uap->iovlen;
+	auio.uio_segflg = UIO_USERSPACE;
+	auio.uio_rw = UIO_WRITE;
+	auio.uio_td = td;
+	auio.uio_offset = 0;			/* XXX */
+	auio.uio_resid = 0;
+	tiov = iov;
+	for (i = 0; i <uap->iovlen; i++, tiov++) {
+		if ((auio.uio_resid += tiov->iov_len) < 0) {
+			error = EINVAL;
+			goto sctp_bad;
+		}
+	}
+	len = auio.uio_resid;
+	error = sctp_lower_sosend(so, to, &auio,
+		    (struct mbuf *)NULL, (struct mbuf *)NULL,
+		    uap->flags, use_rcvinfo, u_sinfo, td);
+	if (error) {
+		if (auio.uio_resid != len && (error == ERESTART ||
+		    error == EINTR || error == EWOULDBLOCK))
+			error = 0;
+		/* Generation of SIGPIPE can be controlled per socket */
+		if (error == EPIPE && !(so->so_options & SO_NOSIGPIPE) &&
+		    !(uap->flags & MSG_NOSIGNAL)) {
+			PROC_LOCK(td->td_proc);
+			psignal(td->td_proc, SIGPIPE);
+			PROC_UNLOCK(td->td_proc);
+		}
+	}
+	if (error == 0)
+		td->td_retval[0] = len - auio.uio_resid;
+#ifdef KTRACE
+	if (ktruio != NULL) {
+		ktruio->uio_resid = td->td_retval[0];
+		ktrgenio(uap->sd, UIO_WRITE, ktruio, error);
+	}
+#endif /* KTRACE */
+sctp_bad:
+	free(iov, M_IOV);
+sctp_bad1:
+	if (fp)
+		fdrop(fp, td);
+sctp_bad2:
+	if (to)
+		free(to, M_SONAME);
+	return (error);
+#else  /* SCTP */
+	return (EOPNOTSUPP);
+#endif /* SCTP */
+}
+
+int
+sctp_generic_recvmsg(td, uap)
+	struct thread *td;
+	struct sctp_generic_recvmsg_args /* {
+		int sd, 
+		struct iovec *iov, 
+		int iovlen,
+		struct sockaddr *from, 
+		__socklen_t *fromlenaddr,
+		struct sctp_sndrcvinfo *sinfo, 
+		int *msg_flags
+	} */ *uap;
+{
+#ifdef SCTP
+	u_int8_t sockbufstore[256];
+	struct uio auio;
+	struct iovec *iov, *tiov;
+	struct sctp_sndrcvinfo sinfo;
+	struct socket *so;
+	struct file *fp = NULL;
+	struct sockaddr *fromsa;
+	int fromlen;
+	int len, i, msg_flags;
+	int error = 0;
+#ifdef KTRACE
+	struct uio *ktruio = NULL;
+#endif
+	error = getsock(td->td_proc->p_fd, uap->sd, &fp, NULL);
+	if (error) {
+		return (error);
+	}
+	error = copyiniov(uap->iov, uap->iovlen, &iov, EMSGSIZE);
+	if (error) {
+		goto out1;
+	}
+
+	so = fp->f_data;
+#ifdef MAC
+	SOCK_LOCK(so);
+	error = mac_check_socket_receive(td->td_ucred, so);
+	SOCK_UNLOCK(so);
+	if (error) {
+		goto out;
+		return (error);
+	}
+#endif /* MAC */
+
+	if (uap->fromlenaddr) {
+		error = copyin(uap->fromlenaddr,
+		    &fromlen, sizeof (fromlen));
+		if (error) {
+			goto out;
+		}
+	} else {
+		fromlen = 0;
+	}
+	if(uap->msg_flags) {
+		error = copyin(uap->msg_flags, &msg_flags, sizeof (int));
+		if (error) {
+			goto out;
+		}
+	} else {
+		msg_flags = 0;
+	}
+	auio.uio_iov = iov;
+	auio.uio_iovcnt = uap->iovlen;
+  	auio.uio_segflg = UIO_USERSPACE;
+	auio.uio_rw = UIO_READ;
+	auio.uio_td = td;
+	auio.uio_offset = 0;			/* XXX */
+	auio.uio_resid = 0;
+	tiov = iov;
+	for (i = 0; i <uap->iovlen; i++, tiov++) {
+		if ((auio.uio_resid += tiov->iov_len) < 0) {
+			error = EINVAL;
+			goto out;
+		}
+	}
+	len = auio.uio_resid;
+	fromsa = (struct sockaddr *)sockbufstore;
+
+#ifdef KTRACE
+	if (KTRPOINT(td, KTR_GENIO))
+		ktruio = cloneuio(&auio);
+#endif /* KTRACE */
+	error = sctp_sorecvmsg(so, &auio, (struct mbuf **)NULL,
+		    fromsa, fromlen, &msg_flags,
+		    (struct sctp_sndrcvinfo *)&sinfo, 1);
+	if (error) {
+		if (auio.uio_resid != (int)len && (error == ERESTART ||
+		    error == EINTR || error == EWOULDBLOCK))
+			error = 0;
+	} else {
+		if (uap->sinfo)
+			error = copyout(&sinfo, uap->sinfo, sizeof (sinfo));
+	}
+#ifdef KTRACE
+	if (ktruio != NULL) {
+		ktruio->uio_resid = (int)len - auio.uio_resid;
+		ktrgenio(uap->sd, UIO_READ, ktruio, error);
+	}
+#endif /* KTRACE */
+	if (error)
+		goto out;
+	td->td_retval[0] = (int)len - auio.uio_resid;
+
+	if (fromlen && uap->from) {
+		len = fromlen;
+		if (len <= 0 || fromsa == 0)
+			len = 0;
+		else {
+			len = MIN(len, fromsa->sa_len);
+			error = copyout(fromsa, uap->from, (unsigned)len);
+			if (error)
+				goto out;
+		}
+		error = copyout(&len, uap->fromlenaddr, sizeof (socklen_t));
+		if (error) {
+			goto out;
+		}
+	}
+	if (uap->msg_flags) {
+		error = copyout(&msg_flags, uap->msg_flags, sizeof (int));
+		if (error) {
+			goto out;
+		}
+	}
+out:
+	free(iov, M_IOV);
+out1:
+	if (fp) 
+		fdrop(fp, td);
+
+	return (error);
+#else  /* SCTP */
+	return (EOPNOTSUPP);
+#endif /* SCTP */
+}
Index: subr_turnstile.c
===================================================================
RCS file: /home/cvs/src/sys/kern/subr_turnstile.c,v
retrieving revision 1.1.1.2
retrieving revision 1.2
diff -L sys/kern/subr_turnstile.c -L sys/kern/subr_turnstile.c -u -r1.1.1.2 -r1.2
--- sys/kern/subr_turnstile.c
+++ sys/kern/subr_turnstile.c
@@ -46,27 +46,28 @@
  * chain.  Each chain contains a spin mutex that protects all of the
  * turnstiles in the chain.
  *
- * Each time a thread is created, a turnstile is malloc'd and attached to
- * that thread.  When a thread blocks on a lock, if it is the first thread
- * to block, it lends its turnstile to the lock.  If the lock already has
- * a turnstile, then it gives its turnstile to the lock's turnstile's free
- * list.  When a thread is woken up, it takes a turnstile from the free list
- * if there are any other waiters.  If it is the only thread blocked on the
- * lock, then it reclaims the turnstile associated with the lock and removes
- * it from the hash table.
+ * Each time a thread is created, a turnstile is allocated from a UMA zone
+ * and attached to that thread.  When a thread blocks on a lock, if it is the
+ * first thread to block, it lends its turnstile to the lock.  If the lock
+ * already has a turnstile, then it gives its turnstile to the lock's
+ * turnstile's free list.  When a thread is woken up, it takes a turnstile from
+ * the free list if there are any other waiters.  If it is the only thread
+ * blocked on the lock, then it reclaims the turnstile associated with the lock
+ * and removes it from the hash table.
  */
 
-#include "opt_turnstile_profiling.h"
-
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/subr_turnstile.c,v 1.152.2.1 2005/10/09 03:25:37 delphij Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/subr_turnstile.c,v 1.169 2007/06/12 23:27:31 jeff Exp $");
+
+#include "opt_ddb.h"
+#include "opt_turnstile_profiling.h"
+#include "opt_sched.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
-#include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/queue.h>
@@ -74,6 +75,15 @@
 #include <sys/sysctl.h>
 #include <sys/turnstile.h>
 
+#include <vm/uma.h>
+
+#ifdef DDB
+#include <sys/kdb.h>
+#include <ddb/ddb.h>
+#include <sys/lockmgr.h>
+#include <sys/sx.h>
+#endif
+
 /*
  * Constants for the hash table of turnstile chains.  TC_SHIFT is a magic
  * number chosen because the sleep queue's use the same value for the
@@ -95,8 +105,9 @@
  * when it is attached to a lock.  The second list to use ts_hash is the
  * free list hung off of a turnstile that is attached to a lock.
  *
- * Each turnstile contains two lists of threads.  The ts_blocked list is
- * a linked list of threads blocked on the turnstile's lock.  The
+ * Each turnstile contains three lists of threads.  The two ts_blocked lists
+ * are linked list of threads blocked on the turnstile's lock.  One list is
+ * for exclusive waiters, and the other is for shared waiters.  The
  * ts_pending list is a linked list of threads previously awakened by
  * turnstile_signal() or turnstile_wait() that are waiting to be put on
  * the run queue.
@@ -106,8 +117,9 @@
  *  q - td_contested lock
  */
 struct turnstile {
-	TAILQ_HEAD(, thread) ts_blocked;	/* (c + q) Blocked threads. */
-	TAILQ_HEAD(, thread) ts_pending;	/* (c) Pending threads. */
+	struct mtx ts_lock;			/* Spin lock for self. */
+	struct threadqueue ts_blocked[2];	/* (c + q) Blocked threads. */
+	struct threadqueue ts_pending;		/* (c) Pending threads. */
 	LIST_ENTRY(turnstile) ts_hash;		/* (c) Chain and free list. */
 	LIST_ENTRY(turnstile) ts_link;		/* (q) Contested locks. */
 	LIST_HEAD(, turnstile) ts_free;		/* (c) Free turnstiles. */
@@ -134,8 +146,7 @@
 #endif
 static struct mtx td_contested_lock;
 static struct turnstile_chain turnstile_chains[TC_TABLESIZE];
-
-static MALLOC_DEFINE(M_TURNSTILE, "turnstiles", "turnstiles");
+static uma_zone_t turnstile_zone;
 
 /*
  * Prototypes for non-exported routines.
@@ -147,7 +158,13 @@
 static void	propagate_priority(struct thread *td);
 static int	turnstile_adjust_thread(struct turnstile *ts,
 		    struct thread *td);
+static struct thread *turnstile_first_waiter(struct turnstile *ts);
 static void	turnstile_setowner(struct turnstile *ts, struct thread *owner);
+#ifdef INVARIANTS
+static void	turnstile_dtor(void *mem, int size, void *arg);
+#endif
+static int	turnstile_init(void *mem, int size, int flags);
+static void	turnstile_fini(void *mem, int size);
 
 /*
  * Walks the chain of turnstiles and their owners to propagate the priority
@@ -157,45 +174,62 @@
 static void
 propagate_priority(struct thread *td)
 {
-	struct turnstile_chain *tc;
 	struct turnstile *ts;
 	int pri;
 
-	mtx_assert(&sched_lock, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	pri = td->td_priority;
 	ts = td->td_blocked;
+	MPASS(td->td_lock == &ts->ts_lock);
+	/*
+	 * Grab a recursive lock on this turnstile chain so it stays locked
+	 * for the whole operation.  The caller expects us to return with
+	 * the original lock held.  We only ever lock down the chain so
+	 * the lock order is constant.
+	 */
+	mtx_lock_spin(&ts->ts_lock);
 	for (;;) {
 		td = ts->ts_owner;
 
 		if (td == NULL) {
 			/*
-			 * This really isn't quite right. Really
-			 * ought to bump priority of thread that
-			 * next acquires the lock.
+			 * This might be a read lock with no owner.  There's
+			 * not much we can do, so just bail.
 			 */
+			mtx_unlock_spin(&ts->ts_lock);
 			return;
 		}
 
+		thread_lock_flags(td, MTX_DUPOK);
+		mtx_unlock_spin(&ts->ts_lock);
 		MPASS(td->td_proc != NULL);
 		MPASS(td->td_proc->p_magic == P_MAGIC);
 
 		/*
-		 * XXX: The owner of a turnstile can be stale if it is the
-		 * first thread to grab a slock of a sx lock.  In that case
-		 * it is possible for us to be at SSLEEP or some other
-		 * weird state.  We should probably just return if the state
-		 * isn't SRUN or SLOCK.
+		 * If the thread is asleep, then we are probably about
+		 * to deadlock.  To make debugging this easier, just
+		 * panic and tell the user which thread misbehaved so
+		 * they can hopefully get a stack trace from the truly
+		 * misbehaving thread.
 		 */
-		KASSERT(!TD_IS_SLEEPING(td),
-		    ("sleeping thread (tid %d) owns a non-sleepable lock",
-		    td->td_tid));
+		if (TD_IS_SLEEPING(td)) {
+			printf(
+		"Sleeping thread (tid %d, pid %d) owns a non-sleepable lock\n",
+			    td->td_tid, td->td_proc->p_pid);
+#ifdef DDB
+			db_trace_thread(td, -1);
+#endif
+			panic("sleeping thread");
+		}
 
 		/*
 		 * If this thread already has higher priority than the
 		 * thread that is being blocked, we are finished.
 		 */
-		if (td->td_priority <= pri)
+		if (td->td_priority <= pri) {
+			thread_unlock(td);
 			return;
+		}
 
 		/*
 		 * Bump this thread's priority.
@@ -208,6 +242,7 @@
 		 */
 		if (TD_IS_RUNNING(td) || TD_ON_RUNQ(td)) {
 			MPASS(td->td_blocked == NULL);
+			thread_unlock(td);
 			return;
 		}
 
@@ -232,15 +267,13 @@
 		 */
 		ts = td->td_blocked;
 		MPASS(ts != NULL);
-		tc = TC_LOOKUP(ts->ts_lockobj);
-		mtx_lock_spin(&tc->tc_lock);
-
+		MPASS(td->td_lock == &ts->ts_lock);
 		/* Resort td on the list if needed. */
 		if (!turnstile_adjust_thread(ts, td)) {
-			mtx_unlock_spin(&tc->tc_lock);
+			mtx_unlock_spin(&ts->ts_lock);
 			return;
 		}
-		mtx_unlock_spin(&tc->tc_lock);
+		/* The thread lock is released as ts lock above. */
 	}
 }
 
@@ -251,16 +284,16 @@
 static int
 turnstile_adjust_thread(struct turnstile *ts, struct thread *td)
 {
-	struct turnstile_chain *tc;
 	struct thread *td1, *td2;
+	int queue;
 
-	mtx_assert(&sched_lock, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	MPASS(TD_ON_LOCK(td));
 
 	/*
 	 * This thread may not be blocked on this turnstile anymore
 	 * but instead might already be woken up on another CPU
-	 * that is waiting on sched_lock in turnstile_unpend() to
+	 * that is waiting on the thread lock in turnstile_unpend() to
 	 * finish waking this thread up.  We can detect this case
 	 * by checking to see if this thread has been given a
 	 * turnstile by either turnstile_signal() or
@@ -275,8 +308,7 @@
 	 * It needs to be moved if either its priority is lower than
 	 * the previous thread or higher than the next thread.
 	 */
-	tc = TC_LOOKUP(ts->ts_lockobj);
-	mtx_assert(&tc->tc_lock, MA_OWNED);
+	MPASS(td->td_lock == &ts->ts_lock);
 	td1 = TAILQ_PREV(td, threadqueue, td_lockq);
 	td2 = TAILQ_NEXT(td, td_lockq);
 	if ((td1 != NULL && td->td_priority < td1->td_priority) ||
@@ -286,16 +318,18 @@
 		 * Remove thread from blocked chain and determine where
 		 * it should be moved to.
 		 */
+		queue = td->td_tsqueue;
+		MPASS(queue == TS_EXCLUSIVE_QUEUE || queue == TS_SHARED_QUEUE);
 		mtx_lock_spin(&td_contested_lock);
-		TAILQ_REMOVE(&ts->ts_blocked, td, td_lockq);
-		TAILQ_FOREACH(td1, &ts->ts_blocked, td_lockq) {
+		TAILQ_REMOVE(&ts->ts_blocked[queue], td, td_lockq);
+		TAILQ_FOREACH(td1, &ts->ts_blocked[queue], td_lockq) {
 			MPASS(td1->td_proc->p_magic == P_MAGIC);
 			if (td1->td_priority > td->td_priority)
 				break;
 		}
 
 		if (td1 == NULL)
-			TAILQ_INSERT_TAIL(&ts->ts_blocked, td, td_lockq);
+			TAILQ_INSERT_TAIL(&ts->ts_blocked[queue], td, td_lockq);
 		else
 			TAILQ_INSERT_BEFORE(td1, td, td_lockq);
 		mtx_unlock_spin(&td_contested_lock);
@@ -328,6 +362,7 @@
 		    NULL, MTX_SPIN);
 	}
 	mtx_init(&td_contested_lock, "td_contested", NULL, MTX_SPIN);
+	LIST_INIT(&thread0.td_contested);
 	thread0.td_turnstile = NULL;
 }
 
@@ -360,6 +395,13 @@
 init_turnstile0(void *dummy)
 {
 
+	turnstile_zone = uma_zcreate("TURNSTILE", sizeof(struct turnstile),
+#ifdef INVARIANTS
+	    NULL, turnstile_dtor, turnstile_init, turnstile_fini,
+	    UMA_ALIGN_CACHE, 0);
+#else
+	    NULL, NULL, turnstile_init, turnstile_fini, UMA_ALIGN_CACHE, 0);
+#endif
 	thread0.td_turnstile = turnstile_alloc();
 }
 SYSINIT(turnstile0, SI_SUB_LOCK, SI_ORDER_ANY, init_turnstile0, NULL);
@@ -371,10 +413,8 @@
 void
 turnstile_adjust(struct thread *td, u_char oldpri)
 {
-	struct turnstile_chain *tc;
 	struct turnstile *ts;
 
-	mtx_assert(&sched_lock, MA_OWNED);
 	MPASS(TD_ON_LOCK(td));
 
 	/*
@@ -382,26 +422,24 @@
 	 */
 	ts = td->td_blocked;
 	MPASS(ts != NULL);
-	tc = TC_LOOKUP(ts->ts_lockobj);
-	mtx_lock_spin(&tc->tc_lock);
+	MPASS(td->td_lock == &ts->ts_lock);
+	mtx_assert(&ts->ts_lock, MA_OWNED);
 
 	/* Resort the turnstile on the list. */
-	if (!turnstile_adjust_thread(ts, td)) {
-		mtx_unlock_spin(&tc->tc_lock);
+	if (!turnstile_adjust_thread(ts, td))
 		return;
-	}
-
 	/*
 	 * If our priority was lowered and we are at the head of the
 	 * turnstile, then propagate our new priority up the chain.
 	 * Note that we currently don't try to revoke lent priorities
 	 * when our priority goes up.
 	 */
-	if (td == TAILQ_FIRST(&ts->ts_blocked) && td->td_priority < oldpri) {
-		mtx_unlock_spin(&tc->tc_lock);
+	MPASS(td->td_tsqueue == TS_EXCLUSIVE_QUEUE ||
+	    td->td_tsqueue == TS_SHARED_QUEUE);
+	if (td == TAILQ_FIRST(&ts->ts_blocked[td->td_tsqueue]) &&
+	    td->td_priority < oldpri) {
 		propagate_priority(td);
-	} else
-		mtx_unlock_spin(&tc->tc_lock);
+	}
 }
 
 /*
@@ -412,25 +450,68 @@
 {
 
 	mtx_assert(&td_contested_lock, MA_OWNED);
-	MPASS(owner->td_proc->p_magic == P_MAGIC);
 	MPASS(ts->ts_owner == NULL);
+
+	/* A shared lock might not have an owner. */
+	if (owner == NULL)
+		return;
+
+	MPASS(owner->td_proc->p_magic == P_MAGIC);
 	ts->ts_owner = owner;
 	LIST_INSERT_HEAD(&owner->td_contested, ts, ts_link);
 }
 
+#ifdef INVARIANTS
 /*
- * Malloc a turnstile for a new thread, initialize it and return it.
+ * UMA zone item deallocator.
  */
-struct turnstile *
-turnstile_alloc(void)
+static void
+turnstile_dtor(void *mem, int size, void *arg)
 {
 	struct turnstile *ts;
 
-	ts = malloc(sizeof(struct turnstile), M_TURNSTILE, M_WAITOK | M_ZERO);
-	TAILQ_INIT(&ts->ts_blocked);
+	ts = mem;
+	MPASS(TAILQ_EMPTY(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE]));
+	MPASS(TAILQ_EMPTY(&ts->ts_blocked[TS_SHARED_QUEUE]));
+	MPASS(TAILQ_EMPTY(&ts->ts_pending));
+}
+#endif
+
+/*
+ * UMA zone item initializer.
+ */
+static int
+turnstile_init(void *mem, int size, int flags)
+{
+	struct turnstile *ts;
+
+	bzero(mem, size);
+	ts = mem;
+	TAILQ_INIT(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE]);
+	TAILQ_INIT(&ts->ts_blocked[TS_SHARED_QUEUE]);
 	TAILQ_INIT(&ts->ts_pending);
 	LIST_INIT(&ts->ts_free);
-	return (ts);
+	mtx_init(&ts->ts_lock, "turnstile lock", NULL, MTX_SPIN | MTX_RECURSE);
+	return (0);
+}
+
+static void
+turnstile_fini(void *mem, int size)
+{
+	struct turnstile *ts;
+
+	ts = mem;
+	mtx_destroy(&ts->ts_lock);
+}
+
+/*
+ * Get a turnstile for a new thread.
+ */
+struct turnstile *
+turnstile_alloc(void)
+{
+
+	return (uma_zalloc(turnstile_zone, M_WAITOK));
 }
 
 /*
@@ -440,22 +521,58 @@
 turnstile_free(struct turnstile *ts)
 {
 
-	MPASS(ts != NULL);
-	MPASS(TAILQ_EMPTY(&ts->ts_blocked));
-	MPASS(TAILQ_EMPTY(&ts->ts_pending));
-	free(ts, M_TURNSTILE);
+	uma_zfree(turnstile_zone, ts);
 }
 
 /*
  * Lock the turnstile chain associated with the specified lock.
  */
 void
-turnstile_lock(struct lock_object *lock)
+turnstile_chain_lock(struct lock_object *lock)
+{
+	struct turnstile_chain *tc;
+
+	tc = TC_LOOKUP(lock);
+	mtx_lock_spin(&tc->tc_lock);
+}
+
+struct turnstile *
+turnstile_trywait(struct lock_object *lock)
 {
 	struct turnstile_chain *tc;
+	struct turnstile *ts;
 
 	tc = TC_LOOKUP(lock);
 	mtx_lock_spin(&tc->tc_lock);
+	LIST_FOREACH(ts, &tc->tc_turnstiles, ts_hash)
+		if (ts->ts_lockobj == lock) {
+			mtx_lock_spin(&ts->ts_lock);
+			return (ts);
+		}
+
+	ts = curthread->td_turnstile;
+	MPASS(ts != NULL);
+	mtx_lock_spin(&ts->ts_lock);
+	KASSERT(ts->ts_lockobj == NULL, ("stale ts_lockobj pointer"));
+	ts->ts_lockobj = lock;
+
+	return (ts);
+}
+
+void
+turnstile_cancel(struct turnstile *ts)
+{
+	struct turnstile_chain *tc;
+	struct lock_object *lock;
+
+	mtx_assert(&ts->ts_lock, MA_OWNED);
+
+	mtx_unlock_spin(&ts->ts_lock);
+	lock = ts->ts_lockobj;
+	if (ts == curthread->td_turnstile)
+		ts->ts_lockobj = NULL;
+	tc = TC_LOOKUP(lock);
+	mtx_unlock_spin(&tc->tc_lock);
 }
 
 /*
@@ -472,8 +589,10 @@
 	tc = TC_LOOKUP(lock);
 	mtx_assert(&tc->tc_lock, MA_OWNED);
 	LIST_FOREACH(ts, &tc->tc_turnstiles, ts_hash)
-		if (ts->ts_lockobj == lock)
+		if (ts->ts_lockobj == lock) {
+			mtx_lock_spin(&ts->ts_lock);
 			return (ts);
+		}
 	return (NULL);
 }
 
@@ -481,7 +600,7 @@
  * Unlock the turnstile chain associated with a given lock.
  */
 void
-turnstile_release(struct lock_object *lock)
+turnstile_chain_unlock(struct lock_object *lock)
 {
 	struct turnstile_chain *tc;
 
@@ -490,38 +609,54 @@
 }
 
 /*
+ * Return a pointer to the thread waiting on this turnstile with the
+ * most important priority or NULL if the turnstile has no waiters.
+ */
+static struct thread *
+turnstile_first_waiter(struct turnstile *ts)
+{
+	struct thread *std, *xtd;
+
+	std = TAILQ_FIRST(&ts->ts_blocked[TS_SHARED_QUEUE]);
+	xtd = TAILQ_FIRST(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE]);
+	if (xtd == NULL || (std != NULL && std->td_priority < xtd->td_priority))
+		return (std);
+	return (xtd);
+}
+
+/*
  * Take ownership of a turnstile and adjust the priority of the new
  * owner appropriately.
  */
 void
-turnstile_claim(struct lock_object *lock)
+turnstile_claim(struct turnstile *ts)
 {
-	struct turnstile_chain *tc;
-	struct turnstile *ts;
 	struct thread *td, *owner;
+	struct turnstile_chain *tc;
 
-	tc = TC_LOOKUP(lock);
-	mtx_assert(&tc->tc_lock, MA_OWNED);
-	ts = turnstile_lookup(lock);
-	MPASS(ts != NULL);
+	mtx_assert(&ts->ts_lock, MA_OWNED);
+	MPASS(ts != curthread->td_turnstile);
 
 	owner = curthread;
 	mtx_lock_spin(&td_contested_lock);
 	turnstile_setowner(ts, owner);
 	mtx_unlock_spin(&td_contested_lock);
 
-	td = TAILQ_FIRST(&ts->ts_blocked);
+	td = turnstile_first_waiter(ts);
 	MPASS(td != NULL);
 	MPASS(td->td_proc->p_magic == P_MAGIC);
-	mtx_unlock_spin(&tc->tc_lock);
+	MPASS(td->td_lock == &ts->ts_lock);
 
 	/*
 	 * Update the priority of the new owner if needed.
 	 */
-	mtx_lock_spin(&sched_lock);
+	thread_lock(owner);
 	if (td->td_priority < owner->td_priority)
 		sched_lend_prio(owner, td->td_priority);
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(owner);
+	tc = TC_LOOKUP(ts->ts_lockobj);
+	mtx_unlock_spin(&ts->ts_lock);
+	mtx_unlock_spin(&tc->tc_lock);
 }
 
 /*
@@ -531,28 +666,28 @@
  * turnstile chain locked and will return with it unlocked.
  */
 void
-turnstile_wait(struct lock_object *lock, struct thread *owner)
+turnstile_wait(struct turnstile *ts, struct thread *owner, int queue)
 {
 	struct turnstile_chain *tc;
-	struct turnstile *ts;
 	struct thread *td, *td1;
+	struct lock_object *lock;
 
 	td = curthread;
-	tc = TC_LOOKUP(lock);
-	mtx_assert(&tc->tc_lock, MA_OWNED);
-	MPASS(td->td_turnstile != NULL);
-	MPASS(owner != NULL);
-	MPASS(owner->td_proc->p_magic == P_MAGIC);
-
-	/* Look up the turnstile associated with the lock 'lock'. */
-	ts = turnstile_lookup(lock);
+	mtx_assert(&ts->ts_lock, MA_OWNED);
+	if (queue == TS_SHARED_QUEUE)
+		MPASS(owner != NULL);
+	if (owner)
+		MPASS(owner->td_proc->p_magic == P_MAGIC);
+	MPASS(queue == TS_SHARED_QUEUE || queue == TS_EXCLUSIVE_QUEUE);
 
 	/*
 	 * If the lock does not already have a turnstile, use this thread's
 	 * turnstile.  Otherwise insert the current thread into the
 	 * turnstile already in use by this lock.
 	 */
-	if (ts == NULL) {
+	tc = TC_LOOKUP(ts->ts_lockobj);
+	if (ts == td->td_turnstile) {
+	mtx_assert(&tc->tc_lock, MA_OWNED);
 #ifdef TURNSTILE_PROFILING
 		tc->tc_depth++;
 		if (tc->tc_depth > tc->tc_max_depth) {
@@ -561,83 +696,60 @@
 				turnstile_max_depth = tc->tc_max_depth;
 		}
 #endif
-		ts = td->td_turnstile;
+		tc = TC_LOOKUP(ts->ts_lockobj);
 		LIST_INSERT_HEAD(&tc->tc_turnstiles, ts, ts_hash);
 		KASSERT(TAILQ_EMPTY(&ts->ts_pending),
 		    ("thread's turnstile has pending threads"));
-		KASSERT(TAILQ_EMPTY(&ts->ts_blocked),
-		    ("thread's turnstile has a non-empty queue"));
+		KASSERT(TAILQ_EMPTY(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE]),
+		    ("thread's turnstile has exclusive waiters"));
+		KASSERT(TAILQ_EMPTY(&ts->ts_blocked[TS_SHARED_QUEUE]),
+		    ("thread's turnstile has shared waiters"));
 		KASSERT(LIST_EMPTY(&ts->ts_free),
 		    ("thread's turnstile has a non-empty free list"));
-		KASSERT(ts->ts_lockobj == NULL, ("stale ts_lockobj pointer"));
-		ts->ts_lockobj = lock;
+		MPASS(ts->ts_lockobj != NULL);
 		mtx_lock_spin(&td_contested_lock);
-		TAILQ_INSERT_TAIL(&ts->ts_blocked, td, td_lockq);
+		TAILQ_INSERT_TAIL(&ts->ts_blocked[queue], td, td_lockq);
 		turnstile_setowner(ts, owner);
 		mtx_unlock_spin(&td_contested_lock);
 	} else {
-		TAILQ_FOREACH(td1, &ts->ts_blocked, td_lockq)
+		TAILQ_FOREACH(td1, &ts->ts_blocked[queue], td_lockq)
 			if (td1->td_priority > td->td_priority)
 				break;
 		mtx_lock_spin(&td_contested_lock);
 		if (td1 != NULL)
 			TAILQ_INSERT_BEFORE(td1, td, td_lockq);
 		else
-			TAILQ_INSERT_TAIL(&ts->ts_blocked, td, td_lockq);
+			TAILQ_INSERT_TAIL(&ts->ts_blocked[queue], td, td_lockq);
+		MPASS(owner == ts->ts_owner);
 		mtx_unlock_spin(&td_contested_lock);
 		MPASS(td->td_turnstile != NULL);
 		LIST_INSERT_HEAD(&ts->ts_free, td->td_turnstile, ts_hash);
-		MPASS(owner == ts->ts_owner);
 	}
+	thread_lock(td);
+	thread_lock_set(td, &ts->ts_lock);
 	td->td_turnstile = NULL;
-	mtx_unlock_spin(&tc->tc_lock);
-
-	mtx_lock_spin(&sched_lock);
-	/*
-	 * Handle race condition where a thread on another CPU that owns
-	 * lock 'lock' could have woken us in between us dropping the
-	 * turnstile chain lock and acquiring the sched_lock.
-	 */
-	if (td->td_flags & TDF_TSNOBLOCK) {
-		td->td_flags &= ~TDF_TSNOBLOCK;
-		mtx_unlock_spin(&sched_lock);
-		return;
-	}
-		
-#ifdef notyet
-	/*
-	 * If we're borrowing an interrupted thread's VM context, we
-	 * must clean up before going to sleep.
-	 */
-	if (td->td_ithd != NULL) {
-		struct ithd *it = td->td_ithd;
-
-		if (it->it_interrupted) {
-			if (LOCK_LOG_TEST(lock, 0))
-				CTR3(KTR_LOCK, "%s: %p interrupted %p",
-				    __func__, it, it->it_interrupted);
-			intr_thd_fixup(it);
-		}
-	}
-#endif
 
 	/* Save who we are blocked on and switch. */
+	lock = ts->ts_lockobj;
+	td->td_tsqueue = queue;
 	td->td_blocked = ts;
 	td->td_lockname = lock->lo_name;
 	TD_SET_LOCK(td);
+	mtx_unlock_spin(&tc->tc_lock);
 	propagate_priority(td);
 
 	if (LOCK_LOG_TEST(lock, 0))
 		CTR4(KTR_LOCK, "%s: td %d blocked on [%p] %s", __func__,
 		    td->td_tid, lock, lock->lo_name);
 
+	MPASS(td->td_lock == &ts->ts_lock);
+	SCHED_STAT_INC(switch_turnstile);
 	mi_switch(SW_VOL, NULL);
 
 	if (LOCK_LOG_TEST(lock, 0))
 		CTR4(KTR_LOCK, "%s: td %d free from blocked on [%p] %s",
 		    __func__, td->td_tid, lock, lock->lo_name);
-
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(td);
 }
 
 /*
@@ -645,26 +757,27 @@
  * pending list.  This must be called with the turnstile chain locked.
  */
 int
-turnstile_signal(struct turnstile *ts)
+turnstile_signal(struct turnstile *ts, int queue)
 {
 	struct turnstile_chain *tc;
 	struct thread *td;
 	int empty;
 
 	MPASS(ts != NULL);
+	mtx_assert(&ts->ts_lock, MA_OWNED);
 	MPASS(curthread->td_proc->p_magic == P_MAGIC);
-	MPASS(ts->ts_owner == curthread);
-	tc = TC_LOOKUP(ts->ts_lockobj);
-	mtx_assert(&tc->tc_lock, MA_OWNED);
+	MPASS(ts->ts_owner == curthread ||
+	    (queue == TS_EXCLUSIVE_QUEUE && ts->ts_owner == NULL));
+	MPASS(queue == TS_SHARED_QUEUE || queue == TS_EXCLUSIVE_QUEUE);
 
 	/*
 	 * Pick the highest priority thread blocked on this lock and
 	 * move it to the pending list.
 	 */
-	td = TAILQ_FIRST(&ts->ts_blocked);
+	td = TAILQ_FIRST(&ts->ts_blocked[queue]);
 	MPASS(td->td_proc->p_magic == P_MAGIC);
 	mtx_lock_spin(&td_contested_lock);
-	TAILQ_REMOVE(&ts->ts_blocked, td, td_lockq);
+	TAILQ_REMOVE(&ts->ts_blocked[queue], td, td_lockq);
 	mtx_unlock_spin(&td_contested_lock);
 	TAILQ_INSERT_TAIL(&ts->ts_pending, td, td_lockq);
 
@@ -673,8 +786,11 @@
 	 * give it to the about-to-be-woken thread.  Otherwise take a
 	 * turnstile from the free list and give it to the thread.
 	 */
-	empty = TAILQ_EMPTY(&ts->ts_blocked);
+	empty = TAILQ_EMPTY(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE]) &&
+	    TAILQ_EMPTY(&ts->ts_blocked[TS_SHARED_QUEUE]);
 	if (empty) {
+		tc = TC_LOOKUP(ts->ts_lockobj);
+		mtx_assert(&tc->tc_lock, MA_OWNED);
 		MPASS(LIST_EMPTY(&ts->ts_free));
 #ifdef TURNSTILE_PROFILING
 		tc->tc_depth--;
@@ -693,28 +809,35 @@
  * the turnstile chain locked.
  */
 void
-turnstile_broadcast(struct turnstile *ts)
+turnstile_broadcast(struct turnstile *ts, int queue)
 {
 	struct turnstile_chain *tc;
 	struct turnstile *ts1;
 	struct thread *td;
 
 	MPASS(ts != NULL);
+	mtx_assert(&ts->ts_lock, MA_OWNED);
 	MPASS(curthread->td_proc->p_magic == P_MAGIC);
-	MPASS(ts->ts_owner == curthread);
+	MPASS(ts->ts_owner == curthread ||
+	    (queue == TS_EXCLUSIVE_QUEUE && ts->ts_owner == NULL));
+	/*
+	 * We must have the chain locked so that we can remove the empty
+	 * turnstile from the hash queue.
+	 */
 	tc = TC_LOOKUP(ts->ts_lockobj);
 	mtx_assert(&tc->tc_lock, MA_OWNED);
+	MPASS(queue == TS_SHARED_QUEUE || queue == TS_EXCLUSIVE_QUEUE);
 
 	/*
 	 * Transfer the blocked list to the pending list.
 	 */
 	mtx_lock_spin(&td_contested_lock);
-	TAILQ_CONCAT(&ts->ts_pending, &ts->ts_blocked, td_lockq);
+	TAILQ_CONCAT(&ts->ts_pending, &ts->ts_blocked[queue], td_lockq);
 	mtx_unlock_spin(&td_contested_lock);
 
 	/*
 	 * Give a turnstile to each thread.  The last thread gets
-	 * this turnstile.
+	 * this turnstile if the turnstile is empty.
 	 */
 	TAILQ_FOREACH(td, &ts->ts_pending, td_lockq) {
 		if (LIST_EMPTY(&ts->ts_free)) {
@@ -737,17 +860,17 @@
  * chain locked.
  */
 void
-turnstile_unpend(struct turnstile *ts)
+turnstile_unpend(struct turnstile *ts, int owner_type)
 {
 	TAILQ_HEAD( ,thread) pending_threads;
-	struct turnstile_chain *tc;
+	struct turnstile *nts;
 	struct thread *td;
 	u_char cp, pri;
 
 	MPASS(ts != NULL);
-	MPASS(ts->ts_owner == curthread);
-	tc = TC_LOOKUP(ts->ts_lockobj);
-	mtx_assert(&tc->tc_lock, MA_OWNED);
+	mtx_assert(&ts->ts_lock, MA_OWNED);
+	MPASS(ts->ts_owner == curthread ||
+	    (owner_type == TS_SHARED_LOCK && ts->ts_owner == NULL));
 	MPASS(!TAILQ_EMPTY(&ts->ts_pending));
 
 	/*
@@ -757,9 +880,81 @@
 	TAILQ_INIT(&pending_threads);
 	TAILQ_CONCAT(&pending_threads, &ts->ts_pending, td_lockq);
 #ifdef INVARIANTS
-	if (TAILQ_EMPTY(&ts->ts_blocked))
+	if (TAILQ_EMPTY(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE]) &&
+	    TAILQ_EMPTY(&ts->ts_blocked[TS_SHARED_QUEUE]))
 		ts->ts_lockobj = NULL;
 #endif
+	/*
+	 * Adjust the priority of curthread based on other contested
+	 * locks it owns.  Don't lower the priority below the base
+	 * priority however.
+	 */
+	td = curthread;
+	pri = PRI_MAX;
+	thread_lock(td);
+	mtx_lock_spin(&td_contested_lock);
+	/*
+	 * Remove the turnstile from this thread's list of contested locks
+	 * since this thread doesn't own it anymore.  New threads will
+	 * not be blocking on the turnstile until it is claimed by a new
+	 * owner.  There might not be a current owner if this is a shared
+	 * lock.
+	 */
+	if (ts->ts_owner != NULL) {
+		ts->ts_owner = NULL;
+		LIST_REMOVE(ts, ts_link);
+	}
+	LIST_FOREACH(nts, &td->td_contested, ts_link) {
+		cp = turnstile_first_waiter(nts)->td_priority;
+		if (cp < pri)
+			pri = cp;
+	}
+	mtx_unlock_spin(&td_contested_lock);
+	sched_unlend_prio(td, pri);
+	thread_unlock(td);
+	/*
+	 * Wake up all the pending threads.  If a thread is not blocked
+	 * on a lock, then it is currently executing on another CPU in
+	 * turnstile_wait() or sitting on a run queue waiting to resume
+	 * in turnstile_wait().  Set a flag to force it to try to acquire
+	 * the lock again instead of blocking.
+	 */
+	while (!TAILQ_EMPTY(&pending_threads)) {
+		td = TAILQ_FIRST(&pending_threads);
+		TAILQ_REMOVE(&pending_threads, td, td_lockq);
+		thread_lock(td);
+		MPASS(td->td_lock == &ts->ts_lock);
+		MPASS(td->td_proc->p_magic == P_MAGIC);
+		MPASS(TD_ON_LOCK(td));
+		TD_CLR_LOCK(td);
+		MPASS(TD_CAN_RUN(td));
+		td->td_blocked = NULL;
+		td->td_lockname = NULL;
+#ifdef INVARIANTS
+		td->td_tsqueue = 0xff;
+#endif
+		sched_add(td, SRQ_BORING);
+		thread_unlock(td);
+	}
+	mtx_unlock_spin(&ts->ts_lock);
+}
+
+/*
+ * Give up ownership of a turnstile.  This must be called with the
+ * turnstile chain locked.
+ */
+void
+turnstile_disown(struct turnstile *ts)
+{
+	struct thread *td;
+	u_char cp, pri;
+
+	MPASS(ts != NULL);
+	mtx_assert(&ts->ts_lock, MA_OWNED);
+	MPASS(ts->ts_owner == curthread);
+	MPASS(TAILQ_EMPTY(&ts->ts_pending));
+	MPASS(!TAILQ_EMPTY(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE]) ||
+	    !TAILQ_EMPTY(&ts->ts_blocked[TS_SHARED_QUEUE]));
 
 	/*
 	 * Remove the turnstile from this thread's list of contested locks
@@ -771,8 +966,6 @@
 	ts->ts_owner = NULL;
 	LIST_REMOVE(ts, ts_link);
 	mtx_unlock_spin(&td_contested_lock);
-	critical_enter();
-	mtx_unlock_spin(&tc->tc_lock);
 
 	/*
 	 * Adjust the priority of curthread based on other contested
@@ -781,70 +974,330 @@
 	 */
 	td = curthread;
 	pri = PRI_MAX;
-	mtx_lock_spin(&sched_lock);
+	thread_lock(td);
+	mtx_unlock_spin(&ts->ts_lock);
 	mtx_lock_spin(&td_contested_lock);
 	LIST_FOREACH(ts, &td->td_contested, ts_link) {
-		cp = TAILQ_FIRST(&ts->ts_blocked)->td_priority;
+		cp = turnstile_first_waiter(ts)->td_priority;
 		if (cp < pri)
 			pri = cp;
 	}
 	mtx_unlock_spin(&td_contested_lock);
 	sched_unlend_prio(td, pri);
-
-	/*
-	 * Wake up all the pending threads.  If a thread is not blocked
-	 * on a lock, then it is currently executing on another CPU in
-	 * turnstile_wait() or sitting on a run queue waiting to resume
-	 * in turnstile_wait().  Set a flag to force it to try to acquire
-	 * the lock again instead of blocking.
-	 */
-	while (!TAILQ_EMPTY(&pending_threads)) {
-		td = TAILQ_FIRST(&pending_threads);
-		TAILQ_REMOVE(&pending_threads, td, td_lockq);
-		MPASS(td->td_proc->p_magic == P_MAGIC);
-		if (TD_ON_LOCK(td)) {
-			td->td_blocked = NULL;
-			td->td_lockname = NULL;
-			TD_CLR_LOCK(td);
-			MPASS(TD_CAN_RUN(td));
-			setrunqueue(td, SRQ_BORING);
-		} else {
-			td->td_flags |= TDF_TSNOBLOCK;
-			MPASS(TD_IS_RUNNING(td) || TD_ON_RUNQ(td));
-		}
-	}
-	critical_exit();
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(td);
 }
 
 /*
  * Return the first thread in a turnstile.
  */
 struct thread *
-turnstile_head(struct turnstile *ts)
+turnstile_head(struct turnstile *ts, int queue)
 {
 #ifdef INVARIANTS
-	struct turnstile_chain *tc;
 
 	MPASS(ts != NULL);
-	tc = TC_LOOKUP(ts->ts_lockobj);
-	mtx_assert(&tc->tc_lock, MA_OWNED);
+	MPASS(queue == TS_SHARED_QUEUE || queue == TS_EXCLUSIVE_QUEUE);
+	mtx_assert(&ts->ts_lock, MA_OWNED);
 #endif
-	return (TAILQ_FIRST(&ts->ts_blocked));
+	return (TAILQ_FIRST(&ts->ts_blocked[queue]));
 }
 
 /*
- * Returns true if a turnstile is empty.
+ * Returns true if a sub-queue of a turnstile is empty.
  */
 int
-turnstile_empty(struct turnstile *ts)
+turnstile_empty(struct turnstile *ts, int queue)
 {
 #ifdef INVARIANTS
-	struct turnstile_chain *tc;
 
 	MPASS(ts != NULL);
-	tc = TC_LOOKUP(ts->ts_lockobj);
-	mtx_assert(&tc->tc_lock, MA_OWNED);
+	MPASS(queue == TS_SHARED_QUEUE || queue == TS_EXCLUSIVE_QUEUE);
+	mtx_assert(&ts->ts_lock, MA_OWNED);
 #endif
-	return (TAILQ_EMPTY(&ts->ts_blocked));
+	return (TAILQ_EMPTY(&ts->ts_blocked[queue]));
 }
+
+#ifdef DDB
+static void
+print_thread(struct thread *td, const char *prefix)
+{
+
+	db_printf("%s%p (tid %d, pid %d, \"%s\")\n", prefix, td, td->td_tid,
+	    td->td_proc->p_pid, td->td_name[0] != '\0' ? td->td_name :
+	    td->td_proc->p_comm);
+}
+
+static void
+print_queue(struct threadqueue *queue, const char *header, const char *prefix)
+{
+	struct thread *td;
+
+	db_printf("%s:\n", header);
+	if (TAILQ_EMPTY(queue)) {
+		db_printf("%sempty\n", prefix);
+		return;
+	}
+	TAILQ_FOREACH(td, queue, td_lockq) {
+		print_thread(td, prefix);
+	}
+}
+
+DB_SHOW_COMMAND(turnstile, db_show_turnstile)
+{
+	struct turnstile_chain *tc;
+	struct turnstile *ts;
+	struct lock_object *lock;
+	int i;
+
+	if (!have_addr)
+		return;
+
+	/*
+	 * First, see if there is an active turnstile for the lock indicated
+	 * by the address.
+	 */
+	lock = (struct lock_object *)addr;
+	tc = TC_LOOKUP(lock);
+	LIST_FOREACH(ts, &tc->tc_turnstiles, ts_hash)
+		if (ts->ts_lockobj == lock)
+			goto found;
+
+	/*
+	 * Second, see if there is an active turnstile at the address
+	 * indicated.
+	 */
+	for (i = 0; i < TC_TABLESIZE; i++)
+		LIST_FOREACH(ts, &turnstile_chains[i].tc_turnstiles, ts_hash) {
+			if (ts == (struct turnstile *)addr)
+				goto found;
+		}
+
+	db_printf("Unable to locate a turnstile via %p\n", (void *)addr);
+	return;
+found:
+	lock = ts->ts_lockobj;
+	db_printf("Lock: %p - (%s) %s\n", lock, LOCK_CLASS(lock)->lc_name,
+	    lock->lo_name);
+	if (ts->ts_owner)
+		print_thread(ts->ts_owner, "Lock Owner: ");
+	else
+		db_printf("Lock Owner: none\n");
+	print_queue(&ts->ts_blocked[TS_SHARED_QUEUE], "Shared Waiters", "\t");
+	print_queue(&ts->ts_blocked[TS_EXCLUSIVE_QUEUE], "Exclusive Waiters",
+	    "\t");
+	print_queue(&ts->ts_pending, "Pending Threads", "\t");
+	
+}
+
+/*
+ * Show all the threads a particular thread is waiting on based on
+ * non-sleepable and non-spin locks.
+ */
+static void
+print_lockchain(struct thread *td, const char *prefix)
+{
+	struct lock_object *lock;
+	struct lock_class *class;
+	struct turnstile *ts;
+
+	/*
+	 * Follow the chain.  We keep walking as long as the thread is
+	 * blocked on a turnstile that has an owner.
+	 */
+	while (!db_pager_quit) {
+		db_printf("%sthread %d (pid %d, %s) ", prefix, td->td_tid,
+		    td->td_proc->p_pid, td->td_name[0] != '\0' ? td->td_name :
+		    td->td_proc->p_comm);
+		switch (td->td_state) {
+		case TDS_INACTIVE:
+			db_printf("is inactive\n");
+			return;
+		case TDS_CAN_RUN:
+			db_printf("can run\n");
+			return;
+		case TDS_RUNQ:
+			db_printf("is on a run queue\n");
+			return;
+		case TDS_RUNNING:
+			db_printf("running on CPU %d\n", td->td_oncpu);
+			return;
+		case TDS_INHIBITED:
+			if (TD_ON_LOCK(td)) {
+				ts = td->td_blocked;
+				lock = ts->ts_lockobj;
+				class = LOCK_CLASS(lock);
+				db_printf("blocked on lock %p (%s) \"%s\"\n",
+				    lock, class->lc_name, lock->lo_name);
+				if (ts->ts_owner == NULL)
+					return;
+				td = ts->ts_owner;
+				break;
+			}
+			db_printf("inhibited\n");
+			return;
+		default:
+			db_printf("??? (%#x)\n", td->td_state);
+			return;
+		}
+	}
+}
+
+DB_SHOW_COMMAND(lockchain, db_show_lockchain)
+{
+	struct thread *td;
+
+	/* Figure out which thread to start with. */
+	if (have_addr)
+		td = db_lookup_thread(addr, TRUE);
+	else
+		td = kdb_thread;
+
+	print_lockchain(td, "");
+}
+
+DB_SHOW_COMMAND(allchains, db_show_allchains)
+{
+	struct thread *td;
+	struct proc *p;
+	int i;
+
+	i = 1;
+	FOREACH_PROC_IN_SYSTEM(p) {
+		FOREACH_THREAD_IN_PROC(p, td) {
+			if (TD_ON_LOCK(td) && LIST_EMPTY(&td->td_contested)) {
+				db_printf("chain %d:\n", i++);
+				print_lockchain(td, " ");
+			}
+			if (db_pager_quit)
+				return;
+		}
+	}
+}
+
+/*
+ * Show all the threads a particular thread is waiting on based on
+ * sleepable locks.
+ */
+static void
+print_sleepchain(struct thread *td, const char *prefix)
+{
+	struct thread *owner;
+
+	/*
+	 * Follow the chain.  We keep walking as long as the thread is
+	 * blocked on a sleep lock that has an owner.
+	 */
+	while (!db_pager_quit) {
+		db_printf("%sthread %d (pid %d, %s) ", prefix, td->td_tid,
+		    td->td_proc->p_pid, td->td_name[0] != '\0' ? td->td_name :
+		    td->td_proc->p_comm);
+		switch (td->td_state) {
+		case TDS_INACTIVE:
+			db_printf("is inactive\n");
+			return;
+		case TDS_CAN_RUN:
+			db_printf("can run\n");
+			return;
+		case TDS_RUNQ:
+			db_printf("is on a run queue\n");
+			return;
+		case TDS_RUNNING:
+			db_printf("running on CPU %d\n", td->td_oncpu);
+			return;
+		case TDS_INHIBITED:
+			if (TD_ON_SLEEPQ(td)) {
+				if (lockmgr_chain(td, &owner) ||
+				    sx_chain(td, &owner)) {
+					if (owner == NULL)
+						return;
+					td = owner;
+					break;
+				}
+				db_printf("sleeping on %p \"%s\"\n",
+				    td->td_wchan, td->td_wmesg);
+				return;
+			}
+			db_printf("inhibited\n");
+			return;
+		default:
+			db_printf("??? (%#x)\n", td->td_state);
+			return;
+		}
+	}
+}
+
+DB_SHOW_COMMAND(sleepchain, db_show_sleepchain)
+{
+	struct thread *td;
+
+	/* Figure out which thread to start with. */
+	if (have_addr)
+		td = db_lookup_thread(addr, TRUE);
+	else
+		td = kdb_thread;
+
+	print_sleepchain(td, "");
+}
+
+static void	print_waiters(struct turnstile *ts, int indent);
+	
+static void
+print_waiter(struct thread *td, int indent)
+{
+	struct turnstile *ts;
+	int i;
+
+	if (db_pager_quit)
+		return;
+	for (i = 0; i < indent; i++)
+		db_printf(" ");
+	print_thread(td, "thread ");
+	LIST_FOREACH(ts, &td->td_contested, ts_link)
+		print_waiters(ts, indent + 1);
+}
+
+static void
+print_waiters(struct turnstile *ts, int indent)
+{
+	struct lock_object *lock;
+	struct lock_class *class;
+	struct thread *td;
+	int i;
+
+	if (db_pager_quit)
+		return;
+	lock = ts->ts_lockobj;
+	class = LOCK_CLASS(lock);
+	for (i = 0; i < indent; i++)
+		db_printf(" ");
+	db_printf("lock %p (%s) \"%s\"\n", lock, class->lc_name, lock->lo_name);
+	TAILQ_FOREACH(td, &ts->ts_blocked[TS_EXCLUSIVE_QUEUE], td_lockq)
+		print_waiter(td, indent + 1);
+	TAILQ_FOREACH(td, &ts->ts_blocked[TS_SHARED_QUEUE], td_lockq)
+		print_waiter(td, indent + 1);
+	TAILQ_FOREACH(td, &ts->ts_pending, td_lockq)
+		print_waiter(td, indent + 1);
+}
+
+DB_SHOW_COMMAND(locktree, db_show_locktree)
+{
+	struct lock_object *lock;
+	struct lock_class *class;
+	struct turnstile_chain *tc;
+	struct turnstile *ts;
+
+	if (!have_addr)
+		return;
+	lock = (struct lock_object *)addr;
+	tc = TC_LOOKUP(lock);
+	LIST_FOREACH(ts, &tc->tc_turnstiles, ts_hash)
+		if (ts->ts_lockobj == lock)
+			break;
+	if (ts == NULL) {
+		class = LOCK_CLASS(lock);
+		db_printf("lock %p (%s) \"%s\"\n", lock, class->lc_name,
+		    lock->lo_name);
+	} else
+		print_waiters(ts, 0);
+}
+#endif
Index: tty_pty.c
===================================================================
RCS file: /home/cvs/src/sys/kern/tty_pty.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/tty_pty.c -L sys/kern/tty_pty.c -u -r1.2 -r1.3
--- sys/kern/tty_pty.c
+++ sys/kern/tty_pty.c
@@ -30,7 +30,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/tty_pty.c,v 1.137.2.2 2006/03/30 16:46:56 csjp Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/tty_pty.c,v 1.152.2.2.2.2 2008/01/28 12:47:56 kib Exp $");
 
 /*
  * Pseudo-teletype Driver
@@ -40,14 +40,14 @@
 #include "opt_tty.h"
 #include <sys/param.h>
 #include <sys/systm.h>
+#include <sys/libkern.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/sx.h>
-#ifndef BURN_BRIDGES
-#if defined(COMPAT_43)
+#if defined(COMPAT_43TTY)
 #include <sys/ioctl_compat.h>
 #endif
-#endif
+#include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/tty.h>
 #include <sys/conf.h>
@@ -109,6 +109,7 @@
 	u_char	pt_ucntl;
 	struct tty *pt_tty;
 	struct cdev *devs, *devc;
+	int	pt_devs_open, pt_devc_open;
 	struct	prison *pt_prison;
 };
 
@@ -121,43 +122,80 @@
 #define	TSA_PTC_WRITE(tp)	((void *)&(tp)->t_rawq.c_cl)
 #define	TSA_PTS_READ(tp)	((void *)&(tp)->t_canq)
 
-static char *names = "pqrsPQRS";
+static const char names[] = "pqrsPQRSlmnoLMNO";
 /*
  * This function creates and initializes a pts/ptc pair
  *
- * pts == /dev/tty[pqrsPQRS][0123456789abcdefghijklmnopqrstuv]
- * ptc == /dev/pty[pqrsPQRS][0123456789abcdefghijklmnopqrstuv]
- *
- * XXX: define and add mapping of upper minor bits to allow more
- *      than 256 ptys.
+ * pts == /dev/tty[pqrsPQRSlmnoLMNO][0123456789abcdefghijklmnopqrstuv]
+ * ptc == /dev/pty[pqrsPQRSlmnoLMNO][0123456789abcdefghijklmnopqrstuv]
  */
 static struct cdev *
 ptyinit(struct cdev *devc, struct thread *td)
 {
-	struct cdev *devs;
 	struct ptsc *pt;
 	int n;
 
-	n = minor(devc);
-	/* For now we only map the lower 8 bits of the minor */
-	if (n & ~0xff)
+	n = minor2unit(minor(devc));
+
+	/* We only allow for up to 32 ptys per char in "names". */
+	if (n >= 32 * (sizeof(names) - 1))
 		return (NULL);
 
 	devc->si_flags &= ~SI_CHEAPCLONE;
 
+	/*
+	 * Initially do not create a slave endpoint.
+	 */
 	pt = malloc(sizeof(*pt), M_PTY, M_WAITOK | M_ZERO);
-	pt->devs = devs = make_dev_cred(&pts_cdevsw, n, td->td_ucred,
-	    UID_ROOT, GID_WHEEL, 0666, "tty%c%r", names[n / 32], n % 32);
 	pt->devc = devc;
 
-	pt->pt_tty = ttymalloc(pt->pt_tty);
+	pt->pt_tty = ttyalloc();
 	pt->pt_tty->t_sc = pt;
-	devs->si_drv1 = devc->si_drv1 = pt;
-	devs->si_tty = devc->si_tty = pt->pt_tty;
-	pt->pt_tty->t_dev = devs;
+	devc->si_drv1 = pt;
+	devc->si_tty = pt->pt_tty;
 	return (devc);
 }
 
+static void
+pty_create_slave(struct ucred *cred, struct ptsc *pt, int m)
+{
+	int n;
+
+	n = minor2unit(m);
+	KASSERT(n >= 0 && n / 32 < sizeof(names),
+	    ("pty_create_slave: n %d ptsc %p", n, pt));
+	pt->devs = make_dev_cred(&pts_cdevsw, m, cred, UID_ROOT, GID_WHEEL,
+	    0666, "tty%c%r", names[n / 32], n % 32);
+	pt->devs->si_drv1 = pt;
+	pt->devs->si_tty = pt->pt_tty;
+	pt->pt_tty->t_dev = pt->devs;
+}
+
+static void
+pty_destroy_slave(struct ptsc *pt)
+{
+
+	if (pt->pt_tty->t_refcnt > 1)
+		return;
+	pt->pt_tty->t_dev = NULL;
+	ttyrel(pt->pt_tty);
+	pt->pt_tty = NULL;
+	destroy_dev(pt->devs);
+	pt->devs = NULL;
+}
+
+static void
+pty_maybe_destroy_slave(struct ptsc *pt)
+{
+
+	/*
+	 * vfs bugs and complications near revoke() make
+	 * it currently impossible to destroy struct cdev
+	 */
+	if (0 && pt->pt_devc_open == 0 && pt->pt_devs_open == 0)
+		pty_destroy_slave(pt);
+}
+
 /*ARGSUSED*/
 static	int
 ptsopen(struct cdev *dev, int flag, int devtype, struct thread *td)
@@ -170,11 +208,14 @@
 		return(ENXIO);
 	pt = dev->si_drv1;
 	tp = dev->si_tty;
+
 	if ((tp->t_state & TS_ISOPEN) == 0) {
 		ttyinitmode(tp, 1, 0);
-	} else if (tp->t_state & TS_XCLUDE && suser(td))
+	} else if (tp->t_state & TS_XCLUDE && priv_check(td,
+	    PRIV_TTY_EXCLUSIVE))
 		return (EBUSY);
-	else if (pt->pt_prison != td->td_ucred->cr_prison && suser(td))
+	else if (pt->pt_prison != td->td_ucred->cr_prison &&
+	    priv_check(td, PRIV_TTY_PRISON))
 		return (EBUSY);
 	if (tp->t_oproc)			/* Ctrlr still around. */
 		(void)ttyld_modem(tp, 1);
@@ -187,20 +228,32 @@
 			return (error);
 	}
 	error = ttyld_open(tp, dev);
-	if (error == 0)
+	if (error == 0) {
 		ptcwakeup(tp, FREAD|FWRITE);
+		pt->pt_devs_open = 1;
+	} else
+		pty_maybe_destroy_slave(pt);
 	return (error);
 }
 
 static	int
 ptsclose(struct cdev *dev, int flag, int mode, struct thread *td)
 {
+	struct ptsc *pti;
 	struct tty *tp;
 	int err;
 
 	tp = dev->si_tty;
+	pti = dev->si_drv1;
+
+	KASSERT(dev == pti->devs, ("ptsclose: dev != pti->devs"));
+
 	err = ttyld_close(tp, flag);
 	(void) tty_close(tp);
+
+	pti->pt_devs_open = 0;
+	pty_maybe_destroy_slave(pti);
+
 	return (err);
 }
 
@@ -275,7 +328,19 @@
 		ptyinit(dev, td);
 	if (!dev->si_drv1)
 		return(ENXIO);
+
+	pt = dev->si_drv1;
+	/*
+	 * In case we have destroyed the struct tty at the last connect time,
+	 * we need to recreate it.
+	 */
+	if (pt->pt_tty == NULL) {
+		pt->pt_tty = ttyalloc();
+		pt->pt_tty->t_sc = pt;
+		dev->si_tty = pt->pt_tty;
+	}
 	tp = dev->si_tty;
+
 	if (tp->t_oproc)
 		return (EIO);
 	tp->t_timeout = -1;
@@ -283,17 +348,22 @@
 	tp->t_stop = ptsstop;
 	(void)ttyld_modem(tp, 1);
 	tp->t_lflag &= ~EXTPROC;
-	pt = dev->si_drv1;
 	pt->pt_prison = td->td_ucred->cr_prison;
 	pt->pt_flags = 0;
 	pt->pt_send = 0;
 	pt->pt_ucntl = 0;
+
+	if (!pt->devs)
+		pty_create_slave(td->td_ucred, pt, minor(dev));
+	pt->pt_devc_open = 1;
+
 	return (0);
 }
 
 static	int
 ptcclose(struct cdev *dev, int flags, int fmt, struct thread *td)
 {
+	struct ptsc *pti = dev->si_drv1;
 	struct tty *tp;
 
 	tp = dev->si_tty;
@@ -314,6 +384,8 @@
 	}
 
 	tp->t_oproc = 0;		/* mark closed */
+	pti->pt_devc_open = 0;
+	pty_maybe_destroy_slave(pti);
 	return (0);
 }
 
@@ -515,6 +587,10 @@
 {
 	struct tty *tp = dev->si_tty;
 	struct ptsc *pt = dev->si_drv1;
+#if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \
+    defined(COMPAT_FREEBSD4) || defined(COMPAT_43)
+	int ival;
+#endif
 
 	switch (cmd) {
 
@@ -553,12 +629,10 @@
 		return (EAGAIN);
 
 	switch (cmd) {
-#ifndef BURN_BRIDGES
-#ifdef COMPAT_43
+#ifdef COMPAT_43TTY
 	case TIOCSETP:
 	case TIOCSETN:
 #endif
-#endif
 	case TIOCSETD:
 	case TIOCSETA:
 	case TIOCSETAW:
@@ -571,6 +645,13 @@
 		ndflush(&tp->t_outq, tp->t_outq.c_cc);
 		break;
 
+#if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \
+    defined(COMPAT_FREEBSD4) || defined(COMPAT_43)
+	case _IO('t', 95):
+		ival = IOCPARM_IVAL(data);
+		data = (caddr_t)&ival;
+		/* FALLTHROUGH */
+#endif
 	case TIOCSIG:
 		if (*(unsigned int *)data >= NSIG ||
 		    *(unsigned int *)data == 0)
@@ -642,8 +723,7 @@
 		case TIOCSETA:
 		case TIOCSETAW:
 		case TIOCSETAF:
-#ifndef BURN_BRIDGES
-#ifdef COMPAT_43
+#ifdef COMPAT_43TTY
 		case TIOCSETP:
 		case TIOCSETN:
 		case TIOCSETC:
@@ -652,7 +732,6 @@
 		case TIOCLBIC:
 		case TIOCLSET:
 #endif
-#endif
 			pt->pt_send |= TIOCPKT_IOCTL;
 			ptcwakeup(tp, FREAD);
 			break;
@@ -684,34 +763,27 @@
 pty_clone(void *arg, struct ucred *cr, char *name, int namelen,
     struct cdev **dev)
 {
+	char *cp;
 	int u;
 
 	if (*dev != NULL)
 		return;
 	if (bcmp(name, "pty", 3) != 0)
 		return;
-	if (name[5] != '\0')
+	if (name[5] != '\0' || name[3] == '\0')
 		return;
-	switch (name[3]) {
-	case 'p': u =   0; break;
-	case 'q': u =  32; break;
-	case 'r': u =  64; break;
-	case 's': u =  96; break;
-	case 'P': u = 128; break;
-	case 'Q': u = 160; break;
-	case 'R': u = 192; break;
-	case 'S': u = 224; break;
-	default: return;
-	}
+	cp = index(names, name[3]);
+	if (cp == NULL)
+		return;
+	u = (cp - names) * 32;
 	if (name[4] >= '0' && name[4] <= '9')
 		u += name[4] - '0';
 	else if (name[4] >= 'a' && name[4] <= 'v')
 		u += name[4] - 'a' + 10;
 	else
 		return;
-	*dev = make_dev_cred(&ptc_cdevsw, u, cr,
+	*dev = make_dev_credf(MAKEDEV_REF, &ptc_cdevsw, unit2minor(u), cr,
 	    UID_ROOT, GID_WHEEL, 0666, "pty%c%r", names[u / 32], u % 32);
-	dev_ref(*dev);
 	(*dev)->si_flags |= SI_CHEAPCLONE;
 	return;
 }
--- /dev/null
+++ sys/kern/systrace_args.c
@@ -0,0 +1,2878 @@
+/*
+ * System call argument to DTrace register array converstion.
+ *
+ * DO NOT EDIT-- this file is automatically generated.
+ * $FreeBSD: src/sys/kern/systrace_args.c,v 1.14 2007/08/16 05:32:26 davidxu Exp $
+ * This file is part of the DTrace syscall provider.
+ */
+
+static void
+systrace_args(int sysnum, void *params, u_int64_t *uarg, int *n_args)
+{
+	int64_t *iarg  = (int64_t *) uarg;
+	switch (sysnum) {
+	/* nosys */
+	case 0: {
+		*n_args = 0;
+		break;
+	}
+	/* sys_exit */
+	case 1: {
+		struct sys_exit_args *p = params;
+		iarg[0] = p->rval; /* int */
+		*n_args = 1;
+		break;
+	}
+	/* fork */
+	case 2: {
+		*n_args = 0;
+		break;
+	}
+	/* read */
+	case 3: {
+		struct read_args *p = params;
+		iarg[0] = p->fd; /* int */
+		uarg[1] = (intptr_t) p->buf; /* void * */
+		uarg[2] = p->nbyte; /* size_t */
+		*n_args = 3;
+		break;
+	}
+	/* write */
+	case 4: {
+		struct write_args *p = params;
+		iarg[0] = p->fd; /* int */
+		uarg[1] = (intptr_t) p->buf; /* const void * */
+		uarg[2] = p->nbyte; /* size_t */
+		*n_args = 3;
+		break;
+	}
+	/* open */
+	case 5: {
+		struct open_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* char * */
+		iarg[1] = p->flags; /* int */
+		iarg[2] = p->mode; /* int */
+		*n_args = 3;
+		break;
+	}
+	/* close */
+	case 6: {
+		struct close_args *p = params;
+		iarg[0] = p->fd; /* int */
+		*n_args = 1;
+		break;
+	}
+	/* wait4 */
+	case 7: {
+		struct wait_args *p = params;
+		iarg[0] = p->pid; /* int */
+		uarg[1] = (intptr_t) p->status; /* int * */
+		iarg[2] = p->options; /* int */
+		uarg[3] = (intptr_t) p->rusage; /* struct rusage * */
+		*n_args = 4;
+		break;
+	}
+	/* link */
+	case 9: {
+		struct link_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* char * */
+		uarg[1] = (intptr_t) p->link; /* char * */
+		*n_args = 2;
+		break;
+	}
+	/* unlink */
+	case 10: {
+		struct unlink_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* char * */
+		*n_args = 1;
+		break;
+	}
+	/* chdir */
+	case 12: {
+		struct chdir_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* char * */
+		*n_args = 1;
+		break;
+	}
+	/* fchdir */
+	case 13: {
+		struct fchdir_args *p = params;
+		iarg[0] = p->fd; /* int */
+		*n_args = 1;
+		break;
+	}
+	/* mknod */
+	case 14: {
+		struct mknod_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* char * */
+		iarg[1] = p->mode; /* int */
+		iarg[2] = p->dev; /* int */
+		*n_args = 3;
+		break;
+	}
+	/* chmod */
+	case 15: {
+		struct chmod_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* char * */
+		iarg[1] = p->mode; /* int */
+		*n_args = 2;
+		break;
+	}
+	/* chown */
+	case 16: {
+		struct chown_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* char * */
+		iarg[1] = p->uid; /* int */
+		iarg[2] = p->gid; /* int */
+		*n_args = 3;
+		break;
+	}
+	/* obreak */
+	case 17: {
+		struct obreak_args *p = params;
+		uarg[0] = (intptr_t) p->nsize; /* char * */
+		*n_args = 1;
+		break;
+	}
+	/* getpid */
+	case 20: {
+		*n_args = 0;
+		break;
+	}
+	/* mount */
+	case 21: {
+		struct mount_args *p = params;
+		uarg[0] = (intptr_t) p->type; /* char * */
+		uarg[1] = (intptr_t) p->path; /* char * */
+		iarg[2] = p->flags; /* int */
+		uarg[3] = (intptr_t) p->data; /* caddr_t */
+		*n_args = 4;
+		break;
+	}
+	/* unmount */
+	case 22: {
+		struct unmount_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* char * */
+		iarg[1] = p->flags; /* int */
+		*n_args = 2;
+		break;
+	}
+	/* setuid */
+	case 23: {
+		struct setuid_args *p = params;
+		uarg[0] = p->uid; /* uid_t */
+		*n_args = 1;
+		break;
+	}
+	/* getuid */
+	case 24: {
+		*n_args = 0;
+		break;
+	}
+	/* geteuid */
+	case 25: {
+		*n_args = 0;
+		break;
+	}
+	/* ptrace */
+	case 26: {
+		struct ptrace_args *p = params;
+		iarg[0] = p->req; /* int */
+		iarg[1] = p->pid; /* pid_t */
+		uarg[2] = (intptr_t) p->addr; /* caddr_t */
+		iarg[3] = p->data; /* int */
+		*n_args = 4;
+		break;
+	}
+	/* recvmsg */
+	case 27: {
+		struct recvmsg_args *p = params;
+		iarg[0] = p->s; /* int */
+		uarg[1] = (intptr_t) p->msg; /* struct msghdr * */
+		iarg[2] = p->flags; /* int */
+		*n_args = 3;
+		break;
+	}
+	/* sendmsg */
+	case 28: {
+		struct sendmsg_args *p = params;
+		iarg[0] = p->s; /* int */
+		uarg[1] = (intptr_t) p->msg; /* struct msghdr * */
+		iarg[2] = p->flags; /* int */
+		*n_args = 3;
+		break;
+	}
+	/* recvfrom */
+	case 29: {
+		struct recvfrom_args *p = params;
+		iarg[0] = p->s; /* int */
+		uarg[1] = (intptr_t) p->buf; /* caddr_t */
+		uarg[2] = p->len; /* size_t */
+		iarg[3] = p->flags; /* int */
+		uarg[4] = (intptr_t) p->from; /* struct sockaddr *__restrict */
+		uarg[5] = (intptr_t) p->fromlenaddr; /* __socklen_t *__restrict */
+		*n_args = 6;
+		break;
+	}
+	/* accept */
+	case 30: {
+		struct accept_args *p = params;
+		iarg[0] = p->s; /* int */
+		uarg[1] = (intptr_t) p->name; /* struct sockaddr *__restrict */
+		uarg[2] = (intptr_t) p->anamelen; /* __socklen_t *__restrict */
+		*n_args = 3;
+		break;
+	}
+	/* getpeername */
+	case 31: {
+		struct getpeername_args *p = params;
+		iarg[0] = p->fdes; /* int */
+		uarg[1] = (intptr_t) p->asa; /* struct sockaddr *__restrict */
+		uarg[2] = (intptr_t) p->alen; /* __socklen_t *__restrict */
+		*n_args = 3;
+		break;
+	}
+	/* getsockname */
+	case 32: {
+		struct getsockname_args *p = params;
+		iarg[0] = p->fdes; /* int */
+		uarg[1] = (intptr_t) p->asa; /* struct sockaddr *__restrict */
+		uarg[2] = (intptr_t) p->alen; /* __socklen_t *__restrict */
+		*n_args = 3;
+		break;
+	}
+	/* access */
+	case 33: {
+		struct access_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* char * */
+		iarg[1] = p->flags; /* int */
+		*n_args = 2;
+		break;
+	}
+	/* chflags */
+	case 34: {
+		struct chflags_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* char * */
+		iarg[1] = p->flags; /* int */
+		*n_args = 2;
+		break;
+	}
+	/* fchflags */
+	case 35: {
+		struct fchflags_args *p = params;
+		iarg[0] = p->fd; /* int */
+		iarg[1] = p->flags; /* int */
+		*n_args = 2;
+		break;
+	}
+	/* sync */
+	case 36: {
+		*n_args = 0;
+		break;
+	}
+	/* kill */
+	case 37: {
+		struct kill_args *p = params;
+		iarg[0] = p->pid; /* int */
+		iarg[1] = p->signum; /* int */
+		*n_args = 2;
+		break;
+	}
+	/* getppid */
+	case 39: {
+		*n_args = 0;
+		break;
+	}
+	/* dup */
+	case 41: {
+		struct dup_args *p = params;
+		uarg[0] = p->fd; /* u_int */
+		*n_args = 1;
+		break;
+	}
+	/* pipe */
+	case 42: {
+		*n_args = 0;
+		break;
+	}
+	/* getegid */
+	case 43: {
+		*n_args = 0;
+		break;
+	}
+	/* profil */
+	case 44: {
+		struct profil_args *p = params;
+		uarg[0] = (intptr_t) p->samples; /* caddr_t */
+		uarg[1] = p->size; /* size_t */
+		uarg[2] = p->offset; /* size_t */
+		uarg[3] = p->scale; /* u_int */
+		*n_args = 4;
+		break;
+	}
+	/* ktrace */
+	case 45: {
+		struct ktrace_args *p = params;
+		uarg[0] = (intptr_t) p->fname; /* const char * */
+		iarg[1] = p->ops; /* int */
+		iarg[2] = p->facs; /* int */
+		iarg[3] = p->pid; /* int */
+		*n_args = 4;
+		break;
+	}
+	/* getgid */
+	case 47: {
+		*n_args = 0;
+		break;
+	}
+	/* getlogin */
+	case 49: {
+		struct getlogin_args *p = params;
+		uarg[0] = (intptr_t) p->namebuf; /* char * */
+		uarg[1] = p->namelen; /* u_int */
+		*n_args = 2;
+		break;
+	}
+	/* setlogin */
+	case 50: {
+		struct setlogin_args *p = params;
+		uarg[0] = (intptr_t) p->namebuf; /* char * */
+		*n_args = 1;
+		break;
+	}
+	/* acct */
+	case 51: {
+		struct acct_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* char * */
+		*n_args = 1;
+		break;
+	}
+	/* sigaltstack */
+	case 53: {
+		struct sigaltstack_args *p = params;
+		uarg[0] = (intptr_t) p->ss; /* stack_t * */
+		uarg[1] = (intptr_t) p->oss; /* stack_t * */
+		*n_args = 2;
+		break;
+	}
+	/* ioctl */
+	case 54: {
+		struct ioctl_args *p = params;
+		iarg[0] = p->fd; /* int */
+		uarg[1] = p->com; /* u_long */
+		uarg[2] = (intptr_t) p->data; /* caddr_t */
+		*n_args = 3;
+		break;
+	}
+	/* reboot */
+	case 55: {
+		struct reboot_args *p = params;
+		iarg[0] = p->opt; /* int */
+		*n_args = 1;
+		break;
+	}
+	/* revoke */
+	case 56: {
+		struct revoke_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* char * */
+		*n_args = 1;
+		break;
+	}
+	/* symlink */
+	case 57: {
+		struct symlink_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* char * */
+		uarg[1] = (intptr_t) p->link; /* char * */
+		*n_args = 2;
+		break;
+	}
+	/* readlink */
+	case 58: {
+		struct readlink_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* char * */
+		uarg[1] = (intptr_t) p->buf; /* char * */
+		iarg[2] = p->count; /* int */
+		*n_args = 3;
+		break;
+	}
+	/* execve */
+	case 59: {
+		struct execve_args *p = params;
+		uarg[0] = (intptr_t) p->fname; /* char * */
+		uarg[1] = (intptr_t) p->argv; /* char ** */
+		uarg[2] = (intptr_t) p->envv; /* char ** */
+		*n_args = 3;
+		break;
+	}
+	/* umask */
+	case 60: {
+		struct umask_args *p = params;
+		iarg[0] = p->newmask; /* int */
+		*n_args = 1;
+		break;
+	}
+	/* chroot */
+	case 61: {
+		struct chroot_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* char * */
+		*n_args = 1;
+		break;
+	}
+	/* msync */
+	case 65: {
+		struct msync_args *p = params;
+		uarg[0] = (intptr_t) p->addr; /* void * */
+		uarg[1] = p->len; /* size_t */
+		iarg[2] = p->flags; /* int */
+		*n_args = 3;
+		break;
+	}
+	/* vfork */
+	case 66: {
+		*n_args = 0;
+		break;
+	}
+	/* sbrk */
+	case 69: {
+		struct sbrk_args *p = params;
+		iarg[0] = p->incr; /* int */
+		*n_args = 1;
+		break;
+	}
+	/* sstk */
+	case 70: {
+		struct sstk_args *p = params;
+		iarg[0] = p->incr; /* int */
+		*n_args = 1;
+		break;
+	}
+	/* ovadvise */
+	case 72: {
+		struct ovadvise_args *p = params;
+		iarg[0] = p->anom; /* int */
+		*n_args = 1;
+		break;
+	}
+	/* munmap */
+	case 73: {
+		struct munmap_args *p = params;
+		uarg[0] = (intptr_t) p->addr; /* void * */
+		uarg[1] = p->len; /* size_t */
+		*n_args = 2;
+		break;
+	}
+	/* mprotect */
+	case 74: {
+		struct mprotect_args *p = params;
+		uarg[0] = (intptr_t) p->addr; /* const void * */
+		uarg[1] = p->len; /* size_t */
+		iarg[2] = p->prot; /* int */
+		*n_args = 3;
+		break;
+	}
+	/* madvise */
+	case 75: {
+		struct madvise_args *p = params;
+		uarg[0] = (intptr_t) p->addr; /* void * */
+		uarg[1] = p->len; /* size_t */
+		iarg[2] = p->behav; /* int */
+		*n_args = 3;
+		break;
+	}
+	/* mincore */
+	case 78: {
+		struct mincore_args *p = params;
+		uarg[0] = (intptr_t) p->addr; /* const void * */
+		uarg[1] = p->len; /* size_t */
+		uarg[2] = (intptr_t) p->vec; /* char * */
+		*n_args = 3;
+		break;
+	}
+	/* getgroups */
+	case 79: {
+		struct getgroups_args *p = params;
+		uarg[0] = p->gidsetsize; /* u_int */
+		uarg[1] = (intptr_t) p->gidset; /* gid_t * */
+		*n_args = 2;
+		break;
+	}
+	/* setgroups */
+	case 80: {
+		struct setgroups_args *p = params;
+		uarg[0] = p->gidsetsize; /* u_int */
+		uarg[1] = (intptr_t) p->gidset; /* gid_t * */
+		*n_args = 2;
+		break;
+	}
+	/* getpgrp */
+	case 81: {
+		*n_args = 0;
+		break;
+	}
+	/* setpgid */
+	case 82: {
+		struct setpgid_args *p = params;
+		iarg[0] = p->pid; /* int */
+		iarg[1] = p->pgid; /* int */
+		*n_args = 2;
+		break;
+	}
+	/* setitimer */
+	case 83: {
+		struct setitimer_args *p = params;
+		uarg[0] = p->which; /* u_int */
+		uarg[1] = (intptr_t) p->itv; /* struct itimerval * */
+		uarg[2] = (intptr_t) p->oitv; /* struct itimerval * */
+		*n_args = 3;
+		break;
+	}
+	/* swapon */
+	case 85: {
+		struct swapon_args *p = params;
+		uarg[0] = (intptr_t) p->name; /* char * */
+		*n_args = 1;
+		break;
+	}
+	/* getitimer */
+	case 86: {
+		struct getitimer_args *p = params;
+		uarg[0] = p->which; /* u_int */
+		uarg[1] = (intptr_t) p->itv; /* struct itimerval * */
+		*n_args = 2;
+		break;
+	}
+	/* getdtablesize */
+	case 89: {
+		*n_args = 0;
+		break;
+	}
+	/* dup2 */
+	case 90: {
+		struct dup2_args *p = params;
+		uarg[0] = p->from; /* u_int */
+		uarg[1] = p->to; /* u_int */
+		*n_args = 2;
+		break;
+	}
+	/* fcntl */
+	case 92: {
+		struct fcntl_args *p = params;
+		iarg[0] = p->fd; /* int */
+		iarg[1] = p->cmd; /* int */
+		iarg[2] = p->arg; /* long */
+		*n_args = 3;
+		break;
+	}
+	/* select */
+	case 93: {
+		struct select_args *p = params;
+		iarg[0] = p->nd; /* int */
+		uarg[1] = (intptr_t) p->in; /* fd_set * */
+		uarg[2] = (intptr_t) p->ou; /* fd_set * */
+		uarg[3] = (intptr_t) p->ex; /* fd_set * */
+		uarg[4] = (intptr_t) p->tv; /* struct timeval * */
+		*n_args = 5;
+		break;
+	}
+	/* fsync */
+	case 95: {
+		struct fsync_args *p = params;
+		iarg[0] = p->fd; /* int */
+		*n_args = 1;
+		break;
+	}
+	/* setpriority */
+	case 96: {
+		struct setpriority_args *p = params;
+		iarg[0] = p->which; /* int */
+		iarg[1] = p->who; /* int */
+		iarg[2] = p->prio; /* int */
+		*n_args = 3;
+		break;
+	}
+	/* socket */
+	case 97: {
+		struct socket_args *p = params;
+		iarg[0] = p->domain; /* int */
+		iarg[1] = p->type; /* int */
+		iarg[2] = p->protocol; /* int */
+		*n_args = 3;
+		break;
+	}
+	/* connect */
+	case 98: {
+		struct connect_args *p = params;
+		iarg[0] = p->s; /* int */
+		uarg[1] = (intptr_t) p->name; /* caddr_t */
+		iarg[2] = p->namelen; /* int */
+		*n_args = 3;
+		break;
+	}
+	/* getpriority */
+	case 100: {
+		struct getpriority_args *p = params;
+		iarg[0] = p->which; /* int */
+		iarg[1] = p->who; /* int */
+		*n_args = 2;
+		break;
+	}
+	/* bind */
+	case 104: {
+		struct bind_args *p = params;
+		iarg[0] = p->s; /* int */
+		uarg[1] = (intptr_t) p->name; /* caddr_t */
+		iarg[2] = p->namelen; /* int */
+		*n_args = 3;
+		break;
+	}
+	/* setsockopt */
+	case 105: {
+		struct setsockopt_args *p = params;
+		iarg[0] = p->s; /* int */
+		iarg[1] = p->level; /* int */
+		iarg[2] = p->name; /* int */
+		uarg[3] = (intptr_t) p->val; /* caddr_t */
+		iarg[4] = p->valsize; /* int */
+		*n_args = 5;
+		break;
+	}
+	/* listen */
+	case 106: {
+		struct listen_args *p = params;
+		iarg[0] = p->s; /* int */
+		iarg[1] = p->backlog; /* int */
+		*n_args = 2;
+		break;
+	}
+	/* gettimeofday */
+	case 116: {
+		struct gettimeofday_args *p = params;
+		uarg[0] = (intptr_t) p->tp; /* struct timeval * */
+		uarg[1] = (intptr_t) p->tzp; /* struct timezone * */
+		*n_args = 2;
+		break;
+	}
+	/* getrusage */
+	case 117: {
+		struct getrusage_args *p = params;
+		iarg[0] = p->who; /* int */
+		uarg[1] = (intptr_t) p->rusage; /* struct rusage * */
+		*n_args = 2;
+		break;
+	}
+	/* getsockopt */
+	case 118: {
+		struct getsockopt_args *p = params;
+		iarg[0] = p->s; /* int */
+		iarg[1] = p->level; /* int */
+		iarg[2] = p->name; /* int */
+		uarg[3] = (intptr_t) p->val; /* caddr_t */
+		uarg[4] = (intptr_t) p->avalsize; /* int * */
+		*n_args = 5;
+		break;
+	}
+	/* readv */
+	case 120: {
+		struct readv_args *p = params;
+		iarg[0] = p->fd; /* int */
+		uarg[1] = (intptr_t) p->iovp; /* struct iovec * */
+		uarg[2] = p->iovcnt; /* u_int */
+		*n_args = 3;
+		break;
+	}
+	/* writev */
+	case 121: {
+		struct writev_args *p = params;
+		iarg[0] = p->fd; /* int */
+		uarg[1] = (intptr_t) p->iovp; /* struct iovec * */
+		uarg[2] = p->iovcnt; /* u_int */
+		*n_args = 3;
+		break;
+	}
+	/* settimeofday */
+	case 122: {
+		struct settimeofday_args *p = params;
+		uarg[0] = (intptr_t) p->tv; /* struct timeval * */
+		uarg[1] = (intptr_t) p->tzp; /* struct timezone * */
+		*n_args = 2;
+		break;
+	}
+	/* fchown */
+	case 123: {
+		struct fchown_args *p = params;
+		iarg[0] = p->fd; /* int */
+		iarg[1] = p->uid; /* int */
+		iarg[2] = p->gid; /* int */
+		*n_args = 3;
+		break;
+	}
+	/* fchmod */
+	case 124: {
+		struct fchmod_args *p = params;
+		iarg[0] = p->fd; /* int */
+		iarg[1] = p->mode; /* int */
+		*n_args = 2;
+		break;
+	}
+	/* setreuid */
+	case 126: {
+		struct setreuid_args *p = params;
+		iarg[0] = p->ruid; /* int */
+		iarg[1] = p->euid; /* int */
+		*n_args = 2;
+		break;
+	}
+	/* setregid */
+	case 127: {
+		struct setregid_args *p = params;
+		iarg[0] = p->rgid; /* int */
+		iarg[1] = p->egid; /* int */
+		*n_args = 2;
+		break;
+	}
+	/* rename */
+	case 128: {
+		struct rename_args *p = params;
+		uarg[0] = (intptr_t) p->from; /* char * */
+		uarg[1] = (intptr_t) p->to; /* char * */
+		*n_args = 2;
+		break;
+	}
+	/* flock */
+	case 131: {
+		struct flock_args *p = params;
+		iarg[0] = p->fd; /* int */
+		iarg[1] = p->how; /* int */
+		*n_args = 2;
+		break;
+	}
+	/* mkfifo */
+	case 132: {
+		struct mkfifo_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* char * */
+		iarg[1] = p->mode; /* int */
+		*n_args = 2;
+		break;
+	}
+	/* sendto */
+	case 133: {
+		struct sendto_args *p = params;
+		iarg[0] = p->s; /* int */
+		uarg[1] = (intptr_t) p->buf; /* caddr_t */
+		uarg[2] = p->len; /* size_t */
+		iarg[3] = p->flags; /* int */
+		uarg[4] = (intptr_t) p->to; /* caddr_t */
+		iarg[5] = p->tolen; /* int */
+		*n_args = 6;
+		break;
+	}
+	/* shutdown */
+	case 134: {
+		struct shutdown_args *p = params;
+		iarg[0] = p->s; /* int */
+		iarg[1] = p->how; /* int */
+		*n_args = 2;
+		break;
+	}
+	/* socketpair */
+	case 135: {
+		struct socketpair_args *p = params;
+		iarg[0] = p->domain; /* int */
+		iarg[1] = p->type; /* int */
+		iarg[2] = p->protocol; /* int */
+		uarg[3] = (intptr_t) p->rsv; /* int * */
+		*n_args = 4;
+		break;
+	}
+	/* mkdir */
+	case 136: {
+		struct mkdir_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* char * */
+		iarg[1] = p->mode; /* int */
+		*n_args = 2;
+		break;
+	}
+	/* rmdir */
+	case 137: {
+		struct rmdir_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* char * */
+		*n_args = 1;
+		break;
+	}
+	/* utimes */
+	case 138: {
+		struct utimes_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* char * */
+		uarg[1] = (intptr_t) p->tptr; /* struct timeval * */
+		*n_args = 2;
+		break;
+	}
+	/* adjtime */
+	case 140: {
+		struct adjtime_args *p = params;
+		uarg[0] = (intptr_t) p->delta; /* struct timeval * */
+		uarg[1] = (intptr_t) p->olddelta; /* struct timeval * */
+		*n_args = 2;
+		break;
+	}
+	/* setsid */
+	case 147: {
+		*n_args = 0;
+		break;
+	}
+	/* quotactl */
+	case 148: {
+		struct quotactl_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* char * */
+		iarg[1] = p->cmd; /* int */
+		iarg[2] = p->uid; /* int */
+		uarg[3] = (intptr_t) p->arg; /* caddr_t */
+		*n_args = 4;
+		break;
+	}
+	/* nfssvc */
+	case 155: {
+		struct nfssvc_args *p = params;
+		iarg[0] = p->flag; /* int */
+		uarg[1] = (intptr_t) p->argp; /* caddr_t */
+		*n_args = 2;
+		break;
+	}
+	/* lgetfh */
+	case 160: {
+		struct lgetfh_args *p = params;
+		uarg[0] = (intptr_t) p->fname; /* char * */
+		uarg[1] = (intptr_t) p->fhp; /* struct fhandle * */
+		*n_args = 2;
+		break;
+	}
+	/* getfh */
+	case 161: {
+		struct getfh_args *p = params;
+		uarg[0] = (intptr_t) p->fname; /* char * */
+		uarg[1] = (intptr_t) p->fhp; /* struct fhandle * */
+		*n_args = 2;
+		break;
+	}
+	/* getdomainname */
+	case 162: {
+		struct getdomainname_args *p = params;
+		uarg[0] = (intptr_t) p->domainname; /* char * */
+		iarg[1] = p->len; /* int */
+		*n_args = 2;
+		break;
+	}
+	/* setdomainname */
+	case 163: {
+		struct setdomainname_args *p = params;
+		uarg[0] = (intptr_t) p->domainname; /* char * */
+		iarg[1] = p->len; /* int */
+		*n_args = 2;
+		break;
+	}
+	/* uname */
+	case 164: {
+		struct uname_args *p = params;
+		uarg[0] = (intptr_t) p->name; /* struct utsname * */
+		*n_args = 1;
+		break;
+	}
+	/* sysarch */
+	case 165: {
+		struct sysarch_args *p = params;
+		iarg[0] = p->op; /* int */
+		uarg[1] = (intptr_t) p->parms; /* char * */
+		*n_args = 2;
+		break;
+	}
+	/* rtprio */
+	case 166: {
+		struct rtprio_args *p = params;
+		iarg[0] = p->function; /* int */
+		iarg[1] = p->pid; /* pid_t */
+		uarg[2] = (intptr_t) p->rtp; /* struct rtprio * */
+		*n_args = 3;
+		break;
+	}
+	/* semsys */
+	case 169: {
+		struct semsys_args *p = params;
+		iarg[0] = p->which; /* int */
+		iarg[1] = p->a2; /* int */
+		iarg[2] = p->a3; /* int */
+		iarg[3] = p->a4; /* int */
+		iarg[4] = p->a5; /* int */
+		*n_args = 5;
+		break;
+	}
+	/* msgsys */
+	case 170: {
+		struct msgsys_args *p = params;
+		iarg[0] = p->which; /* int */
+		iarg[1] = p->a2; /* int */
+		iarg[2] = p->a3; /* int */
+		iarg[3] = p->a4; /* int */
+		iarg[4] = p->a5; /* int */
+		iarg[5] = p->a6; /* int */
+		*n_args = 6;
+		break;
+	}
+	/* shmsys */
+	case 171: {
+		struct shmsys_args *p = params;
+		iarg[0] = p->which; /* int */
+		iarg[1] = p->a2; /* int */
+		iarg[2] = p->a3; /* int */
+		iarg[3] = p->a4; /* int */
+		*n_args = 4;
+		break;
+	}
+	/* freebsd6_pread */
+	case 173: {
+		struct freebsd6_pread_args *p = params;
+		iarg[0] = p->fd; /* int */
+		uarg[1] = (intptr_t) p->buf; /* void * */
+		uarg[2] = p->nbyte; /* size_t */
+		iarg[3] = p->pad; /* int */
+		iarg[4] = p->offset; /* off_t */
+		*n_args = 5;
+		break;
+	}
+	/* freebsd6_pwrite */
+	case 174: {
+		struct freebsd6_pwrite_args *p = params;
+		iarg[0] = p->fd; /* int */
+		uarg[1] = (intptr_t) p->buf; /* const void * */
+		uarg[2] = p->nbyte; /* size_t */
+		iarg[3] = p->pad; /* int */
+		iarg[4] = p->offset; /* off_t */
+		*n_args = 5;
+		break;
+	}
+	/* ntp_adjtime */
+	case 176: {
+		struct ntp_adjtime_args *p = params;
+		uarg[0] = (intptr_t) p->tp; /* struct timex * */
+		*n_args = 1;
+		break;
+	}
+	/* setgid */
+	case 181: {
+		struct setgid_args *p = params;
+		iarg[0] = p->gid; /* gid_t */
+		*n_args = 1;
+		break;
+	}
+	/* setegid */
+	case 182: {
+		struct setegid_args *p = params;
+		iarg[0] = p->egid; /* gid_t */
+		*n_args = 1;
+		break;
+	}
+	/* seteuid */
+	case 183: {
+		struct seteuid_args *p = params;
+		uarg[0] = p->euid; /* uid_t */
+		*n_args = 1;
+		break;
+	}
+	/* stat */
+	case 188: {
+		struct stat_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* char * */
+		uarg[1] = (intptr_t) p->ub; /* struct stat * */
+		*n_args = 2;
+		break;
+	}
+	/* fstat */
+	case 189: {
+		struct fstat_args *p = params;
+		iarg[0] = p->fd; /* int */
+		uarg[1] = (intptr_t) p->sb; /* struct stat * */
+		*n_args = 2;
+		break;
+	}
+	/* lstat */
+	case 190: {
+		struct lstat_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* char * */
+		uarg[1] = (intptr_t) p->ub; /* struct stat * */
+		*n_args = 2;
+		break;
+	}
+	/* pathconf */
+	case 191: {
+		struct pathconf_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* char * */
+		iarg[1] = p->name; /* int */
+		*n_args = 2;
+		break;
+	}
+	/* fpathconf */
+	case 192: {
+		struct fpathconf_args *p = params;
+		iarg[0] = p->fd; /* int */
+		iarg[1] = p->name; /* int */
+		*n_args = 2;
+		break;
+	}
+	/* getrlimit */
+	case 194: {
+		struct __getrlimit_args *p = params;
+		uarg[0] = p->which; /* u_int */
+		uarg[1] = (intptr_t) p->rlp; /* struct rlimit * */
+		*n_args = 2;
+		break;
+	}
+	/* setrlimit */
+	case 195: {
+		struct __setrlimit_args *p = params;
+		uarg[0] = p->which; /* u_int */
+		uarg[1] = (intptr_t) p->rlp; /* struct rlimit * */
+		*n_args = 2;
+		break;
+	}
+	/* getdirentries */
+	case 196: {
+		struct getdirentries_args *p = params;
+		iarg[0] = p->fd; /* int */
+		uarg[1] = (intptr_t) p->buf; /* char * */
+		uarg[2] = p->count; /* u_int */
+		uarg[3] = (intptr_t) p->basep; /* long * */
+		*n_args = 4;
+		break;
+	}
+	/* freebsd6_mmap */
+	case 197: {
+		struct freebsd6_mmap_args *p = params;
+		uarg[0] = (intptr_t) p->addr; /* caddr_t */
+		uarg[1] = p->len; /* size_t */
+		iarg[2] = p->prot; /* int */
+		iarg[3] = p->flags; /* int */
+		iarg[4] = p->fd; /* int */
+		iarg[5] = p->pad; /* int */
+		iarg[6] = p->pos; /* off_t */
+		*n_args = 7;
+		break;
+	}
+	/* nosys */
+	case 198: {
+		*n_args = 0;
+		break;
+	}
+	/* freebsd6_lseek */
+	case 199: {
+		struct freebsd6_lseek_args *p = params;
+		iarg[0] = p->fd; /* int */
+		iarg[1] = p->pad; /* int */
+		iarg[2] = p->offset; /* off_t */
+		iarg[3] = p->whence; /* int */
+		*n_args = 4;
+		break;
+	}
+	/* freebsd6_truncate */
+	case 200: {
+		struct freebsd6_truncate_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* char * */
+		iarg[1] = p->pad; /* int */
+		iarg[2] = p->length; /* off_t */
+		*n_args = 3;
+		break;
+	}
+	/* freebsd6_ftruncate */
+	case 201: {
+		struct freebsd6_ftruncate_args *p = params;
+		iarg[0] = p->fd; /* int */
+		iarg[1] = p->pad; /* int */
+		iarg[2] = p->length; /* off_t */
+		*n_args = 3;
+		break;
+	}
+	/* __sysctl */
+	case 202: {
+		struct sysctl_args *p = params;
+		uarg[0] = (intptr_t) p->name; /* int * */
+		uarg[1] = p->namelen; /* u_int */
+		uarg[2] = (intptr_t) p->old; /* void * */
+		uarg[3] = (intptr_t) p->oldlenp; /* size_t * */
+		uarg[4] = (intptr_t) p->new; /* void * */
+		uarg[5] = p->newlen; /* size_t */
+		*n_args = 6;
+		break;
+	}
+	/* mlock */
+	case 203: {
+		struct mlock_args *p = params;
+		uarg[0] = (intptr_t) p->addr; /* const void * */
+		uarg[1] = p->len; /* size_t */
+		*n_args = 2;
+		break;
+	}
+	/* munlock */
+	case 204: {
+		struct munlock_args *p = params;
+		uarg[0] = (intptr_t) p->addr; /* const void * */
+		uarg[1] = p->len; /* size_t */
+		*n_args = 2;
+		break;
+	}
+	/* undelete */
+	case 205: {
+		struct undelete_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* char * */
+		*n_args = 1;
+		break;
+	}
+	/* futimes */
+	case 206: {
+		struct futimes_args *p = params;
+		iarg[0] = p->fd; /* int */
+		uarg[1] = (intptr_t) p->tptr; /* struct timeval * */
+		*n_args = 2;
+		break;
+	}
+	/* getpgid */
+	case 207: {
+		struct getpgid_args *p = params;
+		iarg[0] = p->pid; /* pid_t */
+		*n_args = 1;
+		break;
+	}
+	/* poll */
+	case 209: {
+		struct poll_args *p = params;
+		uarg[0] = (intptr_t) p->fds; /* struct pollfd * */
+		uarg[1] = p->nfds; /* u_int */
+		iarg[2] = p->timeout; /* int */
+		*n_args = 3;
+		break;
+	}
+	/* lkmnosys */
+	case 210: {
+		*n_args = 0;
+		break;
+	}
+	/* lkmnosys */
+	case 211: {
+		*n_args = 0;
+		break;
+	}
+	/* lkmnosys */
+	case 212: {
+		*n_args = 0;
+		break;
+	}
+	/* lkmnosys */
+	case 213: {
+		*n_args = 0;
+		break;
+	}
+	/* lkmnosys */
+	case 214: {
+		*n_args = 0;
+		break;
+	}
+	/* lkmnosys */
+	case 215: {
+		*n_args = 0;
+		break;
+	}
+	/* lkmnosys */
+	case 216: {
+		*n_args = 0;
+		break;
+	}
+	/* lkmnosys */
+	case 217: {
+		*n_args = 0;
+		break;
+	}
+	/* lkmnosys */
+	case 218: {
+		*n_args = 0;
+		break;
+	}
+	/* lkmnosys */
+	case 219: {
+		*n_args = 0;
+		break;
+	}
+	/* __semctl */
+	case 220: {
+		struct __semctl_args *p = params;
+		iarg[0] = p->semid; /* int */
+		iarg[1] = p->semnum; /* int */
+		iarg[2] = p->cmd; /* int */
+		uarg[3] = (intptr_t) p->arg; /* union semun * */
+		*n_args = 4;
+		break;
+	}
+	/* semget */
+	case 221: {
+		struct semget_args *p = params;
+		iarg[0] = p->key; /* key_t */
+		iarg[1] = p->nsems; /* int */
+		iarg[2] = p->semflg; /* int */
+		*n_args = 3;
+		break;
+	}
+	/* semop */
+	case 222: {
+		struct semop_args *p = params;
+		iarg[0] = p->semid; /* int */
+		uarg[1] = (intptr_t) p->sops; /* struct sembuf * */
+		uarg[2] = p->nsops; /* size_t */
+		*n_args = 3;
+		break;
+	}
+	/* msgctl */
+	case 224: {
+		struct msgctl_args *p = params;
+		iarg[0] = p->msqid; /* int */
+		iarg[1] = p->cmd; /* int */
+		uarg[2] = (intptr_t) p->buf; /* struct msqid_ds * */
+		*n_args = 3;
+		break;
+	}
+	/* msgget */
+	case 225: {
+		struct msgget_args *p = params;
+		iarg[0] = p->key; /* key_t */
+		iarg[1] = p->msgflg; /* int */
+		*n_args = 2;
+		break;
+	}
+	/* msgsnd */
+	case 226: {
+		struct msgsnd_args *p = params;
+		iarg[0] = p->msqid; /* int */
+		uarg[1] = (intptr_t) p->msgp; /* const void * */
+		uarg[2] = p->msgsz; /* size_t */
+		iarg[3] = p->msgflg; /* int */
+		*n_args = 4;
+		break;
+	}
+	/* msgrcv */
+	case 227: {
+		struct msgrcv_args *p = params;
+		iarg[0] = p->msqid; /* int */
+		uarg[1] = (intptr_t) p->msgp; /* void * */
+		uarg[2] = p->msgsz; /* size_t */
+		iarg[3] = p->msgtyp; /* long */
+		iarg[4] = p->msgflg; /* int */
+		*n_args = 5;
+		break;
+	}
+	/* shmat */
+	case 228: {
+		struct shmat_args *p = params;
+		iarg[0] = p->shmid; /* int */
+		uarg[1] = (intptr_t) p->shmaddr; /* const void * */
+		iarg[2] = p->shmflg; /* int */
+		*n_args = 3;
+		break;
+	}
+	/* shmctl */
+	case 229: {
+		struct shmctl_args *p = params;
+		iarg[0] = p->shmid; /* int */
+		iarg[1] = p->cmd; /* int */
+		uarg[2] = (intptr_t) p->buf; /* struct shmid_ds * */
+		*n_args = 3;
+		break;
+	}
+	/* shmdt */
+	case 230: {
+		struct shmdt_args *p = params;
+		uarg[0] = (intptr_t) p->shmaddr; /* const void * */
+		*n_args = 1;
+		break;
+	}
+	/* shmget */
+	case 231: {
+		struct shmget_args *p = params;
+		iarg[0] = p->key; /* key_t */
+		uarg[1] = p->size; /* size_t */
+		iarg[2] = p->shmflg; /* int */
+		*n_args = 3;
+		break;
+	}
+	/* clock_gettime */
+	case 232: {
+		struct clock_gettime_args *p = params;
+		iarg[0] = p->clock_id; /* clockid_t */
+		uarg[1] = (intptr_t) p->tp; /* struct timespec * */
+		*n_args = 2;
+		break;
+	}
+	/* clock_settime */
+	case 233: {
+		struct clock_settime_args *p = params;
+		iarg[0] = p->clock_id; /* clockid_t */
+		uarg[1] = (intptr_t) p->tp; /* const struct timespec * */
+		*n_args = 2;
+		break;
+	}
+	/* clock_getres */
+	case 234: {
+		struct clock_getres_args *p = params;
+		iarg[0] = p->clock_id; /* clockid_t */
+		uarg[1] = (intptr_t) p->tp; /* struct timespec * */
+		*n_args = 2;
+		break;
+	}
+	/* ktimer_create */
+	case 235: {
+		struct ktimer_create_args *p = params;
+		iarg[0] = p->clock_id; /* clockid_t */
+		uarg[1] = (intptr_t) p->evp; /* struct sigevent * */
+		uarg[2] = (intptr_t) p->timerid; /* int * */
+		*n_args = 3;
+		break;
+	}
+	/* ktimer_delete */
+	case 236: {
+		struct ktimer_delete_args *p = params;
+		iarg[0] = p->timerid; /* int */
+		*n_args = 1;
+		break;
+	}
+	/* ktimer_settime */
+	case 237: {
+		struct ktimer_settime_args *p = params;
+		iarg[0] = p->timerid; /* int */
+		iarg[1] = p->flags; /* int */
+		uarg[2] = (intptr_t) p->value; /* const struct itimerspec * */
+		uarg[3] = (intptr_t) p->ovalue; /* struct itimerspec * */
+		*n_args = 4;
+		break;
+	}
+	/* ktimer_gettime */
+	case 238: {
+		struct ktimer_gettime_args *p = params;
+		iarg[0] = p->timerid; /* int */
+		uarg[1] = (intptr_t) p->value; /* struct itimerspec * */
+		*n_args = 2;
+		break;
+	}
+	/* ktimer_getoverrun */
+	case 239: {
+		struct ktimer_getoverrun_args *p = params;
+		iarg[0] = p->timerid; /* int */
+		*n_args = 1;
+		break;
+	}
+	/* nanosleep */
+	case 240: {
+		struct nanosleep_args *p = params;
+		uarg[0] = (intptr_t) p->rqtp; /* const struct timespec * */
+		uarg[1] = (intptr_t) p->rmtp; /* struct timespec * */
+		*n_args = 2;
+		break;
+	}
+	/* ntp_gettime */
+	case 248: {
+		struct ntp_gettime_args *p = params;
+		uarg[0] = (intptr_t) p->ntvp; /* struct ntptimeval * */
+		*n_args = 1;
+		break;
+	}
+	/* minherit */
+	case 250: {
+		struct minherit_args *p = params;
+		uarg[0] = (intptr_t) p->addr; /* void * */
+		uarg[1] = p->len; /* size_t */
+		iarg[2] = p->inherit; /* int */
+		*n_args = 3;
+		break;
+	}
+	/* rfork */
+	case 251: {
+		struct rfork_args *p = params;
+		iarg[0] = p->flags; /* int */
+		*n_args = 1;
+		break;
+	}
+	/* openbsd_poll */
+	case 252: {
+		struct openbsd_poll_args *p = params;
+		uarg[0] = (intptr_t) p->fds; /* struct pollfd * */
+		uarg[1] = p->nfds; /* u_int */
+		iarg[2] = p->timeout; /* int */
+		*n_args = 3;
+		break;
+	}
+	/* issetugid */
+	case 253: {
+		*n_args = 0;
+		break;
+	}
+	/* lchown */
+	case 254: {
+		struct lchown_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* char * */
+		iarg[1] = p->uid; /* int */
+		iarg[2] = p->gid; /* int */
+		*n_args = 3;
+		break;
+	}
+	/* aio_read */
+	case 255: {
+		struct aio_read_args *p = params;
+		uarg[0] = (intptr_t) p->aiocbp; /* struct aiocb * */
+		*n_args = 1;
+		break;
+	}
+	/* aio_write */
+	case 256: {
+		struct aio_write_args *p = params;
+		uarg[0] = (intptr_t) p->aiocbp; /* struct aiocb * */
+		*n_args = 1;
+		break;
+	}
+	/* lio_listio */
+	case 257: {
+		struct lio_listio_args *p = params;
+		iarg[0] = p->mode; /* int */
+		uarg[1] = (intptr_t) p->acb_list; /* struct aiocb *const * */
+		iarg[2] = p->nent; /* int */
+		uarg[3] = (intptr_t) p->sig; /* struct sigevent * */
+		*n_args = 4;
+		break;
+	}
+	/* getdents */
+	case 272: {
+		struct getdents_args *p = params;
+		iarg[0] = p->fd; /* int */
+		uarg[1] = (intptr_t) p->buf; /* char * */
+		uarg[2] = p->count; /* size_t */
+		*n_args = 3;
+		break;
+	}
+	/* lchmod */
+	case 274: {
+		struct lchmod_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* char * */
+		iarg[1] = p->mode; /* mode_t */
+		*n_args = 2;
+		break;
+	}
+	/* lchown */
+	case 275: {
+		struct lchown_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* char * */
+		uarg[1] = p->uid; /* uid_t */
+		iarg[2] = p->gid; /* gid_t */
+		*n_args = 3;
+		break;
+	}
+	/* lutimes */
+	case 276: {
+		struct lutimes_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* char * */
+		uarg[1] = (intptr_t) p->tptr; /* struct timeval * */
+		*n_args = 2;
+		break;
+	}
+	/* msync */
+	case 277: {
+		struct msync_args *p = params;
+		uarg[0] = (intptr_t) p->addr; /* void * */
+		uarg[1] = p->len; /* size_t */
+		iarg[2] = p->flags; /* int */
+		*n_args = 3;
+		break;
+	}
+	/* nstat */
+	case 278: {
+		struct nstat_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* char * */
+		uarg[1] = (intptr_t) p->ub; /* struct nstat * */
+		*n_args = 2;
+		break;
+	}
+	/* nfstat */
+	case 279: {
+		struct nfstat_args *p = params;
+		iarg[0] = p->fd; /* int */
+		uarg[1] = (intptr_t) p->sb; /* struct nstat * */
+		*n_args = 2;
+		break;
+	}
+	/* nlstat */
+	case 280: {
+		struct nlstat_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* char * */
+		uarg[1] = (intptr_t) p->ub; /* struct nstat * */
+		*n_args = 2;
+		break;
+	}
+	/* preadv */
+	case 289: {
+		struct preadv_args *p = params;
+		iarg[0] = p->fd; /* int */
+		uarg[1] = (intptr_t) p->iovp; /* struct iovec * */
+		uarg[2] = p->iovcnt; /* u_int */
+		iarg[3] = p->offset; /* off_t */
+		*n_args = 4;
+		break;
+	}
+	/* pwritev */
+	case 290: {
+		struct pwritev_args *p = params;
+		iarg[0] = p->fd; /* int */
+		uarg[1] = (intptr_t) p->iovp; /* struct iovec * */
+		uarg[2] = p->iovcnt; /* u_int */
+		iarg[3] = p->offset; /* off_t */
+		*n_args = 4;
+		break;
+	}
+	/* fhopen */
+	case 298: {
+		struct fhopen_args *p = params;
+		uarg[0] = (intptr_t) p->u_fhp; /* const struct fhandle * */
+		iarg[1] = p->flags; /* int */
+		*n_args = 2;
+		break;
+	}
+	/* fhstat */
+	case 299: {
+		struct fhstat_args *p = params;
+		uarg[0] = (intptr_t) p->u_fhp; /* const struct fhandle * */
+		uarg[1] = (intptr_t) p->sb; /* struct stat * */
+		*n_args = 2;
+		break;
+	}
+	/* modnext */
+	case 300: {
+		struct modnext_args *p = params;
+		iarg[0] = p->modid; /* int */
+		*n_args = 1;
+		break;
+	}
+	/* modstat */
+	case 301: {
+		struct modstat_args *p = params;
+		iarg[0] = p->modid; /* int */
+		uarg[1] = (intptr_t) p->stat; /* struct module_stat * */
+		*n_args = 2;
+		break;
+	}
+	/* modfnext */
+	case 302: {
+		struct modfnext_args *p = params;
+		iarg[0] = p->modid; /* int */
+		*n_args = 1;
+		break;
+	}
+	/* modfind */
+	case 303: {
+		struct modfind_args *p = params;
+		uarg[0] = (intptr_t) p->name; /* const char * */
+		*n_args = 1;
+		break;
+	}
+	/* kldload */
+	case 304: {
+		struct kldload_args *p = params;
+		uarg[0] = (intptr_t) p->file; /* const char * */
+		*n_args = 1;
+		break;
+	}
+	/* kldunload */
+	case 305: {
+		struct kldunload_args *p = params;
+		iarg[0] = p->fileid; /* int */
+		*n_args = 1;
+		break;
+	}
+	/* kldfind */
+	case 306: {
+		struct kldfind_args *p = params;
+		uarg[0] = (intptr_t) p->file; /* const char * */
+		*n_args = 1;
+		break;
+	}
+	/* kldnext */
+	case 307: {
+		struct kldnext_args *p = params;
+		iarg[0] = p->fileid; /* int */
+		*n_args = 1;
+		break;
+	}
+	/* kldstat */
+	case 308: {
+		struct kldstat_args *p = params;
+		iarg[0] = p->fileid; /* int */
+		uarg[1] = (intptr_t) p->stat; /* struct kld_file_stat * */
+		*n_args = 2;
+		break;
+	}
+	/* kldfirstmod */
+	case 309: {
+		struct kldfirstmod_args *p = params;
+		iarg[0] = p->fileid; /* int */
+		*n_args = 1;
+		break;
+	}
+	/* getsid */
+	case 310: {
+		struct getsid_args *p = params;
+		iarg[0] = p->pid; /* pid_t */
+		*n_args = 1;
+		break;
+	}
+	/* setresuid */
+	case 311: {
+		struct setresuid_args *p = params;
+		uarg[0] = p->ruid; /* uid_t */
+		uarg[1] = p->euid; /* uid_t */
+		uarg[2] = p->suid; /* uid_t */
+		*n_args = 3;
+		break;
+	}
+	/* setresgid */
+	case 312: {
+		struct setresgid_args *p = params;
+		iarg[0] = p->rgid; /* gid_t */
+		iarg[1] = p->egid; /* gid_t */
+		iarg[2] = p->sgid; /* gid_t */
+		*n_args = 3;
+		break;
+	}
+	/* aio_return */
+	case 314: {
+		struct aio_return_args *p = params;
+		uarg[0] = (intptr_t) p->aiocbp; /* struct aiocb * */
+		*n_args = 1;
+		break;
+	}
+	/* aio_suspend */
+	case 315: {
+		struct aio_suspend_args *p = params;
+		uarg[0] = (intptr_t) p->aiocbp; /* struct aiocb *const * */
+		iarg[1] = p->nent; /* int */
+		uarg[2] = (intptr_t) p->timeout; /* const struct timespec * */
+		*n_args = 3;
+		break;
+	}
+	/* aio_cancel */
+	case 316: {
+		struct aio_cancel_args *p = params;
+		iarg[0] = p->fd; /* int */
+		uarg[1] = (intptr_t) p->aiocbp; /* struct aiocb * */
+		*n_args = 2;
+		break;
+	}
+	/* aio_error */
+	case 317: {
+		struct aio_error_args *p = params;
+		uarg[0] = (intptr_t) p->aiocbp; /* struct aiocb * */
+		*n_args = 1;
+		break;
+	}
+	/* oaio_read */
+	case 318: {
+		struct oaio_read_args *p = params;
+		uarg[0] = (intptr_t) p->aiocbp; /* struct oaiocb * */
+		*n_args = 1;
+		break;
+	}
+	/* oaio_write */
+	case 319: {
+		struct oaio_write_args *p = params;
+		uarg[0] = (intptr_t) p->aiocbp; /* struct oaiocb * */
+		*n_args = 1;
+		break;
+	}
+	/* olio_listio */
+	case 320: {
+		struct olio_listio_args *p = params;
+		iarg[0] = p->mode; /* int */
+		uarg[1] = (intptr_t) p->acb_list; /* struct oaiocb *const * */
+		iarg[2] = p->nent; /* int */
+		uarg[3] = (intptr_t) p->sig; /* struct osigevent * */
+		*n_args = 4;
+		break;
+	}
+	/* yield */
+	case 321: {
+		*n_args = 0;
+		break;
+	}
+	/* mlockall */
+	case 324: {
+		struct mlockall_args *p = params;
+		iarg[0] = p->how; /* int */
+		*n_args = 1;
+		break;
+	}
+	/* munlockall */
+	case 325: {
+		*n_args = 0;
+		break;
+	}
+	/* __getcwd */
+	case 326: {
+		struct __getcwd_args *p = params;
+		uarg[0] = (intptr_t) p->buf; /* u_char * */
+		uarg[1] = p->buflen; /* u_int */
+		*n_args = 2;
+		break;
+	}
+	/* sched_setparam */
+	case 327: {
+		struct sched_setparam_args *p = params;
+		iarg[0] = p->pid; /* pid_t */
+		uarg[1] = (intptr_t) p->param; /* const struct sched_param * */
+		*n_args = 2;
+		break;
+	}
+	/* sched_getparam */
+	case 328: {
+		struct sched_getparam_args *p = params;
+		iarg[0] = p->pid; /* pid_t */
+		uarg[1] = (intptr_t) p->param; /* struct sched_param * */
+		*n_args = 2;
+		break;
+	}
+	/* sched_setscheduler */
+	case 329: {
+		struct sched_setscheduler_args *p = params;
+		iarg[0] = p->pid; /* pid_t */
+		iarg[1] = p->policy; /* int */
+		uarg[2] = (intptr_t) p->param; /* const struct sched_param * */
+		*n_args = 3;
+		break;
+	}
+	/* sched_getscheduler */
+	case 330: {
+		struct sched_getscheduler_args *p = params;
+		iarg[0] = p->pid; /* pid_t */
+		*n_args = 1;
+		break;
+	}
+	/* sched_yield */
+	case 331: {
+		*n_args = 0;
+		break;
+	}
+	/* sched_get_priority_max */
+	case 332: {
+		struct sched_get_priority_max_args *p = params;
+		iarg[0] = p->policy; /* int */
+		*n_args = 1;
+		break;
+	}
+	/* sched_get_priority_min */
+	case 333: {
+		struct sched_get_priority_min_args *p = params;
+		iarg[0] = p->policy; /* int */
+		*n_args = 1;
+		break;
+	}
+	/* sched_rr_get_interval */
+	case 334: {
+		struct sched_rr_get_interval_args *p = params;
+		iarg[0] = p->pid; /* pid_t */
+		uarg[1] = (intptr_t) p->interval; /* struct timespec * */
+		*n_args = 2;
+		break;
+	}
+	/* utrace */
+	case 335: {
+		struct utrace_args *p = params;
+		uarg[0] = (intptr_t) p->addr; /* const void * */
+		uarg[1] = p->len; /* size_t */
+		*n_args = 2;
+		break;
+	}
+	/* kldsym */
+	case 337: {
+		struct kldsym_args *p = params;
+		iarg[0] = p->fileid; /* int */
+		iarg[1] = p->cmd; /* int */
+		uarg[2] = (intptr_t) p->data; /* void * */
+		*n_args = 3;
+		break;
+	}
+	/* jail */
+	case 338: {
+		struct jail_args *p = params;
+		uarg[0] = (intptr_t) p->jail; /* struct jail * */
+		*n_args = 1;
+		break;
+	}
+	/* sigprocmask */
+	case 340: {
+		struct sigprocmask_args *p = params;
+		iarg[0] = p->how; /* int */
+		uarg[1] = (intptr_t) p->set; /* const sigset_t * */
+		uarg[2] = (intptr_t) p->oset; /* sigset_t * */
+		*n_args = 3;
+		break;
+	}
+	/* sigsuspend */
+	case 341: {
+		struct sigsuspend_args *p = params;
+		uarg[0] = (intptr_t) p->sigmask; /* const sigset_t * */
+		*n_args = 1;
+		break;
+	}
+	/* sigpending */
+	case 343: {
+		struct sigpending_args *p = params;
+		uarg[0] = (intptr_t) p->set; /* sigset_t * */
+		*n_args = 1;
+		break;
+	}
+	/* sigtimedwait */
+	case 345: {
+		struct sigtimedwait_args *p = params;
+		uarg[0] = (intptr_t) p->set; /* const sigset_t * */
+		uarg[1] = (intptr_t) p->info; /* siginfo_t * */
+		uarg[2] = (intptr_t) p->timeout; /* const struct timespec * */
+		*n_args = 3;
+		break;
+	}
+	/* sigwaitinfo */
+	case 346: {
+		struct sigwaitinfo_args *p = params;
+		uarg[0] = (intptr_t) p->set; /* const sigset_t * */
+		uarg[1] = (intptr_t) p->info; /* siginfo_t * */
+		*n_args = 2;
+		break;
+	}
+	/* __acl_get_file */
+	case 347: {
+		struct __acl_get_file_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* const char * */
+		iarg[1] = p->type; /* acl_type_t */
+		uarg[2] = (intptr_t) p->aclp; /* struct acl * */
+		*n_args = 3;
+		break;
+	}
+	/* __acl_set_file */
+	case 348: {
+		struct __acl_set_file_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* const char * */
+		iarg[1] = p->type; /* acl_type_t */
+		uarg[2] = (intptr_t) p->aclp; /* struct acl * */
+		*n_args = 3;
+		break;
+	}
+	/* __acl_get_fd */
+	case 349: {
+		struct __acl_get_fd_args *p = params;
+		iarg[0] = p->filedes; /* int */
+		iarg[1] = p->type; /* acl_type_t */
+		uarg[2] = (intptr_t) p->aclp; /* struct acl * */
+		*n_args = 3;
+		break;
+	}
+	/* __acl_set_fd */
+	case 350: {
+		struct __acl_set_fd_args *p = params;
+		iarg[0] = p->filedes; /* int */
+		iarg[1] = p->type; /* acl_type_t */
+		uarg[2] = (intptr_t) p->aclp; /* struct acl * */
+		*n_args = 3;
+		break;
+	}
+	/* __acl_delete_file */
+	case 351: {
+		struct __acl_delete_file_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* const char * */
+		iarg[1] = p->type; /* acl_type_t */
+		*n_args = 2;
+		break;
+	}
+	/* __acl_delete_fd */
+	case 352: {
+		struct __acl_delete_fd_args *p = params;
+		iarg[0] = p->filedes; /* int */
+		iarg[1] = p->type; /* acl_type_t */
+		*n_args = 2;
+		break;
+	}
+	/* __acl_aclcheck_file */
+	case 353: {
+		struct __acl_aclcheck_file_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* const char * */
+		iarg[1] = p->type; /* acl_type_t */
+		uarg[2] = (intptr_t) p->aclp; /* struct acl * */
+		*n_args = 3;
+		break;
+	}
+	/* __acl_aclcheck_fd */
+	case 354: {
+		struct __acl_aclcheck_fd_args *p = params;
+		iarg[0] = p->filedes; /* int */
+		iarg[1] = p->type; /* acl_type_t */
+		uarg[2] = (intptr_t) p->aclp; /* struct acl * */
+		*n_args = 3;
+		break;
+	}
+	/* extattrctl */
+	case 355: {
+		struct extattrctl_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* const char * */
+		iarg[1] = p->cmd; /* int */
+		uarg[2] = (intptr_t) p->filename; /* const char * */
+		iarg[3] = p->attrnamespace; /* int */
+		uarg[4] = (intptr_t) p->attrname; /* const char * */
+		*n_args = 5;
+		break;
+	}
+	/* extattr_set_file */
+	case 356: {
+		struct extattr_set_file_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* const char * */
+		iarg[1] = p->attrnamespace; /* int */
+		uarg[2] = (intptr_t) p->attrname; /* const char * */
+		uarg[3] = (intptr_t) p->data; /* void * */
+		uarg[4] = p->nbytes; /* size_t */
+		*n_args = 5;
+		break;
+	}
+	/* extattr_get_file */
+	case 357: {
+		struct extattr_get_file_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* const char * */
+		iarg[1] = p->attrnamespace; /* int */
+		uarg[2] = (intptr_t) p->attrname; /* const char * */
+		uarg[3] = (intptr_t) p->data; /* void * */
+		uarg[4] = p->nbytes; /* size_t */
+		*n_args = 5;
+		break;
+	}
+	/* extattr_delete_file */
+	case 358: {
+		struct extattr_delete_file_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* const char * */
+		iarg[1] = p->attrnamespace; /* int */
+		uarg[2] = (intptr_t) p->attrname; /* const char * */
+		*n_args = 3;
+		break;
+	}
+	/* aio_waitcomplete */
+	case 359: {
+		struct aio_waitcomplete_args *p = params;
+		uarg[0] = (intptr_t) p->aiocbp; /* struct aiocb ** */
+		uarg[1] = (intptr_t) p->timeout; /* struct timespec * */
+		*n_args = 2;
+		break;
+	}
+	/* getresuid */
+	case 360: {
+		struct getresuid_args *p = params;
+		uarg[0] = (intptr_t) p->ruid; /* uid_t * */
+		uarg[1] = (intptr_t) p->euid; /* uid_t * */
+		uarg[2] = (intptr_t) p->suid; /* uid_t * */
+		*n_args = 3;
+		break;
+	}
+	/* getresgid */
+	case 361: {
+		struct getresgid_args *p = params;
+		uarg[0] = (intptr_t) p->rgid; /* gid_t * */
+		uarg[1] = (intptr_t) p->egid; /* gid_t * */
+		uarg[2] = (intptr_t) p->sgid; /* gid_t * */
+		*n_args = 3;
+		break;
+	}
+	/* kqueue */
+	case 362: {
+		*n_args = 0;
+		break;
+	}
+	/* kevent */
+	case 363: {
+		struct kevent_args *p = params;
+		iarg[0] = p->fd; /* int */
+		uarg[1] = (intptr_t) p->changelist; /* struct kevent * */
+		iarg[2] = p->nchanges; /* int */
+		uarg[3] = (intptr_t) p->eventlist; /* struct kevent * */
+		iarg[4] = p->nevents; /* int */
+		uarg[5] = (intptr_t) p->timeout; /* const struct timespec * */
+		*n_args = 6;
+		break;
+	}
+	/* lkmressys */
+	case 370: {
+		*n_args = 0;
+		break;
+	}
+	/* extattr_set_fd */
+	case 371: {
+		struct extattr_set_fd_args *p = params;
+		iarg[0] = p->fd; /* int */
+		iarg[1] = p->attrnamespace; /* int */
+		uarg[2] = (intptr_t) p->attrname; /* const char * */
+		uarg[3] = (intptr_t) p->data; /* void * */
+		uarg[4] = p->nbytes; /* size_t */
+		*n_args = 5;
+		break;
+	}
+	/* extattr_get_fd */
+	case 372: {
+		struct extattr_get_fd_args *p = params;
+		iarg[0] = p->fd; /* int */
+		iarg[1] = p->attrnamespace; /* int */
+		uarg[2] = (intptr_t) p->attrname; /* const char * */
+		uarg[3] = (intptr_t) p->data; /* void * */
+		uarg[4] = p->nbytes; /* size_t */
+		*n_args = 5;
+		break;
+	}
+	/* extattr_delete_fd */
+	case 373: {
+		struct extattr_delete_fd_args *p = params;
+		iarg[0] = p->fd; /* int */
+		iarg[1] = p->attrnamespace; /* int */
+		uarg[2] = (intptr_t) p->attrname; /* const char * */
+		*n_args = 3;
+		break;
+	}
+	/* __setugid */
+	case 374: {
+		struct __setugid_args *p = params;
+		iarg[0] = p->flag; /* int */
+		*n_args = 1;
+		break;
+	}
+	/* nfsclnt */
+	case 375: {
+		struct nfsclnt_args *p = params;
+		iarg[0] = p->flag; /* int */
+		uarg[1] = (intptr_t) p->argp; /* caddr_t */
+		*n_args = 2;
+		break;
+	}
+	/* eaccess */
+	case 376: {
+		struct eaccess_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* char * */
+		iarg[1] = p->flags; /* int */
+		*n_args = 2;
+		break;
+	}
+	/* nmount */
+	case 378: {
+		struct nmount_args *p = params;
+		uarg[0] = (intptr_t) p->iovp; /* struct iovec * */
+		uarg[1] = p->iovcnt; /* unsigned int */
+		iarg[2] = p->flags; /* int */
+		*n_args = 3;
+		break;
+	}
+	/* kse_exit */
+	case 379: {
+		*n_args = 0;
+		break;
+	}
+	/* kse_wakeup */
+	case 380: {
+		struct kse_wakeup_args *p = params;
+		uarg[0] = (intptr_t) p->mbx; /* struct kse_mailbox * */
+		*n_args = 1;
+		break;
+	}
+	/* kse_create */
+	case 381: {
+		struct kse_create_args *p = params;
+		uarg[0] = (intptr_t) p->mbx; /* struct kse_mailbox * */
+		iarg[1] = p->newgroup; /* int */
+		*n_args = 2;
+		break;
+	}
+	/* kse_thr_interrupt */
+	case 382: {
+		struct kse_thr_interrupt_args *p = params;
+		uarg[0] = (intptr_t) p->tmbx; /* struct kse_thr_mailbox * */
+		iarg[1] = p->cmd; /* int */
+		iarg[2] = p->data; /* long */
+		*n_args = 3;
+		break;
+	}
+	/* kse_release */
+	case 383: {
+		struct kse_release_args *p = params;
+		uarg[0] = (intptr_t) p->timeout; /* struct timespec * */
+		*n_args = 1;
+		break;
+	}
+	/* __mac_get_proc */
+	case 384: {
+		struct __mac_get_proc_args *p = params;
+		uarg[0] = (intptr_t) p->mac_p; /* struct mac * */
+		*n_args = 1;
+		break;
+	}
+	/* __mac_set_proc */
+	case 385: {
+		struct __mac_set_proc_args *p = params;
+		uarg[0] = (intptr_t) p->mac_p; /* struct mac * */
+		*n_args = 1;
+		break;
+	}
+	/* __mac_get_fd */
+	case 386: {
+		struct __mac_get_fd_args *p = params;
+		iarg[0] = p->fd; /* int */
+		uarg[1] = (intptr_t) p->mac_p; /* struct mac * */
+		*n_args = 2;
+		break;
+	}
+	/* __mac_get_file */
+	case 387: {
+		struct __mac_get_file_args *p = params;
+		uarg[0] = (intptr_t) p->path_p; /* const char * */
+		uarg[1] = (intptr_t) p->mac_p; /* struct mac * */
+		*n_args = 2;
+		break;
+	}
+	/* __mac_set_fd */
+	case 388: {
+		struct __mac_set_fd_args *p = params;
+		iarg[0] = p->fd; /* int */
+		uarg[1] = (intptr_t) p->mac_p; /* struct mac * */
+		*n_args = 2;
+		break;
+	}
+	/* __mac_set_file */
+	case 389: {
+		struct __mac_set_file_args *p = params;
+		uarg[0] = (intptr_t) p->path_p; /* const char * */
+		uarg[1] = (intptr_t) p->mac_p; /* struct mac * */
+		*n_args = 2;
+		break;
+	}
+	/* kenv */
+	case 390: {
+		struct kenv_args *p = params;
+		iarg[0] = p->what; /* int */
+		uarg[1] = (intptr_t) p->name; /* const char * */
+		uarg[2] = (intptr_t) p->value; /* char * */
+		iarg[3] = p->len; /* int */
+		*n_args = 4;
+		break;
+	}
+	/* lchflags */
+	case 391: {
+		struct lchflags_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* const char * */
+		iarg[1] = p->flags; /* int */
+		*n_args = 2;
+		break;
+	}
+	/* uuidgen */
+	case 392: {
+		struct uuidgen_args *p = params;
+		uarg[0] = (intptr_t) p->store; /* struct uuid * */
+		iarg[1] = p->count; /* int */
+		*n_args = 2;
+		break;
+	}
+	/* sendfile */
+	case 393: {
+		struct sendfile_args *p = params;
+		iarg[0] = p->fd; /* int */
+		iarg[1] = p->s; /* int */
+		iarg[2] = p->offset; /* off_t */
+		uarg[3] = p->nbytes; /* size_t */
+		uarg[4] = (intptr_t) p->hdtr; /* struct sf_hdtr * */
+		uarg[5] = (intptr_t) p->sbytes; /* off_t * */
+		iarg[6] = p->flags; /* int */
+		*n_args = 7;
+		break;
+	}
+	/* mac_syscall */
+	case 394: {
+		struct mac_syscall_args *p = params;
+		uarg[0] = (intptr_t) p->policy; /* const char * */
+		iarg[1] = p->call; /* int */
+		uarg[2] = (intptr_t) p->arg; /* void * */
+		*n_args = 3;
+		break;
+	}
+	/* getfsstat */
+	case 395: {
+		struct getfsstat_args *p = params;
+		uarg[0] = (intptr_t) p->buf; /* struct statfs * */
+		iarg[1] = p->bufsize; /* long */
+		iarg[2] = p->flags; /* int */
+		*n_args = 3;
+		break;
+	}
+	/* statfs */
+	case 396: {
+		struct statfs_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* char * */
+		uarg[1] = (intptr_t) p->buf; /* struct statfs * */
+		*n_args = 2;
+		break;
+	}
+	/* fstatfs */
+	case 397: {
+		struct fstatfs_args *p = params;
+		iarg[0] = p->fd; /* int */
+		uarg[1] = (intptr_t) p->buf; /* struct statfs * */
+		*n_args = 2;
+		break;
+	}
+	/* fhstatfs */
+	case 398: {
+		struct fhstatfs_args *p = params;
+		uarg[0] = (intptr_t) p->u_fhp; /* const struct fhandle * */
+		uarg[1] = (intptr_t) p->buf; /* struct statfs * */
+		*n_args = 2;
+		break;
+	}
+	/* ksem_close */
+	case 400: {
+		struct ksem_close_args *p = params;
+		iarg[0] = p->id; /* semid_t */
+		*n_args = 1;
+		break;
+	}
+	/* ksem_post */
+	case 401: {
+		struct ksem_post_args *p = params;
+		iarg[0] = p->id; /* semid_t */
+		*n_args = 1;
+		break;
+	}
+	/* ksem_wait */
+	case 402: {
+		struct ksem_wait_args *p = params;
+		iarg[0] = p->id; /* semid_t */
+		*n_args = 1;
+		break;
+	}
+	/* ksem_trywait */
+	case 403: {
+		struct ksem_trywait_args *p = params;
+		iarg[0] = p->id; /* semid_t */
+		*n_args = 1;
+		break;
+	}
+	/* ksem_init */
+	case 404: {
+		struct ksem_init_args *p = params;
+		uarg[0] = (intptr_t) p->idp; /* semid_t * */
+		uarg[1] = p->value; /* unsigned int */
+		*n_args = 2;
+		break;
+	}
+	/* ksem_open */
+	case 405: {
+		struct ksem_open_args *p = params;
+		uarg[0] = (intptr_t) p->idp; /* semid_t * */
+		uarg[1] = (intptr_t) p->name; /* const char * */
+		iarg[2] = p->oflag; /* int */
+		iarg[3] = p->mode; /* mode_t */
+		uarg[4] = p->value; /* unsigned int */
+		*n_args = 5;
+		break;
+	}
+	/* ksem_unlink */
+	case 406: {
+		struct ksem_unlink_args *p = params;
+		uarg[0] = (intptr_t) p->name; /* const char * */
+		*n_args = 1;
+		break;
+	}
+	/* ksem_getvalue */
+	case 407: {
+		struct ksem_getvalue_args *p = params;
+		iarg[0] = p->id; /* semid_t */
+		uarg[1] = (intptr_t) p->val; /* int * */
+		*n_args = 2;
+		break;
+	}
+	/* ksem_destroy */
+	case 408: {
+		struct ksem_destroy_args *p = params;
+		iarg[0] = p->id; /* semid_t */
+		*n_args = 1;
+		break;
+	}
+	/* __mac_get_pid */
+	case 409: {
+		struct __mac_get_pid_args *p = params;
+		iarg[0] = p->pid; /* pid_t */
+		uarg[1] = (intptr_t) p->mac_p; /* struct mac * */
+		*n_args = 2;
+		break;
+	}
+	/* __mac_get_link */
+	case 410: {
+		struct __mac_get_link_args *p = params;
+		uarg[0] = (intptr_t) p->path_p; /* const char * */
+		uarg[1] = (intptr_t) p->mac_p; /* struct mac * */
+		*n_args = 2;
+		break;
+	}
+	/* __mac_set_link */
+	case 411: {
+		struct __mac_set_link_args *p = params;
+		uarg[0] = (intptr_t) p->path_p; /* const char * */
+		uarg[1] = (intptr_t) p->mac_p; /* struct mac * */
+		*n_args = 2;
+		break;
+	}
+	/* extattr_set_link */
+	case 412: {
+		struct extattr_set_link_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* const char * */
+		iarg[1] = p->attrnamespace; /* int */
+		uarg[2] = (intptr_t) p->attrname; /* const char * */
+		uarg[3] = (intptr_t) p->data; /* void * */
+		uarg[4] = p->nbytes; /* size_t */
+		*n_args = 5;
+		break;
+	}
+	/* extattr_get_link */
+	case 413: {
+		struct extattr_get_link_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* const char * */
+		iarg[1] = p->attrnamespace; /* int */
+		uarg[2] = (intptr_t) p->attrname; /* const char * */
+		uarg[3] = (intptr_t) p->data; /* void * */
+		uarg[4] = p->nbytes; /* size_t */
+		*n_args = 5;
+		break;
+	}
+	/* extattr_delete_link */
+	case 414: {
+		struct extattr_delete_link_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* const char * */
+		iarg[1] = p->attrnamespace; /* int */
+		uarg[2] = (intptr_t) p->attrname; /* const char * */
+		*n_args = 3;
+		break;
+	}
+	/* __mac_execve */
+	case 415: {
+		struct __mac_execve_args *p = params;
+		uarg[0] = (intptr_t) p->fname; /* char * */
+		uarg[1] = (intptr_t) p->argv; /* char ** */
+		uarg[2] = (intptr_t) p->envv; /* char ** */
+		uarg[3] = (intptr_t) p->mac_p; /* struct mac * */
+		*n_args = 4;
+		break;
+	}
+	/* sigaction */
+	case 416: {
+		struct sigaction_args *p = params;
+		iarg[0] = p->sig; /* int */
+		uarg[1] = (intptr_t) p->act; /* const struct sigaction * */
+		uarg[2] = (intptr_t) p->oact; /* struct sigaction * */
+		*n_args = 3;
+		break;
+	}
+	/* sigreturn */
+	case 417: {
+		struct sigreturn_args *p = params;
+		uarg[0] = (intptr_t) p->sigcntxp; /* const struct __ucontext * */
+		*n_args = 1;
+		break;
+	}
+	/* getcontext */
+	case 421: {
+		struct getcontext_args *p = params;
+		uarg[0] = (intptr_t) p->ucp; /* struct __ucontext * */
+		*n_args = 1;
+		break;
+	}
+	/* setcontext */
+	case 422: {
+		struct setcontext_args *p = params;
+		uarg[0] = (intptr_t) p->ucp; /* const struct __ucontext * */
+		*n_args = 1;
+		break;
+	}
+	/* swapcontext */
+	case 423: {
+		struct swapcontext_args *p = params;
+		uarg[0] = (intptr_t) p->oucp; /* struct __ucontext * */
+		uarg[1] = (intptr_t) p->ucp; /* const struct __ucontext * */
+		*n_args = 2;
+		break;
+	}
+	/* swapoff */
+	case 424: {
+		struct swapoff_args *p = params;
+		uarg[0] = (intptr_t) p->name; /* const char * */
+		*n_args = 1;
+		break;
+	}
+	/* __acl_get_link */
+	case 425: {
+		struct __acl_get_link_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* const char * */
+		iarg[1] = p->type; /* acl_type_t */
+		uarg[2] = (intptr_t) p->aclp; /* struct acl * */
+		*n_args = 3;
+		break;
+	}
+	/* __acl_set_link */
+	case 426: {
+		struct __acl_set_link_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* const char * */
+		iarg[1] = p->type; /* acl_type_t */
+		uarg[2] = (intptr_t) p->aclp; /* struct acl * */
+		*n_args = 3;
+		break;
+	}
+	/* __acl_delete_link */
+	case 427: {
+		struct __acl_delete_link_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* const char * */
+		iarg[1] = p->type; /* acl_type_t */
+		*n_args = 2;
+		break;
+	}
+	/* __acl_aclcheck_link */
+	case 428: {
+		struct __acl_aclcheck_link_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* const char * */
+		iarg[1] = p->type; /* acl_type_t */
+		uarg[2] = (intptr_t) p->aclp; /* struct acl * */
+		*n_args = 3;
+		break;
+	}
+	/* sigwait */
+	case 429: {
+		struct sigwait_args *p = params;
+		uarg[0] = (intptr_t) p->set; /* const sigset_t * */
+		uarg[1] = (intptr_t) p->sig; /* int * */
+		*n_args = 2;
+		break;
+	}
+	/* thr_create */
+	case 430: {
+		struct thr_create_args *p = params;
+		uarg[0] = (intptr_t) p->ctx; /* ucontext_t * */
+		uarg[1] = (intptr_t) p->id; /* long * */
+		iarg[2] = p->flags; /* int */
+		*n_args = 3;
+		break;
+	}
+	/* thr_exit */
+	case 431: {
+		struct thr_exit_args *p = params;
+		uarg[0] = (intptr_t) p->state; /* long * */
+		*n_args = 1;
+		break;
+	}
+	/* thr_self */
+	case 432: {
+		struct thr_self_args *p = params;
+		uarg[0] = (intptr_t) p->id; /* long * */
+		*n_args = 1;
+		break;
+	}
+	/* thr_kill */
+	case 433: {
+		struct thr_kill_args *p = params;
+		iarg[0] = p->id; /* long */
+		iarg[1] = p->sig; /* int */
+		*n_args = 2;
+		break;
+	}
+	/* _umtx_lock */
+	case 434: {
+		struct _umtx_lock_args *p = params;
+		uarg[0] = (intptr_t) p->umtx; /* struct umtx * */
+		*n_args = 1;
+		break;
+	}
+	/* _umtx_unlock */
+	case 435: {
+		struct _umtx_unlock_args *p = params;
+		uarg[0] = (intptr_t) p->umtx; /* struct umtx * */
+		*n_args = 1;
+		break;
+	}
+	/* jail_attach */
+	case 436: {
+		struct jail_attach_args *p = params;
+		iarg[0] = p->jid; /* int */
+		*n_args = 1;
+		break;
+	}
+	/* extattr_list_fd */
+	case 437: {
+		struct extattr_list_fd_args *p = params;
+		iarg[0] = p->fd; /* int */
+		iarg[1] = p->attrnamespace; /* int */
+		uarg[2] = (intptr_t) p->data; /* void * */
+		uarg[3] = p->nbytes; /* size_t */
+		*n_args = 4;
+		break;
+	}
+	/* extattr_list_file */
+	case 438: {
+		struct extattr_list_file_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* const char * */
+		iarg[1] = p->attrnamespace; /* int */
+		uarg[2] = (intptr_t) p->data; /* void * */
+		uarg[3] = p->nbytes; /* size_t */
+		*n_args = 4;
+		break;
+	}
+	/* extattr_list_link */
+	case 439: {
+		struct extattr_list_link_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* const char * */
+		iarg[1] = p->attrnamespace; /* int */
+		uarg[2] = (intptr_t) p->data; /* void * */
+		uarg[3] = p->nbytes; /* size_t */
+		*n_args = 4;
+		break;
+	}
+	/* kse_switchin */
+	case 440: {
+		struct kse_switchin_args *p = params;
+		uarg[0] = (intptr_t) p->tmbx; /* struct kse_thr_mailbox * */
+		iarg[1] = p->flags; /* int */
+		*n_args = 2;
+		break;
+	}
+	/* ksem_timedwait */
+	case 441: {
+		struct ksem_timedwait_args *p = params;
+		iarg[0] = p->id; /* semid_t */
+		uarg[1] = (intptr_t) p->abstime; /* const struct timespec * */
+		*n_args = 2;
+		break;
+	}
+	/* thr_suspend */
+	case 442: {
+		struct thr_suspend_args *p = params;
+		uarg[0] = (intptr_t) p->timeout; /* const struct timespec * */
+		*n_args = 1;
+		break;
+	}
+	/* thr_wake */
+	case 443: {
+		struct thr_wake_args *p = params;
+		iarg[0] = p->id; /* long */
+		*n_args = 1;
+		break;
+	}
+	/* kldunloadf */
+	case 444: {
+		struct kldunloadf_args *p = params;
+		iarg[0] = p->fileid; /* int */
+		iarg[1] = p->flags; /* int */
+		*n_args = 2;
+		break;
+	}
+	/* audit */
+	case 445: {
+		struct audit_args *p = params;
+		uarg[0] = (intptr_t) p->record; /* const void * */
+		uarg[1] = p->length; /* u_int */
+		*n_args = 2;
+		break;
+	}
+	/* auditon */
+	case 446: {
+		struct auditon_args *p = params;
+		iarg[0] = p->cmd; /* int */
+		uarg[1] = (intptr_t) p->data; /* void * */
+		uarg[2] = p->length; /* u_int */
+		*n_args = 3;
+		break;
+	}
+	/* getauid */
+	case 447: {
+		struct getauid_args *p = params;
+		uarg[0] = (intptr_t) p->auid; /* uid_t * */
+		*n_args = 1;
+		break;
+	}
+	/* setauid */
+	case 448: {
+		struct setauid_args *p = params;
+		uarg[0] = (intptr_t) p->auid; /* uid_t * */
+		*n_args = 1;
+		break;
+	}
+	/* getaudit */
+	case 449: {
+		struct getaudit_args *p = params;
+		uarg[0] = (intptr_t) p->auditinfo; /* struct auditinfo * */
+		*n_args = 1;
+		break;
+	}
+	/* setaudit */
+	case 450: {
+		struct setaudit_args *p = params;
+		uarg[0] = (intptr_t) p->auditinfo; /* struct auditinfo * */
+		*n_args = 1;
+		break;
+	}
+	/* getaudit_addr */
+	case 451: {
+		struct getaudit_addr_args *p = params;
+		uarg[0] = (intptr_t) p->auditinfo_addr; /* struct auditinfo_addr * */
+		uarg[1] = p->length; /* u_int */
+		*n_args = 2;
+		break;
+	}
+	/* setaudit_addr */
+	case 452: {
+		struct setaudit_addr_args *p = params;
+		uarg[0] = (intptr_t) p->auditinfo_addr; /* struct auditinfo_addr * */
+		uarg[1] = p->length; /* u_int */
+		*n_args = 2;
+		break;
+	}
+	/* auditctl */
+	case 453: {
+		struct auditctl_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* char * */
+		*n_args = 1;
+		break;
+	}
+	/* _umtx_op */
+	case 454: {
+		struct _umtx_op_args *p = params;
+		uarg[0] = (intptr_t) p->obj; /* void * */
+		iarg[1] = p->op; /* int */
+		uarg[2] = p->val; /* u_long */
+		uarg[3] = (intptr_t) p->uaddr1; /* void * */
+		uarg[4] = (intptr_t) p->uaddr2; /* void * */
+		*n_args = 5;
+		break;
+	}
+	/* thr_new */
+	case 455: {
+		struct thr_new_args *p = params;
+		uarg[0] = (intptr_t) p->param; /* struct thr_param * */
+		iarg[1] = p->param_size; /* int */
+		*n_args = 2;
+		break;
+	}
+	/* sigqueue */
+	case 456: {
+		struct sigqueue_args *p = params;
+		iarg[0] = p->pid; /* pid_t */
+		iarg[1] = p->signum; /* int */
+		uarg[2] = (intptr_t) p->value; /* void * */
+		*n_args = 3;
+		break;
+	}
+	/* kmq_open */
+	case 457: {
+		struct kmq_open_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* const char * */
+		iarg[1] = p->flags; /* int */
+		iarg[2] = p->mode; /* mode_t */
+		uarg[3] = (intptr_t) p->attr; /* const struct mq_attr * */
+		*n_args = 4;
+		break;
+	}
+	/* kmq_setattr */
+	case 458: {
+		struct kmq_setattr_args *p = params;
+		iarg[0] = p->mqd; /* int */
+		uarg[1] = (intptr_t) p->attr; /* const struct mq_attr * */
+		uarg[2] = (intptr_t) p->oattr; /* struct mq_attr * */
+		*n_args = 3;
+		break;
+	}
+	/* kmq_timedreceive */
+	case 459: {
+		struct kmq_timedreceive_args *p = params;
+		iarg[0] = p->mqd; /* int */
+		uarg[1] = (intptr_t) p->msg_ptr; /* char * */
+		uarg[2] = p->msg_len; /* size_t */
+		uarg[3] = (intptr_t) p->msg_prio; /* unsigned * */
+		uarg[4] = (intptr_t) p->abs_timeout; /* const struct timespec * */
+		*n_args = 5;
+		break;
+	}
+	/* kmq_timedsend */
+	case 460: {
+		struct kmq_timedsend_args *p = params;
+		iarg[0] = p->mqd; /* int */
+		uarg[1] = (intptr_t) p->msg_ptr; /* const char * */
+		uarg[2] = p->msg_len; /* size_t */
+		uarg[3] = p->msg_prio; /* unsigned */
+		uarg[4] = (intptr_t) p->abs_timeout; /* const struct timespec * */
+		*n_args = 5;
+		break;
+	}
+	/* kmq_notify */
+	case 461: {
+		struct kmq_notify_args *p = params;
+		iarg[0] = p->mqd; /* int */
+		uarg[1] = (intptr_t) p->sigev; /* const struct sigevent * */
+		*n_args = 2;
+		break;
+	}
+	/* kmq_unlink */
+	case 462: {
+		struct kmq_unlink_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* const char * */
+		*n_args = 1;
+		break;
+	}
+	/* abort2 */
+	case 463: {
+		struct abort2_args *p = params;
+		uarg[0] = (intptr_t) p->why; /* const char * */
+		iarg[1] = p->nargs; /* int */
+		uarg[2] = (intptr_t) p->args; /* void ** */
+		*n_args = 3;
+		break;
+	}
+	/* thr_set_name */
+	case 464: {
+		struct thr_set_name_args *p = params;
+		iarg[0] = p->id; /* long */
+		uarg[1] = (intptr_t) p->name; /* const char * */
+		*n_args = 2;
+		break;
+	}
+	/* aio_fsync */
+	case 465: {
+		struct aio_fsync_args *p = params;
+		iarg[0] = p->op; /* int */
+		uarg[1] = (intptr_t) p->aiocbp; /* struct aiocb * */
+		*n_args = 2;
+		break;
+	}
+	/* rtprio_thread */
+	case 466: {
+		struct rtprio_thread_args *p = params;
+		iarg[0] = p->function; /* int */
+		iarg[1] = p->lwpid; /* lwpid_t */
+		uarg[2] = (intptr_t) p->rtp; /* struct rtprio * */
+		*n_args = 3;
+		break;
+	}
+	/* sctp_peeloff */
+	case 471: {
+		struct sctp_peeloff_args *p = params;
+		iarg[0] = p->sd; /* int */
+		uarg[1] = p->name; /* uint32_t */
+		*n_args = 2;
+		break;
+	}
+	/* sctp_generic_sendmsg */
+	case 472: {
+		struct sctp_generic_sendmsg_args *p = params;
+		iarg[0] = p->sd; /* int */
+		uarg[1] = (intptr_t) p->msg; /* caddr_t */
+		iarg[2] = p->mlen; /* int */
+		uarg[3] = (intptr_t) p->to; /* caddr_t */
+		iarg[4] = p->tolen; /* __socklen_t */
+		uarg[5] = (intptr_t) p->sinfo; /* struct sctp_sndrcvinfo * */
+		iarg[6] = p->flags; /* int */
+		*n_args = 7;
+		break;
+	}
+	/* sctp_generic_sendmsg_iov */
+	case 473: {
+		struct sctp_generic_sendmsg_iov_args *p = params;
+		iarg[0] = p->sd; /* int */
+		uarg[1] = (intptr_t) p->iov; /* struct iovec * */
+		iarg[2] = p->iovlen; /* int */
+		uarg[3] = (intptr_t) p->to; /* caddr_t */
+		iarg[4] = p->tolen; /* __socklen_t */
+		uarg[5] = (intptr_t) p->sinfo; /* struct sctp_sndrcvinfo * */
+		iarg[6] = p->flags; /* int */
+		*n_args = 7;
+		break;
+	}
+	/* sctp_generic_recvmsg */
+	case 474: {
+		struct sctp_generic_recvmsg_args *p = params;
+		iarg[0] = p->sd; /* int */
+		uarg[1] = (intptr_t) p->iov; /* struct iovec * */
+		iarg[2] = p->iovlen; /* int */
+		uarg[3] = (intptr_t) p->from; /* struct sockaddr * */
+		uarg[4] = (intptr_t) p->fromlenaddr; /* __socklen_t * */
+		uarg[5] = (intptr_t) p->sinfo; /* struct sctp_sndrcvinfo * */
+		uarg[6] = (intptr_t) p->msg_flags; /* int * */
+		*n_args = 7;
+		break;
+	}
+	/* pread */
+	case 475: {
+		struct pread_args *p = params;
+		iarg[0] = p->fd; /* int */
+		uarg[1] = (intptr_t) p->buf; /* void * */
+		uarg[2] = p->nbyte; /* size_t */
+		iarg[3] = p->offset; /* off_t */
+		*n_args = 4;
+		break;
+	}
+	/* pwrite */
+	case 476: {
+		struct pwrite_args *p = params;
+		iarg[0] = p->fd; /* int */
+		uarg[1] = (intptr_t) p->buf; /* const void * */
+		uarg[2] = p->nbyte; /* size_t */
+		iarg[3] = p->offset; /* off_t */
+		*n_args = 4;
+		break;
+	}
+	/* mmap */
+	case 477: {
+		struct mmap_args *p = params;
+		uarg[0] = (intptr_t) p->addr; /* caddr_t */
+		uarg[1] = p->len; /* size_t */
+		iarg[2] = p->prot; /* int */
+		iarg[3] = p->flags; /* int */
+		iarg[4] = p->fd; /* int */
+		iarg[5] = p->pos; /* off_t */
+		*n_args = 6;
+		break;
+	}
+	/* lseek */
+	case 478: {
+		struct lseek_args *p = params;
+		iarg[0] = p->fd; /* int */
+		iarg[1] = p->offset; /* off_t */
+		iarg[2] = p->whence; /* int */
+		*n_args = 3;
+		break;
+	}
+	/* truncate */
+	case 479: {
+		struct truncate_args *p = params;
+		uarg[0] = (intptr_t) p->path; /* char * */
+		iarg[1] = p->length; /* off_t */
+		*n_args = 2;
+		break;
+	}
+	/* ftruncate */
+	case 480: {
+		struct ftruncate_args *p = params;
+		iarg[0] = p->fd; /* int */
+		iarg[1] = p->length; /* off_t */
+		*n_args = 2;
+		break;
+	}
+	/* thr_kill2 */
+	case 481: {
+		struct thr_kill2_args *p = params;
+		iarg[0] = p->pid; /* pid_t */
+		iarg[1] = p->id; /* long */
+		iarg[2] = p->sig; /* int */
+		*n_args = 3;
+		break;
+	}
+	default:
+		*n_args = 0;
+		break;
+	};
+}
--- /dev/null
+++ sys/kern/subr_acl_posix1e.c
@@ -0,0 +1,637 @@
+/*-
+ * Copyright (c) 1999-2006 Robert N. M. Watson
+ * All rights reserved.
+ *
+ * This software was developed by Robert Watson for the TrustedBSD Project.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/*
+ * Developed by the TrustedBSD Project.
+ *
+ * ACL support routines specific to POSIX.1e access control lists.  These are
+ * utility routines for code common across file systems implementing POSIX.1e
+ * ACLs.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: src/sys/kern/subr_acl_posix1e.c,v 1.52 2007/06/12 00:11:59 rwatson Exp $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/mount.h>
+#include <sys/priv.h>
+#include <sys/vnode.h>
+#include <sys/errno.h>
+#include <sys/stat.h>
+#include <sys/acl.h>
+
+/*
+ * Implement a version of vaccess() that understands POSIX.1e ACL semantics;
+ * the access ACL has already been prepared for evaluation by the file system
+ * and is passed via 'uid', 'gid', and 'acl'.  Return 0 on success, else an
+ * errno value.
+ */
+int
+vaccess_acl_posix1e(enum vtype type, uid_t file_uid, gid_t file_gid,
+    struct acl *acl, mode_t acc_mode, struct ucred *cred, int *privused)
+{
+	struct acl_entry *acl_other, *acl_mask;
+	mode_t dac_granted;
+	mode_t priv_granted;
+	mode_t acl_mask_granted;
+	int group_matched, i;
+
+	/*
+	 * Look for a normal, non-privileged way to access the file/directory
+	 * as requested.  If it exists, go with that.  Otherwise, attempt to
+	 * use privileges granted via priv_granted.  In some cases, which
+	 * privileges to use may be ambiguous due to "best match", in which
+	 * case fall back on first match for the time being.
+	 */
+	if (privused != NULL)
+		*privused = 0;
+
+	/*
+	 * Determine privileges now, but don't apply until we've found a DAC
+	 * entry that matches but has failed to allow access.
+	 *
+	 * XXXRW: Ideally, we'd determine the privileges required before
+	 * asking for them.
+	 */
+	priv_granted = 0;
+
+	if (type == VDIR) {
+		if ((acc_mode & VEXEC) && !priv_check_cred(cred,
+		     PRIV_VFS_LOOKUP, 0))
+			priv_granted |= VEXEC;
+	} else {
+		if ((acc_mode & VEXEC) && !priv_check_cred(cred,
+		    PRIV_VFS_EXEC, 0))
+			priv_granted |= VEXEC;
+	}
+
+	if ((acc_mode & VREAD) && !priv_check_cred(cred, PRIV_VFS_READ, 0))
+		priv_granted |= VREAD;
+
+	if (((acc_mode & VWRITE) || (acc_mode & VAPPEND)) &&
+	    !priv_check_cred(cred, PRIV_VFS_WRITE, 0))
+		priv_granted |= (VWRITE | VAPPEND);
+
+	if ((acc_mode & VADMIN) && !priv_check_cred(cred, PRIV_VFS_ADMIN, 0))
+		priv_granted |= VADMIN;
+
+	/*
+	 * The owner matches if the effective uid associated with the
+	 * credential matches that of the ACL_USER_OBJ entry.  While we're
+	 * doing the first scan, also cache the location of the ACL_MASK and
+	 * ACL_OTHER entries, preventing some future iterations.
+	 */
+	acl_mask = acl_other = NULL;
+	for (i = 0; i < acl->acl_cnt; i++) {
+		switch (acl->acl_entry[i].ae_tag) {
+		case ACL_USER_OBJ:
+			if (file_uid != cred->cr_uid)
+				break;
+			dac_granted = 0;
+			dac_granted |= VADMIN;
+			if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
+				dac_granted |= VEXEC;
+			if (acl->acl_entry[i].ae_perm & ACL_READ)
+				dac_granted |= VREAD;
+			if (acl->acl_entry[i].ae_perm & ACL_WRITE)
+				dac_granted |= (VWRITE | VAPPEND);
+			if ((acc_mode & dac_granted) == acc_mode)
+				return (0);
+
+			/*
+			 * XXXRW: Do privilege lookup here.
+			 */
+			if ((acc_mode & (dac_granted | priv_granted)) ==
+			    acc_mode) {
+				if (privused != NULL)
+					*privused = 1;
+				return (0);
+			}
+			goto error;
+
+		case ACL_MASK:
+			acl_mask = &acl->acl_entry[i];
+			break;
+
+		case ACL_OTHER:
+			acl_other = &acl->acl_entry[i];
+			break;
+
+		default:
+			break;
+		}
+	}
+
+	/*
+	 * An ACL_OTHER entry should always exist in a valid access ACL.  If
+	 * it doesn't, then generate a serious failure.  For now, this means
+	 * a debugging message and EPERM, but in the future should probably
+	 * be a panic.
+	 */
+	if (acl_other == NULL) {
+		/*
+		 * XXX This should never happen
+		 */
+		printf("vaccess_acl_posix1e: ACL_OTHER missing\n");
+		return (EPERM);
+	}
+
+	/*
+	 * Checks against ACL_USER, ACL_GROUP_OBJ, and ACL_GROUP fields are
+	 * masked by an ACL_MASK entry, if any.  As such, first identify the
+	 * ACL_MASK field, then iterate through identifying potential user
+	 * matches, then group matches.  If there is no ACL_MASK, assume that
+	 * the mask allows all requests to succeed.
+	 */
+	if (acl_mask != NULL) {
+		acl_mask_granted = 0;
+		if (acl_mask->ae_perm & ACL_EXECUTE)
+			acl_mask_granted |= VEXEC;
+		if (acl_mask->ae_perm & ACL_READ)
+			acl_mask_granted |= VREAD;
+		if (acl_mask->ae_perm & ACL_WRITE)
+			acl_mask_granted |= (VWRITE | VAPPEND);
+	} else
+		acl_mask_granted = VEXEC | VREAD | VWRITE | VAPPEND;
+
+	/*
+	 * Check ACL_USER ACL entries.  There will either be one or no
+	 * matches; if there is one, we accept or rejected based on the
+	 * match; otherwise, we continue on to groups.
+	 */
+	for (i = 0; i < acl->acl_cnt; i++) {
+		switch (acl->acl_entry[i].ae_tag) {
+		case ACL_USER:
+			if (acl->acl_entry[i].ae_id != cred->cr_uid)
+				break;
+			dac_granted = 0;
+			if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
+				dac_granted |= VEXEC;
+			if (acl->acl_entry[i].ae_perm & ACL_READ)
+				dac_granted |= VREAD;
+			if (acl->acl_entry[i].ae_perm & ACL_WRITE)
+				dac_granted |= (VWRITE | VAPPEND);
+			dac_granted &= acl_mask_granted;
+			if ((acc_mode & dac_granted) == acc_mode)
+				return (0);
+			/*
+			 * XXXRW: Do privilege lookup here.
+			 */
+			if ((acc_mode & (dac_granted | priv_granted)) !=
+			    acc_mode)
+				goto error;
+
+			if (privused != NULL)
+				*privused = 1;
+			return (0);
+		}
+	}
+
+	/*
+	 * Group match is best-match, not first-match, so find a "best"
+	 * match.  Iterate across, testing each potential group match.  Make
+	 * sure we keep track of whether we found a match or not, so that we
+	 * know if we should try again with any available privilege, or if we
+	 * should move on to ACL_OTHER.
+	 */
+	group_matched = 0;
+	for (i = 0; i < acl->acl_cnt; i++) {
+		switch (acl->acl_entry[i].ae_tag) {
+		case ACL_GROUP_OBJ:
+			if (!groupmember(file_gid, cred))
+				break;
+			dac_granted = 0;
+			if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
+				dac_granted |= VEXEC;
+			if (acl->acl_entry[i].ae_perm & ACL_READ)
+				dac_granted |= VREAD;
+			if (acl->acl_entry[i].ae_perm & ACL_WRITE)
+				dac_granted |= (VWRITE | VAPPEND);
+			dac_granted  &= acl_mask_granted;
+
+			if ((acc_mode & dac_granted) == acc_mode)
+				return (0);
+
+			group_matched = 1;
+			break;
+
+		case ACL_GROUP:
+			if (!groupmember(acl->acl_entry[i].ae_id, cred))
+				break;
+			dac_granted = 0;
+			if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
+				dac_granted |= VEXEC;
+			if (acl->acl_entry[i].ae_perm & ACL_READ)
+				dac_granted |= VREAD;
+			if (acl->acl_entry[i].ae_perm & ACL_WRITE)
+				dac_granted |= (VWRITE | VAPPEND);
+			dac_granted  &= acl_mask_granted;
+
+			if ((acc_mode & dac_granted) == acc_mode)
+				return (0);
+
+			group_matched = 1;
+			break;
+
+		default:
+			break;
+		}
+	}
+
+	if (group_matched == 1) {
+		/*
+		 * There was a match, but it did not grant rights via pure
+		 * DAC.  Try again, this time with privilege.
+		 */
+		for (i = 0; i < acl->acl_cnt; i++) {
+			switch (acl->acl_entry[i].ae_tag) {
+			case ACL_GROUP_OBJ:
+				if (!groupmember(file_gid, cred))
+					break;
+				dac_granted = 0;
+				if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
+					dac_granted |= VEXEC;
+				if (acl->acl_entry[i].ae_perm & ACL_READ)
+					dac_granted |= VREAD;
+				if (acl->acl_entry[i].ae_perm & ACL_WRITE)
+					dac_granted |= (VWRITE | VAPPEND);
+				dac_granted &= acl_mask_granted;
+
+				/*
+				 * XXXRW: Do privilege lookup here.
+				 */
+				if ((acc_mode & (dac_granted | priv_granted))
+				    != acc_mode)
+					break;
+
+				if (privused != NULL)
+					*privused = 1;
+				return (0);
+
+			case ACL_GROUP:
+				if (!groupmember(acl->acl_entry[i].ae_id,
+				    cred))
+					break;
+				dac_granted = 0;
+				if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
+				dac_granted |= VEXEC;
+				if (acl->acl_entry[i].ae_perm & ACL_READ)
+					dac_granted |= VREAD;
+				if (acl->acl_entry[i].ae_perm & ACL_WRITE)
+					dac_granted |= (VWRITE | VAPPEND);
+				dac_granted &= acl_mask_granted;
+
+				/*
+				 * XXXRW: Do privilege lookup here.
+				 */
+				if ((acc_mode & (dac_granted | priv_granted))
+				    != acc_mode)
+					break;
+
+				if (privused != NULL)
+					*privused = 1;
+				return (0);
+
+			default:
+				break;
+			}
+		}
+		/*
+		 * Even with privilege, group membership was not sufficient.
+		 * Return failure.
+		 */
+		goto error;
+	}
+		
+	/*
+	 * Fall back on ACL_OTHER.  ACL_MASK is not applied to ACL_OTHER.
+	 */
+	dac_granted = 0;
+	if (acl_other->ae_perm & ACL_EXECUTE)
+		dac_granted |= VEXEC;
+	if (acl_other->ae_perm & ACL_READ)
+		dac_granted |= VREAD;
+	if (acl_other->ae_perm & ACL_WRITE)
+		dac_granted |= (VWRITE | VAPPEND);
+
+	if ((acc_mode & dac_granted) == acc_mode)
+		return (0);
+	/*
+	 * XXXRW: Do privilege lookup here.
+	 */
+	if ((acc_mode & (dac_granted | priv_granted)) == acc_mode) {
+		if (privused != NULL)
+			*privused = 1;
+		return (0);
+	}
+
+error:
+	return ((acc_mode & VADMIN) ? EPERM : EACCES);
+}
+
+/*
+ * For the purposes of filesystems maintaining the _OBJ entries in an inode
+ * with a mode_t field, this routine converts a mode_t entry to an
+ * acl_perm_t.
+ */
+acl_perm_t
+acl_posix1e_mode_to_perm(acl_tag_t tag, mode_t mode)
+{
+	acl_perm_t	perm = 0;
+
+	switch(tag) {
+	case ACL_USER_OBJ:
+		if (mode & S_IXUSR)
+			perm |= ACL_EXECUTE;
+		if (mode & S_IRUSR)
+			perm |= ACL_READ;
+		if (mode & S_IWUSR)
+			perm |= ACL_WRITE;
+		return (perm);
+
+	case ACL_GROUP_OBJ:
+		if (mode & S_IXGRP)
+			perm |= ACL_EXECUTE;
+		if (mode & S_IRGRP)
+			perm |= ACL_READ;
+		if (mode & S_IWGRP)
+			perm |= ACL_WRITE;
+		return (perm);
+
+	case ACL_OTHER:
+		if (mode & S_IXOTH)
+			perm |= ACL_EXECUTE;
+		if (mode & S_IROTH)
+			perm |= ACL_READ;
+		if (mode & S_IWOTH)
+			perm |= ACL_WRITE;
+		return (perm);
+
+	default:
+		printf("acl_posix1e_mode_to_perm: invalid tag (%d)\n", tag);
+		return (0);
+	}
+}
+
+/*
+ * Given inode information (uid, gid, mode), return an acl entry of the
+ * appropriate type.
+ */
+struct acl_entry
+acl_posix1e_mode_to_entry(acl_tag_t tag, uid_t uid, gid_t gid, mode_t mode)
+{
+	struct acl_entry	acl_entry;
+
+	acl_entry.ae_tag = tag;
+	acl_entry.ae_perm = acl_posix1e_mode_to_perm(tag, mode);
+	switch(tag) {
+	case ACL_USER_OBJ:
+		acl_entry.ae_id = uid;
+		break;
+
+	case ACL_GROUP_OBJ:
+		acl_entry.ae_id = gid;
+		break;
+
+	case ACL_OTHER:
+		acl_entry.ae_id = ACL_UNDEFINED_ID;
+		break;
+
+	default:
+		acl_entry.ae_id = ACL_UNDEFINED_ID;
+		printf("acl_posix1e_mode_to_entry: invalid tag (%d)\n", tag);
+	}
+
+	return (acl_entry);
+}
+
+/*
+ * Utility function to generate a file mode given appropriate ACL entries.
+ */
+mode_t
+acl_posix1e_perms_to_mode(struct acl_entry *acl_user_obj_entry,
+    struct acl_entry *acl_group_obj_entry, struct acl_entry *acl_other_entry)
+{
+	mode_t	mode;
+
+	mode = 0;
+	if (acl_user_obj_entry->ae_perm & ACL_EXECUTE)
+		mode |= S_IXUSR;
+	if (acl_user_obj_entry->ae_perm & ACL_READ)
+		mode |= S_IRUSR;
+	if (acl_user_obj_entry->ae_perm & ACL_WRITE)
+		mode |= S_IWUSR;
+	if (acl_group_obj_entry->ae_perm & ACL_EXECUTE)
+		mode |= S_IXGRP;
+	if (acl_group_obj_entry->ae_perm & ACL_READ)
+		mode |= S_IRGRP;
+	if (acl_group_obj_entry->ae_perm & ACL_WRITE)
+		mode |= S_IWGRP;
+	if (acl_other_entry->ae_perm & ACL_EXECUTE)
+		mode |= S_IXOTH;
+	if (acl_other_entry->ae_perm & ACL_READ)
+		mode |= S_IROTH;
+	if (acl_other_entry->ae_perm & ACL_WRITE)
+		mode |= S_IWOTH;
+
+	return (mode);
+}
+
+/*
+ * Utility function to generate a file mode given a complete POSIX.1e access
+ * ACL.  Note that if the ACL is improperly formed, this may result in a
+ * panic.
+ */
+mode_t
+acl_posix1e_acl_to_mode(struct acl *acl)
+{
+	struct acl_entry *acl_mask, *acl_user_obj, *acl_group_obj, *acl_other;
+	int i;
+
+	/*
+	 * Find the ACL entries relevant to a POSIX permission mode.
+	 */
+	acl_user_obj = acl_group_obj = acl_other = acl_mask = NULL;
+	for (i = 0; i < acl->acl_cnt; i++) {
+		switch (acl->acl_entry[i].ae_tag) {
+		case ACL_USER_OBJ:
+			acl_user_obj = &acl->acl_entry[i];
+			break;
+
+		case ACL_GROUP_OBJ:
+			acl_group_obj = &acl->acl_entry[i];
+			break;
+
+		case ACL_OTHER:
+			acl_other = &acl->acl_entry[i];
+			break;
+
+		case ACL_MASK:
+			acl_mask = &acl->acl_entry[i];
+			break;
+
+		case ACL_USER:
+		case ACL_GROUP:
+			break;
+
+		default:
+			panic("acl_posix1e_acl_to_mode: bad ae_tag");
+		}
+	}
+
+	if (acl_user_obj == NULL || acl_group_obj == NULL || acl_other == NULL)
+		panic("acl_posix1e_acl_to_mode: missing base ae_tags");
+
+	/*
+	 * POSIX.1e specifies that if there is an ACL_MASK entry, we replace
+	 * the mode "group" bits with its permissions.  If there isn't, we
+	 * use the ACL_GROUP_OBJ permissions.
+	 */
+	if (acl_mask != NULL)
+		return (acl_posix1e_perms_to_mode(acl_user_obj, acl_mask,
+		    acl_other));
+	else
+		return (acl_posix1e_perms_to_mode(acl_user_obj, acl_group_obj,
+		    acl_other));
+}
+
+/*
+ * Perform a syntactic check of the ACL, sufficient to allow an implementing
+ * filesystem to determine if it should accept this and rely on the POSIX.1e
+ * ACL properties.
+ */
+int
+acl_posix1e_check(struct acl *acl)
+{
+	int num_acl_user_obj, num_acl_user, num_acl_group_obj, num_acl_group;
+	int num_acl_mask, num_acl_other, i;
+
+	/*
+	 * Verify that the number of entries does not exceed the maximum
+	 * defined for acl_t.
+	 *
+	 * Verify that the correct number of various sorts of ae_tags are
+	 * present:
+	 *   Exactly one ACL_USER_OBJ
+	 *   Exactly one ACL_GROUP_OBJ
+	 *   Exactly one ACL_OTHER
+	 *   If any ACL_USER or ACL_GROUP entries appear, then exactly one
+	 *   ACL_MASK entry must also appear.
+	 *
+	 * Verify that all ae_perm entries are in ACL_PERM_BITS.
+	 *
+	 * Verify all ae_tag entries are understood by this implementation.
+	 *
+	 * Note: Does not check for uniqueness of qualifier (ae_id) field.
+	 */
+	num_acl_user_obj = num_acl_user = num_acl_group_obj = num_acl_group =
+	    num_acl_mask = num_acl_other = 0;
+	if (acl->acl_cnt > ACL_MAX_ENTRIES || acl->acl_cnt < 0)
+		return (EINVAL);
+	for (i = 0; i < acl->acl_cnt; i++) {
+		/*
+		 * Check for a valid tag.
+		 */
+		switch(acl->acl_entry[i].ae_tag) {
+		case ACL_USER_OBJ:
+			acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */
+			if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID)
+				return (EINVAL);
+			num_acl_user_obj++;
+			break;
+		case ACL_GROUP_OBJ:
+			acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */
+			if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID)
+				return (EINVAL);
+			num_acl_group_obj++;
+			break;
+		case ACL_USER:
+			if (acl->acl_entry[i].ae_id == ACL_UNDEFINED_ID)
+				return (EINVAL);
+			num_acl_user++;
+			break;
+		case ACL_GROUP:
+			if (acl->acl_entry[i].ae_id == ACL_UNDEFINED_ID)
+				return (EINVAL);
+			num_acl_group++;
+			break;
+		case ACL_OTHER:
+			acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */
+			if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID)
+				return (EINVAL);
+			num_acl_other++;
+			break;
+		case ACL_MASK:
+			acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */
+			if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID)
+				return (EINVAL);
+			num_acl_mask++;
+			break;
+		default:
+			return (EINVAL);
+		}
+		/*
+		 * Check for valid perm entries.
+		 */
+		if ((acl->acl_entry[i].ae_perm | ACL_PERM_BITS) !=
+		    ACL_PERM_BITS)
+			return (EINVAL);
+	}
+	if ((num_acl_user_obj != 1) || (num_acl_group_obj != 1) ||
+	    (num_acl_other != 1) || (num_acl_mask != 0 && num_acl_mask != 1))
+		return (EINVAL);
+	if (((num_acl_group != 0) || (num_acl_user != 0)) &&
+	    (num_acl_mask != 1))
+		return (EINVAL);
+	return (0);
+}
+
+/*
+ * Given a requested mode for a new object, and a default ACL, combine the
+ * two to produce a new mode.  Be careful not to clear any bits that aren't
+ * intended to be affected by the POSIX.1e ACL.  Eventually, this might also
+ * take the cmask as an argument, if we push that down into
+ * per-filesystem-code.
+ */
+mode_t
+acl_posix1e_newfilemode(mode_t cmode, struct acl *dacl)
+{
+	mode_t mode;
+
+	mode = cmode;
+	/*
+	 * The current composition policy is that a permission bit must be
+	 * set in *both* the ACL and the requested creation mode for it to
+	 * appear in the resulting mode/ACL.  First clear any possibly
+	 * effected bits, then reconstruct.
+	 */
+	mode &= ACL_PRESERVE_MASK;
+	mode |= (ACL_OVERRIDE_MASK & cmode & acl_posix1e_acl_to_mode(dacl));
+
+	return (mode);
+}
Index: kern_sx.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_sx.c,v
retrieving revision 1.1.1.2
retrieving revision 1.2
diff -L sys/kern/kern_sx.c -L sys/kern/kern_sx.c -u -r1.1.1.2 -r1.2
--- sys/kern/kern_sx.c
+++ sys/kern/kern_sx.c
@@ -1,12 +1,14 @@
 /*-
- * Copyright (C) 2001 Jason Evans <jasone at freebsd.org>.  All rights reserved.
+ * Copyright (c) 2007 Attilio Rao <attilio at freebsd.org>
+ * Copyright (c) 2001 Jason Evans <jasone at freebsd.org>
+ * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice(s), this list of conditions and the following disclaimer as
- *    the first lines of this file unmodified other than the possible 
+ *    the first lines of this file unmodified other than the possible
  *    addition of one or more copyright notices.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice(s), this list of conditions and the following disclaimer in the
@@ -26,40 +28,95 @@
  */
 
 /*
- * Shared/exclusive locks.  This implementation assures deterministic lock
- * granting behavior, so that slocks and xlocks are interleaved.
+ * Shared/exclusive locks.  This implementation attempts to ensure
+ * deterministic lock granting behavior, so that slocks and xlocks are
+ * interleaved.
  *
  * Priority propagation will not generally raise the priority of lock holders,
  * so should not be relied upon in combination with sx locks.
  */
 
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_sx.c,v 1.25.2.1 2005/12/20 19:28:23 jhb Exp $");
-
+#include "opt_adaptive_sx.h"
 #include "opt_ddb.h"
 
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: src/sys/kern/kern_sx.c,v 1.55 2007/10/02 14:48:48 pjd Exp $");
+
 #include <sys/param.h>
-#include <sys/systm.h>
 #include <sys/ktr.h>
-#include <sys/linker_set.h>
-#include <sys/condvar.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
+#include <sys/sleepqueue.h>
 #include <sys/sx.h>
+#include <sys/systm.h>
+
+#ifdef ADAPTIVE_SX
+#include <machine/cpu.h>
+#endif
 
+#ifdef DDB
 #include <ddb/ddb.h>
+#endif
+
+#if !defined(SMP) && defined(ADAPTIVE_SX)
+#error "You must have SMP to enable the ADAPTIVE_SX option"
+#endif
+
+CTASSERT(((SX_ADAPTIVESPIN | SX_RECURSE) & LO_CLASSFLAGS) ==
+    (SX_ADAPTIVESPIN | SX_RECURSE));
+
+/* Handy macros for sleep queues. */
+#define	SQ_EXCLUSIVE_QUEUE	0
+#define	SQ_SHARED_QUEUE		1
+
+/*
+ * Variations on DROP_GIANT()/PICKUP_GIANT() for use in this file.  We
+ * drop Giant anytime we have to sleep or if we adaptively spin.
+ */
+#define	GIANT_DECLARE							\
+	int _giantcnt = 0;						\
+	WITNESS_SAVE_DECL(Giant)					\
+
+#define	GIANT_SAVE() do {						\
+	if (mtx_owned(&Giant)) {					\
+		WITNESS_SAVE(&Giant.lock_object, Giant);		\
+		while (mtx_owned(&Giant)) {				\
+			_giantcnt++;					\
+			mtx_unlock(&Giant);				\
+		}							\
+	}								\
+} while (0)
+
+#define GIANT_RESTORE() do {						\
+	if (_giantcnt > 0) {						\
+		mtx_assert(&Giant, MA_NOTOWNED);			\
+		while (_giantcnt--)					\
+			mtx_lock(&Giant);				\
+		WITNESS_RESTORE(&Giant.lock_object, Giant);		\
+	}								\
+} while (0)
+
+/*
+ * Returns true if an exclusive lock is recursed.  It assumes
+ * curthread currently has an exclusive lock.
+ */
+#define	sx_recursed(sx)		((sx)->sx_recurse != 0)
 
 #ifdef DDB
 static void	db_show_sx(struct lock_object *lock);
 #endif
+static void	lock_sx(struct lock_object *lock, int how);
+static int	unlock_sx(struct lock_object *lock);
 
 struct lock_class lock_class_sx = {
-	"sx",
-	LC_SLEEPLOCK | LC_SLEEPABLE | LC_RECURSABLE | LC_UPGRADABLE,
+	.lc_name = "sx",
+	.lc_flags = LC_SLEEPLOCK | LC_SLEEPABLE | LC_RECURSABLE | LC_UPGRADABLE,
 #ifdef DDB
-	db_show_sx
+	.lc_ddb_show = db_show_sx,
 #endif
+	.lc_lock = lock_sx,
+	.lc_unlock = unlock_sx,
 };
 
 #ifndef INVARIANTS
@@ -67,6 +124,34 @@
 #endif
 
 void
+lock_sx(struct lock_object *lock, int how)
+{
+	struct sx *sx;
+
+	sx = (struct sx *)lock;
+	if (how)
+		sx_xlock(sx);
+	else
+		sx_slock(sx);
+}
+
+int
+unlock_sx(struct lock_object *lock)
+{
+	struct sx *sx;
+
+	sx = (struct sx *)lock;
+	sx_assert(sx, SA_LOCKED | SA_NOTRECURSED);
+	if (sx_xlocked(sx)) {
+		sx_xunlock(sx);
+		return (1);
+	} else {
+		sx_sunlock(sx);
+		return (0);
+	}
+}
+
+void
 sx_sysinit(void *arg)
 {
 	struct sx_args *sargs = arg;
@@ -75,250 +160,718 @@
 }
 
 void
-sx_init(struct sx *sx, const char *description)
+sx_init_flags(struct sx *sx, const char *description, int opts)
 {
-	struct lock_object *lock;
+	int flags;
 
-	lock = &sx->sx_object;
-	KASSERT((lock->lo_flags & LO_INITIALIZED) == 0,
-	    ("sx lock %s %p already initialized", description, sx));
-	bzero(sx, sizeof(*sx));
-	lock->lo_class = &lock_class_sx;
-	lock->lo_type = lock->lo_name = description;
-	lock->lo_flags = LO_WITNESS | LO_RECURSABLE | LO_SLEEPABLE |
-	    LO_UPGRADABLE;
-	sx->sx_lock = mtx_pool_find(mtxpool_lockbuilder, sx);
-	sx->sx_cnt = 0;
-	cv_init(&sx->sx_shrd_cv, description);
-	sx->sx_shrd_wcnt = 0;
-	cv_init(&sx->sx_excl_cv, description);
-	sx->sx_excl_wcnt = 0;
-	sx->sx_xholder = NULL;
+	MPASS((opts & ~(SX_QUIET | SX_RECURSE | SX_NOWITNESS | SX_DUPOK |
+	    SX_NOPROFILE | SX_ADAPTIVESPIN)) == 0);
 
-	LOCK_LOG_INIT(lock, 0);
-
-	WITNESS_INIT(lock);
+	flags = LO_RECURSABLE | LO_SLEEPABLE | LO_UPGRADABLE;
+	if (opts & SX_DUPOK)
+		flags |= LO_DUPOK;
+	if (opts & SX_NOPROFILE)
+		flags |= LO_NOPROFILE;
+	if (!(opts & SX_NOWITNESS))
+		flags |= LO_WITNESS;
+	if (opts & SX_QUIET)
+		flags |= LO_QUIET;
+
+	flags |= opts & (SX_ADAPTIVESPIN | SX_RECURSE);
+	sx->sx_lock = SX_LOCK_UNLOCKED;
+	sx->sx_recurse = 0;
+	lock_init(&sx->lock_object, &lock_class_sx, description, NULL, flags);
 }
 
 void
 sx_destroy(struct sx *sx)
 {
 
-	LOCK_LOG_DESTROY(&sx->sx_object, 0);
+	KASSERT(sx->sx_lock == SX_LOCK_UNLOCKED, ("sx lock still held"));
+	KASSERT(sx->sx_recurse == 0, ("sx lock still recursed"));
+	sx->sx_lock = SX_LOCK_DESTROYED;
+	lock_destroy(&sx->lock_object);
+}
 
-	KASSERT((sx->sx_cnt == 0 && sx->sx_shrd_wcnt == 0 && sx->sx_excl_wcnt ==
-	    0), ("%s (%s): holders or waiters\n", __func__,
-	    sx->sx_object.lo_name));
+int
+_sx_slock(struct sx *sx, int opts, const char *file, int line)
+{
+	int error = 0;
 
-	sx->sx_lock = NULL;
-	cv_destroy(&sx->sx_shrd_cv);
-	cv_destroy(&sx->sx_excl_cv);
+	MPASS(curthread != NULL);
+	KASSERT(sx->sx_lock != SX_LOCK_DESTROYED,
+	    ("sx_slock() of destroyed sx @ %s:%d", file, line));
+	WITNESS_CHECKORDER(&sx->lock_object, LOP_NEWORDER, file, line);
+	error = __sx_slock(sx, opts, file, line);
+	if (!error) {
+		LOCK_LOG_LOCK("SLOCK", &sx->lock_object, 0, 0, file, line);
+		WITNESS_LOCK(&sx->lock_object, 0, file, line);
+		curthread->td_locks++;
+	}
 
-	WITNESS_DESTROY(&sx->sx_object);
+	return (error);
 }
 
-void
-_sx_slock(struct sx *sx, const char *file, int line)
+int
+_sx_try_slock(struct sx *sx, const char *file, int line)
 {
+	uintptr_t x;
 
-	mtx_lock(sx->sx_lock);
-	KASSERT(sx->sx_xholder != curthread,
-	    ("%s (%s): slock while xlock is held @ %s:%d\n", __func__,
-	    sx->sx_object.lo_name, file, line));
-	WITNESS_CHECKORDER(&sx->sx_object, LOP_NEWORDER, file, line);
-
-	/*
-	 * Loop in case we lose the race for lock acquisition.
-	 */
-	while (sx->sx_cnt < 0) {
-		sx->sx_shrd_wcnt++;
-		cv_wait(&sx->sx_shrd_cv, sx->sx_lock);
-		sx->sx_shrd_wcnt--;
+	for (;;) {
+		x = sx->sx_lock;
+		KASSERT(x != SX_LOCK_DESTROYED,
+		    ("sx_try_slock() of destroyed sx @ %s:%d", file, line));
+		if (!(x & SX_LOCK_SHARED))
+			break;
+		if (atomic_cmpset_acq_ptr(&sx->sx_lock, x, x + SX_ONE_SHARER)) {
+			LOCK_LOG_TRY("SLOCK", &sx->lock_object, 0, 1, file, line);
+			WITNESS_LOCK(&sx->lock_object, LOP_TRYLOCK, file, line);
+			curthread->td_locks++;
+			return (1);
+		}
 	}
 
-	/* Acquire a shared lock. */
-	sx->sx_cnt++;
+	LOCK_LOG_TRY("SLOCK", &sx->lock_object, 0, 0, file, line);
+	return (0);
+}
 
-	LOCK_LOG_LOCK("SLOCK", &sx->sx_object, 0, 0, file, line);
-	WITNESS_LOCK(&sx->sx_object, 0, file, line);
+int
+_sx_xlock(struct sx *sx, int opts, const char *file, int line)
+{
+	int error = 0;
+
+	MPASS(curthread != NULL);
+	KASSERT(sx->sx_lock != SX_LOCK_DESTROYED,
+	    ("sx_xlock() of destroyed sx @ %s:%d", file, line));
+	WITNESS_CHECKORDER(&sx->lock_object, LOP_NEWORDER | LOP_EXCLUSIVE, file,
+	    line);
+	error = __sx_xlock(sx, curthread, opts, file, line);
+	if (!error) {
+		LOCK_LOG_LOCK("XLOCK", &sx->lock_object, 0, sx->sx_recurse,
+		    file, line);
+		WITNESS_LOCK(&sx->lock_object, LOP_EXCLUSIVE, file, line);
+		curthread->td_locks++;
+	}
 
-	mtx_unlock(sx->sx_lock);
+	return (error);
 }
 
 int
-_sx_try_slock(struct sx *sx, const char *file, int line)
+_sx_try_xlock(struct sx *sx, const char *file, int line)
 {
+	int rval;
 
-	mtx_lock(sx->sx_lock);
-	if (sx->sx_cnt >= 0) {
-		sx->sx_cnt++;
-		LOCK_LOG_TRY("SLOCK", &sx->sx_object, 0, 1, file, line);
-		WITNESS_LOCK(&sx->sx_object, LOP_TRYLOCK, file, line);
-		mtx_unlock(sx->sx_lock);
-		return (1);
-	} else {
-		LOCK_LOG_TRY("SLOCK", &sx->sx_object, 0, 0, file, line);
-		mtx_unlock(sx->sx_lock);
-		return (0);
+	MPASS(curthread != NULL);
+	KASSERT(sx->sx_lock != SX_LOCK_DESTROYED,
+	    ("sx_try_xlock() of destroyed sx @ %s:%d", file, line));
+
+	if (sx_xlocked(sx) && (sx->lock_object.lo_flags & SX_RECURSE) != 0) {
+		sx->sx_recurse++;
+		atomic_set_ptr(&sx->sx_lock, SX_LOCK_RECURSED);
+		rval = 1;
+	} else
+		rval = atomic_cmpset_acq_ptr(&sx->sx_lock, SX_LOCK_UNLOCKED,
+		    (uintptr_t)curthread);
+	LOCK_LOG_TRY("XLOCK", &sx->lock_object, 0, rval, file, line);
+	if (rval) {
+		WITNESS_LOCK(&sx->lock_object, LOP_EXCLUSIVE | LOP_TRYLOCK,
+		    file, line);
+		curthread->td_locks++;
 	}
+
+	return (rval);
+}
+
+void
+_sx_sunlock(struct sx *sx, const char *file, int line)
+{
+
+	MPASS(curthread != NULL);
+	KASSERT(sx->sx_lock != SX_LOCK_DESTROYED,
+	    ("sx_sunlock() of destroyed sx @ %s:%d", file, line));
+	_sx_assert(sx, SA_SLOCKED, file, line);
+	curthread->td_locks--;
+	WITNESS_UNLOCK(&sx->lock_object, 0, file, line);
+	LOCK_LOG_LOCK("SUNLOCK", &sx->lock_object, 0, 0, file, line);
+#ifdef LOCK_PROFILING_SHARED
+	if (SX_SHARERS(sx->sx_lock) == 1)
+		lock_profile_release_lock(&sx->lock_object);
+#endif
+	__sx_sunlock(sx, file, line);
 }
 
 void
-_sx_xlock(struct sx *sx, const char *file, int line)
+_sx_xunlock(struct sx *sx, const char *file, int line)
+{
+
+	MPASS(curthread != NULL);
+	KASSERT(sx->sx_lock != SX_LOCK_DESTROYED,
+	    ("sx_xunlock() of destroyed sx @ %s:%d", file, line));
+	_sx_assert(sx, SA_XLOCKED, file, line);
+	curthread->td_locks--;
+	WITNESS_UNLOCK(&sx->lock_object, LOP_EXCLUSIVE, file, line);
+	LOCK_LOG_LOCK("XUNLOCK", &sx->lock_object, 0, sx->sx_recurse, file,
+	    line);
+	if (!sx_recursed(sx))
+		lock_profile_release_lock(&sx->lock_object);
+	__sx_xunlock(sx, curthread, file, line);
+}
+
+/*
+ * Try to do a non-blocking upgrade from a shared lock to an exclusive lock.
+ * This will only succeed if this thread holds a single shared lock.
+ * Return 1 if if the upgrade succeed, 0 otherwise.
+ */
+int
+_sx_try_upgrade(struct sx *sx, const char *file, int line)
 {
+	uintptr_t x;
+	int success;
 
-	mtx_lock(sx->sx_lock);
+	KASSERT(sx->sx_lock != SX_LOCK_DESTROYED,
+	    ("sx_try_upgrade() of destroyed sx @ %s:%d", file, line));
+	_sx_assert(sx, SA_SLOCKED, file, line);
 
 	/*
-	 * With sx locks, we're absolutely not permitted to recurse on
-	 * xlocks, as it is fatal (deadlock). Normally, recursion is handled
-	 * by WITNESS, but as it is not semantically correct to hold the
-	 * xlock while in here, we consider it API abuse and put it under
-	 * INVARIANTS.
+	 * Try to switch from one shared lock to an exclusive lock.  We need
+	 * to maintain the SX_LOCK_EXCLUSIVE_WAITERS flag if set so that
+	 * we will wake up the exclusive waiters when we drop the lock.
 	 */
-	KASSERT(sx->sx_xholder != curthread,
-	    ("%s (%s): xlock already held @ %s:%d", __func__,
-	    sx->sx_object.lo_name, file, line));
-	WITNESS_CHECKORDER(&sx->sx_object, LOP_NEWORDER | LOP_EXCLUSIVE, file,
-	    line);
+	x = sx->sx_lock & SX_LOCK_EXCLUSIVE_WAITERS;
+	success = atomic_cmpset_ptr(&sx->sx_lock, SX_SHARERS_LOCK(1) | x,
+	    (uintptr_t)curthread | x);
+	LOCK_LOG_TRY("XUPGRADE", &sx->lock_object, 0, success, file, line);
+	if (success)
+		WITNESS_UPGRADE(&sx->lock_object, LOP_EXCLUSIVE | LOP_TRYLOCK,
+		    file, line);
+	return (success);
+}
 
-	/* Loop in case we lose the race for lock acquisition. */
-	while (sx->sx_cnt != 0) {
-		sx->sx_excl_wcnt++;
-		cv_wait(&sx->sx_excl_cv, sx->sx_lock);
-		sx->sx_excl_wcnt--;
-	}
+/*
+ * Downgrade an unrecursed exclusive lock into a single shared lock.
+ */
+void
+_sx_downgrade(struct sx *sx, const char *file, int line)
+{
+	uintptr_t x;
+
+	KASSERT(sx->sx_lock != SX_LOCK_DESTROYED,
+	    ("sx_downgrade() of destroyed sx @ %s:%d", file, line));
+	_sx_assert(sx, SA_XLOCKED | SA_NOTRECURSED, file, line);
+#ifndef INVARIANTS
+	if (sx_recursed(sx))
+		panic("downgrade of a recursed lock");
+#endif
 
-	MPASS(sx->sx_cnt == 0);
+	WITNESS_DOWNGRADE(&sx->lock_object, 0, file, line);
+
+	/*
+	 * Try to switch from an exclusive lock with no shared waiters
+	 * to one sharer with no shared waiters.  If there are
+	 * exclusive waiters, we don't need to lock the sleep queue so
+	 * long as we preserve the flag.  We do one quick try and if
+	 * that fails we grab the sleepq lock to keep the flags from
+	 * changing and do it the slow way.
+	 *
+	 * We have to lock the sleep queue if there are shared waiters
+	 * so we can wake them up.
+	 */
+	x = sx->sx_lock;
+	if (!(x & SX_LOCK_SHARED_WAITERS) &&
+	    atomic_cmpset_rel_ptr(&sx->sx_lock, x, SX_SHARERS_LOCK(1) |
+	    (x & SX_LOCK_EXCLUSIVE_WAITERS))) {
+		LOCK_LOG_LOCK("XDOWNGRADE", &sx->lock_object, 0, 0, file, line);
+		return;
+	}
 
-	/* Acquire an exclusive lock. */
-	sx->sx_cnt--;
-	sx->sx_xholder = curthread;
+	/*
+	 * Lock the sleep queue so we can read the waiters bits
+	 * without any races and wakeup any shared waiters.
+	 */
+	sleepq_lock(&sx->lock_object);
 
-	LOCK_LOG_LOCK("XLOCK", &sx->sx_object, 0, 0, file, line);
-	WITNESS_LOCK(&sx->sx_object, LOP_EXCLUSIVE, file, line);
+	/*
+	 * Preserve SX_LOCK_EXCLUSIVE_WAITERS while downgraded to a single
+	 * shared lock.  If there are any shared waiters, wake them up.
+	 */
+	x = sx->sx_lock;
+	atomic_store_rel_ptr(&sx->sx_lock, SX_SHARERS_LOCK(1) |
+	    (x & SX_LOCK_EXCLUSIVE_WAITERS));
+	if (x & SX_LOCK_SHARED_WAITERS)
+		sleepq_broadcast(&sx->lock_object, SLEEPQ_SX, -1,
+		    SQ_SHARED_QUEUE);
+	else
+		sleepq_release(&sx->lock_object);
 
-	mtx_unlock(sx->sx_lock);
+	LOCK_LOG_LOCK("XDOWNGRADE", &sx->lock_object, 0, 0, file, line);
 }
 
+/*
+ * This function represents the so-called 'hard case' for sx_xlock
+ * operation.  All 'easy case' failures are redirected to this.  Note
+ * that ideally this would be a static function, but it needs to be
+ * accessible from at least sx.h.
+ */
 int
-_sx_try_xlock(struct sx *sx, const char *file, int line)
+_sx_xlock_hard(struct sx *sx, uintptr_t tid, int opts, const char *file,
+    int line)
 {
-
-	mtx_lock(sx->sx_lock);
-	if (sx->sx_cnt == 0) {
-		sx->sx_cnt--;
-		sx->sx_xholder = curthread;
-		LOCK_LOG_TRY("XLOCK", &sx->sx_object, 0, 1, file, line);
-		WITNESS_LOCK(&sx->sx_object, LOP_EXCLUSIVE | LOP_TRYLOCK, file,
-		    line);
-		mtx_unlock(sx->sx_lock);
-		return (1);
-	} else {
-		LOCK_LOG_TRY("XLOCK", &sx->sx_object, 0, 0, file, line);
-		mtx_unlock(sx->sx_lock);
+	GIANT_DECLARE;
+#ifdef ADAPTIVE_SX
+	volatile struct thread *owner;
+#endif
+	uint64_t waittime = 0;
+	uintptr_t x;
+	int contested = 0, error = 0;
+
+	/* If we already hold an exclusive lock, then recurse. */
+	if (sx_xlocked(sx)) {
+		KASSERT((sx->lock_object.lo_flags & SX_RECURSE) != 0,
+	    ("_sx_xlock_hard: recursed on non-recursive sx %s @ %s:%d\n",
+		    sx->lock_object.lo_name, file, line));
+		sx->sx_recurse++;
+		atomic_set_ptr(&sx->sx_lock, SX_LOCK_RECURSED);
+		if (LOCK_LOG_TEST(&sx->lock_object, 0))
+			CTR2(KTR_LOCK, "%s: %p recursing", __func__, sx);
 		return (0);
 	}
-}
 
-void
-_sx_sunlock(struct sx *sx, const char *file, int line)
-{
+	if (LOCK_LOG_TEST(&sx->lock_object, 0))
+		CTR5(KTR_LOCK, "%s: %s contested (lock=%p) at %s:%d", __func__,
+		    sx->lock_object.lo_name, (void *)sx->sx_lock, file, line);
+
+	while (!atomic_cmpset_acq_ptr(&sx->sx_lock, SX_LOCK_UNLOCKED, tid)) {
+#ifdef ADAPTIVE_SX
+		/*
+		 * If the lock is write locked and the owner is
+		 * running on another CPU, spin until the owner stops
+		 * running or the state of the lock changes.
+		 */
+		x = sx->sx_lock;
+		if (!(x & SX_LOCK_SHARED) &&
+		    (sx->lock_object.lo_flags & SX_ADAPTIVESPIN)) {
+			x = SX_OWNER(x);
+			owner = (struct thread *)x;
+			if (TD_IS_RUNNING(owner)) {
+				if (LOCK_LOG_TEST(&sx->lock_object, 0))
+					CTR3(KTR_LOCK,
+					    "%s: spinning on %p held by %p",
+					    __func__, sx, owner);
+				GIANT_SAVE();
+				lock_profile_obtain_lock_failed(
+				    &sx->lock_object, &contested, &waittime);
+				while (SX_OWNER(sx->sx_lock) == x &&
+				    TD_IS_RUNNING(owner))
+					cpu_spinwait();
+				continue;
+			}
+		}
+#endif
 
-	_sx_assert(sx, SX_SLOCKED, file, line);
-	mtx_lock(sx->sx_lock);
+		sleepq_lock(&sx->lock_object);
+		x = sx->sx_lock;
 
-	WITNESS_UNLOCK(&sx->sx_object, 0, file, line);
+		/*
+		 * If the lock was released while spinning on the
+		 * sleep queue chain lock, try again.
+		 */
+		if (x == SX_LOCK_UNLOCKED) {
+			sleepq_release(&sx->lock_object);
+			continue;
+		}
 
-	/* Release. */
-	sx->sx_cnt--;
+#ifdef ADAPTIVE_SX
+		/*
+		 * The current lock owner might have started executing
+		 * on another CPU (or the lock could have changed
+		 * owners) while we were waiting on the sleep queue
+		 * chain lock.  If so, drop the sleep queue lock and try
+		 * again.
+		 */
+		if (!(x & SX_LOCK_SHARED) &&
+		    (sx->lock_object.lo_flags & SX_ADAPTIVESPIN)) {
+			owner = (struct thread *)SX_OWNER(x);
+			if (TD_IS_RUNNING(owner)) {
+				sleepq_release(&sx->lock_object);
+				continue;
+			}
+		}
+#endif
 
-	/*
-	 * If we just released the last shared lock, wake any waiters up, giving
-	 * exclusive lockers precedence.  In order to make sure that exclusive
-	 * lockers won't be blocked forever, don't wake shared lock waiters if
-	 * there are exclusive lock waiters.
-	 */
-	if (sx->sx_excl_wcnt > 0) {
-		if (sx->sx_cnt == 0)
-			cv_signal(&sx->sx_excl_cv);
-	} else if (sx->sx_shrd_wcnt > 0)
-		cv_broadcast(&sx->sx_shrd_cv);
+		/*
+		 * If an exclusive lock was released with both shared
+		 * and exclusive waiters and a shared waiter hasn't
+		 * woken up and acquired the lock yet, sx_lock will be
+		 * set to SX_LOCK_UNLOCKED | SX_LOCK_EXCLUSIVE_WAITERS.
+		 * If we see that value, try to acquire it once.  Note
+		 * that we have to preserve SX_LOCK_EXCLUSIVE_WAITERS
+		 * as there are other exclusive waiters still.  If we
+		 * fail, restart the loop.
+		 */
+		if (x == (SX_LOCK_UNLOCKED | SX_LOCK_EXCLUSIVE_WAITERS)) {
+			if (atomic_cmpset_acq_ptr(&sx->sx_lock,
+			    SX_LOCK_UNLOCKED | SX_LOCK_EXCLUSIVE_WAITERS,
+			    tid | SX_LOCK_EXCLUSIVE_WAITERS)) {
+				sleepq_release(&sx->lock_object);
+				CTR2(KTR_LOCK, "%s: %p claimed by new writer",
+				    __func__, sx);
+				break;
+			}
+			sleepq_release(&sx->lock_object);
+			continue;
+		}
 
-	LOCK_LOG_LOCK("SUNLOCK", &sx->sx_object, 0, 0, file, line);
+		/*
+		 * Try to set the SX_LOCK_EXCLUSIVE_WAITERS.  If we fail,
+		 * than loop back and retry.
+		 */
+		if (!(x & SX_LOCK_EXCLUSIVE_WAITERS)) {
+			if (!atomic_cmpset_ptr(&sx->sx_lock, x,
+			    x | SX_LOCK_EXCLUSIVE_WAITERS)) {
+				sleepq_release(&sx->lock_object);
+				continue;
+			}
+			if (LOCK_LOG_TEST(&sx->lock_object, 0))
+				CTR2(KTR_LOCK, "%s: %p set excl waiters flag",
+				    __func__, sx);
+		}
+
+		/*
+		 * Since we have been unable to acquire the exclusive
+		 * lock and the exclusive waiters flag is set, we have
+		 * to sleep.
+		 */
+		if (LOCK_LOG_TEST(&sx->lock_object, 0))
+			CTR2(KTR_LOCK, "%s: %p blocking on sleep queue",
+			    __func__, sx);
+
+		GIANT_SAVE();
+		lock_profile_obtain_lock_failed(&sx->lock_object, &contested,
+		    &waittime);
+		sleepq_add(&sx->lock_object, NULL, sx->lock_object.lo_name,
+		    SLEEPQ_SX | ((opts & SX_INTERRUPTIBLE) ?
+		    SLEEPQ_INTERRUPTIBLE : 0), SQ_EXCLUSIVE_QUEUE);
+		if (!(opts & SX_INTERRUPTIBLE))
+			sleepq_wait(&sx->lock_object);
+		else
+			error = sleepq_wait_sig(&sx->lock_object);
+
+		if (error) {
+			if (LOCK_LOG_TEST(&sx->lock_object, 0))
+				CTR2(KTR_LOCK,
+			"%s: interruptible sleep by %p suspended by signal",
+				    __func__, sx);
+			break;
+		}
+		if (LOCK_LOG_TEST(&sx->lock_object, 0))
+			CTR2(KTR_LOCK, "%s: %p resuming from sleep queue",
+			    __func__, sx);
+	}
 
-	mtx_unlock(sx->sx_lock);
+	GIANT_RESTORE();
+	if (!error)
+		lock_profile_obtain_lock_success(&sx->lock_object, contested,
+		    waittime, file, line);
+	return (error);
 }
 
+/*
+ * This function represents the so-called 'hard case' for sx_xunlock
+ * operation.  All 'easy case' failures are redirected to this.  Note
+ * that ideally this would be a static function, but it needs to be
+ * accessible from at least sx.h.
+ */
 void
-_sx_xunlock(struct sx *sx, const char *file, int line)
+_sx_xunlock_hard(struct sx *sx, uintptr_t tid, const char *file, int line)
 {
+	uintptr_t x;
+	int queue;
 
-	_sx_assert(sx, SX_XLOCKED, file, line);
-	mtx_lock(sx->sx_lock);
-	MPASS(sx->sx_cnt == -1);
-
-	WITNESS_UNLOCK(&sx->sx_object, LOP_EXCLUSIVE, file, line);
-
-	/* Release. */
-	sx->sx_cnt++;
-	sx->sx_xholder = NULL;
+	MPASS(!(sx->sx_lock & SX_LOCK_SHARED));
 
-	/*
-	 * Wake up waiters if there are any.  Give precedence to slock waiters.
-	 */
-	if (sx->sx_shrd_wcnt > 0)
-		cv_broadcast(&sx->sx_shrd_cv);
-	else if (sx->sx_excl_wcnt > 0)
-		cv_signal(&sx->sx_excl_cv);
+	/* If the lock is recursed, then unrecurse one level. */
+	if (sx_xlocked(sx) && sx_recursed(sx)) {
+		if ((--sx->sx_recurse) == 0)
+			atomic_clear_ptr(&sx->sx_lock, SX_LOCK_RECURSED);
+		if (LOCK_LOG_TEST(&sx->lock_object, 0))
+			CTR2(KTR_LOCK, "%s: %p unrecursing", __func__, sx);
+		return;
+	}
+	MPASS(sx->sx_lock & (SX_LOCK_SHARED_WAITERS |
+	    SX_LOCK_EXCLUSIVE_WAITERS));
+	if (LOCK_LOG_TEST(&sx->lock_object, 0))
+		CTR2(KTR_LOCK, "%s: %p contested", __func__, sx);
 
-	LOCK_LOG_LOCK("XUNLOCK", &sx->sx_object, 0, 0, file, line);
+	sleepq_lock(&sx->lock_object);
+	x = SX_LOCK_UNLOCKED;
 
-	mtx_unlock(sx->sx_lock);
+	/*
+	 * The wake up algorithm here is quite simple and probably not
+	 * ideal.  It gives precedence to shared waiters if they are
+	 * present.  For this condition, we have to preserve the
+	 * state of the exclusive waiters flag.
+	 */
+	if (sx->sx_lock & SX_LOCK_SHARED_WAITERS) {
+		queue = SQ_SHARED_QUEUE;
+		x |= (sx->sx_lock & SX_LOCK_EXCLUSIVE_WAITERS);
+	} else
+		queue = SQ_EXCLUSIVE_QUEUE;
+
+	/* Wake up all the waiters for the specific queue. */
+	if (LOCK_LOG_TEST(&sx->lock_object, 0))
+		CTR3(KTR_LOCK, "%s: %p waking up all threads on %s queue",
+		    __func__, sx, queue == SQ_SHARED_QUEUE ? "shared" :
+		    "exclusive");
+	atomic_store_rel_ptr(&sx->sx_lock, x);
+	sleepq_broadcast(&sx->lock_object, SLEEPQ_SX, -1, queue);
 }
 
+/*
+ * This function represents the so-called 'hard case' for sx_slock
+ * operation.  All 'easy case' failures are redirected to this.  Note
+ * that ideally this would be a static function, but it needs to be
+ * accessible from at least sx.h.
+ */
 int
-_sx_try_upgrade(struct sx *sx, const char *file, int line)
+_sx_slock_hard(struct sx *sx, int opts, const char *file, int line)
 {
+	GIANT_DECLARE;
+#ifdef ADAPTIVE_SX
+	volatile struct thread *owner;
+#endif
+#ifdef LOCK_PROFILING_SHARED
+	uint64_t waittime = 0;
+	int contested = 0;
+#endif
+	uintptr_t x;
+	int error = 0;
 
-	_sx_assert(sx, SX_SLOCKED, file, line);
-	mtx_lock(sx->sx_lock);
+	/*
+	 * As with rwlocks, we don't make any attempt to try to block
+	 * shared locks once there is an exclusive waiter.
+	 */
+	for (;;) {
+		x = sx->sx_lock;
 
-	if (sx->sx_cnt == 1) {
-		sx->sx_cnt = -1;
-		sx->sx_xholder = curthread;
+		/*
+		 * If no other thread has an exclusive lock then try to bump up
+		 * the count of sharers.  Since we have to preserve the state
+		 * of SX_LOCK_EXCLUSIVE_WAITERS, if we fail to acquire the
+		 * shared lock loop back and retry.
+		 */
+		if (x & SX_LOCK_SHARED) {
+			MPASS(!(x & SX_LOCK_SHARED_WAITERS));
+			if (atomic_cmpset_acq_ptr(&sx->sx_lock, x,
+			    x + SX_ONE_SHARER)) {
+#ifdef LOCK_PROFILING_SHARED
+				if (SX_SHARERS(x) == 0)
+					lock_profile_obtain_lock_success(
+					    &sx->lock_object, contested,
+					    waittime, file, line);
+#endif
+				if (LOCK_LOG_TEST(&sx->lock_object, 0))
+					CTR4(KTR_LOCK,
+					    "%s: %p succeed %p -> %p", __func__,
+					    sx, (void *)x,
+					    (void *)(x + SX_ONE_SHARER));
+				break;
+			}
+			continue;
+		}
 
-		LOCK_LOG_TRY("XUPGRADE", &sx->sx_object, 0, 1, file, line);
-		WITNESS_UPGRADE(&sx->sx_object, LOP_EXCLUSIVE | LOP_TRYLOCK,
-		    file, line);
+#ifdef ADAPTIVE_SX
+		/*
+		 * If the owner is running on another CPU, spin until
+		 * the owner stops running or the state of the lock
+		 * changes.
+		 */
+		else if (sx->lock_object.lo_flags & SX_ADAPTIVESPIN) {
+			x = SX_OWNER(x);
+			owner = (struct thread *)x;
+			if (TD_IS_RUNNING(owner)) {
+				if (LOCK_LOG_TEST(&sx->lock_object, 0))
+					CTR3(KTR_LOCK,
+					    "%s: spinning on %p held by %p",
+					    __func__, sx, owner);
+				GIANT_SAVE();
+#ifdef LOCK_PROFILING_SHARED
+				lock_profile_obtain_lock_failed(
+				    &sx->lock_object, &contested, &waittime);
+#endif
+				while (SX_OWNER(sx->sx_lock) == x &&
+				    TD_IS_RUNNING(owner))
+					cpu_spinwait();
+				continue;
+			}
+		}
+#endif
 
-		mtx_unlock(sx->sx_lock);
-		return (1);
-	} else {
-		LOCK_LOG_TRY("XUPGRADE", &sx->sx_object, 0, 0, file, line);
-		mtx_unlock(sx->sx_lock);
-		return (0);
+		/*
+		 * Some other thread already has an exclusive lock, so
+		 * start the process of blocking.
+		 */
+		sleepq_lock(&sx->lock_object);
+		x = sx->sx_lock;
+
+		/*
+		 * The lock could have been released while we spun.
+		 * In this case loop back and retry.
+		 */
+		if (x & SX_LOCK_SHARED) {
+			sleepq_release(&sx->lock_object);
+			continue;
+		}
+
+#ifdef ADAPTIVE_SX
+		/*
+		 * If the owner is running on another CPU, spin until
+		 * the owner stops running or the state of the lock
+		 * changes.
+		 */
+		if (!(x & SX_LOCK_SHARED) &&
+		    (sx->lock_object.lo_flags & SX_ADAPTIVESPIN)) {
+			owner = (struct thread *)SX_OWNER(x);
+			if (TD_IS_RUNNING(owner)) {
+				sleepq_release(&sx->lock_object);
+				continue;
+			}
+		}
+#endif
+
+		/*
+		 * Try to set the SX_LOCK_SHARED_WAITERS flag.  If we
+		 * fail to set it drop the sleep queue lock and loop
+		 * back.
+		 */
+		if (!(x & SX_LOCK_SHARED_WAITERS)) {
+			if (!atomic_cmpset_ptr(&sx->sx_lock, x,
+			    x | SX_LOCK_SHARED_WAITERS)) {
+				sleepq_release(&sx->lock_object);
+				continue;
+			}
+			if (LOCK_LOG_TEST(&sx->lock_object, 0))
+				CTR2(KTR_LOCK, "%s: %p set shared waiters flag",
+				    __func__, sx);
+		}
+
+		/*
+		 * Since we have been unable to acquire the shared lock,
+		 * we have to sleep.
+		 */
+		if (LOCK_LOG_TEST(&sx->lock_object, 0))
+			CTR2(KTR_LOCK, "%s: %p blocking on sleep queue",
+			    __func__, sx);
+
+		GIANT_SAVE();
+#ifdef LOCK_PROFILING_SHARED
+		lock_profile_obtain_lock_failed(&sx->lock_object, &contested,
+		    &waittime);
+#endif
+		sleepq_add(&sx->lock_object, NULL, sx->lock_object.lo_name,
+		    SLEEPQ_SX | ((opts & SX_INTERRUPTIBLE) ?
+		    SLEEPQ_INTERRUPTIBLE : 0), SQ_SHARED_QUEUE);
+		if (!(opts & SX_INTERRUPTIBLE))
+			sleepq_wait(&sx->lock_object);
+		else
+			error = sleepq_wait_sig(&sx->lock_object);
+
+		if (error) {
+			if (LOCK_LOG_TEST(&sx->lock_object, 0))
+				CTR2(KTR_LOCK,
+			"%s: interruptible sleep by %p suspended by signal",
+				    __func__, sx);
+			break;
+		}
+		if (LOCK_LOG_TEST(&sx->lock_object, 0))
+			CTR2(KTR_LOCK, "%s: %p resuming from sleep queue",
+			    __func__, sx);
 	}
+
+	GIANT_RESTORE();
+	return (error);
 }
 
+/*
+ * This function represents the so-called 'hard case' for sx_sunlock
+ * operation.  All 'easy case' failures are redirected to this.  Note
+ * that ideally this would be a static function, but it needs to be
+ * accessible from at least sx.h.
+ */
 void
-_sx_downgrade(struct sx *sx, const char *file, int line)
+_sx_sunlock_hard(struct sx *sx, const char *file, int line)
 {
+	uintptr_t x;
+
+	for (;;) {
+		x = sx->sx_lock;
 
-	_sx_assert(sx, SX_XLOCKED, file, line);
-	mtx_lock(sx->sx_lock);
-	MPASS(sx->sx_cnt == -1);
+		/*
+		 * We should never have sharers while at least one thread
+		 * holds a shared lock.
+		 */
+		KASSERT(!(x & SX_LOCK_SHARED_WAITERS),
+		    ("%s: waiting sharers", __func__));
 
-	WITNESS_DOWNGRADE(&sx->sx_object, 0, file, line);
+		/*
+		 * See if there is more than one shared lock held.  If
+		 * so, just drop one and return.
+		 */
+		if (SX_SHARERS(x) > 1) {
+			if (atomic_cmpset_ptr(&sx->sx_lock, x,
+			    x - SX_ONE_SHARER)) {
+				if (LOCK_LOG_TEST(&sx->lock_object, 0))
+					CTR4(KTR_LOCK,
+					    "%s: %p succeeded %p -> %p",
+					    __func__, sx, (void *)x,
+					    (void *)(x - SX_ONE_SHARER));
+				break;
+			}
+			continue;
+		}
+
+		/*
+		 * If there aren't any waiters for an exclusive lock,
+		 * then try to drop it quickly.
+		 */
+		if (!(x & SX_LOCK_EXCLUSIVE_WAITERS)) {
+			MPASS(x == SX_SHARERS_LOCK(1));
+			if (atomic_cmpset_ptr(&sx->sx_lock, SX_SHARERS_LOCK(1),
+			    SX_LOCK_UNLOCKED)) {
+				if (LOCK_LOG_TEST(&sx->lock_object, 0))
+					CTR2(KTR_LOCK, "%s: %p last succeeded",
+					    __func__, sx);
+				break;
+			}
+			continue;
+		}
 
-	sx->sx_cnt = 1;
-	sx->sx_xholder = NULL;
-        if (sx->sx_shrd_wcnt > 0)
-                cv_broadcast(&sx->sx_shrd_cv);
+		/*
+		 * At this point, there should just be one sharer with
+		 * exclusive waiters.
+		 */
+		MPASS(x == (SX_SHARERS_LOCK(1) | SX_LOCK_EXCLUSIVE_WAITERS));
 
-	LOCK_LOG_LOCK("XDOWNGRADE", &sx->sx_object, 0, 0, file, line);
+		sleepq_lock(&sx->lock_object);
 
-	mtx_unlock(sx->sx_lock);
+		/*
+		 * Wake up semantic here is quite simple:
+		 * Just wake up all the exclusive waiters.
+		 * Note that the state of the lock could have changed,
+		 * so if it fails loop back and retry.
+		 */
+		if (!atomic_cmpset_ptr(&sx->sx_lock,
+		    SX_SHARERS_LOCK(1) | SX_LOCK_EXCLUSIVE_WAITERS,
+		    SX_LOCK_UNLOCKED)) {
+			sleepq_release(&sx->lock_object);
+			continue;
+		}
+		if (LOCK_LOG_TEST(&sx->lock_object, 0))
+			CTR2(KTR_LOCK, "%s: %p waking up all thread on"
+			    "exclusive queue", __func__, sx);
+		sleepq_broadcast(&sx->lock_object, SLEEPQ_SX, -1,
+		    SQ_EXCLUSIVE_QUEUE);
+		break;
+	}
 }
 
 #ifdef INVARIANT_SUPPORT
@@ -334,44 +887,76 @@
 void
 _sx_assert(struct sx *sx, int what, const char *file, int line)
 {
+#ifndef WITNESS
+	int slocked = 0;
+#endif
 
 	if (panicstr != NULL)
 		return;
 	switch (what) {
-	case SX_LOCKED:
-	case SX_SLOCKED:
+	case SA_SLOCKED:
+	case SA_SLOCKED | SA_NOTRECURSED:
+	case SA_SLOCKED | SA_RECURSED:
+#ifndef WITNESS
+		slocked = 1;
+		/* FALLTHROUGH */
+#endif
+	case SA_LOCKED:
+	case SA_LOCKED | SA_NOTRECURSED:
+	case SA_LOCKED | SA_RECURSED:
 #ifdef WITNESS
-		witness_assert(&sx->sx_object, what, file, line);
+		witness_assert(&sx->lock_object, what, file, line);
 #else
-		mtx_lock(sx->sx_lock);
-		if (sx->sx_cnt <= 0 &&
-		    (what == SX_SLOCKED || sx->sx_xholder != curthread))
+		/*
+		 * If some other thread has an exclusive lock or we
+		 * have one and are asserting a shared lock, fail.
+		 * Also, if no one has a lock at all, fail.
+		 */
+		if (sx->sx_lock == SX_LOCK_UNLOCKED ||
+		    (!(sx->sx_lock & SX_LOCK_SHARED) && (slocked ||
+		    sx_xholder(sx) != curthread)))
 			panic("Lock %s not %slocked @ %s:%d\n",
-			    sx->sx_object.lo_name, (what == SX_SLOCKED) ?
-			    "share " : "", file, line);
-		mtx_unlock(sx->sx_lock);
+			    sx->lock_object.lo_name, slocked ? "share " : "",
+			    file, line);
+
+		if (!(sx->sx_lock & SX_LOCK_SHARED)) {
+			if (sx_recursed(sx)) {
+				if (what & SA_NOTRECURSED)
+					panic("Lock %s recursed @ %s:%d\n",
+					    sx->lock_object.lo_name, file,
+					    line);
+			} else if (what & SA_RECURSED)
+				panic("Lock %s not recursed @ %s:%d\n",
+				    sx->lock_object.lo_name, file, line);
+		}
 #endif
 		break;
-	case SX_XLOCKED:
-		mtx_lock(sx->sx_lock);
-		if (sx->sx_xholder != curthread)
+	case SA_XLOCKED:
+	case SA_XLOCKED | SA_NOTRECURSED:
+	case SA_XLOCKED | SA_RECURSED:
+		if (sx_xholder(sx) != curthread)
 			panic("Lock %s not exclusively locked @ %s:%d\n",
-			    sx->sx_object.lo_name, file, line);
-		mtx_unlock(sx->sx_lock);
+			    sx->lock_object.lo_name, file, line);
+		if (sx_recursed(sx)) {
+			if (what & SA_NOTRECURSED)
+				panic("Lock %s recursed @ %s:%d\n",
+				    sx->lock_object.lo_name, file, line);
+		} else if (what & SA_RECURSED)
+			panic("Lock %s not recursed @ %s:%d\n",
+			    sx->lock_object.lo_name, file, line);
 		break;
-	case SX_UNLOCKED:
+	case SA_UNLOCKED:
 #ifdef WITNESS
-		witness_assert(&sx->sx_object, what, file, line);
+		witness_assert(&sx->lock_object, what, file, line);
 #else
 		/*
-		 * We are able to check only exclusive lock here,
-		 * we cannot assert that *this* thread owns slock.
+		 * If we hold an exclusve lock fail.  We can't
+		 * reliably check to see if we hold a shared lock or
+		 * not.
 		 */
-		mtx_lock(sx->sx_lock);
-		if (sx->sx_xholder == curthread)
+		if (sx_xholder(sx) == curthread)
 			panic("Lock %s exclusively locked @ %s:%d\n",
-			    sx->sx_object.lo_name, file, line);
-		mtx_unlock(sx->sx_lock);
+			    sx->lock_object.lo_name, file, line);
 #endif
 		break;
 	default:
@@ -382,7 +967,7 @@
 #endif	/* INVARIANT_SUPPORT */
 
 #ifdef DDB
-void
+static void
 db_show_sx(struct lock_object *lock)
 {
 	struct thread *td;
@@ -391,15 +976,66 @@
 	sx = (struct sx *)lock;
 
 	db_printf(" state: ");
-	if (sx->sx_cnt < 0) {
-		td = sx->sx_xholder;
+	if (sx->sx_lock == SX_LOCK_UNLOCKED)
+		db_printf("UNLOCKED\n");
+	else if (sx->sx_lock == SX_LOCK_DESTROYED) {
+		db_printf("DESTROYED\n");
+		return;
+	} else if (sx->sx_lock & SX_LOCK_SHARED)
+		db_printf("SLOCK: %ju\n", (uintmax_t)SX_SHARERS(sx->sx_lock));
+	else {
+		td = sx_xholder(sx);
 		db_printf("XLOCK: %p (tid %d, pid %d, \"%s\")\n", td,
 		    td->td_tid, td->td_proc->p_pid, td->td_proc->p_comm);
-	} else if (sx->sx_cnt > 0)
-		db_printf("SLOCK: %d locks\n", sx->sx_cnt);
+		if (sx_recursed(sx))
+			db_printf(" recursed: %d\n", sx->sx_recurse);
+	}
+
+	db_printf(" waiters: ");
+	switch(sx->sx_lock &
+	    (SX_LOCK_SHARED_WAITERS | SX_LOCK_EXCLUSIVE_WAITERS)) {
+	case SX_LOCK_SHARED_WAITERS:
+		db_printf("shared\n");
+		break;
+	case SX_LOCK_EXCLUSIVE_WAITERS:
+		db_printf("exclusive\n");
+		break;
+	case SX_LOCK_SHARED_WAITERS | SX_LOCK_EXCLUSIVE_WAITERS:
+		db_printf("exclusive and shared\n");
+		break;
+	default:
+		db_printf("none\n");
+	}
+}
+
+/*
+ * Check to see if a thread that is blocked on a sleep queue is actually
+ * blocked on an sx lock.  If so, output some details and return true.
+ * If the lock has an exclusive owner, return that in *ownerp.
+ */
+int
+sx_chain(struct thread *td, struct thread **ownerp)
+{
+	struct sx *sx;
+
+	/*
+	 * Check to see if this thread is blocked on an sx lock.
+	 * First, we check the lock class.  If that is ok, then we
+	 * compare the lock name against the wait message.
+	 */
+	sx = td->td_wchan;
+	if (LOCK_CLASS(&sx->lock_object) != &lock_class_sx ||
+	    sx->lock_object.lo_name != td->td_wmesg)
+		return (0);
+
+	/* We think we have an sx lock, so output some details. */
+	db_printf("blocked on sx \"%s\" ", td->td_wmesg);
+	*ownerp = sx_xholder(sx);
+	if (sx->sx_lock & SX_LOCK_SHARED)
+		db_printf("SLOCK (count %ju)\n",
+		    (uintmax_t)SX_SHARERS(sx->sx_lock));
 	else
-		db_printf("UNLOCKED\n");
-	db_printf(" waiters: %d shared, %d exclusive\n", sx->sx_shrd_wcnt,
-	    sx->sx_excl_wcnt);
+		db_printf("XLOCK\n");
+	return (1);
 }
 #endif
Index: vfs_lookup.c
===================================================================
RCS file: /home/cvs/src/sys/kern/vfs_lookup.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/vfs_lookup.c -L sys/kern/vfs_lookup.c -u -r1.2 -r1.3
--- sys/kern/vfs_lookup.c
+++ sys/kern/vfs_lookup.c
@@ -35,7 +35,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/vfs_lookup.c,v 1.80.2.6.2.1 2006/04/30 03:58:12 kris Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/vfs_lookup.c,v 1.102 2007/09/21 10:16:56 pjd Exp $");
 
 #include "opt_ktrace.h"
 #include "opt_mac.h"
@@ -45,7 +45,6 @@
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
-#include <sys/mac.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/vnode.h>
@@ -58,6 +57,9 @@
 #include <sys/ktrace.h>
 #endif
 
+#include <security/audit/audit.h>
+#include <security/mac/mac_framework.h>
+
 #include <vm/uma.h>
 
 #define	NAMEI_DIAGNOSTIC 1
@@ -67,13 +69,22 @@
  * Allocation zone for namei
  */
 uma_zone_t namei_zone;
+/*
+ * Placeholder vnode for mp traversal
+ */
+static struct vnode *vp_crossmp;
 
 static void
 nameiinit(void *dummy __unused)
 {
+	int error;
+
 	namei_zone = uma_zcreate("NAMEI", MAXPATHLEN, NULL, NULL, NULL, NULL,
 	    UMA_ALIGN_PTR, 0);
-
+	error = getnewvnode("crossmp", NULL, &dead_vnodeops, &vp_crossmp);
+	if (error != 0)
+		panic("nameiinit: getnewvnode");
+	vp_crossmp->v_vnlock->lk_flags &= ~LK_NOSHARE;
 }
 SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_SECOND, nameiinit, NULL)
 
@@ -86,7 +97,7 @@
     "Enables/Disables shared locks for path name translation");
 
 /*
- * Convert a pathname into a pointer to a locked inode.
+ * Convert a pathname into a pointer to a locked vnode.
  *
  * The FOLLOW flag is set when symbolic links are to be followed
  * when they occur at the end of the name translation process.
@@ -106,12 +117,11 @@
  *	}
  */
 int
-namei(ndp)
-	register struct nameidata *ndp;
+namei(struct nameidata *ndp)
 {
-	register struct filedesc *fdp;	/* pointer to file descriptor state */
-	register char *cp;		/* pointer into pathname argument */
-	register struct vnode *dp;	/* the directory we are searching */
+	struct filedesc *fdp;	/* pointer to file descriptor state */
+	char *cp;		/* pointer into pathname argument */
+	struct vnode *dp;	/* the directory we are searching */
 	struct iovec aiov;		/* uio for reading symbolic links */
 	struct uio auio;
 	int error, linklen;
@@ -145,6 +155,12 @@
 		error = copyinstr(ndp->ni_dirp, cnp->cn_pnbuf,
 			    MAXPATHLEN, (size_t *)&ndp->ni_pathlen);
 
+	/* If we are auditing the kernel pathname, save the user pathname. */
+	if (cnp->cn_flags & AUDITVNODE1)
+		AUDIT_ARG(upath, td, cnp->cn_pnbuf, ARG_UPATH1);
+	if (cnp->cn_flags & AUDITVNODE2)
+		AUDIT_ARG(upath, td, cnp->cn_pnbuf, ARG_UPATH2);
+
 	/*
 	 * Don't allow empty pathnames.
 	 */
@@ -172,14 +188,14 @@
 	/*
 	 * Get starting point for the translation.
 	 */
-	FILEDESC_LOCK(fdp);
+	FILEDESC_SLOCK(fdp);
 	ndp->ni_rootdir = fdp->fd_rdir;
 	ndp->ni_topdir = fdp->fd_jdir;
 
 	dp = fdp->fd_cdir;
 	vfslocked = VFS_LOCK_GIANT(dp->v_mount);
 	VREF(dp);
-	FILEDESC_UNLOCK(fdp);
+	FILEDESC_SUNLOCK(fdp);
 	for (;;) {
 		/*
 		 * Check if root directory should replace current directory.
@@ -296,6 +312,17 @@
 	return (error);
 }
 
+static int
+compute_cn_lkflags(struct mount *mp, int lkflags)
+{
+	if (mp == NULL || 
+	    ((lkflags & LK_SHARED) && !(mp->mnt_kern_flag & MNTK_LOOKUP_SHARED))) {
+		lkflags &= ~LK_SHARED;
+		lkflags |= LK_EXCLUSIVE;
+	}
+	return lkflags;
+}
+
 /*
  * Search a pathname.
  * This is a very central and rather complicated routine.
@@ -335,11 +362,10 @@
  *	    if WANTPARENT set, return unlocked parent in ni_dvp
  */
 int
-lookup(ndp)
-	register struct nameidata *ndp;
+lookup(struct nameidata *ndp)
 {
-	register char *cp;		/* pointer into pathname argument */
-	register struct vnode *dp = 0;	/* the directory we are searching */
+	char *cp;		/* pointer into pathname argument */
+	struct vnode *dp = 0;	/* the directory we are searching */
 	struct vnode *tdp;		/* saved dp */
 	struct mount *mp;		/* mount table entry */
 	int docache;			/* == 0 do not cache last component */
@@ -353,7 +379,8 @@
 	int vfslocked;			/* VFS Giant state for child */
 	int dvfslocked;			/* VFS Giant state for parent */
 	int tvfslocked;
-
+	int lkflags_save;
+	
 	/*
 	 * Setup: break out flag bits into variables.
 	 */
@@ -381,7 +408,7 @@
 		cnp->cn_lkflags = LK_EXCLUSIVE;
 	dp = ndp->ni_startdir;
 	ndp->ni_startdir = NULLVP;
-	vn_lock(dp, cnp->cn_lkflags | LK_RETRY, td);
+	vn_lock(dp, compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags | LK_RETRY), td);
 
 dirloop:
 	/*
@@ -460,6 +487,12 @@
 			VREF(dp);
 		}
 		ndp->ni_vp = dp;
+
+		if (cnp->cn_flags & AUDITVNODE1)
+			AUDIT_ARG(vnode, dp, ARG_VNODE1);
+		else if (cnp->cn_flags & AUDITVNODE2)
+			AUDIT_ARG(vnode, dp, ARG_VNODE2);
+
 		if (!(cnp->cn_flags & (LOCKPARENT | LOCKLEAF)))
 			VOP_UNLOCK(dp, 0, td);
 		/* XXX This should probably move to the top of function. */
@@ -491,15 +524,16 @@
 		for (;;) {
 			if (dp == ndp->ni_rootdir || 
 			    dp == ndp->ni_topdir || 
-			    dp == rootvnode) {
+			    dp == rootvnode ||
+			    ((dp->v_vflag & VV_ROOT) != 0 &&
+			     (cnp->cn_flags & NOCROSSMOUNT) != 0)) {
 				ndp->ni_dvp = dp;
 				ndp->ni_vp = dp;
 				vfslocked = VFS_LOCK_GIANT(dp->v_mount);
 				VREF(dp);
 				goto nextname;
 			}
-			if ((dp->v_vflag & VV_ROOT) == 0 ||
-			    (cnp->cn_flags & NOCROSSMOUNT))
+			if ((dp->v_vflag & VV_ROOT) == 0)
 				break;
 			if (dp->v_iflag & VI_DOOMED) {	/* forced unmount */
 				error = EBADF;
@@ -512,7 +546,7 @@
 			VREF(dp);
 			vput(tdp);
 			VFS_UNLOCK_GIANT(tvfslocked);
-			vn_lock(dp, cnp->cn_lkflags | LK_RETRY, td);
+			vn_lock(dp, compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags | LK_RETRY), td);
 		}
 	}
 
@@ -535,7 +569,8 @@
 	 * If we have a shared lock we may need to upgrade the lock for the
 	 * last operation.
 	 */
-	if (VOP_ISLOCKED(dp, td) == LK_SHARED &&
+	if (dp != vp_crossmp &&
+	    VOP_ISLOCKED(dp, td) == LK_SHARED &&
 	    (cnp->cn_flags & ISLASTCN) && (cnp->cn_flags & LOCKPARENT))
 		vn_lock(dp, LK_UPGRADE|LK_RETRY, td);
 	/*
@@ -548,7 +583,10 @@
 #ifdef NAMEI_DIAGNOSTIC
 	vprint("lookup in", dp);
 #endif
+	lkflags_save = cnp->cn_lkflags;
+	cnp->cn_lkflags = compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags);
 	if ((error = VOP_LOOKUP(dp, &ndp->ni_vp, cnp)) != 0) {
+		cnp->cn_lkflags = lkflags_save;
 		KASSERT(ndp->ni_vp == NULL, ("leaf should be empty"));
 #ifdef NAMEI_DIAGNOSTIC
 		printf("not found\n");
@@ -563,7 +601,7 @@
 			VREF(dp);
 			vput(tdp);
 			VFS_UNLOCK_GIANT(tvfslocked);
-			vn_lock(dp, cnp->cn_lkflags | LK_RETRY, td);
+			vn_lock(dp, compute_cn_lkflags(dp->v_mount, cnp->cn_lkflags | LK_RETRY), td);
 			goto unionlookup;
 		}
 
@@ -593,14 +631,15 @@
 		/*
 		 * We return with ni_vp NULL to indicate that the entry
 		 * doesn't currently exist, leaving a pointer to the
-		 * (possibly locked) directory inode in ndp->ni_dvp.
+		 * (possibly locked) directory vnode in ndp->ni_dvp.
 		 */
 		if (cnp->cn_flags & SAVESTART) {
 			ndp->ni_startdir = ndp->ni_dvp;
 			VREF(ndp->ni_startdir);
 		}
 		goto success;
-	}
+	} else
+		cnp->cn_lkflags = lkflags_save;
 #ifdef NAMEI_DIAGNOSTIC
 	printf("found\n");
 #endif
@@ -630,10 +669,17 @@
 		VFS_UNLOCK_GIANT(vfslocked);
 		vfslocked = VFS_LOCK_GIANT(mp);
 		if (dp != ndp->ni_dvp)
-			VOP_UNLOCK(ndp->ni_dvp, 0, td);
-		error = VFS_ROOT(mp, cnp->cn_lkflags, &tdp, td);
+			vput(ndp->ni_dvp);
+		else
+			vrele(ndp->ni_dvp);
+		VFS_UNLOCK_GIANT(dvfslocked);
+		dvfslocked = 0;
+		vref(vp_crossmp);
+		ndp->ni_dvp = vp_crossmp;
+		error = VFS_ROOT(mp, compute_cn_lkflags(mp, cnp->cn_lkflags), &tdp, td);
 		vfs_unbusy(mp, td);
-		vn_lock(ndp->ni_dvp, cnp->cn_lkflags | LK_RETRY, td);
+		if (vn_lock(vp_crossmp, LK_SHARED | LK_NOWAIT, td))
+			panic("vp_crossmp exclusively locked or reclaimed");
 		if (error) {
 			dpunlocked = 1;
 			goto bad2;
@@ -718,9 +764,22 @@
 	} else if ((cnp->cn_flags & LOCKPARENT) == 0 && ndp->ni_dvp != dp)
 		VOP_UNLOCK(ndp->ni_dvp, 0, td);
 
+	if (cnp->cn_flags & AUDITVNODE1)
+		AUDIT_ARG(vnode, dp, ARG_VNODE1);
+	else if (cnp->cn_flags & AUDITVNODE2)
+		AUDIT_ARG(vnode, dp, ARG_VNODE2);
+
 	if ((cnp->cn_flags & LOCKLEAF) == 0)
 		VOP_UNLOCK(dp, 0, td);
 success:
+	/*
+	 * Because of lookup_shared we may have the vnode shared locked, but
+	 * the caller may want it to be exclusively locked.
+	 */
+	if ((cnp->cn_flags & (ISLASTCN | LOCKSHARED | LOCKLEAF)) ==
+	    (ISLASTCN | LOCKLEAF) && VOP_ISLOCKED(dp, td) != LK_EXCLUSIVE) {
+		vn_lock(dp, LK_UPGRADE | LK_RETRY, td);
+	}
 	if (vfslocked && dvfslocked)
 		VFS_UNLOCK_GIANT(dvfslocked);	/* Only need one */
 	if (vfslocked || dvfslocked)
@@ -744,12 +803,10 @@
 
 /*
  * relookup - lookup a path name component
- *    Used by lookup to re-aquire things.
+ *    Used by lookup to re-acquire things.
  */
 int
-relookup(dvp, vpp, cnp)
-	struct vnode *dvp, **vpp;
-	struct componentname *cnp;
+relookup(struct vnode *dvp, struct vnode **vpp, struct componentname *cnp)
 {
 	struct thread *td = cnp->cn_thread;
 	struct vnode *dp = 0;		/* the directory we are searching */
@@ -840,10 +897,11 @@
 		/*
 		 * We return with ni_vp NULL to indicate that the entry
 		 * doesn't currently exist, leaving a pointer to the
-		 * (possibly locked) directory inode in ndp->ni_dvp.
+		 * (possibly locked) directory vnode in ndp->ni_dvp.
 		 */
 		return (0);
 	}
+
 	dp = *vpp;
 
 	/*
@@ -891,9 +949,7 @@
  * Free data allocated by namei(); see namei(9) for details.
  */
 void
-NDFREE(ndp, flags)
-     struct nameidata *ndp;
-     const u_int flags;
+NDFREE(struct nameidata *ndp, const u_int flags)
 {
 	int unlock_dvp;
 	int unlock_vp;
--- /dev/null
+++ sys/kern/tty_pts.c
@@ -0,0 +1,917 @@
+/*
+ * Copyright (c) 2003 Networks Associates Technology, Inc.
+ * Copyright (c) 2006 Robert N. M. Watson
+ * Copyright (c) 2006 Olivier Houchard
+ * All rights reserved.
+ *
+ * This software was developed for the FreeBSD Project in part by Network
+ * Associates Laboratories, the Security Research Division of Network
+ * Associates, Inc. under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"),
+ * as part of the DARPA CHATS research program.
+ *
+ * Copyright (c) 1982, 1986, 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)tty_pty.c	8.4 (Berkeley) 2/20/95
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: src/sys/kern/tty_pts.c,v 1.16 2007/07/05 05:54:47 peter Exp $");
+
+/*
+ * Pseudo-teletype Driver
+ * (Actually two drivers, requiring two entries in 'cdevsw')
+ */
+#include "opt_compat.h"
+#include "opt_tty.h"
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/sx.h>
+#if defined(COMPAT_43TTY)
+#include <sys/ioctl_compat.h>
+#endif
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/queue.h>
+#include <sys/tty.h>
+#include <sys/fcntl.h>
+#include <sys/poll.h>
+#include <sys/kernel.h>
+#include <sys/vnode.h>
+#include <sys/signalvar.h>
+#include <sys/malloc.h>
+#include <sys/conf.h>
+#include <sys/sysctl.h>
+#include <sys/filio.h>
+
+static MALLOC_DEFINE(M_PTY, "ptys", "pty data structures");
+
+static void ptsstart(struct tty *tp);
+static void ptsstop(struct tty *tp, int rw);
+static void ptcwakeup(struct tty *tp, int flag);
+
+static d_open_t		ptsopen;
+static d_close_t	ptsclose;
+static d_read_t		ptsread;
+static d_write_t	ptswrite;
+static d_ioctl_t	ptsioctl;
+static d_ioctl_t	ptcioctl;
+static d_open_t		ptcopen;
+static d_close_t	ptcclose;
+static d_read_t		ptcread;
+static d_write_t	ptcwrite;
+static d_poll_t		ptcpoll;
+
+static struct cdevsw pts_cdevsw = {
+	.d_version = 	D_VERSION,
+	.d_open =	ptsopen,
+	.d_close =	ptsclose,
+	.d_read =	ptsread,
+	.d_write =	ptswrite,
+	.d_ioctl =	ptsioctl,
+	.d_poll =	ttypoll,
+	.d_name =	"pts",
+	.d_flags =	D_TTY | D_NEEDGIANT,
+	.d_kqfilter =	ttykqfilter,
+};
+
+static struct cdevsw ptc_cdevsw = {
+	.d_version = 	D_VERSION,
+	.d_open =	ptcopen,
+	.d_close =	ptcclose,
+	.d_read =	ptcread,
+	.d_write =	ptcwrite,
+	.d_ioctl =	ptcioctl,
+	.d_poll =	ptcpoll,
+	.d_name =	"ptc",
+	.d_flags =	D_TTY | D_NEEDGIANT,
+	.d_kqfilter =	ttykqfilter,
+};
+
+#define BUFSIZ 100		/* Chunk size iomoved to/from user */
+
+#define TSA_PTC_READ(tp)	((void *)&(tp)->t_outq.c_cf)
+#define TSA_PTC_WRITE(tp)	((void *)&(tp)->t_rawq.c_cl)
+#define TSA_PTS_READ(tp)	((void *)&(tp)->t_canq)
+
+#define NUM_TO_MINOR(c)		((c & 0xff) | ((c & ~0xff) << 16))
+/*-
+ * Once a tty is allocated, it cannot (currently) be freed.  As such,
+ * we keep a global list of ptys that have been used so we can recycle
+ * them.  Another list is provided for released pts, which are 
+ * not currently allocated, permitting reuse.  pt_flags holds state
+ * associated with a particular session, so isn't overloaded for this.
+ * When a pty descriptor is unused, its number is set to -1 giving
+ * more consistent and traditional allocation orders to pty numbers.
+ *
+ * Locking: (p) indicates that the field is locked by the global pt_mtx.
+ * (c) indicates the value is constant after allocation.   Other fields
+ * await tty locking generally, and are protected by Giant.
+ */
+struct	pt_desc {
+	int			 pt_num;	/* (c) pty number */
+	LIST_ENTRY(pt_desc)	 pt_list;	/* (p) global pty list */
+
+	int			 pt_flags;
+	struct selinfo		 pt_selr, pt_selw;
+	u_char			 pt_send;
+	u_char			 pt_ucntl;
+	struct tty		 *pt_tty;
+	struct cdev		 *pt_devs, *pt_devc;
+	int			 pt_pts_open, pt_ptc_open;
+	struct prison		*pt_prison;
+};
+
+static struct mtx		pt_mtx;
+static LIST_HEAD(,pt_desc)	pt_list;
+static LIST_HEAD(,pt_desc)	pt_free_list;
+
+#define	PF_PKT		0x008		/* packet mode */
+#define	PF_STOPPED	0x010		/* user told stopped */
+#define	PF_NOSTOP	0x040
+#define PF_UCNTL	0x080		/* user control mode */
+
+static unsigned int next_avail_nb;
+
+static int use_pts = 0;
+
+static unsigned int max_pts = 1000;
+
+static unsigned int nb_allocated;
+
+TUNABLE_INT("kern.pts.enable", &use_pts);
+
+SYSCTL_NODE(_kern, OID_AUTO, pts, CTLFLAG_RD, 0, "pts");
+
+SYSCTL_INT(_kern_pts, OID_AUTO, enable, CTLFLAG_RW, &use_pts, 0,
+    "enable pts");
+
+SYSCTL_INT(_kern_pts, OID_AUTO, max, CTLFLAG_RW, &max_pts, 0, "max pts");
+
+/*
+ * If there's a free pty descriptor in the pty descriptor list, retrieve it.
+ * Otherwise, allocate a new one, initialize it, and hook it up.  If there's
+ * not a tty number, reject.
+ */
+static struct pt_desc *
+pty_new(void)
+{
+	struct pt_desc *pt;
+	int nb;
+
+	mtx_lock(&pt_mtx);
+	if (nb_allocated >= max_pts || nb_allocated == 0xffffff) {
+		mtx_unlock(&pt_mtx);
+		return (NULL);
+	}
+	nb_allocated++;
+	pt = LIST_FIRST(&pt_free_list);
+	if (pt) {
+		LIST_REMOVE(pt, pt_list);
+		LIST_INSERT_HEAD(&pt_list, pt, pt_list);
+		mtx_unlock(&pt_mtx);
+	} else {
+		nb = next_avail_nb++;
+		mtx_unlock(&pt_mtx);
+		pt = malloc(sizeof(*pt), M_PTY, M_WAITOK | M_ZERO);
+		mtx_lock(&pt_mtx);
+		pt->pt_num = nb;
+		LIST_INSERT_HEAD(&pt_list, pt, pt_list);
+		mtx_unlock(&pt_mtx);
+		pt->pt_tty = ttyalloc();
+	}
+	return (pt);
+}
+
+/*
+ * Release a pty descriptor back to the pool for reuse.  The pty number
+ * remains allocated.
+ */
+static void
+pty_release(void *v)
+{
+	struct pt_desc *pt = (struct pt_desc *)v;
+
+	mtx_lock(&pt_mtx);
+	KASSERT(pt->pt_ptc_open == 0 && pt->pt_pts_open == 0,
+	    ("pty_release: pts/%d freed while open\n", pt->pt_num));
+	KASSERT(pt->pt_devs == NULL && pt->pt_devc == NULL,
+	    ("pty_release: pts/%d freed whith non-null struct cdev\n", pt->pt_num));
+	nb_allocated--;
+	LIST_REMOVE(pt, pt_list);
+	LIST_INSERT_HEAD(&pt_free_list, pt, pt_list);
+	mtx_unlock(&pt_mtx);
+}
+
+/*
+ * Given a pty descriptor, if both endpoints are closed, release all
+ * resources and destroy the device nodes to flush file system level
+ * state for the tty (owner, avoid races, etc).
+ */
+static void
+pty_maybecleanup(struct pt_desc *pt)
+{
+	struct cdev *pt_devs, *pt_devc;
+
+	if (pt->pt_ptc_open || pt->pt_pts_open)
+		return;
+
+	if (pt->pt_tty->t_refcnt > 1)
+		return;
+
+	if (bootverbose)
+		printf("destroying pty %d\n", pt->pt_num);
+
+	pt_devs = pt->pt_devs;
+	pt_devc = pt->pt_devc;
+	pt->pt_devs = pt->pt_devc = NULL;
+	pt->pt_tty->t_dev = NULL;
+	pt_devc->si_drv1 = NULL;
+	ttyrel(pt->pt_tty);
+	pt->pt_tty = NULL;
+	destroy_dev_sched(pt_devs);
+	destroy_dev_sched_cb(pt_devc, pty_release, pt);
+}
+
+/*ARGSUSED*/
+static int
+ptsopen(struct cdev *dev, int flag, int devtype, struct thread *td)
+{
+	struct tty *tp;
+	int error;
+	struct pt_desc *pt;
+
+	pt = dev->si_drv1;
+	tp = dev->si_tty;
+	if ((tp->t_state & TS_ISOPEN) == 0)
+		ttyinitmode(tp, 1, 0);
+	else if (tp->t_state & TS_XCLUDE && priv_check(td,
+	    PRIV_TTY_EXCLUSIVE)) {
+		return (EBUSY);
+	} else if (pt->pt_prison != td->td_ucred->cr_prison &&
+	    priv_check(td, PRIV_TTY_PRISON)) {
+		return (EBUSY);
+	}
+	if (tp->t_oproc)			/* Ctrlr still around. */
+		ttyld_modem(tp, 1);
+	while ((tp->t_state & TS_CARR_ON) == 0) {
+		if (flag & FNONBLOCK)
+			break;
+		error = ttysleep(tp, TSA_CARR_ON(tp), TTIPRI | PCATCH,
+				 "ptsopn", 0);
+		if (error)
+			return (error);
+	}
+	error = ttyld_open(tp, dev);
+	if (error == 0) {
+		ptcwakeup(tp, FREAD|FWRITE);
+		pt->pt_pts_open = 1;
+	}
+	return (error);
+}
+
+static int
+ptsclose(struct cdev *dev, int flag, int mode, struct thread *td)
+{
+	struct pt_desc *pt = dev->si_drv1;
+	struct tty *tp;
+	int err;
+
+	tp = dev->si_tty;
+	err = ttyld_close(tp, flag);
+	ptsstop(tp, FREAD|FWRITE);
+	(void) tty_close(tp);
+	pt->pt_pts_open = 0;
+	pty_maybecleanup(pt);
+	return (err);
+}
+
+static int
+ptsread(struct cdev *dev, struct uio *uio, int flag)
+{
+	struct tty *tp = dev->si_tty;
+	int error = 0;
+
+	if (tp->t_oproc)
+		error = ttyld_read(tp, uio, flag);
+	ptcwakeup(tp, FWRITE);
+	return (error);
+}
+
+/*
+ * Write to pseudo-tty.
+ * Wakeups of controlling tty will happen
+ * indirectly, when tty driver calls ptsstart.
+ */
+static int
+ptswrite(struct cdev *dev, struct uio *uio, int flag)
+{
+	struct tty *tp;
+
+	tp = dev->si_tty;
+	if (tp->t_oproc == 0)
+		return (EIO);
+	return (ttyld_write(tp, uio, flag));
+}
+
+/*
+ * Start output on pseudo-tty.
+ * Wake up process selecting or sleeping for input from controlling tty.
+ */
+static void
+ptsstart(struct tty *tp)
+{
+	struct pt_desc *pt = tp->t_dev->si_drv1;
+
+	if (tp->t_state & TS_TTSTOP)
+		return;
+	if (pt->pt_flags & PF_STOPPED) {
+		pt->pt_flags &= ~PF_STOPPED;
+		pt->pt_send = TIOCPKT_START;
+	}
+	ptcwakeup(tp, FREAD);
+}
+
+static void
+ptcwakeup(struct tty *tp, int flag)
+{
+	struct pt_desc *pt = tp->t_dev->si_drv1;
+
+	if (flag & FREAD) {
+		selwakeup(&pt->pt_selr);
+		wakeup(TSA_PTC_READ(tp));
+	}
+	if (flag & FWRITE) {
+		selwakeup(&pt->pt_selw);
+		wakeup(TSA_PTC_WRITE(tp));
+	}
+}
+
+/*
+ * ptcopen implementes exclusive access to the master/control device
+ * as well as creating the slave device based on the credential of the
+ * process opening the master.  By creating the slave here, we avoid
+ * a race to access the master in terms of having a process with access
+ * to an incorrectly owned slave, but it does create the possibility
+ * that a racing process can cause a ptmx user to get EIO if it gets
+ * there first.  Consumers of ptmx must look for EIO and retry if it
+ * happens.  VFS locking may actually prevent this from occurring due
+ * to the lookup into devfs holding the vnode lock through open, but
+ * it's better to be careful.
+ */
+static int
+ptcopen(struct cdev *dev, int flag, int devtype, struct thread *td)
+{
+	struct pt_desc *pt;
+	struct tty *tp;
+	struct cdev *devs;
+
+	pt = dev->si_drv1;
+	if (pt == NULL)
+		return (EIO);
+	/*
+	 * In case we have destroyed the struct tty at the last connect time,
+	 * we need to recreate it.
+	 */
+	if (pt->pt_tty == NULL) {
+		pt->pt_tty = ttyalloc();
+		dev->si_tty = pt->pt_tty;
+	}
+	tp = dev->si_tty;
+	if (tp->t_oproc)
+		return (EIO);
+
+	/*
+	 * XXX: Might want to make the ownership/permissions here more
+	 * configurable.
+	 */
+	if (pt->pt_devs)
+		devs = pt->pt_devs;
+	else
+		pt->pt_devs = devs = make_dev_cred(&pts_cdevsw, 
+		    NUM_TO_MINOR(pt->pt_num), 
+		    td->td_ucred, UID_ROOT, GID_WHEEL, 0666, "pts/%d",
+		    pt->pt_num);
+	devs->si_drv1 = pt;
+	devs->si_tty = pt->pt_tty;
+	pt->pt_tty->t_dev = devs;
+
+	tp->t_timeout = -1;
+	tp->t_oproc = ptsstart;
+	tp->t_stop = ptsstop;
+	ttyld_modem(tp, 1);
+	tp->t_lflag &= ~EXTPROC;
+	pt = dev->si_drv1;
+	pt->pt_prison = td->td_ucred->cr_prison;
+	pt->pt_flags = 0;
+	pt->pt_send = 0;
+	pt->pt_ucntl = 0;
+	pt->pt_ptc_open = 1;
+	return (0);
+}
+
+static int
+ptcclose(struct cdev *dev, int flags, int fmt, struct thread *td)
+{
+	struct pt_desc *pt = dev->si_drv1;
+	struct tty *tp;
+
+	tp = dev->si_tty;
+	ttyld_modem(tp, 0);
+
+	/*
+	 * XXX MDMBUF makes no sense for ptys but would inhibit the above
+	 * l_modem().  CLOCAL makes sense but isn't supported.   Special
+	 * l_modem()s that ignore carrier drop make no sense for ptys but
+	 * may be in use because other parts of the line discipline make
+	 * sense for ptys.  Recover by doing everything that a normal
+	 * ttymodem() would have done except for sending a SIGHUP.
+	 */
+	if (tp->t_state & TS_ISOPEN) {
+		tp->t_state &= ~(TS_CARR_ON | TS_CONNECTED);
+		tp->t_state |= TS_ZOMBIE;
+		ttyflush(tp, FREAD | FWRITE);
+	}
+
+	tp->t_oproc = 0;		/* mark closed */
+	pt->pt_ptc_open = 0;
+	pty_maybecleanup(pt);
+	return (0);
+}
+
+static int
+ptcread(struct cdev *dev, struct uio *uio, int flag)
+{
+	struct tty *tp = dev->si_tty;
+	struct pt_desc *pt = dev->si_drv1;
+	char buf[BUFSIZ];
+	int error = 0, cc;
+
+	/*
+	 * We want to block until the slave
+	 * is open, and there's something to read;
+	 * but if we lost the slave or we're NBIO,
+	 * then return the appropriate error instead.
+	 */
+	for (;;) {
+		if (tp->t_state&TS_ISOPEN) {
+			if (pt->pt_flags&PF_PKT && pt->pt_send) {
+				error = ureadc((int)pt->pt_send, uio);
+				if (error)
+					return (error);
+				if (pt->pt_send & TIOCPKT_IOCTL) {
+					cc = min(uio->uio_resid,
+						sizeof(tp->t_termios));
+					uiomove(&tp->t_termios, cc, uio);
+				}
+				pt->pt_send = 0;
+				return (0);
+			}
+			if (pt->pt_flags&PF_UCNTL && pt->pt_ucntl) {
+				error = ureadc((int)pt->pt_ucntl, uio);
+				if (error)
+					return (error);
+				pt->pt_ucntl = 0;
+				return (0);
+			}
+			if (tp->t_outq.c_cc && (tp->t_state&TS_TTSTOP) == 0)
+				break;
+		}
+		if ((tp->t_state & TS_CONNECTED) == 0)
+			return (0);	/* EOF */
+		if (flag & O_NONBLOCK)
+			return (EWOULDBLOCK);
+		error = tsleep(TSA_PTC_READ(tp), TTIPRI | PCATCH, "ptcin", 0);
+		if (error)
+			return (error);
+	}
+	if (pt->pt_flags & (PF_PKT|PF_UCNTL))
+		error = ureadc(0, uio);
+	while (uio->uio_resid > 0 && error == 0) {
+		cc = q_to_b(&tp->t_outq, buf, min(uio->uio_resid, BUFSIZ));
+		if (cc <= 0)
+			break;
+		error = uiomove(buf, cc, uio);
+	}
+	ttwwakeup(tp);
+	return (error);
+}
+
+static void
+ptsstop(struct tty *tp, int flush)
+{
+	struct pt_desc *pt = tp->t_dev->si_drv1;
+	int flag;
+
+	/* note: FLUSHREAD and FLUSHWRITE already ok */
+	if (flush == 0) {
+		flush = TIOCPKT_STOP;
+		pt->pt_flags |= PF_STOPPED;
+	} else
+		pt->pt_flags &= ~PF_STOPPED;
+	pt->pt_send |= flush;
+	/* change of perspective */
+	flag = 0;
+	if (flush & FREAD)
+		flag |= FWRITE;
+	if (flush & FWRITE)
+		flag |= FREAD;
+	ptcwakeup(tp, flag);
+}
+
+static int
+ptcpoll(struct cdev *dev, int events, struct thread *td)
+{
+	struct tty *tp = dev->si_tty;
+	struct pt_desc *pt = dev->si_drv1;
+	int revents = 0;
+	int s;
+
+	if ((tp->t_state & TS_CONNECTED) == 0)
+		return (events & 
+		   (POLLHUP | POLLIN | POLLRDNORM | POLLOUT | POLLWRNORM));
+
+	/*
+	 * Need to block timeouts (ttrstart).
+	 */
+	s = spltty();
+
+	if (events & (POLLIN | POLLRDNORM))
+		if ((tp->t_state & TS_ISOPEN) &&
+		    ((tp->t_outq.c_cc && (tp->t_state & TS_TTSTOP) == 0) ||
+		     ((pt->pt_flags & PF_PKT) && pt->pt_send) ||
+		     ((pt->pt_flags & PF_UCNTL) && pt->pt_ucntl)))
+			revents |= events & (POLLIN | POLLRDNORM);
+
+	if (events & (POLLOUT | POLLWRNORM))
+		if (tp->t_state & TS_ISOPEN &&
+		     (((tp->t_rawq.c_cc + tp->t_canq.c_cc < TTYHOG - 2) ||
+		      (tp->t_canq.c_cc == 0 && (tp->t_lflag & ICANON)))))
+			revents |= events & (POLLOUT | POLLWRNORM);
+
+	if (events & POLLHUP)
+		if ((tp->t_state & TS_CARR_ON) == 0)
+			revents |= POLLHUP;
+
+	if (revents == 0) {
+		if (events & (POLLIN | POLLRDNORM))
+			selrecord(td, &pt->pt_selr);
+
+		if (events & (POLLOUT | POLLWRNORM))
+			selrecord(td, &pt->pt_selw);
+	}
+	splx(s);
+
+	return (revents);
+}
+
+static int
+ptcwrite(struct cdev *dev, struct uio *uio, int flag)
+{
+	struct tty *tp = dev->si_tty;
+	u_char *cp = 0;
+	int cc = 0;
+	u_char locbuf[BUFSIZ];
+	int cnt = 0;
+	int error = 0;
+
+again:
+	if ((tp->t_state&TS_ISOPEN) == 0)
+		goto block;
+	while (uio->uio_resid > 0 || cc > 0) {
+		if (cc == 0) {
+			cc = min(uio->uio_resid, BUFSIZ);
+			cp = locbuf;
+			error = uiomove(cp, cc, uio);
+			if (error)
+				return (error);
+			/* check again for safety */
+			if ((tp->t_state & TS_ISOPEN) == 0) {
+				/* adjust for data copied in but not written */
+				uio->uio_resid += cc;
+				return (EIO);
+			}
+		}
+		while (cc > 0) {
+			if ((tp->t_rawq.c_cc + tp->t_canq.c_cc) >= TTYHOG - 2 &&
+			   (tp->t_canq.c_cc > 0 || !(tp->t_lflag&ICANON))) {
+				wakeup(TSA_HUP_OR_INPUT(tp));
+				goto block;
+			}
+			ttyld_rint(tp, *cp++);
+			cnt++;
+			cc--;
+		}
+		cc = 0;
+	}
+	return (0);
+block:
+	/*
+	 * Come here to wait for slave to open, for space
+	 * in outq, or space in rawq, or an empty canq.
+	 */
+	if ((tp->t_state & TS_CONNECTED) == 0) {
+		/* adjust for data copied in but not written */
+		uio->uio_resid += cc;
+		return (EIO);
+	}
+	if (flag & IO_NDELAY) {
+		/* adjust for data copied in but not written */
+		uio->uio_resid += cc;
+		if (cnt == 0)
+			return (EWOULDBLOCK);
+		return (0);
+	}
+	error = tsleep(TSA_PTC_WRITE(tp), TTOPRI | PCATCH, "ptcout", 0);
+	if (error) {
+		/* adjust for data copied in but not written */
+		uio->uio_resid += cc;
+		return (error);
+	}
+	goto again;
+}
+
+static int
+ptcioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag, struct thread *td)
+{
+	struct tty *tp = dev->si_tty;
+	struct pt_desc *pt = dev->si_drv1;
+#if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \
+    defined(COMPAT_FREEBSD4) || defined(COMPAT_43)
+	int ival;
+#endif
+
+	switch (cmd) {
+		
+	case TIOCGPGRP:
+		/*
+		 * We avoid calling ttioctl on the controller since,
+		 * in that case, tp must be the controlling terminal.
+		 */
+		*(int *)data = tp->t_pgrp ? tp->t_pgrp->pg_id : 0;
+		return (0);
+		
+	case TIOCPKT:
+		if (*(int *)data) {
+			if (pt->pt_flags & PF_UCNTL)
+				return (EINVAL);
+			pt->pt_flags |= PF_PKT;
+		} else
+			pt->pt_flags &= ~PF_PKT;
+		return (0);
+		
+	case TIOCUCNTL:
+		if (*(int *)data) {
+			if (pt->pt_flags & PF_PKT)
+				return (EINVAL);
+			pt->pt_flags |= PF_UCNTL;
+		} else
+			pt->pt_flags &= ~PF_UCNTL;
+		return (0);
+	case TIOCGPTN:
+		*(unsigned int *)data = pt->pt_num;
+		return (0);
+	}
+	
+	/*
+	 * The rest of the ioctls shouldn't be called until
+	 * the slave is open.
+	 */
+	if ((tp->t_state & TS_ISOPEN) == 0) {
+		if (cmd == TIOCGETA) {
+			/* 
+			 * TIOCGETA is used by isatty() to make sure it's
+			 * a tty. Linux openpty() calls isatty() very early,
+			 * before the slave is opened, so don't actually
+			 * fill the struct termios, but just let isatty()
+			 * know it's a tty.
+			 */
+			return (0);
+		}
+		if (cmd != FIONBIO && cmd != FIOASYNC)
+			return (EAGAIN);
+	}
+	
+	switch (cmd) {
+#ifdef COMPAT_43TTY
+	case TIOCSETP:
+	case TIOCSETN:
+#endif
+	case TIOCSETD:
+	case TIOCSETA:
+	case TIOCSETAW:
+	case TIOCSETAF:
+		/*
+		 * IF CONTROLLER STTY THEN MUST FLUSH TO PREVENT A HANG.
+		 * ttywflush(tp) will hang if there are characters in
+		 * the outq.
+		 */
+		ndflush(&tp->t_outq, tp->t_outq.c_cc);
+		break;
+		
+#if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \
+    defined(COMPAT_FREEBSD4) || defined(COMPAT_43)
+	case _IO('t', 95):
+		ival = IOCPARM_IVAL(data);
+		data = (caddr_t)&ival;
+		/* FALLTHROUGH */
+#endif
+	case TIOCSIG:
+		if (*(unsigned int *)data >= NSIG ||
+		    *(unsigned int *)data == 0)
+			return(EINVAL);
+		if ((tp->t_lflag&NOFLSH) == 0)
+			ttyflush(tp, FREAD|FWRITE);
+		if (tp->t_pgrp != NULL) {
+			PGRP_LOCK(tp->t_pgrp);
+			pgsignal(tp->t_pgrp, *(unsigned int *)data, 1);
+			PGRP_UNLOCK(tp->t_pgrp);
+		}
+		if ((*(unsigned int *)data == SIGINFO) &&
+		    ((tp->t_lflag&NOKERNINFO) == 0))
+			ttyinfo(tp);
+		return(0);
+	}
+	return (ptsioctl(dev, cmd, data, flag, td));
+}
+/*ARGSUSED*/
+static int
+ptsioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag, struct thread *td)
+{
+	struct tty *tp = dev->si_tty;
+	struct pt_desc *pt = dev->si_drv1;
+	u_char *cc = tp->t_cc;
+	int stop, error;
+
+	if (cmd == TIOCEXT) {
+		/*
+		 * When the EXTPROC bit is being toggled, we need
+		 * to send an TIOCPKT_IOCTL if the packet driver
+		 * is turned on.
+		 */
+		if (*(int *)data) {
+			if (pt->pt_flags & PF_PKT) {
+				pt->pt_send |= TIOCPKT_IOCTL;
+				ptcwakeup(tp, FREAD);
+			}
+			tp->t_lflag |= EXTPROC;
+		} else {
+			if ((tp->t_lflag & EXTPROC) &&
+			    (pt->pt_flags & PF_PKT)) {
+				pt->pt_send |= TIOCPKT_IOCTL;
+				ptcwakeup(tp, FREAD);
+			}
+			tp->t_lflag &= ~EXTPROC;
+		}
+		return(0);
+	}
+	error = ttioctl(tp, cmd, data, flag);
+	if (error == ENOTTY) {
+		if (pt->pt_flags & PF_UCNTL &&
+		    (cmd & ~0xff) == UIOCCMD(0)) {
+			if (cmd & 0xff) {
+				pt->pt_ucntl = (u_char)cmd;
+				ptcwakeup(tp, FREAD);
+			}
+			return (0);
+		}
+		error = ENOTTY;
+	}
+	/*
+	 * If external processing and packet mode send ioctl packet.
+	 */
+	if ((tp->t_lflag&EXTPROC) && (pt->pt_flags & PF_PKT)) {
+		switch(cmd) {
+		case TIOCSETA:
+		case TIOCSETAW:
+		case TIOCSETAF:
+#ifdef COMPAT_43TTY
+		case TIOCSETP:
+		case TIOCSETN:
+		case TIOCSETC:
+		case TIOCSLTC:
+		case TIOCLBIS:
+		case TIOCLBIC:
+		case TIOCLSET:
+#endif
+			pt->pt_send |= TIOCPKT_IOCTL;
+			ptcwakeup(tp, FREAD);
+			break;
+		default:
+			break;
+		}
+	}
+	stop = (tp->t_iflag & IXON) && CCEQ(cc[VSTOP], CTRL('s'))
+		&& CCEQ(cc[VSTART], CTRL('q'));
+	if (pt->pt_flags & PF_NOSTOP) {
+		if (stop) {
+			pt->pt_send &= ~TIOCPKT_NOSTOP;
+			pt->pt_send |= TIOCPKT_DOSTOP;
+			pt->pt_flags &= ~PF_NOSTOP;
+			ptcwakeup(tp, FREAD);
+		}
+	} else {
+		if (!stop) {
+			pt->pt_send &= ~TIOCPKT_DOSTOP;
+			pt->pt_send |= TIOCPKT_NOSTOP;
+			pt->pt_flags |= PF_NOSTOP;
+			ptcwakeup(tp, FREAD);
+		}
+	}
+	return (error);
+}
+
+/*
+ * Match lookups on /dev/ptmx, find the next free pty (if any), set up
+ * the pty descriptor, register it, and return a reference to the master.
+ *
+ * pts == /dev/pts/xxx (oldstyle: ttyp...)
+ * ptc == /dev/pty/xxx (oldstyle: ptyp...)
+ */
+static void
+pty_clone(void *arg, struct ucred *cred, char *name, int namelen,
+    struct cdev **dev)
+{
+	struct pt_desc *pt;
+	struct cdev *devc;
+
+	if (!use_pts)
+		return;
+
+	if (*dev != NULL)
+		return;
+
+	if (strcmp(name, "ptmx") != 0)
+		return;
+
+	mtx_lock(&Giant);
+	pt = pty_new();
+	if (pt == NULL) {
+		mtx_unlock(&Giant);
+		return;
+	}
+
+	/*
+	 * XXX: Lack of locking here considered worrying.  We expose the
+	 * pts/pty device nodes before they are fully initialized, although
+	 * Giant likely protects us (unless make_dev blocks...?).
+	 *
+	 * XXX: If a process performs a lookup on /dev/ptmx but never an
+	 * open, we won't GC the device node.  We should have a callout
+	 * sometime later that GC's device instances that were never
+	 * opened, or some way to tell devfs that "this had better be for
+	 * an open() or we won't create a device".
+	 */
+	pt->pt_devc = devc = make_dev_credf(MAKEDEV_REF, &ptc_cdevsw, 
+	    NUM_TO_MINOR(pt->pt_num), cred, UID_ROOT, GID_WHEEL, 0666,
+	    "pty/%d", pt->pt_num);
+
+	devc->si_drv1 = pt;
+	devc->si_tty = pt->pt_tty;
+	*dev = devc;
+	mtx_unlock(&Giant);
+
+	if (bootverbose)
+		printf("pty_clone: allocated pty %d to uid %d\n", pt->pt_num,
+	    cred->cr_ruid);
+
+	return;
+}
+
+static void
+pty_drvinit(void *unused)
+{
+
+	mtx_init(&pt_mtx, "pt_mtx", NULL, MTX_DEF);
+	LIST_INIT(&pt_list);
+	LIST_INIT(&pt_free_list);
+	EVENTHANDLER_REGISTER(dev_clone, pty_clone, 0, 1000);
+}
+
+SYSINIT(ptydev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE,pty_drvinit,NULL)
Index: kern_exec.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_exec.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/kern_exec.c -L sys/kern/kern_exec.c -u -r1.2 -r1.3
--- sys/kern/kern_exec.c
+++ sys/kern/kern_exec.c
@@ -25,7 +25,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_exec.c,v 1.275.2.4 2006/03/13 03:05:42 jeff Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_exec.c,v 1.308.2.1.2.1 2008/01/19 18:15:05 kib Exp $");
 
 #include "opt_hwpmc_hooks.h"
 #include "opt_ktrace.h"
@@ -39,7 +39,6 @@
 #include <sys/sysproto.h>
 #include <sys/signalvar.h>
 #include <sys/kernel.h>
-#include <sys/mac.h>
 #include <sys/mount.h>
 #include <sys/filedesc.h>
 #include <sys/fcntl.h>
@@ -49,6 +48,7 @@
 #include <sys/imgact_elf.h>
 #include <sys/wait.h>
 #include <sys/malloc.h>
+#include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/pioctl.h>
 #include <sys/namei.h>
@@ -79,6 +79,9 @@
 
 #include <machine/reg.h>
 
+#include <security/audit/audit.h>
+#include <security/mac/mac_framework.h>
+
 MALLOC_DEFINE(M_PARGS, "proc-args", "Process arguments");
 
 static int sysctl_kern_ps_strings(SYSCTL_HANDLER_ARGS);
@@ -86,6 +89,7 @@
 static int sysctl_kern_stackprot(SYSCTL_HANDLER_ARGS);
 static int do_execve(struct thread *td, struct image_args *args,
     struct mac *mac_p);
+static void exec_free_args(struct image_args *);
 
 /* XXX This should be vm_size_t. */
 SYSCTL_PROC(_kern, KERN_PS_STRINGS, ps_strings, CTLTYPE_ULONG|CTLFLAG_RD,
@@ -164,9 +168,6 @@
 };
 #endif
 
-/*
- * MPSAFE
- */
 int
 execve(td, uap)
 	struct thread *td;
@@ -181,12 +182,8 @@
 
 	error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE,
 	    uap->argv, uap->envv);
-
 	if (error == 0)
 		error = kern_execve(td, &args, NULL);
-
-	exec_free_args(&args);
-
 	return (error);
 }
 
@@ -199,9 +196,6 @@
 };
 #endif
 
-/*
- * MPSAFE
- */
 int
 __mac_execve(td, uap)
 	struct thread *td;
@@ -218,12 +212,8 @@
 
 	error = exec_copyin_args(&args, uap->fname, UIO_USERSPACE,
 	    uap->argv, uap->envv);
-
 	if (error == 0)
 		error = kern_execve(td, &args, uap->mac_p);
-
-	exec_free_args(&args);
-
 	return (error);
 #else
 	return (ENOSYS);
@@ -231,11 +221,11 @@
 }
 
 /*
- * XXX: kern_execve has the astonishing property of not always
- * returning to the caller.  If sufficiently bad things happen during
- * the call to do_execve(), it can end up calling exit1(); as a result,
- * callers must avoid doing anything which they might need to undo
- * (e.g., allocating memory).
+ * XXX: kern_execve has the astonishing property of not always returning to
+ * the caller.  If sufficiently bad things happen during the call to
+ * do_execve(), it can end up calling exit1(); as a result, callers must
+ * avoid doing anything which they might need to undo (e.g., allocating
+ * memory).
  */
 int
 kern_execve(td, args, mac_p)
@@ -246,10 +236,15 @@
 	struct proc *p = td->td_proc;
 	int error;
 
+	AUDIT_ARG(argv, args->begin_argv, args->argc,
+	    args->begin_envv - args->begin_argv);
+	AUDIT_ARG(envv, args->begin_envv, args->envc,
+	    args->endp - args->begin_envv);
 	if (p->p_flag & P_HADTHREADS) {
 		PROC_LOCK(p);
 		if (thread_single(SINGLE_BOUNDARY)) {
 			PROC_UNLOCK(p);
+	       		exec_free_args(args);
 			return (ERESTART);	/* Try again later. */
 		}
 		PROC_UNLOCK(p);
@@ -276,8 +271,6 @@
 /*
  * In-kernel implementation of execve().  All arguments are assumed to be
  * userspace pointers from the passed thread.
- *
- * MPSAFE
  */
 static int
 do_execve(td, args, mac_p)
@@ -357,10 +350,13 @@
 	/*
 	 * Translate the file name. namei() returns a vnode pointer
 	 *	in ni_vp amoung other things.
+	 *
+	 * XXXAUDIT: It would be desirable to also audit the name of the
+	 * interpreter if this is an interpreted binary.
 	 */
 	ndp = &nd;
-	NDINIT(ndp, LOOKUP, ISOPEN | LOCKLEAF | FOLLOW | SAVENAME | MPSAFE,
-	    UIO_SYSSPACE, args->fname, td);
+	NDINIT(ndp, LOOKUP, ISOPEN | LOCKLEAF | FOLLOW | SAVENAME | MPSAFE |
+	    AUDITVNODE1, UIO_SYSSPACE, args->fname, td);
 
 interpret:
 	error = namei(ndp);
@@ -395,6 +391,7 @@
 	if (error)
 		goto exec_fail_dealloc;
 
+	imgp->proc->p_osrel = 0;
 	/*
 	 *	If the current process has a special image activator it
 	 *	wants to try first, call it.   For example, emulating shell
@@ -493,7 +490,9 @@
 	}
 
 	/* close files on exec */
+	VOP_UNLOCK(imgp->vp, 0, td);
 	fdcloseexec(td);
+	vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY, td);
 
 	/* Get a reference to the vnode prior to locking the proc */
 	VREF(ndp->ni_vp);
@@ -566,8 +565,10 @@
 		 * we do not regain any tracing during a possible block.
 		 */
 		setsugid(p);
+
 #ifdef KTRACE
-		if (p->p_tracevp != NULL && suser_cred(oldcred, SUSER_ALLOWJAIL)) {
+		if (p->p_tracevp != NULL &&
+		    priv_check_cred(oldcred, PRIV_DEBUG_DIFFCRED, 0)) {
 			mtx_lock(&ktrace_mtx);
 			p->p_traceflag = 0;
 			tracevp = p->p_tracevp;
@@ -588,7 +589,9 @@
 		 */
 		PROC_UNLOCK(p);
 		setugidsafety(td);
+		VOP_UNLOCK(imgp->vp, 0, td);
 		error = fdcheckstd(td);
+		vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY, td);
 		if (error != 0)
 			goto done1;
 		PROC_LOCK(p);
@@ -666,7 +669,7 @@
 	 * single thread mode.
 	 */
 	if (p->p_flag & P_TRACED)
-		tdsignal(td, SIGTRAP, SIGTARGET_TD);
+		tdsignal(p, td, SIGTRAP, NULL);
 
 	/* clear "fork but no exec" flag, as we _are_ execing */
 	p->p_acflag &= ~AFORK;
@@ -720,6 +723,7 @@
 		crfree(oldcred);
 	else
 		crfree(newcred);
+	VOP_UNLOCK(imgp->vp, 0, td);
 	/*
 	 * Handle deferred decrement of ref counts.
 	 */
@@ -733,11 +737,17 @@
 	if (ndp->ni_vp && error != 0)
 		vrele(ndp->ni_vp);
 #ifdef KTRACE
-	if (tracevp != NULL)
+	if (tracevp != NULL) {
+		int tvfslocked;
+
+		tvfslocked = VFS_LOCK_GIANT(tracevp->v_mount);
 		vrele(tracevp);
+		VFS_UNLOCK_GIANT(tvfslocked);
+	}
 	if (tracecred != NULL)
 		crfree(tracecred);
 #endif
+	vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY, td);
 	if (oldargs != NULL)
 		pargs_drop(oldargs);
 	if (newargs != NULL)
@@ -776,19 +786,6 @@
 	p->p_flag &= ~P_INEXEC;
 	PROC_UNLOCK(p);
 
-	if (imgp->vmspace_destroyed) {
-		/* sorry, no more process anymore. exit gracefully */
-#ifdef MAC
-		mac_execve_exit(imgp);
-		if (interplabel != NULL)
-			mac_vnode_label_free(interplabel);
-#endif
-		VFS_UNLOCK_GIANT(vfslocked);
-		exec_free_args(args);
-		exit1(td, W_EXITCODE(0, SIGABRT));
-		/* NOT REACHED */
-		error = 0;
-	}
 done2:
 #ifdef MAC
 	mac_execve_exit(imgp);
@@ -796,6 +793,13 @@
 		mac_vnode_label_free(interplabel);
 #endif
 	VFS_UNLOCK_GIANT(vfslocked);
+	exec_free_args(args);
+
+	if (error && imgp->vmspace_destroyed) {
+		/* sorry, no more process anymore. exit gracefully */
+		exit1(td, W_EXITCODE(0, SIGABRT));
+		/* NOT REACHED */
+	}
 	return (error);
 }
 
@@ -824,16 +828,12 @@
 			if ((ma[i] = vm_page_lookup(object, i)) != NULL) {
 				if (ma[i]->valid)
 					break;
-				vm_page_lock_queues();
-				if ((ma[i]->flags & PG_BUSY) || ma[i]->busy) {
-					vm_page_unlock_queues();
+				if ((ma[i]->oflags & VPO_BUSY) || ma[i]->busy)
 					break;
-				}
 				vm_page_busy(ma[i]);
-				vm_page_unlock_queues();
 			} else {
 				ma[i] = vm_page_alloc(object, i,
-				    VM_ALLOC_NORMAL);
+				    VM_ALLOC_NORMAL | VM_ALLOC_IFNOTCACHED);
 				if (ma[i] == NULL)
 					break;
 			}
@@ -845,7 +845,6 @@
 		    (ma[0]->valid == 0)) {
 			if (ma[0]) {
 				vm_page_lock_queues();
-				pmap_remove_all(ma[0]);
 				vm_page_free(ma[0]);
 				vm_page_unlock_queues();
 			}
@@ -855,8 +854,8 @@
 	}
 	vm_page_lock_queues();
 	vm_page_hold(ma[0]);
-	vm_page_wakeup(ma[0]);
 	vm_page_unlock_queues();
+	vm_page_wakeup(ma[0]);
 	VM_OBJECT_UNLOCK(object);
 
 	imgp->firstpage = sf_buf_alloc(ma[0], 0);
@@ -896,20 +895,13 @@
 	struct vmspace *vmspace = p->p_vmspace;
 	vm_offset_t stack_addr;
 	vm_map_t map;
+	u_long ssiz;
 
 	imgp->vmspace_destroyed = 1;
+	imgp->sysent = sv;
 
-	/* Called with Giant held, do not depend on it! */
-	EVENTHANDLER_INVOKE(process_exec, p);
-
-	/*
-	 * Here is as good a place as any to do any resource limit cleanups.
-	 * This is needed if a 64 bit binary exec's a 32 bit binary - the
-	 * data size limit may need to be changed to a value that makes
-	 * sense for the 32 bit binary.
-	 */
-	if (sv->sv_fixlimits != NULL)
-		sv->sv_fixlimits(imgp);
+	/* May be called with Giant held */
+	EVENTHANDLER_INVOKE(process_exec, p, imgp);
 
 	/*
 	 * Blow away entire process VM, if address space not shared,
@@ -920,18 +912,23 @@
 	if (vmspace->vm_refcnt == 1 && vm_map_min(map) == sv->sv_minuser &&
 	    vm_map_max(map) == sv->sv_maxuser) {
 		shmexit(vmspace);
-		pmap_remove_pages(vmspace_pmap(vmspace), vm_map_min(map),
-		    vm_map_max(map));
+		pmap_remove_pages(vmspace_pmap(vmspace));
 		vm_map_remove(map, vm_map_min(map), vm_map_max(map));
 	} else {
-		vmspace_exec(p, sv->sv_minuser, sv->sv_maxuser);
+		error = vmspace_exec(p, sv->sv_minuser, sv->sv_maxuser);
+		if (error)
+			return (error);
 		vmspace = p->p_vmspace;
 		map = &vmspace->vm_map;
 	}
 
 	/* Allocate a new stack */
-	stack_addr = sv->sv_usrstack - maxssiz;
-	error = vm_map_stack(map, stack_addr, (vm_size_t)maxssiz,
+	if (sv->sv_maxssiz != NULL)
+		ssiz = *sv->sv_maxssiz;
+	else
+		ssiz = maxssiz;
+	stack_addr = sv->sv_usrstack - ssiz;
+	error = vm_map_stack(map, stack_addr, (vm_size_t)ssiz,
 	    sv->sv_stackprot, VM_PROT_ALL, MAP_STACK_GROWS_DOWN);
 	if (error)
 		return (error);
@@ -939,7 +936,7 @@
 #ifdef __ia64__
 	/* Allocate a new register stack */
 	stack_addr = IA64_BACKINGSTORE;
-	error = vm_map_stack(map, stack_addr, (vm_size_t)maxssiz,
+	error = vm_map_stack(map, stack_addr, (vm_size_t)ssiz,
 	    sv->sv_stackprot, VM_PROT_ALL, MAP_STACK_GROWS_UP);
 	if (error)
 		return (error);
@@ -950,14 +947,14 @@
 	 * process stack so we can check the stack rlimit.
 	 */
 	vmspace->vm_ssize = sgrowsiz >> PAGE_SHIFT;
-	vmspace->vm_maxsaddr = (char *)sv->sv_usrstack - maxssiz;
+	vmspace->vm_maxsaddr = (char *)sv->sv_usrstack - ssiz;
 
 	return (0);
 }
 
 /*
- * Copy out argument and environment strings from the old process
- *	address space into the temporary string buffer.
+ * Copy out argument and environment strings from the old process address
+ * space into the temporary string buffer.
  */
 int
 exec_copyin_args(struct image_args *args, char *fname,
@@ -996,19 +993,21 @@
 	    copystr(fname, args->fname, PATH_MAX, &length) :
 	    copyinstr(fname, args->fname, PATH_MAX, &length);
 	if (error != 0)
-		return (error);
+		goto err_exit;
 
 	/*
 	 * extract arguments first
 	 */
 	while ((argp = (caddr_t) (intptr_t) fuword(argv++))) {
-		if (argp == (caddr_t) -1)
-			return (EFAULT);
+		if (argp == (caddr_t) -1) {
+			error = EFAULT;
+			goto err_exit;
+		}
 		if ((error = copyinstr(argp, args->endp,
 		    args->stringspace, &length))) {
-			if (error == ENAMETOOLONG)
-				return (E2BIG);
-			return (error);
+			if (error == ENAMETOOLONG) 
+				error = E2BIG;
+			goto err_exit;
 		}
 		args->stringspace -= length;
 		args->endp += length;
@@ -1022,13 +1021,15 @@
 	 */
 	if (envv) {
 		while ((envp = (caddr_t)(intptr_t)fuword(envv++))) {
-			if (envp == (caddr_t)-1)
-				return (EFAULT);
+			if (envp == (caddr_t)-1) {
+				error = EFAULT;
+				goto err_exit;
+			}
 			if ((error = copyinstr(envp, args->endp,
 			    args->stringspace, &length))) {
 				if (error == ENAMETOOLONG)
-					return (E2BIG);
-				return (error);
+					error = E2BIG;
+				goto err_exit;
 			}
 			args->stringspace -= length;
 			args->endp += length;
@@ -1037,9 +1038,13 @@
 	}
 
 	return (0);
+
+err_exit:
+	exec_free_args(args);
+	return (error);
 }
 
-void
+static void
 exec_free_args(struct image_args *args)
 {
 
@@ -1051,9 +1056,9 @@
 }
 
 /*
- * Copy strings out to the new process address space, constructing
- *	new arg and env vector tables. Return a pointer to the base
- *	so that it can be used as the initial stack pointer.
+ * Copy strings out to the new process address space, constructing new arg
+ * and env vector tables. Return a pointer to the base so that it can be used
+ * as the initial stack pointer.
  */
 register_t *
 exec_copyout_strings(imgp)
@@ -1231,7 +1236,7 @@
 	 * Call filesystem specific open routine (which does nothing in the
 	 * general case).
 	 */
-	error = VOP_OPEN(vp, FREAD, td->td_ucred, td, -1);
+	error = VOP_OPEN(vp, FREAD, td->td_ucred, td, NULL);
 	return (error);
 }
 
Index: kern_jail.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_jail.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/kern_jail.c -L sys/kern/kern_jail.c -u -r1.1.1.1 -r1.2
--- sys/kern/kern_jail.c
+++ sys/kern/kern_jail.c
@@ -8,7 +8,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_jail.c,v 1.50.2.1 2005/11/13 03:12:32 csjp Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_jail.c,v 1.70 2007/04/13 23:54:22 pjd Exp $");
 
 #include "opt_mac.h"
 
@@ -18,13 +18,14 @@
 #include <sys/systm.h>
 #include <sys/errno.h>
 #include <sys/sysproto.h>
-#include <sys/mac.h>
 #include <sys/malloc.h>
+#include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/taskqueue.h>
 #include <sys/jail.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
+#include <sys/sx.h>
 #include <sys/namei.h>
 #include <sys/mount.h>
 #include <sys/queue.h>
@@ -35,9 +36,10 @@
 #include <net/if.h>
 #include <netinet/in.h>
 
+#include <security/mac/mac_framework.h>
+
 MALLOC_DEFINE(M_PRISON, "prison", "Prison structures");
 
-SYSCTL_DECL(_security);
 SYSCTL_NODE(_security, OID_AUTO, jail, CTLFLAG_RW, 0,
     "Jail rules");
 
@@ -71,30 +73,48 @@
     &jail_chflags_allowed, 0,
     "Processes in jail can alter system file flags");
 
-/* allprison, lastprid, and prisoncount are protected by allprison_mtx. */
+int	jail_mount_allowed = 0;
+SYSCTL_INT(_security_jail, OID_AUTO, mount_allowed, CTLFLAG_RW,
+    &jail_mount_allowed, 0,
+    "Processes in jail can mount/unmount jail-friendly file systems");
+
+/* allprison, lastprid, and prisoncount are protected by allprison_lock. */
 struct	prisonlist allprison;
-struct	mtx allprison_mtx;
+struct	sx allprison_lock;
 int	lastprid = 0;
 int	prisoncount = 0;
 
+/*
+ * List of jail services. Protected by allprison_lock.
+ */
+TAILQ_HEAD(prison_services_head, prison_service);
+static struct prison_services_head prison_services =
+    TAILQ_HEAD_INITIALIZER(prison_services);
+static int prison_service_slots = 0;
+
+struct prison_service {
+	prison_create_t ps_create;
+	prison_destroy_t ps_destroy;
+	int		ps_slotno;
+	TAILQ_ENTRY(prison_service) ps_next;
+	char	ps_name[0];
+};
+
 static void		 init_prison(void *);
 static void		 prison_complete(void *context, int pending);
-static struct prison	*prison_find(int);
 static int		 sysctl_jail_list(SYSCTL_HANDLER_ARGS);
 
 static void
 init_prison(void *data __unused)
 {
 
-	mtx_init(&allprison_mtx, "allprison", NULL, MTX_DEF);
+	sx_init(&allprison_lock, "allprison");
 	LIST_INIT(&allprison);
 }
 
 SYSINIT(prison, SI_SUB_INTRINSIC, SI_ORDER_ANY, init_prison, NULL);
 
 /*
- * MPSAFE
- *
  * struct jail_args {
  *	struct jail *jail;
  * };
@@ -104,6 +124,7 @@
 {
 	struct nameidata nd;
 	struct prison *pr, *tpr;
+	struct prison_service *psrv;
 	struct jail j;
 	struct jail_attach_args jaa;
 	int vfslocked, error, tryprid;
@@ -136,9 +157,15 @@
 	pr->pr_ip = j.ip_number;
 	pr->pr_linux = NULL;
 	pr->pr_securelevel = securelevel;
+	if (prison_service_slots == 0)
+		pr->pr_slots = NULL;
+	else {
+		pr->pr_slots = malloc(sizeof(*pr->pr_slots) * prison_service_slots,
+		    M_PRISON, M_ZERO | M_WAITOK);
+	}
 
 	/* Determine next pr_id and add prison to allprison list. */
-	mtx_lock(&allprison_mtx);
+	sx_xlock(&allprison_lock);
 	tryprid = lastprid + 1;
 	if (tryprid == JAIL_MAX)
 		tryprid = 1;
@@ -147,7 +174,7 @@
 		if (tpr->pr_id == tryprid) {
 			tryprid++;
 			if (tryprid == JAIL_MAX) {
-				mtx_unlock(&allprison_mtx);
+				sx_xunlock(&allprison_lock);
 				error = EAGAIN;
 				goto e_dropvnref;
 			}
@@ -157,7 +184,11 @@
 	pr->pr_id = jaa.jid = lastprid = tryprid;
 	LIST_INSERT_HEAD(&allprison, pr, pr_list);
 	prisoncount++;
-	mtx_unlock(&allprison_mtx);
+	sx_downgrade(&allprison_lock);
+	TAILQ_FOREACH(psrv, &prison_services, ps_next) {
+		psrv->ps_create(psrv, pr);
+	}
+	sx_sunlock(&allprison_lock);
 
 	error = jail_attach(td, &jaa);
 	if (error)
@@ -168,10 +199,14 @@
 	td->td_retval[0] = jaa.jid;
 	return (0);
 e_dropprref:
-	mtx_lock(&allprison_mtx);
+	sx_xlock(&allprison_lock);
 	LIST_REMOVE(pr, pr_list);
 	prisoncount--;
-	mtx_unlock(&allprison_mtx);
+	sx_downgrade(&allprison_lock);
+	TAILQ_FOREACH(psrv, &prison_services, ps_next) {
+		psrv->ps_destroy(psrv, pr);
+	}
+	sx_sunlock(&allprison_lock);
 e_dropvnref:
 	vfslocked = VFS_LOCK_GIANT(pr->pr_root->v_mount);
 	vrele(pr->pr_root);
@@ -183,8 +218,6 @@
 }
 
 /*
- * MPSAFE
- *
  * struct jail_attach_args {
  *	int jid;
  * };
@@ -196,7 +229,7 @@
 	struct ucred *newcred, *oldcred;
 	struct prison *pr;
 	int vfslocked, error;
-	
+
 	/*
 	 * XXX: Note that there is a slight race here if two threads
 	 * in the same privileged process attempt to attach to two
@@ -205,20 +238,20 @@
 	 * a process root from one prison, but attached to the jail
 	 * of another.
 	 */
-	error = suser(td);
+	error = priv_check(td, PRIV_JAIL_ATTACH);
 	if (error)
 		return (error);
 
 	p = td->td_proc;
-	mtx_lock(&allprison_mtx);
+	sx_slock(&allprison_lock);
 	pr = prison_find(uap->jid);
 	if (pr == NULL) {
-		mtx_unlock(&allprison_mtx);
+		sx_sunlock(&allprison_lock);
 		return (EINVAL);
 	}
 	pr->pr_ref++;
 	mtx_unlock(&pr->pr_mtx);
-	mtx_unlock(&allprison_mtx);
+	sx_sunlock(&allprison_lock);
 
 	vfslocked = VFS_LOCK_GIANT(pr->pr_root->v_mount);
 	vn_lock(pr->pr_root, LK_EXCLUSIVE | LK_RETRY, td);
@@ -254,15 +287,19 @@
 /*
  * Returns a locked prison instance, or NULL on failure.
  */
-static struct prison *
+struct prison *
 prison_find(int prid)
 {
 	struct prison *pr;
 
-	mtx_assert(&allprison_mtx, MA_OWNED);
+	sx_assert(&allprison_lock, SX_LOCKED);
 	LIST_FOREACH(pr, &allprison, pr_list) {
 		if (pr->pr_id == prid) {
 			mtx_lock(&pr->pr_mtx);
+			if (pr->pr_ref == 0) {
+				mtx_unlock(&pr->pr_mtx);
+				break;
+			}
 			return (pr);
 		}
 	}
@@ -273,31 +310,35 @@
 prison_free(struct prison *pr)
 {
 
-	mtx_lock(&allprison_mtx);
 	mtx_lock(&pr->pr_mtx);
 	pr->pr_ref--;
 	if (pr->pr_ref == 0) {
-		LIST_REMOVE(pr, pr_list);
 		mtx_unlock(&pr->pr_mtx);
-		prisoncount--;
-		mtx_unlock(&allprison_mtx);
-
 		TASK_INIT(&pr->pr_task, 0, prison_complete, pr);
 		taskqueue_enqueue(taskqueue_thread, &pr->pr_task);
 		return;
 	}
 	mtx_unlock(&pr->pr_mtx);
-	mtx_unlock(&allprison_mtx);
 }
 
 static void
 prison_complete(void *context, int pending)
 {
+	struct prison_service *psrv;
 	struct prison *pr;
 	int vfslocked;
 
 	pr = (struct prison *)context;
 
+	sx_xlock(&allprison_lock);
+	LIST_REMOVE(pr, pr_list);
+	prisoncount--;
+	sx_downgrade(&allprison_lock);
+	TAILQ_FOREACH(psrv, &prison_services, ps_next) {
+		psrv->ps_destroy(psrv, pr);
+	}
+	sx_sunlock(&allprison_lock);
+
 	vfslocked = VFS_LOCK_GIANT(pr->pr_root->v_mount);
 	vrele(pr->pr_root);
 	VFS_UNLOCK_GIANT(vfslocked);
@@ -313,6 +354,8 @@
 {
 
 	mtx_lock(&pr->pr_mtx);
+	KASSERT(pr->pr_ref > 0,
+	    ("Trying to hold dead prison (id=%d).", pr->pr_id));
 	pr->pr_ref++;
 	mtx_unlock(&pr->pr_mtx);
 }
@@ -331,12 +374,12 @@
 
 	if (!jailed(cred))
 		return (0);
-	if (flag) 
+	if (flag)
 		tmp = *ip;
 	else
 		tmp = ntohl(*ip);
 	if (tmp == INADDR_ANY) {
-		if (flag) 
+		if (flag)
 			*ip = cred->cr_prison->pr_ip;
 		else
 			*ip = htonl(cred->cr_prison->pr_ip);
@@ -523,6 +566,372 @@
 	}
 }
 
+/*
+ * Check with permission for a specific privilege is granted within jail.  We
+ * have a specific list of accepted privileges; the rest are denied.
+ */
+int
+prison_priv_check(struct ucred *cred, int priv)
+{
+
+	if (!jailed(cred))
+		return (0);
+
+	switch (priv) {
+
+		/*
+		 * Allow ktrace privileges for root in jail.
+		 */
+	case PRIV_KTRACE:
+
+#if 0
+		/*
+		 * Allow jailed processes to configure audit identity and
+		 * submit audit records (login, etc).  In the future we may
+		 * want to further refine the relationship between audit and
+		 * jail.
+		 */
+	case PRIV_AUDIT_GETAUDIT:
+	case PRIV_AUDIT_SETAUDIT:
+	case PRIV_AUDIT_SUBMIT:
+#endif
+
+		/*
+		 * Allow jailed processes to manipulate process UNIX
+		 * credentials in any way they see fit.
+		 */
+	case PRIV_CRED_SETUID:
+	case PRIV_CRED_SETEUID:
+	case PRIV_CRED_SETGID:
+	case PRIV_CRED_SETEGID:
+	case PRIV_CRED_SETGROUPS:
+	case PRIV_CRED_SETREUID:
+	case PRIV_CRED_SETREGID:
+	case PRIV_CRED_SETRESUID:
+	case PRIV_CRED_SETRESGID:
+
+		/*
+		 * Jail implements visibility constraints already, so allow
+		 * jailed root to override uid/gid-based constraints.
+		 */
+	case PRIV_SEEOTHERGIDS:
+	case PRIV_SEEOTHERUIDS:
+
+		/*
+		 * Jail implements inter-process debugging limits already, so
+		 * allow jailed root various debugging privileges.
+		 */
+	case PRIV_DEBUG_DIFFCRED:
+	case PRIV_DEBUG_SUGID:
+	case PRIV_DEBUG_UNPRIV:
+
+		/*
+		 * Allow jail to set various resource limits and login
+		 * properties, and for now, exceed process resource limits.
+		 */
+	case PRIV_PROC_LIMIT:
+	case PRIV_PROC_SETLOGIN:
+	case PRIV_PROC_SETRLIMIT:
+
+		/*
+		 * System V and POSIX IPC privileges are granted in jail.
+		 */
+	case PRIV_IPC_READ:
+	case PRIV_IPC_WRITE:
+	case PRIV_IPC_ADMIN:
+	case PRIV_IPC_MSGSIZE:
+	case PRIV_MQ_ADMIN:
+
+		/*
+		 * Jail implements its own inter-process limits, so allow
+		 * root processes in jail to change scheduling on other
+		 * processes in the same jail.  Likewise for signalling.
+		 */
+	case PRIV_SCHED_DIFFCRED:
+	case PRIV_SIGNAL_DIFFCRED:
+	case PRIV_SIGNAL_SUGID:
+
+		/*
+		 * Allow jailed processes to write to sysctls marked as jail
+		 * writable.
+		 */
+	case PRIV_SYSCTL_WRITEJAIL:
+
+		/*
+		 * Allow root in jail to manage a variety of quota
+		 * properties.  These should likely be conditional on a
+		 * configuration option.
+		 */
+	case PRIV_VFS_GETQUOTA:
+	case PRIV_VFS_SETQUOTA:
+
+		/*
+		 * Since Jail relies on chroot() to implement file system
+		 * protections, grant many VFS privileges to root in jail.
+		 * Be careful to exclude mount-related and NFS-related
+		 * privileges.
+		 */
+	case PRIV_VFS_READ:
+	case PRIV_VFS_WRITE:
+	case PRIV_VFS_ADMIN:
+	case PRIV_VFS_EXEC:
+	case PRIV_VFS_LOOKUP:
+	case PRIV_VFS_BLOCKRESERVE:	/* XXXRW: Slightly surprising. */
+	case PRIV_VFS_CHFLAGS_DEV:
+	case PRIV_VFS_CHOWN:
+	case PRIV_VFS_CHROOT:
+	case PRIV_VFS_RETAINSUGID:
+	case PRIV_VFS_FCHROOT:
+	case PRIV_VFS_LINK:
+	case PRIV_VFS_SETGID:
+	case PRIV_VFS_STICKYFILE:
+		return (0);
+
+		/*
+		 * Depending on the global setting, allow privilege of
+		 * setting system flags.
+		 */
+	case PRIV_VFS_SYSFLAGS:
+		if (jail_chflags_allowed)
+			return (0);
+		else
+			return (EPERM);
+
+		/*
+		 * Depending on the global setting, allow privilege of
+		 * mounting/unmounting file systems.
+		 */
+	case PRIV_VFS_MOUNT:
+	case PRIV_VFS_UNMOUNT:
+	case PRIV_VFS_MOUNT_NONUSER:
+	case PRIV_VFS_MOUNT_OWNER:
+		if (jail_mount_allowed)
+			return (0);
+		else
+			return (EPERM);
+
+		/*
+		 * Allow jailed root to bind reserved ports and reuse in-use
+		 * ports.
+		 */
+	case PRIV_NETINET_RESERVEDPORT:
+	case PRIV_NETINET_REUSEPORT:
+		return (0);
+
+		/*
+		 * Conditionally allow creating raw sockets in jail.
+		 */
+	case PRIV_NETINET_RAW:
+		if (jail_allow_raw_sockets)
+			return (0);
+		else
+			return (EPERM);
+
+		/*
+		 * Since jail implements its own visibility limits on netstat
+		 * sysctls, allow getcred.  This allows identd to work in
+		 * jail.
+		 */
+	case PRIV_NETINET_GETCRED:
+		return (0);
+
+	default:
+		/*
+		 * In all remaining cases, deny the privilege request.  This
+		 * includes almost all network privileges, many system
+		 * configuration privileges.
+		 */
+		return (EPERM);
+	}
+}
+
+/*
+ * Register jail service. Provides 'create' and 'destroy' methods.
+ * 'create' method will be called for every existing jail and all
+ * jails in the future as they beeing created.
+ * 'destroy' method will be called for every jail going away and
+ * for all existing jails at the time of service deregistration.
+ */
+struct prison_service *
+prison_service_register(const char *name, prison_create_t create,
+    prison_destroy_t destroy)
+{
+	struct prison_service *psrv, *psrv2;
+	struct prison *pr;
+	int reallocate = 1, slotno = 0;
+	void **slots, **oldslots;
+
+	psrv = malloc(sizeof(*psrv) + strlen(name) + 1, M_PRISON,
+	    M_WAITOK | M_ZERO);
+	psrv->ps_create = create;
+	psrv->ps_destroy = destroy;
+	strcpy(psrv->ps_name, name);
+	/*
+	 * Grab the allprison_lock here, so we won't miss any jail
+	 * creation/destruction.
+	 */
+	sx_xlock(&allprison_lock);
+#ifdef INVARIANTS
+	/*
+	 * Verify if service is not already registered.
+	 */
+	TAILQ_FOREACH(psrv2, &prison_services, ps_next) {
+		KASSERT(strcmp(psrv2->ps_name, name) != 0,
+		    ("jail service %s already registered", name));
+	}
+#endif
+	/*
+	 * Find free slot. When there is no existing free slot available,
+	 * allocate one at the end.
+	 */
+	TAILQ_FOREACH(psrv2, &prison_services, ps_next) {
+		if (psrv2->ps_slotno != slotno) {
+			KASSERT(slotno < psrv2->ps_slotno,
+			    ("Invalid slotno (slotno=%d >= ps_slotno=%d",
+			    slotno, psrv2->ps_slotno));
+			/* We found free slot. */
+			reallocate = 0;
+			break;
+		}
+		slotno++;
+	}
+	psrv->ps_slotno = slotno;
+	/*
+	 * Keep the list sorted by slot number.
+	 */
+	if (psrv2 != NULL) {
+		KASSERT(reallocate == 0, ("psrv2 != NULL && reallocate != 0"));
+		TAILQ_INSERT_BEFORE(psrv2, psrv, ps_next);
+	} else {
+		KASSERT(reallocate == 1, ("psrv2 == NULL && reallocate == 0"));
+		TAILQ_INSERT_TAIL(&prison_services, psrv, ps_next);
+	}
+	prison_service_slots++;
+	sx_downgrade(&allprison_lock);
+	/*
+	 * Allocate memory for new slot if we didn't found empty one.
+	 * Do not use realloc(9), because pr_slots is protected with a mutex,
+	 * so we can't sleep.
+	 */
+	LIST_FOREACH(pr, &allprison, pr_list) {
+		if (reallocate) {
+			/* First allocate memory with M_WAITOK. */
+			slots = malloc(sizeof(*slots) * prison_service_slots,
+			    M_PRISON, M_WAITOK);
+			/* Now grab the mutex and replace pr_slots. */
+			mtx_lock(&pr->pr_mtx);
+			oldslots = pr->pr_slots;
+			if (psrv->ps_slotno > 0) {
+				bcopy(oldslots, slots,
+				    sizeof(*slots) * (prison_service_slots - 1));
+			}
+			slots[psrv->ps_slotno] = NULL;
+			pr->pr_slots = slots;
+			mtx_unlock(&pr->pr_mtx);
+			if (oldslots != NULL)
+				free(oldslots, M_PRISON);
+		}
+		/*
+		 * Call 'create' method for each existing jail.
+		 */
+		psrv->ps_create(psrv, pr);
+	}
+	sx_sunlock(&allprison_lock);
+
+	return (psrv);
+}
+
+void
+prison_service_deregister(struct prison_service *psrv)
+{
+	struct prison *pr;
+	void **slots, **oldslots;
+	int last = 0;
+
+	sx_xlock(&allprison_lock);
+	if (TAILQ_LAST(&prison_services, prison_services_head) == psrv)
+		last = 1;
+	TAILQ_REMOVE(&prison_services, psrv, ps_next);
+	prison_service_slots--;
+	sx_downgrade(&allprison_lock);
+	LIST_FOREACH(pr, &allprison, pr_list) {
+		/*
+		 * Call 'destroy' method for every currently existing jail.
+		 */
+		psrv->ps_destroy(psrv, pr);
+		/*
+		 * If this is the last slot, free the memory allocated for it.
+		 */
+		if (last) {
+			if (prison_service_slots == 0)
+				slots = NULL;
+			else {
+				slots = malloc(sizeof(*slots) * prison_service_slots,
+				    M_PRISON, M_WAITOK);
+			}
+			mtx_lock(&pr->pr_mtx);
+			oldslots = pr->pr_slots;
+			/*
+			 * We require setting slot to NULL after freeing it,
+			 * this way we can check for memory leaks here.
+			 */
+			KASSERT(oldslots[psrv->ps_slotno] == NULL,
+			    ("Slot %d (service %s, jailid=%d) still contains data?",
+			     psrv->ps_slotno, psrv->ps_name, pr->pr_id));
+			if (psrv->ps_slotno > 0) {
+				bcopy(oldslots, slots,
+				    sizeof(*slots) * prison_service_slots);
+			}
+			pr->pr_slots = slots;
+			mtx_unlock(&pr->pr_mtx);
+			KASSERT(oldslots != NULL, ("oldslots == NULL"));
+			free(oldslots, M_PRISON);
+		}
+	}
+	sx_sunlock(&allprison_lock);
+	free(psrv, M_PRISON);
+}
+
+/*
+ * Function sets data for the given jail in slot assigned for the given
+ * jail service.
+ */
+void
+prison_service_data_set(struct prison_service *psrv, struct prison *pr,
+    void *data)
+{
+
+	mtx_assert(&pr->pr_mtx, MA_OWNED);
+	pr->pr_slots[psrv->ps_slotno] = data;
+}
+
+/*
+ * Function clears slots assigned for the given jail service in the given
+ * prison structure and returns current slot data.
+ */
+void *
+prison_service_data_del(struct prison_service *psrv, struct prison *pr)
+{
+	void *data;
+
+	mtx_assert(&pr->pr_mtx, MA_OWNED);
+	data = pr->pr_slots[psrv->ps_slotno];
+	pr->pr_slots[psrv->ps_slotno] = NULL;
+	return (data);
+}
+
+/*
+ * Function returns current data from the slot assigned to the given jail
+ * service for the given jail.
+ */
+void *
+prison_service_data_get(struct prison_service *psrv, struct prison *pr)
+{
+
+	mtx_assert(&pr->pr_mtx, MA_OWNED);
+	return (pr->pr_slots[psrv->ps_slotno]);
+}
+
 static int
 sysctl_jail_list(SYSCTL_HANDLER_ARGS)
 {
@@ -532,39 +941,30 @@
 
 	if (jailed(req->td->td_ucred))
 		return (0);
-retry:
-	mtx_lock(&allprison_mtx);
-	count = prisoncount;
-	mtx_unlock(&allprison_mtx);
 
-	if (count == 0)
+	sx_slock(&allprison_lock);
+	if ((count = prisoncount) == 0) {
+		sx_sunlock(&allprison_lock);
 		return (0);
+	}
 
 	sxp = xp = malloc(sizeof(*xp) * count, M_TEMP, M_WAITOK | M_ZERO);
-	mtx_lock(&allprison_mtx);
-	if (count != prisoncount) {
-		mtx_unlock(&allprison_mtx);
-		free(sxp, M_TEMP);
-		goto retry;
-	}
-	
+
 	LIST_FOREACH(pr, &allprison, pr_list) {
-		mtx_lock(&pr->pr_mtx);
 		xp->pr_version = XPRISON_VERSION;
 		xp->pr_id = pr->pr_id;
+		xp->pr_ip = pr->pr_ip;
 		strlcpy(xp->pr_path, pr->pr_path, sizeof(xp->pr_path));
+		mtx_lock(&pr->pr_mtx);
 		strlcpy(xp->pr_host, pr->pr_host, sizeof(xp->pr_host));
-		xp->pr_ip = pr->pr_ip;
 		mtx_unlock(&pr->pr_mtx);
 		xp++;
 	}
-	mtx_unlock(&allprison_mtx);
+	sx_sunlock(&allprison_lock);
 
 	error = SYSCTL_OUT(req, sxp, sizeof(*sxp) * count);
 	free(sxp, M_TEMP);
-	if (error)
-		return (error);
-	return (0);
+	return (error);
 }
 
 SYSCTL_OID(_security_jail, OID_AUTO, list, CTLTYPE_STRUCT | CTLFLAG_RD,
Index: tty_compat.c
===================================================================
RCS file: /home/cvs/src/sys/kern/tty_compat.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/tty_compat.c -L sys/kern/tty_compat.c -u -r1.1.1.1 -r1.2
--- sys/kern/tty_compat.c
+++ sys/kern/tty_compat.c
@@ -30,15 +30,13 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/tty_compat.c,v 1.37 2004/06/21 22:57:15 phk Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/tty_compat.c,v 1.39 2006/01/10 09:19:09 phk Exp $");
 
 #include "opt_compat.h"
 
-#ifndef BURN_BRIDGES
 /*
  * mapping routines for old line discipline (yuck)
  */
-#if defined(COMPAT_43)
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -93,7 +91,7 @@
 	return (1); /* 50, min and not hangup */
 }
 
-int
+static int
 ttsetcompat(struct tty *tp, u_long *com, caddr_t data, struct termios *term)
 {
 	switch (*com) {
@@ -471,6 +469,3 @@
 	t->c_lflag = lflag;
 	t->c_cflag = cflag;
 }
-#endif	/* COMPAT_43 */
-
-#endif /* BURN_BRIDGES */
--- /dev/null
+++ sys/kern/posix4_mib.c
@@ -0,0 +1,151 @@
+/*-
+ * Copyright (c) 1998
+ *	HD Associates, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by HD Associates, Inc
+ * 4. Neither the name of the author nor the names of any co-contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY HD ASSOCIATES AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL HD ASSOCIATES OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: src/sys/kern/posix4_mib.c,v 1.12 2006/11/12 03:34:03 trhodes Exp $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/queue.h>
+#include <sys/sysctl.h>
+#include <sys/vnode.h>
+#include <sys/proc.h>
+#include <sys/posix4.h>
+
+static int facility[CTL_P1003_1B_MAXID - 1];
+static int facility_initialized[CTL_P1003_1B_MAXID - 1];
+
+/* OID_AUTO isn't working with sysconf(3).  I guess I'd have to
+ * modify it to do a lookup by name from the index.
+ * For now I've left it a top-level sysctl.
+ */
+
+#if 1
+
+SYSCTL_DECL(_p1003_1b);
+
+#define P1B_SYSCTL(num, name)  \
+SYSCTL_INT(_p1003_1b, num, \
+	name, CTLFLAG_RD, facility + num - 1, 0, "");
+
+#else
+
+SYSCTL_DECL(_kern_p1003_1b);
+
+#define P1B_SYSCTL(num, name)  \
+SYSCTL_INT(_kern_p1003_1b, OID_AUTO, \
+	name, CTLFLAG_RD, facility + num - 1, 0, "");
+SYSCTL_NODE(_kern, OID_AUTO, p1003_1b, CTLFLAG_RW, 0, "P1003.1B");
+
+#endif
+
+SYSCTL_INT(_p1003_1b, CTL_P1003_1B_ASYNCHRONOUS_IO, \
+	asynchronous_io, CTLFLAG_RD, &async_io_version, 0, "");
+P1B_SYSCTL(CTL_P1003_1B_MAPPED_FILES, mapped_files);
+P1B_SYSCTL(CTL_P1003_1B_MEMLOCK, memlock);
+P1B_SYSCTL(CTL_P1003_1B_MEMLOCK_RANGE, memlock_range);
+P1B_SYSCTL(CTL_P1003_1B_MEMORY_PROTECTION, memory_protection);
+P1B_SYSCTL(CTL_P1003_1B_MESSAGE_PASSING, message_passing);
+P1B_SYSCTL(CTL_P1003_1B_PRIORITIZED_IO, prioritized_io);
+P1B_SYSCTL(CTL_P1003_1B_PRIORITY_SCHEDULING, priority_scheduling);
+P1B_SYSCTL(CTL_P1003_1B_REALTIME_SIGNALS, realtime_signals);
+P1B_SYSCTL(CTL_P1003_1B_SEMAPHORES, semaphores);
+P1B_SYSCTL(CTL_P1003_1B_FSYNC, fsync);
+P1B_SYSCTL(CTL_P1003_1B_SHARED_MEMORY_OBJECTS, shared_memory_objects);
+P1B_SYSCTL(CTL_P1003_1B_SYNCHRONIZED_IO, synchronized_io);
+P1B_SYSCTL(CTL_P1003_1B_TIMERS, timers);
+P1B_SYSCTL(CTL_P1003_1B_AIO_LISTIO_MAX, aio_listio_max);
+P1B_SYSCTL(CTL_P1003_1B_AIO_MAX, aio_max);
+P1B_SYSCTL(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, aio_prio_delta_max);
+P1B_SYSCTL(CTL_P1003_1B_DELAYTIMER_MAX, delaytimer_max);
+P1B_SYSCTL(CTL_P1003_1B_MQ_OPEN_MAX, mq_open_max);
+P1B_SYSCTL(CTL_P1003_1B_PAGESIZE, pagesize);
+P1B_SYSCTL(CTL_P1003_1B_RTSIG_MAX, rtsig_max);
+P1B_SYSCTL(CTL_P1003_1B_SEM_NSEMS_MAX, sem_nsems_max);
+P1B_SYSCTL(CTL_P1003_1B_SEM_VALUE_MAX, sem_value_max);
+P1B_SYSCTL(CTL_P1003_1B_SIGQUEUE_MAX, sigqueue_max);
+P1B_SYSCTL(CTL_P1003_1B_TIMER_MAX, timer_max);
+
+#define P31B_VALID(num)	((num) >= 1 && (num) < CTL_P1003_1B_MAXID)
+
+/* p31b_setcfg: Set the configuration
+ */
+void
+p31b_setcfg(int num, int value)
+{
+
+	if (P31B_VALID(num)) {
+		facility[num - 1] = value;
+		facility_initialized[num - 1] = 1;
+	}
+}
+
+int
+p31b_getcfg(int num)
+{
+
+	if (P31B_VALID(num))
+		return (facility[num - 1]);
+	return (0);
+}
+
+int
+p31b_iscfg(int num)
+{
+
+	if (P31B_VALID(num))
+		return (facility_initialized[num - 1]);
+	return (0);
+}
+
+/*
+ * Turn on indications for standard (non-configurable) kernel features.
+ */
+static void
+p31b_set_standard(void *dummy)
+{
+	/* ??? p31b_setcfg(CTL_P1003_1B_FSYNC, 1); */
+	p31b_setcfg(CTL_P1003_1B_MAPPED_FILES, 1);
+	p31b_setcfg(CTL_P1003_1B_SHARED_MEMORY_OBJECTS, 1);
+	p31b_setcfg(CTL_P1003_1B_PAGESIZE, PAGE_SIZE);
+	if (!p31b_iscfg(CTL_P1003_1B_AIO_LISTIO_MAX))
+		p31b_setcfg(CTL_P1003_1B_AIO_LISTIO_MAX, -1);
+	if (!p31b_iscfg(CTL_P1003_1B_AIO_MAX))
+		p31b_setcfg(CTL_P1003_1B_AIO_MAX, -1);
+	if (!p31b_iscfg(CTL_P1003_1B_AIO_PRIO_DELTA_MAX))
+		p31b_setcfg(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, -1);
+}
+
+SYSINIT(p31b_set_standard, SI_SUB_P1003_1B, SI_ORDER_ANY, p31b_set_standard, 
+	0);
+
--- /dev/null
+++ sys/kern/ksched.c
@@ -0,0 +1,277 @@
+/*-
+ * Copyright (c) 1996, 1997
+ *	HD Associates, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by HD Associates, Inc
+ * 4. Neither the name of the author nor the names of any co-contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY HD ASSOCIATES AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL HD ASSOCIATES OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/* ksched: Soft real time scheduling based on "rtprio".
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: src/sys/kern/ksched.c,v 1.36 2007/06/05 00:00:54 jeff Exp $");
+
+#include "opt_posix.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/posix4.h>
+#include <sys/resource.h>
+#include <sys/sched.h>
+
+/* ksched: Real-time extension to support POSIX priority scheduling.
+ */
+
+struct ksched {
+	struct timespec rr_interval;
+};
+
+int
+ksched_attach(struct ksched **p)
+{
+	struct ksched *ksched= p31b_malloc(sizeof(*ksched));
+
+	ksched->rr_interval.tv_sec = 0;
+	ksched->rr_interval.tv_nsec = 1000000000L / sched_rr_interval();
+
+	*p = ksched;
+	return 0;
+}
+
+int
+ksched_detach(struct ksched *ks)
+{
+	p31b_free(ks);
+
+	return 0;
+}
+
+/*
+ * XXX About priorities
+ *
+ *	POSIX 1003.1b requires that numerically higher priorities be of
+ *	higher priority.  It also permits sched_setparam to be
+ *	implementation defined for SCHED_OTHER.  I don't like
+ *	the notion of inverted priorites for normal processes when
+ *  you can use "setpriority" for that.
+ *
+ *	I'm rejecting sched_setparam for SCHED_OTHER with EINVAL.
+ */
+
+/* Macros to convert between the unix (lower numerically is higher priority)
+ * and POSIX 1003.1b (higher numerically is higher priority)
+ */
+
+#define p4prio_to_rtpprio(P) (RTP_PRIO_MAX - (P))
+#define rtpprio_to_p4prio(P) (RTP_PRIO_MAX - (P))
+
+/* These improve readability a bit for me:
+ */
+#define P1B_PRIO_MIN rtpprio_to_p4prio(RTP_PRIO_MAX)
+#define P1B_PRIO_MAX rtpprio_to_p4prio(RTP_PRIO_MIN)
+
+static __inline int
+getscheduler(struct ksched *ksched, struct thread *td, int *policy)
+{
+	struct rtprio rtp;
+	int e = 0;
+
+	pri_to_rtp(td, &rtp);
+	switch (rtp.type)
+	{
+		case RTP_PRIO_FIFO:
+		*policy = SCHED_FIFO;
+		break;
+
+		case RTP_PRIO_REALTIME:
+		*policy = SCHED_RR;
+		break;
+
+		default:
+		*policy = SCHED_OTHER;
+		break;
+	}
+
+	return e;
+}
+
+int
+ksched_setparam(struct ksched *ksched,
+    struct thread *td, const struct sched_param *param)
+{
+	int policy;
+	int e;
+
+	e = getscheduler(ksched, td, &policy);
+
+	if (e == 0)
+	{
+		if (policy == SCHED_OTHER)
+			e = EINVAL;
+		else
+			e = ksched_setscheduler(ksched, td, policy, param);
+	}
+
+	return e;
+}
+
+int
+ksched_getparam(struct ksched *ksched,
+    struct thread *td, struct sched_param *param)
+{
+	struct rtprio rtp;
+
+	pri_to_rtp(td, &rtp);
+	if (RTP_PRIO_IS_REALTIME(rtp.type))
+		param->sched_priority = rtpprio_to_p4prio(rtp.prio);
+
+	return 0;
+}
+
+/*
+ * XXX The priority and scheduler modifications should
+ *     be moved into published interfaces in kern/kern_sync.
+ *
+ * The permissions to modify process p were checked in "p31b_proc()".
+ *
+ */
+int
+ksched_setscheduler(struct ksched *ksched,
+    struct thread *td, int policy, const struct sched_param *param)
+{
+	int e = 0;
+	struct rtprio rtp;
+
+	switch(policy)
+	{
+		case SCHED_RR:
+		case SCHED_FIFO:
+
+		if (param->sched_priority >= P1B_PRIO_MIN &&
+		    param->sched_priority <= P1B_PRIO_MAX)
+		{
+			rtp.prio = p4prio_to_rtpprio(param->sched_priority);
+			rtp.type = (policy == SCHED_FIFO)
+				? RTP_PRIO_FIFO : RTP_PRIO_REALTIME;
+
+			rtp_to_pri(&rtp, td);
+		}
+		else
+			e = EPERM;
+
+
+		break;
+
+		case SCHED_OTHER:
+		{
+			rtp.type = RTP_PRIO_NORMAL;
+			rtp.prio = p4prio_to_rtpprio(param->sched_priority);
+			rtp_to_pri(&rtp, td);
+		}
+		break;
+		
+		default:
+			e = EINVAL;
+			break;
+	}
+
+	return e;
+}
+
+int
+ksched_getscheduler(struct ksched *ksched, struct thread *td, int *policy)
+{
+	return getscheduler(ksched, td, policy);
+}
+
+/* ksched_yield: Yield the CPU.
+ */
+int
+ksched_yield(struct ksched *ksched)
+{
+	sched_relinquish(curthread);
+	return 0;
+}
+
+int
+ksched_get_priority_max(struct ksched *ksched, int policy, int *prio)
+{
+	int e = 0;
+
+	switch (policy)
+	{
+		case SCHED_FIFO:
+		case SCHED_RR:
+		*prio = RTP_PRIO_MAX;
+		break;
+
+		case SCHED_OTHER:
+		*prio = PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE;
+		break;
+
+		default:
+		e = EINVAL;
+	}
+
+	return e;
+}
+
+int
+ksched_get_priority_min(struct ksched *ksched, int policy, int *prio)
+{
+	int e = 0;
+
+	switch (policy)
+	{
+		case SCHED_FIFO:
+		case SCHED_RR:
+		*prio = P1B_PRIO_MIN;
+		break;
+
+		case SCHED_OTHER:
+		*prio = 0;
+		break;
+
+		default:
+		e = EINVAL;
+	}
+
+	return e;
+}
+
+int
+ksched_rr_get_interval(struct ksched *ksched,
+   struct thread *td, struct timespec *timespec)
+{
+	*timespec = ksched->rr_interval;
+
+	return 0;
+}
Index: kern_prot.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_prot.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/kern_prot.c -L sys/kern/kern_prot.c -u -r1.1.1.1 -r1.2
--- sys/kern/kern_prot.c
+++ sys/kern/kern_prot.c
@@ -1,12 +1,14 @@
 /*-
  * Copyright (c) 1982, 1986, 1989, 1990, 1991, 1993
- *	The Regents of the University of California.  All rights reserved.
+ *	The Regents of the University of California.
  * (c) UNIX System Laboratories, Inc.
+ * Copyright (c) 2000-2001 Robert N. M. Watson.
+ * All rights reserved.
+ *
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
- * Copyright (c) 2000-2001 Robert N. M. Watson.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -40,7 +42,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_prot.c,v 1.200 2005/04/18 13:36:56 rwatson Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_prot.c,v 1.211 2007/06/12 00:11:59 rwatson Exp $");
 
 #include "opt_compat.h"
 #include "opt_mac.h"
@@ -51,10 +53,11 @@
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
-#include <sys/mac.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
+#include <sys/refcount.h>
 #include <sys/sx.h>
+#include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/sysproto.h>
 #include <sys/jail.h>
@@ -62,22 +65,21 @@
 #include <sys/resourcevar.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
+#include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 
+#include <security/audit/audit.h>
+#include <security/mac/mac_framework.h>
+
 static MALLOC_DEFINE(M_CRED, "cred", "credentials");
 
-SYSCTL_DECL(_security);
-SYSCTL_NODE(_security, OID_AUTO, bsd, CTLFLAG_RW, 0,
-    "BSD security policy");
+SYSCTL_NODE(_security, OID_AUTO, bsd, CTLFLAG_RW, 0, "BSD security policy");
 
 #ifndef _SYS_SYSPROTO_H_
 struct getpid_args {
 	int	dummy;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 getpid(struct thread *td, struct getpid_args *uap)
@@ -98,9 +100,6 @@
         int     dummy;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 getppid(struct thread *td, struct getppid_args *uap)
@@ -121,9 +120,6 @@
         int     dummy;
 };
 #endif
-/*
- * MPSAFE
- */
 int
 getpgrp(struct thread *td, struct getpgrp_args *uap)
 {
@@ -141,9 +137,6 @@
 	pid_t	pid;
 };
 #endif
-/*
- * MPSAFE
- */
 int
 getpgid(struct thread *td, struct getpgid_args *uap)
 {
@@ -176,9 +169,6 @@
 	pid_t	pid;
 };
 #endif
-/*
- * MPSAFE
- */
 int
 getsid(struct thread *td, struct getsid_args *uap)
 {
@@ -208,9 +198,6 @@
         int     dummy;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 getuid(struct thread *td, struct getuid_args *uap)
@@ -228,9 +215,6 @@
         int     dummy;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 geteuid(struct thread *td, struct geteuid_args *uap)
@@ -245,9 +229,6 @@
         int     dummy;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 getgid(struct thread *td, struct getgid_args *uap)
@@ -270,9 +251,6 @@
         int     dummy;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 getegid(struct thread *td, struct getegid_args *uap)
@@ -288,28 +266,39 @@
 	gid_t	*gidset;
 };
 #endif
-/*
- * MPSAFE
- */
 int
 getgroups(struct thread *td, register struct getgroups_args *uap)
 {
-	struct ucred *cred;
+	gid_t groups[NGROUPS];
 	u_int ngrp;
 	int error;
 
+	ngrp = MIN(uap->gidsetsize, NGROUPS);
+	error = kern_getgroups(td, &ngrp, groups);
+	if (error)
+		return (error);
+	if (uap->gidsetsize > 0)
+		error = copyout(groups, uap->gidset, ngrp * sizeof(gid_t));
+	if (error == 0)
+		td->td_retval[0] = ngrp;
+	return (error);
+}
+
+int
+kern_getgroups(struct thread *td, u_int *ngrp, gid_t *groups)
+{
+	struct ucred *cred;
+
 	cred = td->td_ucred;
-	if ((ngrp = uap->gidsetsize) == 0) {
-		td->td_retval[0] = cred->cr_ngroups;
+	if (*ngrp == 0) {
+		*ngrp = cred->cr_ngroups;
 		return (0);
 	}
-	if (ngrp < cred->cr_ngroups)
+	if (*ngrp < cred->cr_ngroups)
 		return (EINVAL);
-	ngrp = cred->cr_ngroups;
-	error = copyout(cred->cr_groups, uap->gidset, ngrp * sizeof(gid_t));
-	if (error == 0)
-		td->td_retval[0] = ngrp;
-	return (error);
+	*ngrp = cred->cr_ngroups;
+	bcopy(cred->cr_groups, groups, *ngrp * sizeof(gid_t));
+	return (0);
 }
 
 #ifndef _SYS_SYSPROTO_H_
@@ -317,9 +306,6 @@
         int     dummy;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 setsid(register struct thread *td, struct setsid_args *uap)
@@ -378,9 +364,6 @@
 	int	pgid;		/* target pgrp id */
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 setpgid(struct thread *td, register struct setpgid_args *uap)
@@ -481,9 +464,6 @@
 	uid_t	uid;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 setuid(struct thread *td, struct setuid_args *uap)
@@ -495,6 +475,7 @@
 	int error;
 
 	uid = uap->uid;
+	AUDIT_ARG(uid, uid);
 	newcred = crget();
 	uip = uifind(uid);
 	PROC_LOCK(p);
@@ -530,7 +511,7 @@
 #ifdef POSIX_APPENDIX_B_4_2_2	/* Use BSD-compat clause from B.4.2.2 */
 	    uid != oldcred->cr_uid &&		/* allow setuid(geteuid()) */
 #endif
-	    (error = suser_cred(oldcred, SUSER_ALLOWJAIL)) != 0)
+	    (error = priv_check_cred(oldcred, PRIV_CRED_SETUID, 0)) != 0)
 		goto fail;
 
 	/*
@@ -546,7 +527,8 @@
 #ifdef POSIX_APPENDIX_B_4_2_2	/* Use the clause from B.4.2.2 */
 	    uid == oldcred->cr_uid ||
 #endif
-	    suser_cred(oldcred, SUSER_ALLOWJAIL) == 0) /* we are using privs */
+	    /* We are using privs. */
+	    priv_check_cred(oldcred, PRIV_CRED_SETUID, 0) == 0)
 #endif
 	{
 		/*
@@ -594,9 +576,6 @@
 	uid_t	euid;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 seteuid(struct thread *td, struct seteuid_args *uap)
@@ -608,6 +587,7 @@
 	int error;
 
 	euid = uap->euid;
+	AUDIT_ARG(euid, euid);
 	newcred = crget();
 	euip = uifind(euid);
 	PROC_LOCK(p);
@@ -621,7 +601,7 @@
 
 	if (euid != oldcred->cr_ruid &&		/* allow seteuid(getuid()) */
 	    euid != oldcred->cr_svuid &&	/* allow seteuid(saved uid) */
-	    (error = suser_cred(oldcred, SUSER_ALLOWJAIL)) != 0)
+	    (error = priv_check_cred(oldcred, PRIV_CRED_SETEUID, 0)) != 0)
 		goto fail;
 
 	/*
@@ -651,9 +631,6 @@
 	gid_t	gid;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 setgid(struct thread *td, struct setgid_args *uap)
@@ -664,6 +641,7 @@
 	int error;
 
 	gid = uap->gid;
+	AUDIT_ARG(gid, gid);
 	newcred = crget();
 	PROC_LOCK(p);
 	oldcred = p->p_ucred;
@@ -692,7 +670,7 @@
 #ifdef POSIX_APPENDIX_B_4_2_2	/* Use BSD-compat clause from B.4.2.2 */
 	    gid != oldcred->cr_groups[0] && /* allow setgid(getegid()) */
 #endif
-	    (error = suser_cred(oldcred, SUSER_ALLOWJAIL)) != 0)
+	    (error = priv_check_cred(oldcred, PRIV_CRED_SETGID, 0)) != 0)
 		goto fail;
 
 	crcopy(newcred, oldcred);
@@ -705,7 +683,8 @@
 #ifdef POSIX_APPENDIX_B_4_2_2	/* use the clause from B.4.2.2 */
 	    gid == oldcred->cr_groups[0] ||
 #endif
-	    suser_cred(oldcred, SUSER_ALLOWJAIL) == 0) /* we are using privs */
+	    /* We are using privs. */
+	    priv_check_cred(oldcred, PRIV_CRED_SETGID, 0) == 0)
 #endif
 	{
 		/*
@@ -751,9 +730,6 @@
 	gid_t	egid;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 setegid(struct thread *td, struct setegid_args *uap)
@@ -764,6 +740,7 @@
 	int error;
 
 	egid = uap->egid;
+	AUDIT_ARG(egid, egid);
 	newcred = crget();
 	PROC_LOCK(p);
 	oldcred = p->p_ucred;
@@ -776,7 +753,7 @@
 
 	if (egid != oldcred->cr_rgid &&		/* allow setegid(getgid()) */
 	    egid != oldcred->cr_svgid &&	/* allow setegid(saved gid) */
-	    (error = suser_cred(oldcred, SUSER_ALLOWJAIL)) != 0)
+	    (error = priv_check_cred(oldcred, PRIV_CRED_SETEGID, 0)) != 0)
 		goto fail;
 
 	crcopy(newcred, oldcred);
@@ -801,39 +778,42 @@
 	gid_t	*gidset;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 setgroups(struct thread *td, struct setgroups_args *uap)
 {
+	gid_t groups[NGROUPS];
+	int error;
+
+	if (uap->gidsetsize > NGROUPS)
+		return (EINVAL);
+	error = copyin(uap->gidset, groups, uap->gidsetsize * sizeof(gid_t));
+	if (error)
+		return (error);
+	return (kern_setgroups(td, uap->gidsetsize, groups));
+}
+
+int
+kern_setgroups(struct thread *td, u_int ngrp, gid_t *groups)
+{
 	struct proc *p = td->td_proc;
-	struct ucred *newcred, *tempcred, *oldcred;
-	u_int ngrp;
+	struct ucred *newcred, *oldcred;
 	int error;
 
-	ngrp = uap->gidsetsize;
 	if (ngrp > NGROUPS)
 		return (EINVAL);
-	tempcred = crget();
-	error = copyin(uap->gidset, tempcred->cr_groups, ngrp * sizeof(gid_t));
-	if (error != 0) {
-		crfree(tempcred);
-		return (error);
-	}
+	AUDIT_ARG(groupset, groups, ngrp);
 	newcred = crget();
 	PROC_LOCK(p);
 	oldcred = p->p_ucred;
 
 #ifdef MAC
-	error = mac_check_proc_setgroups(p, oldcred, ngrp,
-	    tempcred->cr_groups);
+	error = mac_check_proc_setgroups(p, oldcred, ngrp, groups);
 	if (error)
 		goto fail;
 #endif
 
-	error = suser_cred(oldcred, SUSER_ALLOWJAIL);
+	error = priv_check_cred(oldcred, PRIV_CRED_SETGROUPS, 0);
 	if (error)
 		goto fail;
 
@@ -851,21 +831,18 @@
 		 */
 		newcred->cr_ngroups = 1;
 	} else {
-		bcopy(tempcred->cr_groups, newcred->cr_groups,
-		    ngrp * sizeof(gid_t));
+		bcopy(groups, newcred->cr_groups, ngrp * sizeof(gid_t));
 		newcred->cr_ngroups = ngrp;
 	}
 	setsugid(p);
 	p->p_ucred = newcred;
 	PROC_UNLOCK(p);
-	crfree(tempcred);
 	crfree(oldcred);
 	return (0);
 
 fail:
 	PROC_UNLOCK(p);
 	crfree(newcred);
-	crfree(tempcred);
 	return (error);
 }
 
@@ -875,9 +852,6 @@
 	uid_t	euid;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 setreuid(register struct thread *td, struct setreuid_args *uap)
@@ -890,6 +864,8 @@
 
 	euid = uap->euid;
 	ruid = uap->ruid;
+	AUDIT_ARG(euid, euid);
+	AUDIT_ARG(ruid, ruid);
 	newcred = crget();
 	euip = uifind(euid);
 	ruip = uifind(ruid);
@@ -906,7 +882,7 @@
 	      ruid != oldcred->cr_svuid) ||
 	     (euid != (uid_t)-1 && euid != oldcred->cr_uid &&
 	      euid != oldcred->cr_ruid && euid != oldcred->cr_svuid)) &&
-	    (error = suser_cred(oldcred, SUSER_ALLOWJAIL)) != 0)
+	    (error = priv_check_cred(oldcred, PRIV_CRED_SETREUID, 0)) != 0)
 		goto fail;
 
 	crcopy(newcred, oldcred);
@@ -944,9 +920,6 @@
 	gid_t	egid;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 setregid(register struct thread *td, struct setregid_args *uap)
@@ -958,6 +931,8 @@
 
 	egid = uap->egid;
 	rgid = uap->rgid;
+	AUDIT_ARG(egid, egid);
+	AUDIT_ARG(rgid, rgid);
 	newcred = crget();
 	PROC_LOCK(p);
 	oldcred = p->p_ucred;
@@ -972,7 +947,7 @@
 	    rgid != oldcred->cr_svgid) ||
 	     (egid != (gid_t)-1 && egid != oldcred->cr_groups[0] &&
 	     egid != oldcred->cr_rgid && egid != oldcred->cr_svgid)) &&
-	    (error = suser_cred(oldcred, SUSER_ALLOWJAIL)) != 0)
+	    (error = priv_check_cred(oldcred, PRIV_CRED_SETREGID, 0)) != 0)
 		goto fail;
 
 	crcopy(newcred, oldcred);
@@ -1001,10 +976,9 @@
 }
 
 /*
- * setresuid(ruid, euid, suid) is like setreuid except control over the
- * saved uid is explicit.
+ * setresuid(ruid, euid, suid) is like setreuid except control over the saved
+ * uid is explicit.
  */
-
 #ifndef _SYS_SYSPROTO_H_
 struct setresuid_args {
 	uid_t	ruid;
@@ -1012,9 +986,6 @@
 	uid_t	suid;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 setresuid(register struct thread *td, struct setresuid_args *uap)
@@ -1028,6 +999,9 @@
 	euid = uap->euid;
 	ruid = uap->ruid;
 	suid = uap->suid;
+	AUDIT_ARG(euid, euid);
+	AUDIT_ARG(ruid, ruid);
+	AUDIT_ARG(suid, suid);
 	newcred = crget();
 	euip = uifind(euid);
 	ruip = uifind(ruid);
@@ -1049,7 +1023,7 @@
 	     (suid != (uid_t)-1 && suid != oldcred->cr_ruid &&
 	    suid != oldcred->cr_svuid &&
 	      suid != oldcred->cr_uid)) &&
-	    (error = suser_cred(oldcred, SUSER_ALLOWJAIL)) != 0)
+	    (error = priv_check_cred(oldcred, PRIV_CRED_SETRESUID, 0)) != 0)
 		goto fail;
 
 	crcopy(newcred, oldcred);
@@ -1082,10 +1056,9 @@
 }
 
 /*
- * setresgid(rgid, egid, sgid) is like setregid except control over the
- * saved gid is explicit.
+ * setresgid(rgid, egid, sgid) is like setregid except control over the saved
+ * gid is explicit.
  */
-
 #ifndef _SYS_SYSPROTO_H_
 struct setresgid_args {
 	gid_t	rgid;
@@ -1093,9 +1066,6 @@
 	gid_t	sgid;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 setresgid(register struct thread *td, struct setresgid_args *uap)
@@ -1108,6 +1078,9 @@
 	egid = uap->egid;
 	rgid = uap->rgid;
 	sgid = uap->sgid;
+	AUDIT_ARG(egid, egid);
+	AUDIT_ARG(rgid, rgid);
+	AUDIT_ARG(sgid, sgid);
 	newcred = crget();
 	PROC_LOCK(p);
 	oldcred = p->p_ucred;
@@ -1127,7 +1100,7 @@
 	     (sgid != (gid_t)-1 && sgid != oldcred->cr_rgid &&
 	      sgid != oldcred->cr_svgid &&
 	      sgid != oldcred->cr_groups[0])) &&
-	    (error = suser_cred(oldcred, SUSER_ALLOWJAIL)) != 0)
+	    (error = priv_check_cred(oldcred, PRIV_CRED_SETRESGID, 0)) != 0)
 		goto fail;
 
 	crcopy(newcred, oldcred);
@@ -1161,9 +1134,6 @@
 	uid_t	*suid;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 getresuid(register struct thread *td, struct getresuid_args *uap)
@@ -1191,9 +1161,6 @@
 	gid_t	*sgid;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 getresgid(register struct thread *td, struct getresgid_args *uap)
@@ -1219,9 +1186,6 @@
 	int dummy;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 issetugid(register struct thread *td, struct issetugid_args *uap)
@@ -1242,9 +1206,6 @@
 	return (0);
 }
 
-/*
- * MPSAFE
- */
 int
 __setugid(struct thread *td, struct __setugid_args *uap)
 {
@@ -1274,8 +1235,6 @@
 
 /*
  * Check if gid is a member of the group set.
- *
- * MPSAFE (cred must be held)
  */
 int
 groupmember(gid_t gid, struct ucred *cred)
@@ -1291,66 +1250,13 @@
 }
 
 /*
- * `suser_enabled' (which can be set by the security.suser_enabled
- * sysctl) determines whether the system 'super-user' policy is in effect.
- * If it is nonzero, an effective uid of 0 connotes special privilege,
- * overriding many mandatory and discretionary protections.  If it is zero,
- * uid 0 is offered no special privilege in the kernel security policy.
- * Setting it to zero may seriously impact the functionality of many
- * existing userland programs, and should not be done without careful
- * consideration of the consequences.
- */
-int	suser_enabled = 1;
-SYSCTL_INT(_security_bsd, OID_AUTO, suser_enabled, CTLFLAG_RW,
-    &suser_enabled, 0, "processes with uid 0 have privilege");
-TUNABLE_INT("security.bsd.suser_enabled", &suser_enabled);
-
-/*
- * Test whether the specified credentials imply "super-user" privilege.
- * Return 0 or EPERM.
- */
-int
-suser_cred(struct ucred *cred, int flag)
-{
-
-	if (!suser_enabled)
-		return (EPERM);
-	if (((flag & SUSER_RUID) ? cred->cr_ruid : cred->cr_uid) != 0)
-		return (EPERM);
-	if (jailed(cred) && !(flag & SUSER_ALLOWJAIL))
-		return (EPERM);
-	return (0);
-}
-
-/*
- * Shortcut to hide contents of struct td and struct proc from the
- * caller, promoting binary compatibility.
- */
-int
-suser(struct thread *td)
-{
-
-#ifdef INVARIANTS
-	if (td != curthread) {
-		printf("suser: thread %p (%d %s) != curthread %p (%d %s)\n",
-		    td, td->td_proc->p_pid, td->td_proc->p_comm,
-		    curthread, curthread->td_proc->p_pid,
-		    curthread->td_proc->p_comm);
-#ifdef KDB
-		kdb_backtrace();
-#endif
-	}
-#endif
-	return (suser_cred(td->td_ucred, 0));
-}
-
-/*
  * Test the active securelevel against a given level.  securelevel_gt()
  * implements (securelevel > level).  securelevel_ge() implements
  * (securelevel >= level).  Note that the logic is inverted -- these
  * functions return EPERM on "success" and 0 on "failure".
  *
- * MPSAFE
+ * XXXRW: Possibly since this has to do with privilege, it should move to
+ * kern_priv.c.
  */
 int
 securelevel_gt(struct ucred *cr, int level)
@@ -1402,7 +1308,7 @@
 {
 
 	if (!see_other_uids && u1->cr_ruid != u2->cr_ruid) {
-		if (suser_cred(u1, SUSER_ALLOWJAIL) != 0)
+		if (priv_check_cred(u1, PRIV_SEEOTHERUIDS, 0) != 0)
 			return (ESRCH);
 	}
 	return (0);
@@ -1441,7 +1347,7 @@
 				break;
 		}
 		if (!match) {
-			if (suser_cred(u1, SUSER_ALLOWJAIL) != 0)
+			if (priv_check_cred(u1, PRIV_SEEOTHERGIDS, 0) != 0)
 				return (ESRCH);
 		}
 	}
@@ -1558,7 +1464,7 @@
 			break;
 		default:
 			/* Not permitted without privilege. */
-			error = suser_cred(cred, SUSER_ALLOWJAIL);
+			error = priv_check_cred(cred, PRIV_SIGNAL_SUGID, 0);
 			if (error)
 				return (error);
 		}
@@ -1572,8 +1478,7 @@
 	    cred->cr_ruid != proc->p_ucred->cr_svuid &&
 	    cred->cr_uid != proc->p_ucred->cr_ruid &&
 	    cred->cr_uid != proc->p_ucred->cr_svuid) {
-		/* Not permitted without privilege. */
-		error = suser_cred(cred, SUSER_ALLOWJAIL);
+		error = priv_check_cred(cred, PRIV_SIGNAL_DIFFCRED, 0);
 		if (error)
 			return (error);
 	}
@@ -1581,7 +1486,6 @@
 	return (0);
 }
 
-
 /*-
  * Determine whether td may deliver the specified signal to p.
  * Returns: 0 for permitted, an errno value otherwise
@@ -1650,19 +1554,13 @@
 		return (error);
 	if ((error = cr_seeothergids(td->td_ucred, p->p_ucred)))
 		return (error);
-	if (td->td_ucred->cr_ruid == p->p_ucred->cr_ruid)
-		return (0);
-	if (td->td_ucred->cr_uid == p->p_ucred->cr_ruid)
-		return (0);
-	if (suser_cred(td->td_ucred, SUSER_ALLOWJAIL) == 0)
-		return (0);
-
-#ifdef CAPABILITIES
-	if (!cap_check(NULL, td, CAP_SYS_NICE, SUSER_ALLOWJAIL))
-		return (0);
-#endif
-
-	return (EPERM);
+	if (td->td_ucred->cr_ruid != p->p_ucred->cr_ruid &&
+	    td->td_ucred->cr_uid != p->p_ucred->cr_ruid) {
+		error = priv_check(td, PRIV_SCHED_DIFFCRED);
+		if (error)
+			return (error);
+	}
+	return (0);
 }
 
 /*
@@ -1697,7 +1595,7 @@
 	KASSERT(td == curthread, ("%s: td not curthread", __func__));
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	if (!unprivileged_proc_debug) {
-		error = suser_cred(td->td_ucred, SUSER_ALLOWJAIL);
+		error = priv_check(td, PRIV_DEBUG_UNPRIV);
 		if (error)
 			return (error);
 	}
@@ -1745,11 +1643,16 @@
 	/*
 	 * If p's gids aren't a subset, or the uids aren't a subset,
 	 * or the credential has changed, require appropriate privilege
-	 * for td to debug p.  For POSIX.1e capabilities, this will
-	 * require CAP_SYS_PTRACE.
+	 * for td to debug p.
 	 */
-	if (!grpsubset || !uidsubset || credentialchanged) {
-		error = suser_cred(td->td_ucred, SUSER_ALLOWJAIL);
+	if (!grpsubset || !uidsubset) {
+		error = priv_check(td, PRIV_DEBUG_DIFFCRED);
+		if (error)
+			return (error);
+	}
+
+	if (credentialchanged) {
+		error = priv_check(td, PRIV_DEBUG_SUGID);
 		if (error)
 			return (error);
 	}
@@ -1763,6 +1666,7 @@
 
 	/*
 	 * Can't trace a process that's currently exec'ing.
+	 *
 	 * XXX: Note, this is not a security policy decision, it's a
 	 * basic correctness/functionality decision.  Therefore, this check
 	 * should be moved to the caller's of p_candebug().
@@ -1833,7 +1737,6 @@
 
 /*
  * Allocate a zeroed cred structure.
- * MPSAFE
  */
 struct ucred *
 crget(void)
@@ -1841,8 +1744,10 @@
 	register struct ucred *cr;
 
 	MALLOC(cr, struct ucred *, sizeof(*cr), M_CRED, M_WAITOK | M_ZERO);
-	cr->cr_ref = 1;
-	cr->cr_mtxp = mtx_pool_find(mtxpool_sleep, cr);
+	refcount_init(&cr->cr_ref, 1);
+#ifdef AUDIT
+	audit_cred_init(cr);
+#endif
 #ifdef MAC
 	mac_init_cred(cr);
 #endif
@@ -1851,32 +1756,25 @@
 
 /*
  * Claim another reference to a ucred structure.
- * MPSAFE
  */
 struct ucred *
 crhold(struct ucred *cr)
 {
 
-	mtx_lock(cr->cr_mtxp);
-	cr->cr_ref++;
-	mtx_unlock(cr->cr_mtxp);
+	refcount_acquire(&cr->cr_ref);
 	return (cr);
 }
 
 /*
- * Free a cred structure.
- * Throws away space when ref count gets to 0.
- * MPSAFE
+ * Free a cred structure.  Throws away space when ref count gets to 0.
  */
 void
 crfree(struct ucred *cr)
 {
-	struct mtx *mtxp = cr->cr_mtxp;
 
-	mtx_lock(mtxp);
 	KASSERT(cr->cr_ref > 0, ("bad ucred refcount: %d", cr->cr_ref));
-	if (--cr->cr_ref == 0) {
-		mtx_unlock(mtxp);
+	KASSERT(cr->cr_ref != 0xdeadc0de, ("dangling reference to ucred"));
+	if (refcount_release(&cr->cr_ref)) {
 		/*
 		 * Some callers of crget(), such as nfs_statfs(),
 		 * allocate a temporary credential, but don't
@@ -1891,33 +1789,28 @@
 		 */
 		if (jailed(cr))
 			prison_free(cr->cr_prison);
+#ifdef AUDIT
+		audit_cred_destroy(cr);
+#endif
 #ifdef MAC
 		mac_destroy_cred(cr);
 #endif
 		FREE(cr, M_CRED);
-	} else {
-		mtx_unlock(mtxp);
 	}
 }
 
 /*
  * Check to see if this ucred is shared.
- * MPSAFE
  */
 int
 crshared(struct ucred *cr)
 {
-	int shared;
 
-	mtx_lock(cr->cr_mtxp);
-	shared = (cr->cr_ref > 1);
-	mtx_unlock(cr->cr_mtxp);
-	return (shared);
+	return (cr->cr_ref > 1);
 }
 
 /*
  * Copy a ucred's contents from a template.  Does not block.
- * MPSAFE
  */
 void
 crcopy(struct ucred *dest, struct ucred *src)
@@ -1931,6 +1824,9 @@
 	uihold(dest->cr_ruidinfo);
 	if (jailed(dest))
 		prison_hold(dest->cr_prison);
+#ifdef AUDIT
+	audit_cred_copy(src, dest);
+#endif
 #ifdef MAC
 	mac_copy_cred(src, dest);
 #endif
@@ -1938,7 +1834,6 @@
 
 /*
  * Dup cred struct to a new held one.
- * MPSAFE
  */
 struct ucred *
 crdup(struct ucred *cr)
@@ -1952,7 +1847,6 @@
 
 /*
  * Fill in a struct xucred based on a struct ucred.
- * MPSAFE
  */
 void
 cru2x(struct ucred *cr, struct xucred *xcr)
@@ -1966,9 +1860,8 @@
 }
 
 /*
- * small routine to swap a thread's current ucred for the correct one
- * taken from the process.
- * MPSAFE
+ * small routine to swap a thread's current ucred for the correct one taken
+ * from the process.
  */
 void
 cred_update_thread(struct thread *td)
@@ -1994,9 +1887,6 @@
 	u_int	namelen;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 getlogin(struct thread *td, struct getlogin_args *uap)
@@ -2024,9 +1914,6 @@
 	char	*namebuf;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 setlogin(struct thread *td, struct setlogin_args *uap)
@@ -2035,7 +1922,7 @@
 	int error;
 	char logintmp[MAXLOGNAME];
 
-	error = suser_cred(td->td_ucred, SUSER_ALLOWJAIL);
+	error = priv_check(td, PRIV_PROC_SETLOGIN);
 	if (error)
 		return (error);
 	error = copyinstr(uap->namebuf, logintmp, sizeof(logintmp), NULL);
Index: kern_context.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_context.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/kern_context.c -L sys/kern/kern_context.c -u -r1.1.1.1 -r1.2
--- sys/kern/kern_context.c
+++ sys/kern/kern_context.c
@@ -25,7 +25,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_context.c,v 1.7 2003/11/09 20:31:03 marcel Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_context.c,v 1.9 2007/03/05 13:10:57 rwatson Exp $");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
@@ -39,9 +39,9 @@
 #include <sys/ucontext.h>
 
 /*
- * The first two fields of a ucontext_t are the signal mask and
- * the machine context.  The next field is uc_link; we want to
- * avoid destroying the link when copying out contexts.
+ * The first two fields of a ucontext_t are the signal mask and the machine
+ * context.  The next field is uc_link; we want to avoid destroying the link
+ * when copying out contexts.
  */
 #define	UC_COPY_SIZE	offsetof(ucontext_t, uc_link)
 
@@ -58,9 +58,6 @@
 }
 #endif
 
-/*
- * MPSAFE
- */
 int
 getcontext(struct thread *td, struct getcontext_args *uap)
 {
@@ -79,9 +76,6 @@
 	return (ret);
 }
 
-/*
- * MPSAFE
- */
 int
 setcontext(struct thread *td, struct setcontext_args *uap)
 {
--- sys/kern/kern_mac.c
+++ /dev/null
@@ -1,1250 +0,0 @@
-/*-
- * Copyright (c) 1999-2002 Robert N. M. Watson
- * Copyright (c) 2001 Ilmar S. Habibulin
- * Copyright (c) 2001-2005 Networks Associates Technology, Inc.
- * All rights reserved.
- *
- * This software was developed by Robert Watson and Ilmar Habibulin for the
- * TrustedBSD Project.
- *
- * This software was developed for the FreeBSD Project in part by Network
- * Associates Laboratories, the Security Research Division of Network
- * Associates, Inc. under DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"),
- * as part of the DARPA CHATS research program.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-/*-
- * Framework for extensible kernel access control.  This file contains
- * Kernel and userland interface to the framework, policy registration
- * and composition.  Per-object interfaces, controls, and labeling may be
- * found in src/sys/security/mac/.  Sample policies may be found in
- * src/sys/security/mac_*.
- */
-
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_mac.c,v 1.117.2.2 2006/03/22 17:34:39 tegge Exp $");
-
-#include "opt_mac.h"
-#include "opt_devfs.h"
-
-#include <sys/param.h>
-#include <sys/condvar.h>
-#include <sys/extattr.h>
-#include <sys/imgact.h>
-#include <sys/kernel.h>
-#include <sys/lock.h>
-#include <sys/malloc.h>
-#include <sys/mutex.h>
-#include <sys/mac.h>
-#include <sys/module.h>
-#include <sys/proc.h>
-#include <sys/sbuf.h>
-#include <sys/systm.h>
-#include <sys/sysproto.h>
-#include <sys/sysent.h>
-#include <sys/vnode.h>
-#include <sys/mount.h>
-#include <sys/file.h>
-#include <sys/namei.h>
-#include <sys/socket.h>
-#include <sys/pipe.h>
-#include <sys/socketvar.h>
-#include <sys/sysctl.h>
-
-#include <vm/vm.h>
-#include <vm/pmap.h>
-#include <vm/vm_map.h>
-#include <vm/vm_object.h>
-
-#include <sys/mac_policy.h>
-
-#include <fs/devfs/devfs.h>
-
-#include <net/bpfdesc.h>
-#include <net/if.h>
-#include <net/if_var.h>
-
-#include <netinet/in.h>
-#include <netinet/ip_var.h>
-
-#include <security/mac/mac_internal.h>
-
-#ifdef MAC
-
-/*
- * Declare that the kernel provides MAC support, version 1.  This permits
- * modules to refuse to be loaded if the necessary support isn't present,
- * even if it's pre-boot.
- */
-MODULE_VERSION(kernel_mac_support, 2);
-
-SYSCTL_NODE(_security, OID_AUTO, mac, CTLFLAG_RW, 0,
-    "TrustedBSD MAC policy controls");
-
-#if MAC_MAX_SLOTS > 32
-#error "MAC_MAX_SLOTS too large"
-#endif
-
-static unsigned int mac_max_slots = MAC_MAX_SLOTS;
-static unsigned int mac_slot_offsets_free = (1 << MAC_MAX_SLOTS) - 1;
-SYSCTL_UINT(_security_mac, OID_AUTO, max_slots, CTLFLAG_RD,
-    &mac_max_slots, 0, "");
-
-/*
- * Has the kernel started generating labeled objects yet?  All read/write
- * access to this variable is serialized during the boot process.  Following
- * the end of serialization, we don't update this flag; no locking.
- */
-int	mac_late = 0;
-
-/*
- * Flag to indicate whether or not we should allocate label storage for
- * new mbufs.  Since most dynamic policies we currently work with don't
- * rely on mbuf labeling, try to avoid paying the cost of mtag allocation
- * unless specifically notified of interest.  One result of this is
- * that if a dynamically loaded policy requests mbuf labels, it must
- * be able to deal with a NULL label being returned on any mbufs that
- * were already in flight when the policy was loaded.  Since the policy
- * already has to deal with uninitialized labels, this probably won't
- * be a problem.  Note: currently no locking.  Will this be a problem?
- */
-#ifndef MAC_ALWAYS_LABEL_MBUF
-int	mac_labelmbufs = 0;
-#endif
-
-#ifdef MAC_DEBUG
-SYSCTL_NODE(_security_mac, OID_AUTO, debug, CTLFLAG_RW, 0,
-    "TrustedBSD MAC debug info");
-SYSCTL_NODE(_security_mac_debug, OID_AUTO, counters, CTLFLAG_RW, 0,
-    "TrustedBSD MAC object counters");
-
-static unsigned int nmactemp;
-SYSCTL_UINT(_security_mac_debug_counters, OID_AUTO, temp, CTLFLAG_RD,
-    &nmactemp, 0, "number of temporary labels in use");
-#endif
-
-static int	mac_policy_register(struct mac_policy_conf *mpc);
-static int	mac_policy_unregister(struct mac_policy_conf *mpc);
-
-MALLOC_DEFINE(M_MACTEMP, "mactemp", "MAC temporary label storage");
-
-/*
- * mac_static_policy_list holds a list of policy modules that are not
- * loaded while the system is "live", and cannot be unloaded.  These
- * policies can be invoked without holding the busy count.
- *
- * mac_policy_list stores the list of dynamic policies.  A busy count is
- * maintained for the list, stored in mac_policy_busy.  The busy count
- * is protected by mac_policy_mtx; the list may be modified only
- * while the busy count is 0, requiring that the lock be held to
- * prevent new references to the list from being acquired.  For almost
- * all operations, incrementing the busy count is sufficient to
- * guarantee consistency, as the list cannot be modified while the
- * busy count is elevated.  For a few special operations involving a
- * change to the list of active policies, the mtx itself must be held.
- * A condition variable, mac_policy_cv, is used to signal potential
- * exclusive consumers that they should try to acquire the lock if a
- * first attempt at exclusive access fails.
- */
-#ifndef MAC_STATIC
-static struct mtx mac_policy_mtx;
-static struct cv mac_policy_cv;
-static int mac_policy_count;
-#endif
-struct mac_policy_list_head mac_policy_list;
-struct mac_policy_list_head mac_static_policy_list;
-
-/*
- * We manually invoke WITNESS_WARN() to allow Witness to generate
- * warnings even if we don't end up ever triggering the wait at
- * run-time.  The consumer of the exclusive interface must not hold
- * any locks (other than potentially Giant) since we may sleep for
- * long (potentially indefinite) periods of time waiting for the
- * framework to become quiescent so that a policy list change may
- * be made.
- */
-void
-mac_policy_grab_exclusive(void)
-{
-
-#ifndef MAC_STATIC
-	if (!mac_late)
-		return;
-
-	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL,
- 	    "mac_policy_grab_exclusive() at %s:%d", __FILE__, __LINE__);
-	mtx_lock(&mac_policy_mtx);
-	while (mac_policy_count != 0)
-		cv_wait(&mac_policy_cv, &mac_policy_mtx);
-#endif
-}
-
-void
-mac_policy_assert_exclusive(void)
-{
-
-#ifndef MAC_STATIC
-	if (!mac_late)
-		return;
-
-	mtx_assert(&mac_policy_mtx, MA_OWNED);
-	KASSERT(mac_policy_count == 0,
-	    ("mac_policy_assert_exclusive(): not exclusive"));
-#endif
-}
-
-void
-mac_policy_release_exclusive(void)
-{
-
-#ifndef MAC_STATIC
-	if (!mac_late)
-		return;
-
-	KASSERT(mac_policy_count == 0,
-	    ("mac_policy_release_exclusive(): not exclusive"));
-	mtx_unlock(&mac_policy_mtx);
-	cv_signal(&mac_policy_cv);
-#endif
-}
-
-void
-mac_policy_list_busy(void)
-{
-
-#ifndef MAC_STATIC
-	if (!mac_late)
-		return;
-
-	mtx_lock(&mac_policy_mtx);
-	mac_policy_count++;
-	mtx_unlock(&mac_policy_mtx);
-#endif
-}
-
-int
-mac_policy_list_conditional_busy(void)
-{
-#ifndef MAC_STATIC
-	int ret;
-
-	if (!mac_late)
-		return (1);
-
-	mtx_lock(&mac_policy_mtx);
-	if (!LIST_EMPTY(&mac_policy_list)) {
-		mac_policy_count++;
-		ret = 1;
-	} else
-		ret = 0;
-	mtx_unlock(&mac_policy_mtx);
-	return (ret);
-#else
-	if (!mac_late)
-		return (1);
-
-	return (1);
-#endif
-}
-
-void
-mac_policy_list_unbusy(void)
-{
-
-#ifndef MAC_STATIC
-	if (!mac_late)
-		return;
-
-	mtx_lock(&mac_policy_mtx);
-	mac_policy_count--;
-	KASSERT(mac_policy_count >= 0, ("MAC_POLICY_LIST_LOCK"));
-	if (mac_policy_count == 0)
-		cv_signal(&mac_policy_cv);
-	mtx_unlock(&mac_policy_mtx);
-#endif
-}
-
-/*
- * Initialize the MAC subsystem, including appropriate SMP locks.
- */
-static void
-mac_init(void)
-{
-
-	LIST_INIT(&mac_static_policy_list);
-	LIST_INIT(&mac_policy_list);
-	mac_labelzone_init();
-
-#ifndef MAC_STATIC
-	mtx_init(&mac_policy_mtx, "mac_policy_mtx", NULL, MTX_DEF);
-	cv_init(&mac_policy_cv, "mac_policy_cv");
-#endif
-}
-
-/*
- * For the purposes of modules that want to know if they were loaded
- * "early", set the mac_late flag once we've processed modules either
- * linked into the kernel, or loaded before the kernel startup.
- */
-static void
-mac_late_init(void)
-{
-
-	mac_late = 1;
-}
-
-/*
- * After the policy list has changed, walk the list to update any global
- * flags.  Currently, we support only one flag, and it's conditionally
- * defined; as a result, the entire function is conditional.  Eventually,
- * the #else case might also iterate across the policies.
- */
-static void
-mac_policy_updateflags(void)
-{
-#ifndef MAC_ALWAYS_LABEL_MBUF
-	struct mac_policy_conf *tmpc;
-	int labelmbufs;
-
-	mac_policy_assert_exclusive();
-
-	labelmbufs = 0;
-	LIST_FOREACH(tmpc, &mac_static_policy_list, mpc_list) {
-		if (tmpc->mpc_loadtime_flags & MPC_LOADTIME_FLAG_LABELMBUFS)
-			labelmbufs++;
-	}
-	LIST_FOREACH(tmpc, &mac_policy_list, mpc_list) {
-		if (tmpc->mpc_loadtime_flags & MPC_LOADTIME_FLAG_LABELMBUFS)
-			labelmbufs++;
-	}
-	mac_labelmbufs = (labelmbufs != 0);
-#endif
-}
-
-/*
- * Allow MAC policy modules to register during boot, etc.
- */
-int
-mac_policy_modevent(module_t mod, int type, void *data)
-{
-	struct mac_policy_conf *mpc;
-	int error;
-
-	error = 0;
-	mpc = (struct mac_policy_conf *) data;
-
-#ifdef MAC_STATIC
-	if (mac_late) {
-		printf("mac_policy_modevent: MAC_STATIC and late\n");
-		return (EBUSY);
-	}
-#endif
-
-	switch (type) {
-	case MOD_LOAD:
-		if (mpc->mpc_loadtime_flags & MPC_LOADTIME_FLAG_NOTLATE &&
-		    mac_late) {
-			printf("mac_policy_modevent: can't load %s policy "
-			    "after booting\n", mpc->mpc_name);
-			error = EBUSY;
-			break;
-		}
-		error = mac_policy_register(mpc);
-		break;
-	case MOD_UNLOAD:
-		/* Don't unregister the module if it was never registered. */
-		if ((mpc->mpc_runtime_flags & MPC_RUNTIME_FLAG_REGISTERED)
-		    != 0)
-			error = mac_policy_unregister(mpc);
-		else
-			error = 0;
-		break;
-	default:
-		error = EOPNOTSUPP;
-		break;
-	}
-
-	return (error);
-}
-
-static int
-mac_policy_register(struct mac_policy_conf *mpc)
-{
-	struct mac_policy_conf *tmpc;
-	int error, slot, static_entry;
-
-	error = 0;
-
-	/*
-	 * We don't technically need exclusive access while !mac_late,
-	 * but hold it for assertion consistency.
-	 */
-	mac_policy_grab_exclusive();
-
-	/*
-	 * If the module can potentially be unloaded, or we're loading
-	 * late, we have to stick it in the non-static list and pay
-	 * an extra performance overhead.  Otherwise, we can pay a
-	 * light locking cost and stick it in the static list.
-	 */
-	static_entry = (!mac_late &&
-	    !(mpc->mpc_loadtime_flags & MPC_LOADTIME_FLAG_UNLOADOK));
-
-	if (static_entry) {
-		LIST_FOREACH(tmpc, &mac_static_policy_list, mpc_list) {
-			if (strcmp(tmpc->mpc_name, mpc->mpc_name) == 0) {
-				error = EEXIST;
-				goto out;
-			}
-		}
-	} else {
-		LIST_FOREACH(tmpc, &mac_policy_list, mpc_list) {
-			if (strcmp(tmpc->mpc_name, mpc->mpc_name) == 0) {
-				error = EEXIST;
-				goto out;
-			}
-		}
-	}
-	if (mpc->mpc_field_off != NULL) {
-		slot = ffs(mac_slot_offsets_free);
-		if (slot == 0) {
-			error = ENOMEM;
-			goto out;
-		}
-		slot--;
-		mac_slot_offsets_free &= ~(1 << slot);
-		*mpc->mpc_field_off = slot;
-	}
-	mpc->mpc_runtime_flags |= MPC_RUNTIME_FLAG_REGISTERED;
-
-	/*
-	 * If we're loading a MAC module after the framework has
-	 * initialized, it has to go into the dynamic list.  If
-	 * we're loading it before we've finished initializing,
-	 * it can go into the static list with weaker locker
-	 * requirements.
-	 */
-	if (static_entry)
-		LIST_INSERT_HEAD(&mac_static_policy_list, mpc, mpc_list);
-	else
-		LIST_INSERT_HEAD(&mac_policy_list, mpc, mpc_list);
-
-	/* Per-policy initialization. */
-	if (mpc->mpc_ops->mpo_init != NULL)
-		(*(mpc->mpc_ops->mpo_init))(mpc);
-	mac_policy_updateflags();
-
-	printf("Security policy loaded: %s (%s)\n", mpc->mpc_fullname,
-	    mpc->mpc_name);
-
-out:
-	mac_policy_release_exclusive();
-	return (error);
-}
-
-static int
-mac_policy_unregister(struct mac_policy_conf *mpc)
-{
-
-	/*
-	 * If we fail the load, we may get a request to unload.  Check
-	 * to see if we did the run-time registration, and if not,
-	 * silently succeed.
-	 */
-	mac_policy_grab_exclusive();
-	if ((mpc->mpc_runtime_flags & MPC_RUNTIME_FLAG_REGISTERED) == 0) {
-		mac_policy_release_exclusive();
-		return (0);
-	}
-#if 0
-	/*
-	 * Don't allow unloading modules with private data.
-	 */
-	if (mpc->mpc_field_off != NULL) {
-		MAC_POLICY_LIST_UNLOCK();
-		return (EBUSY);
-	}
-#endif
-	/*
-	 * Only allow the unload to proceed if the module is unloadable
-	 * by its own definition.
-	 */
-	if ((mpc->mpc_loadtime_flags & MPC_LOADTIME_FLAG_UNLOADOK) == 0) {
-		mac_policy_release_exclusive();
-		return (EBUSY);
-	}
-	if (mpc->mpc_ops->mpo_destroy != NULL)
-		(*(mpc->mpc_ops->mpo_destroy))(mpc);
-
-	LIST_REMOVE(mpc, mpc_list);
-	mpc->mpc_runtime_flags &= ~MPC_RUNTIME_FLAG_REGISTERED;
-	mac_policy_updateflags();
-
-	mac_policy_release_exclusive();
-
-	printf("Security policy unload: %s (%s)\n", mpc->mpc_fullname,
-	    mpc->mpc_name);
-
-	return (0);
-}
-
-/*
- * Define an error value precedence, and given two arguments, selects the
- * value with the higher precedence.
- */
-int
-mac_error_select(int error1, int error2)
-{
-
-	/* Certain decision-making errors take top priority. */
-	if (error1 == EDEADLK || error2 == EDEADLK)
-		return (EDEADLK);
-
-	/* Invalid arguments should be reported where possible. */
-	if (error1 == EINVAL || error2 == EINVAL)
-		return (EINVAL);
-
-	/* Precedence goes to "visibility", with both process and file. */
-	if (error1 == ESRCH || error2 == ESRCH)
-		return (ESRCH);
-
-	if (error1 == ENOENT || error2 == ENOENT)
-		return (ENOENT);
-
-	/* Precedence goes to DAC/MAC protections. */
-	if (error1 == EACCES || error2 == EACCES)
-		return (EACCES);
-
-	/* Precedence goes to privilege. */
-	if (error1 == EPERM || error2 == EPERM)
-		return (EPERM);
-
-	/* Precedence goes to error over success; otherwise, arbitrary. */
-	if (error1 != 0)
-		return (error1);
-	return (error2);
-}
-
-void
-mac_init_label(struct label *label)
-{
-
-	bzero(label, sizeof(*label));
-	label->l_flags = MAC_FLAG_INITIALIZED;
-}
-
-void
-mac_destroy_label(struct label *label)
-{
-
-	KASSERT(label->l_flags & MAC_FLAG_INITIALIZED,
-	    ("destroying uninitialized label"));
-
-	bzero(label, sizeof(*label));
-	/* implicit: label->l_flags &= ~MAC_FLAG_INITIALIZED; */
-}
-
-int
-mac_check_structmac_consistent(struct mac *mac)
-{
-
-	if (mac->m_buflen < 0 ||
-	    mac->m_buflen > MAC_MAX_LABEL_BUF_LEN)
-		return (EINVAL);
-
-	return (0);
-}
-
-/*
- * MPSAFE
- */
-int
-__mac_get_pid(struct thread *td, struct __mac_get_pid_args *uap)
-{
-	char *elements, *buffer;
-	struct mac mac;
-	struct proc *tproc;
-	struct ucred *tcred;
-	int error;
-
-	error = copyin(uap->mac_p, &mac, sizeof(mac));
-	if (error)
-		return (error);
-
-	error = mac_check_structmac_consistent(&mac);
-	if (error)
-		return (error);
-
-	tproc = pfind(uap->pid);
-	if (tproc == NULL)
-		return (ESRCH);
-
-	tcred = NULL;				/* Satisfy gcc. */
-	error = p_cansee(td, tproc);
-	if (error == 0)
-		tcred = crhold(tproc->p_ucred);
-	PROC_UNLOCK(tproc);
-	if (error)
-		return (error);
-
-	elements = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
-	error = copyinstr(mac.m_string, elements, mac.m_buflen, NULL);
-	if (error) {
-		free(elements, M_MACTEMP);
-		crfree(tcred);
-		return (error);
-	}
-
-	buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK | M_ZERO);
-	error = mac_externalize_cred_label(tcred->cr_label, elements,
-	    buffer, mac.m_buflen);
-	if (error == 0)
-		error = copyout(buffer, mac.m_string, strlen(buffer)+1);
-
-	free(buffer, M_MACTEMP);
-	free(elements, M_MACTEMP);
-	crfree(tcred);
-	return (error);
-}
-
-/*
- * MPSAFE
- */
-int
-__mac_get_proc(struct thread *td, struct __mac_get_proc_args *uap)
-{
-	char *elements, *buffer;
-	struct mac mac;
-	int error;
-
-	error = copyin(uap->mac_p, &mac, sizeof(mac));
-	if (error)
-		return (error);
-
-	error = mac_check_structmac_consistent(&mac);
-	if (error)
-		return (error);
-
-	elements = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
-	error = copyinstr(mac.m_string, elements, mac.m_buflen, NULL);
-	if (error) {
-		free(elements, M_MACTEMP);
-		return (error);
-	}
-
-	buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK | M_ZERO);
-	error = mac_externalize_cred_label(td->td_ucred->cr_label,
-	    elements, buffer, mac.m_buflen);
-	if (error == 0)
-		error = copyout(buffer, mac.m_string, strlen(buffer)+1);
-
-	free(buffer, M_MACTEMP);
-	free(elements, M_MACTEMP);
-	return (error);
-}
-
-/*
- * MPSAFE
- */
-int
-__mac_set_proc(struct thread *td, struct __mac_set_proc_args *uap)
-{
-	struct ucred *newcred, *oldcred;
-	struct label *intlabel;
-	struct proc *p;
-	struct mac mac;
-	char *buffer;
-	int error;
-
-	error = copyin(uap->mac_p, &mac, sizeof(mac));
-	if (error)
-		return (error);
-
-	error = mac_check_structmac_consistent(&mac);
-	if (error)
-		return (error);
-
-	buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
-	error = copyinstr(mac.m_string, buffer, mac.m_buflen, NULL);
-	if (error) {
-		free(buffer, M_MACTEMP);
-		return (error);
-	}
-
-	intlabel = mac_cred_label_alloc();
-	error = mac_internalize_cred_label(intlabel, buffer);
-	free(buffer, M_MACTEMP);
-	if (error)
-		goto out;
-
-	newcred = crget();
-
-	p = td->td_proc;
-	PROC_LOCK(p);
-	oldcred = p->p_ucred;
-
-	error = mac_check_cred_relabel(oldcred, intlabel);
-	if (error) {
-		PROC_UNLOCK(p);
-		crfree(newcred);
-		goto out;
-	}
-
-	setsugid(p);
-	crcopy(newcred, oldcred);
-	mac_relabel_cred(newcred, intlabel);
-	p->p_ucred = newcred;
-
-	/*
-	 * Grab additional reference for use while revoking mmaps, prior
-	 * to releasing the proc lock and sharing the cred.
-	 */
-	crhold(newcred);
-	PROC_UNLOCK(p);
-
-	if (mac_enforce_vm) {
-		mac_cred_mmapped_drop_perms(td, newcred);
-	}
-
-	crfree(newcred);	/* Free revocation reference. */
-	crfree(oldcred);
-
-out:
-	mac_cred_label_free(intlabel);
-	return (error);
-}
-
-/*
- * MPSAFE
- */
-int
-__mac_get_fd(struct thread *td, struct __mac_get_fd_args *uap)
-{
-	char *elements, *buffer;
-	struct label *intlabel;
-	struct file *fp;
-	struct mac mac;
-	struct vnode *vp;
-	struct pipe *pipe;
-	struct socket *so;
-	short label_type;
-	int vfslocked, error;
-
-	error = copyin(uap->mac_p, &mac, sizeof(mac));
-	if (error)
-		return (error);
-
-	error = mac_check_structmac_consistent(&mac);
-	if (error)
-		return (error);
-
-	elements = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
-	error = copyinstr(mac.m_string, elements, mac.m_buflen, NULL);
-	if (error) {
-		free(elements, M_MACTEMP);
-		return (error);
-	}
-
-	buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK | M_ZERO);
-	error = fget(td, uap->fd, &fp);
-	if (error)
-		goto out;
-
-	label_type = fp->f_type;
-	switch (fp->f_type) {
-	case DTYPE_FIFO:
-	case DTYPE_VNODE:
-		vp = fp->f_vnode;
-		intlabel = mac_vnode_label_alloc();
-		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
-		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
-		mac_copy_vnode_label(vp->v_label, intlabel);
-		VOP_UNLOCK(vp, 0, td);
-		VFS_UNLOCK_GIANT(vfslocked);
-		error = mac_externalize_vnode_label(intlabel, elements,
-		    buffer, mac.m_buflen);
-		mac_vnode_label_free(intlabel);
-		break;
-
-	case DTYPE_PIPE:
-		pipe = fp->f_data;
-		intlabel = mac_pipe_label_alloc();
-		PIPE_LOCK(pipe);
-		mac_copy_pipe_label(pipe->pipe_pair->pp_label, intlabel);
-		PIPE_UNLOCK(pipe);
-		error = mac_externalize_pipe_label(intlabel, elements,
-		    buffer, mac.m_buflen);
-		mac_pipe_label_free(intlabel);
-		break;
-
-	case DTYPE_SOCKET:
-		so = fp->f_data;
-		intlabel = mac_socket_label_alloc(M_WAITOK);
-		NET_LOCK_GIANT();
-		SOCK_LOCK(so);
-		mac_copy_socket_label(so->so_label, intlabel);
-		SOCK_UNLOCK(so);
-		NET_UNLOCK_GIANT();
-		error = mac_externalize_socket_label(intlabel, elements,
-		    buffer, mac.m_buflen);
-		mac_socket_label_free(intlabel);
-		break;
-
-	default:
-		error = EINVAL;
-	}
-	fdrop(fp, td);
-	if (error == 0)
-		error = copyout(buffer, mac.m_string, strlen(buffer)+1);
-
-out:
-	free(buffer, M_MACTEMP);
-	free(elements, M_MACTEMP);
-	return (error);
-}
-
-/*
- * MPSAFE
- */
-int
-__mac_get_file(struct thread *td, struct __mac_get_file_args *uap)
-{
-	char *elements, *buffer;
-	struct nameidata nd;
-	struct label *intlabel;
-	struct mac mac;
-	int vfslocked, error;
-
-	error = copyin(uap->mac_p, &mac, sizeof(mac));
-	if (error)
-		return (error);
-
-	error = mac_check_structmac_consistent(&mac);
-	if (error)
-		return (error);
-
-	elements = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
-	error = copyinstr(mac.m_string, elements, mac.m_buflen, NULL);
-	if (error) {
-		free(elements, M_MACTEMP);
-		return (error);
-	}
-
-	buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK | M_ZERO);
-	NDINIT(&nd, LOOKUP, MPSAFE | LOCKLEAF | FOLLOW, UIO_USERSPACE,
-	    uap->path_p, td);
-	error = namei(&nd);
-	if (error)
-		goto out;
-
-	intlabel = mac_vnode_label_alloc();
-	vfslocked = NDHASGIANT(&nd);
-	mac_copy_vnode_label(nd.ni_vp->v_label, intlabel);
-	error = mac_externalize_vnode_label(intlabel, elements, buffer,
-	    mac.m_buflen);
-
-	NDFREE(&nd, 0);
-	VFS_UNLOCK_GIANT(vfslocked);
-	mac_vnode_label_free(intlabel);
-	if (error == 0)
-		error = copyout(buffer, mac.m_string, strlen(buffer)+1);
-
-out:
-	free(buffer, M_MACTEMP);
-	free(elements, M_MACTEMP);
-
-	return (error);
-}
-
-/*
- * MPSAFE
- */
-int
-__mac_get_link(struct thread *td, struct __mac_get_link_args *uap)
-{
-	char *elements, *buffer;
-	struct nameidata nd;
-	struct label *intlabel;
-	struct mac mac;
-	int vfslocked, error;
-
-	error = copyin(uap->mac_p, &mac, sizeof(mac));
-	if (error)
-		return (error);
-
-	error = mac_check_structmac_consistent(&mac);
-	if (error)
-		return (error);
-
-	elements = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
-	error = copyinstr(mac.m_string, elements, mac.m_buflen, NULL);
-	if (error) {
-		free(elements, M_MACTEMP);
-		return (error);
-	}
-
-	buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK | M_ZERO);
-	NDINIT(&nd, LOOKUP, MPSAFE | LOCKLEAF | NOFOLLOW, UIO_USERSPACE,
-	    uap->path_p, td);
-	error = namei(&nd);
-	if (error)
-		goto out;
-
-	intlabel = mac_vnode_label_alloc();
-	vfslocked = NDHASGIANT(&nd);
-	mac_copy_vnode_label(nd.ni_vp->v_label, intlabel);
-	error = mac_externalize_vnode_label(intlabel, elements, buffer,
-	    mac.m_buflen);
-	NDFREE(&nd, 0);
-	VFS_UNLOCK_GIANT(vfslocked);
-	mac_vnode_label_free(intlabel);
-
-	if (error == 0)
-		error = copyout(buffer, mac.m_string, strlen(buffer)+1);
-
-out:
-	free(buffer, M_MACTEMP);
-	free(elements, M_MACTEMP);
-
-	return (error);
-}
-
-/*
- * MPSAFE
- */
-int
-__mac_set_fd(struct thread *td, struct __mac_set_fd_args *uap)
-{
-	struct label *intlabel;
-	struct pipe *pipe;
-	struct socket *so;
-	struct file *fp;
-	struct mount *mp;
-	struct vnode *vp;
-	struct mac mac;
-	char *buffer;
-	int error, vfslocked;
-
-	error = copyin(uap->mac_p, &mac, sizeof(mac));
-	if (error)
-		return (error);
-
-	error = mac_check_structmac_consistent(&mac);
-	if (error)
-		return (error);
-
-	buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
-	error = copyinstr(mac.m_string, buffer, mac.m_buflen, NULL);
-	if (error) {
-		free(buffer, M_MACTEMP);
-		return (error);
-	}
-
-	error = fget(td, uap->fd, &fp);
-	if (error)
-		goto out;
-
-	switch (fp->f_type) {
-	case DTYPE_FIFO:
-	case DTYPE_VNODE:
-		intlabel = mac_vnode_label_alloc();
-		error = mac_internalize_vnode_label(intlabel, buffer);
-		if (error) {
-			mac_vnode_label_free(intlabel);
-			break;
-		}
-		vp = fp->f_vnode;
-		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
-		error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
-		if (error != 0) {
-			VFS_UNLOCK_GIANT(vfslocked);
-			mac_vnode_label_free(intlabel);
-			break;
-		}
-		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
-		error = vn_setlabel(vp, intlabel, td->td_ucred);
-		VOP_UNLOCK(vp, 0, td);
-		vn_finished_write(mp);
-		VFS_UNLOCK_GIANT(vfslocked);
-		mac_vnode_label_free(intlabel);
-		break;
-
-	case DTYPE_PIPE:
-		intlabel = mac_pipe_label_alloc();
-		error = mac_internalize_pipe_label(intlabel, buffer);
-		if (error == 0) {
-			pipe = fp->f_data;
-			PIPE_LOCK(pipe);
-			error = mac_pipe_label_set(td->td_ucred,
-			    pipe->pipe_pair, intlabel);
-			PIPE_UNLOCK(pipe);
-		}
-		mac_pipe_label_free(intlabel);
-		break;
-
-	case DTYPE_SOCKET:
-		intlabel = mac_socket_label_alloc(M_WAITOK);
-		error = mac_internalize_socket_label(intlabel, buffer);
-		if (error == 0) {
-			so = fp->f_data;
-			NET_LOCK_GIANT();
-			error = mac_socket_label_set(td->td_ucred, so,
-			    intlabel);
-			NET_UNLOCK_GIANT();
-		}
-		mac_socket_label_free(intlabel);
-		break;
-
-	default:
-		error = EINVAL;
-	}
-	fdrop(fp, td);
-out:
-	free(buffer, M_MACTEMP);
-	return (error);
-}
-
-/*
- * MPSAFE
- */
-int
-__mac_set_file(struct thread *td, struct __mac_set_file_args *uap)
-{
-	struct label *intlabel;
-	struct nameidata nd;
-	struct mount *mp;
-	struct mac mac;
-	char *buffer;
-	int vfslocked, error;
-
-	error = copyin(uap->mac_p, &mac, sizeof(mac));
-	if (error)
-		return (error);
-
-	error = mac_check_structmac_consistent(&mac);
-	if (error)
-		return (error);
-
-	buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
-	error = copyinstr(mac.m_string, buffer, mac.m_buflen, NULL);
-	if (error) {
-		free(buffer, M_MACTEMP);
-		return (error);
-	}
-
-	intlabel = mac_vnode_label_alloc();
-	error = mac_internalize_vnode_label(intlabel, buffer);
-	free(buffer, M_MACTEMP);
-	if (error)
-		goto out;
-
-	NDINIT(&nd, LOOKUP, MPSAFE | LOCKLEAF | FOLLOW, UIO_USERSPACE,
-	    uap->path_p, td);
-	error = namei(&nd);
-	vfslocked = NDHASGIANT(&nd);
-	if (error == 0) {
-		error = vn_start_write(nd.ni_vp, &mp, V_WAIT | PCATCH);
-		if (error == 0) {
-			error = vn_setlabel(nd.ni_vp, intlabel,
-			    td->td_ucred);
-			vn_finished_write(mp);
-		}
-	}
-
-	NDFREE(&nd, 0);
-	VFS_UNLOCK_GIANT(vfslocked);
-out:
-	mac_vnode_label_free(intlabel);
-	return (error);
-}
-
-/*
- * MPSAFE
- */
-int
-__mac_set_link(struct thread *td, struct __mac_set_link_args *uap)
-{
-	struct label *intlabel;
-	struct nameidata nd;
-	struct mount *mp;
-	struct mac mac;
-	char *buffer;
-	int vfslocked, error;
-
-	error = copyin(uap->mac_p, &mac, sizeof(mac));
-	if (error)
-		return (error);
-
-	error = mac_check_structmac_consistent(&mac);
-	if (error)
-		return (error);
-
-	buffer = malloc(mac.m_buflen, M_MACTEMP, M_WAITOK);
-	error = copyinstr(mac.m_string, buffer, mac.m_buflen, NULL);
-	if (error) {
-		free(buffer, M_MACTEMP);
-		return (error);
-	}
-
-	intlabel = mac_vnode_label_alloc();
-	error = mac_internalize_vnode_label(intlabel, buffer);
-	free(buffer, M_MACTEMP);
-	if (error)
-		goto out;
-
-	NDINIT(&nd, LOOKUP, MPSAFE | LOCKLEAF | NOFOLLOW, UIO_USERSPACE,
-	    uap->path_p, td);
-	error = namei(&nd);
-	vfslocked = NDHASGIANT(&nd);
-	if (error == 0) {
-		error = vn_start_write(nd.ni_vp, &mp, V_WAIT | PCATCH);
-		if (error == 0) {
-			error = vn_setlabel(nd.ni_vp, intlabel,
-			    td->td_ucred);
-			vn_finished_write(mp);
-		}
-	}
-
-	NDFREE(&nd, 0);
-	VFS_UNLOCK_GIANT(vfslocked);
-out:
-	mac_vnode_label_free(intlabel);
-	return (error);
-}
-
-/*
- * MPSAFE
- */
-int
-mac_syscall(struct thread *td, struct mac_syscall_args *uap)
-{
-	struct mac_policy_conf *mpc;
-	char target[MAC_MAX_POLICY_NAME];
-	int entrycount, error;
-
-	error = copyinstr(uap->policy, target, sizeof(target), NULL);
-	if (error)
-		return (error);
-
-	error = ENOSYS;
-	LIST_FOREACH(mpc, &mac_static_policy_list, mpc_list) {
-		if (strcmp(mpc->mpc_name, target) == 0 &&
-		    mpc->mpc_ops->mpo_syscall != NULL) {
-			error = mpc->mpc_ops->mpo_syscall(td,
-			    uap->call, uap->arg);
-			goto out;
-		}
-	}
-
-	if ((entrycount = mac_policy_list_conditional_busy()) != 0) {
-		LIST_FOREACH(mpc, &mac_policy_list, mpc_list) {
-			if (strcmp(mpc->mpc_name, target) == 0 &&
-			    mpc->mpc_ops->mpo_syscall != NULL) {
-				error = mpc->mpc_ops->mpo_syscall(td,
-				    uap->call, uap->arg);
-				break;
-			}
-		}
-		mac_policy_list_unbusy();
-	}
-out:
-	return (error);
-}
-
-SYSINIT(mac, SI_SUB_MAC, SI_ORDER_FIRST, mac_init, NULL);
-SYSINIT(mac_late, SI_SUB_MAC_LATE, SI_ORDER_FIRST, mac_late_init, NULL);
-
-#else /* !MAC */
-
-int
-__mac_get_pid(struct thread *td, struct __mac_get_pid_args *uap)
-{
-
-	return (ENOSYS);
-}
-
-int
-__mac_get_proc(struct thread *td, struct __mac_get_proc_args *uap)
-{
-
-	return (ENOSYS);
-}
-
-int
-__mac_set_proc(struct thread *td, struct __mac_set_proc_args *uap)
-{
-
-	return (ENOSYS);
-}
-
-int
-__mac_get_fd(struct thread *td, struct __mac_get_fd_args *uap)
-{
-
-	return (ENOSYS);
-}
-
-int
-__mac_get_file(struct thread *td, struct __mac_get_file_args *uap)
-{
-
-	return (ENOSYS);
-}
-
-int
-__mac_get_link(struct thread *td, struct __mac_get_link_args *uap)
-{
-
-	return (ENOSYS);
-}
-
-int
-__mac_set_fd(struct thread *td, struct __mac_set_fd_args *uap)
-{
-
-	return (ENOSYS);
-}
-
-int
-__mac_set_file(struct thread *td, struct __mac_set_file_args *uap)
-{
-
-	return (ENOSYS);
-}
-
-int
-__mac_set_link(struct thread *td, struct __mac_set_link_args *uap)
-{
-
-	return (ENOSYS);
-}
-
-int
-mac_syscall(struct thread *td, struct mac_syscall_args *uap)
-{
-
-	return (ENOSYS);
-}
-
-#endif /* !MAC */
--- /dev/null
+++ sys/kern/uipc_sockbuf.c
@@ -0,0 +1,1041 @@
+/*-
+ * Copyright (c) 1982, 1986, 1988, 1990, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)uipc_socket2.c	8.1 (Berkeley) 6/10/93
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: src/sys/kern/uipc_sockbuf.c,v 1.171.2.1.2.1 2008/02/02 12:44:13 rwatson Exp $");
+
+#include "opt_param.h"
+
+#include <sys/param.h>
+#include <sys/aio.h> /* for aio_swake proto */
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/mbuf.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/protosw.h>
+#include <sys/resourcevar.h>
+#include <sys/signalvar.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sx.h>
+#include <sys/sysctl.h>
+
+/*
+ * Function pointer set by the AIO routines so that the socket buffer code
+ * can call back into the AIO module if it is loaded.
+ */
+void	(*aio_swake)(struct socket *, struct sockbuf *);
+
+/*
+ * Primitive routines for operating on socket buffers
+ */
+
+u_long	sb_max = SB_MAX;
+u_long sb_max_adj =
+       SB_MAX * MCLBYTES / (MSIZE + MCLBYTES); /* adjusted sb_max */
+
+static	u_long sb_efficiency = 8;	/* parameter for sbreserve() */
+
+static void	sbdrop_internal(struct sockbuf *sb, int len);
+static void	sbflush_internal(struct sockbuf *sb);
+static void	sbrelease_internal(struct sockbuf *sb, struct socket *so);
+
+/*
+ * Socantsendmore indicates that no more data will be sent on the socket; it
+ * would normally be applied to a socket when the user informs the system
+ * that no more data is to be sent, by the protocol code (in case
+ * PRU_SHUTDOWN).  Socantrcvmore indicates that no more data will be
+ * received, and will normally be applied to the socket by a protocol when it
+ * detects that the peer will send no more data.  Data queued for reading in
+ * the socket may yet be read.
+ */
+void
+socantsendmore_locked(struct socket *so)
+{
+
+	SOCKBUF_LOCK_ASSERT(&so->so_snd);
+
+	so->so_snd.sb_state |= SBS_CANTSENDMORE;
+	sowwakeup_locked(so);
+	mtx_assert(SOCKBUF_MTX(&so->so_snd), MA_NOTOWNED);
+}
+
+void
+socantsendmore(struct socket *so)
+{
+
+	SOCKBUF_LOCK(&so->so_snd);
+	socantsendmore_locked(so);
+	mtx_assert(SOCKBUF_MTX(&so->so_snd), MA_NOTOWNED);
+}
+
+void
+socantrcvmore_locked(struct socket *so)
+{
+
+	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
+
+	so->so_rcv.sb_state |= SBS_CANTRCVMORE;
+	sorwakeup_locked(so);
+	mtx_assert(SOCKBUF_MTX(&so->so_rcv), MA_NOTOWNED);
+}
+
+void
+socantrcvmore(struct socket *so)
+{
+
+	SOCKBUF_LOCK(&so->so_rcv);
+	socantrcvmore_locked(so);
+	mtx_assert(SOCKBUF_MTX(&so->so_rcv), MA_NOTOWNED);
+}
+
+/*
+ * Wait for data to arrive at/drain from a socket buffer.
+ */
+int
+sbwait(struct sockbuf *sb)
+{
+
+	SOCKBUF_LOCK_ASSERT(sb);
+
+	sb->sb_flags |= SB_WAIT;
+	return (msleep(&sb->sb_cc, &sb->sb_mtx,
+	    (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH, "sbwait",
+	    sb->sb_timeo));
+}
+
+int
+sblock(struct sockbuf *sb, int flags)
+{
+
+	KASSERT((flags & SBL_VALID) == flags,
+	    ("sblock: flags invalid (0x%x)", flags));
+
+	if (flags & SBL_WAIT) {
+		if ((sb->sb_flags & SB_NOINTR) ||
+		    (flags & SBL_NOINTR)) {
+			sx_xlock(&sb->sb_sx);
+			return (0);
+		}
+		return (sx_xlock_sig(&sb->sb_sx));
+	} else {
+		if (sx_try_xlock(&sb->sb_sx) == 0)
+			return (EWOULDBLOCK);
+		return (0);
+	}
+}
+
+void
+sbunlock(struct sockbuf *sb)
+{
+
+	sx_xunlock(&sb->sb_sx);
+}
+
+/*
+ * Wakeup processes waiting on a socket buffer.  Do asynchronous notification
+ * via SIGIO if the socket has the SS_ASYNC flag set.
+ *
+ * Called with the socket buffer lock held; will release the lock by the end
+ * of the function.  This allows the caller to acquire the socket buffer lock
+ * while testing for the need for various sorts of wakeup and hold it through
+ * to the point where it's no longer required.  We currently hold the lock
+ * through calls out to other subsystems (with the exception of kqueue), and
+ * then release it to avoid lock order issues.  It's not clear that's
+ * correct.
+ */
+void
+sowakeup(struct socket *so, struct sockbuf *sb)
+{
+
+	SOCKBUF_LOCK_ASSERT(sb);
+
+	selwakeuppri(&sb->sb_sel, PSOCK);
+	sb->sb_flags &= ~SB_SEL;
+	if (sb->sb_flags & SB_WAIT) {
+		sb->sb_flags &= ~SB_WAIT;
+		wakeup(&sb->sb_cc);
+	}
+	KNOTE_LOCKED(&sb->sb_sel.si_note, 0);
+	SOCKBUF_UNLOCK(sb);
+	if ((so->so_state & SS_ASYNC) && so->so_sigio != NULL)
+		pgsigio(&so->so_sigio, SIGIO, 0);
+	if (sb->sb_flags & SB_UPCALL)
+		(*so->so_upcall)(so, so->so_upcallarg, M_DONTWAIT);
+	if (sb->sb_flags & SB_AIO)
+		aio_swake(so, sb);
+	mtx_assert(SOCKBUF_MTX(sb), MA_NOTOWNED);
+}
+
+/*
+ * Socket buffer (struct sockbuf) utility routines.
+ *
+ * Each socket contains two socket buffers: one for sending data and one for
+ * receiving data.  Each buffer contains a queue of mbufs, information about
+ * the number of mbufs and amount of data in the queue, and other fields
+ * allowing select() statements and notification on data availability to be
+ * implemented.
+ *
+ * Data stored in a socket buffer is maintained as a list of records.  Each
+ * record is a list of mbufs chained together with the m_next field.  Records
+ * are chained together with the m_nextpkt field. The upper level routine
+ * soreceive() expects the following conventions to be observed when placing
+ * information in the receive buffer:
+ *
+ * 1. If the protocol requires each message be preceded by the sender's name,
+ *    then a record containing that name must be present before any
+ *    associated data (mbuf's must be of type MT_SONAME).
+ * 2. If the protocol supports the exchange of ``access rights'' (really just
+ *    additional data associated with the message), and there are ``rights''
+ *    to be received, then a record containing this data should be present
+ *    (mbuf's must be of type MT_RIGHTS).
+ * 3. If a name or rights record exists, then it must be followed by a data
+ *    record, perhaps of zero length.
+ *
+ * Before using a new socket structure it is first necessary to reserve
+ * buffer space to the socket, by calling sbreserve().  This should commit
+ * some of the available buffer space in the system buffer pool for the
+ * socket (currently, it does nothing but enforce limits).  The space should
+ * be released by calling sbrelease() when the socket is destroyed.
+ */
+int
+soreserve(struct socket *so, u_long sndcc, u_long rcvcc)
+{
+	struct thread *td = curthread;
+
+	SOCKBUF_LOCK(&so->so_snd);
+	SOCKBUF_LOCK(&so->so_rcv);
+	if (sbreserve_locked(&so->so_snd, sndcc, so, td) == 0)
+		goto bad;
+	if (sbreserve_locked(&so->so_rcv, rcvcc, so, td) == 0)
+		goto bad2;
+	if (so->so_rcv.sb_lowat == 0)
+		so->so_rcv.sb_lowat = 1;
+	if (so->so_snd.sb_lowat == 0)
+		so->so_snd.sb_lowat = MCLBYTES;
+	if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat)
+		so->so_snd.sb_lowat = so->so_snd.sb_hiwat;
+	SOCKBUF_UNLOCK(&so->so_rcv);
+	SOCKBUF_UNLOCK(&so->so_snd);
+	return (0);
+bad2:
+	sbrelease_locked(&so->so_snd, so);
+bad:
+	SOCKBUF_UNLOCK(&so->so_rcv);
+	SOCKBUF_UNLOCK(&so->so_snd);
+	return (ENOBUFS);
+}
+
+static int
+sysctl_handle_sb_max(SYSCTL_HANDLER_ARGS)
+{
+	int error = 0;
+	u_long tmp_sb_max = sb_max;
+
+	error = sysctl_handle_long(oidp, &tmp_sb_max, arg2, req);
+	if (error || !req->newptr)
+		return (error);
+	if (tmp_sb_max < MSIZE + MCLBYTES)
+		return (EINVAL);
+	sb_max = tmp_sb_max;
+	sb_max_adj = (u_quad_t)sb_max * MCLBYTES / (MSIZE + MCLBYTES);
+	return (0);
+}
+	
+/*
+ * Allot mbufs to a sockbuf.  Attempt to scale mbmax so that mbcnt doesn't
+ * become limiting if buffering efficiency is near the normal case.
+ */
+int
+sbreserve_locked(struct sockbuf *sb, u_long cc, struct socket *so,
+    struct thread *td)
+{
+	rlim_t sbsize_limit;
+
+	SOCKBUF_LOCK_ASSERT(sb);
+
+	/*
+	 * td will only be NULL when we're in an interrupt (e.g. in
+	 * tcp_input()).
+	 *
+	 * XXXRW: This comment needs updating, as might the code.
+	 */
+	if (cc > sb_max_adj)
+		return (0);
+	if (td != NULL) {
+		PROC_LOCK(td->td_proc);
+		sbsize_limit = lim_cur(td->td_proc, RLIMIT_SBSIZE);
+		PROC_UNLOCK(td->td_proc);
+	} else
+		sbsize_limit = RLIM_INFINITY;
+	if (!chgsbsize(so->so_cred->cr_uidinfo, &sb->sb_hiwat, cc,
+	    sbsize_limit))
+		return (0);
+	sb->sb_mbmax = min(cc * sb_efficiency, sb_max);
+	if (sb->sb_lowat > sb->sb_hiwat)
+		sb->sb_lowat = sb->sb_hiwat;
+	return (1);
+}
+
+int
+sbreserve(struct sockbuf *sb, u_long cc, struct socket *so, 
+    struct thread *td)
+{
+	int error;
+
+	SOCKBUF_LOCK(sb);
+	error = sbreserve_locked(sb, cc, so, td);
+	SOCKBUF_UNLOCK(sb);
+	return (error);
+}
+
+/*
+ * Free mbufs held by a socket, and reserved mbuf space.
+ */
+static void
+sbrelease_internal(struct sockbuf *sb, struct socket *so)
+{
+
+	sbflush_internal(sb);
+	(void)chgsbsize(so->so_cred->cr_uidinfo, &sb->sb_hiwat, 0,
+	    RLIM_INFINITY);
+	sb->sb_mbmax = 0;
+}
+
+void
+sbrelease_locked(struct sockbuf *sb, struct socket *so)
+{
+
+	SOCKBUF_LOCK_ASSERT(sb);
+
+	sbrelease_internal(sb, so);
+}
+
+void
+sbrelease(struct sockbuf *sb, struct socket *so)
+{
+
+	SOCKBUF_LOCK(sb);
+	sbrelease_locked(sb, so);
+	SOCKBUF_UNLOCK(sb);
+}
+
+void
+sbdestroy(struct sockbuf *sb, struct socket *so)
+{
+
+	sbrelease_internal(sb, so);
+}
+
+/*
+ * Routines to add and remove data from an mbuf queue.
+ *
+ * The routines sbappend() or sbappendrecord() are normally called to append
+ * new mbufs to a socket buffer, after checking that adequate space is
+ * available, comparing the function sbspace() with the amount of data to be
+ * added.  sbappendrecord() differs from sbappend() in that data supplied is
+ * treated as the beginning of a new record.  To place a sender's address,
+ * optional access rights, and data in a socket receive buffer,
+ * sbappendaddr() should be used.  To place access rights and data in a
+ * socket receive buffer, sbappendrights() should be used.  In either case,
+ * the new data begins a new record.  Note that unlike sbappend() and
+ * sbappendrecord(), these routines check for the caller that there will be
+ * enough space to store the data.  Each fails if there is not enough space,
+ * or if it cannot find mbufs to store additional information in.
+ *
+ * Reliable protocols may use the socket send buffer to hold data awaiting
+ * acknowledgement.  Data is normally copied from a socket send buffer in a
+ * protocol with m_copy for output to a peer, and then removing the data from
+ * the socket buffer with sbdrop() or sbdroprecord() when the data is
+ * acknowledged by the peer.
+ */
+#ifdef SOCKBUF_DEBUG
+void
+sblastrecordchk(struct sockbuf *sb, const char *file, int line)
+{
+	struct mbuf *m = sb->sb_mb;
+
+	SOCKBUF_LOCK_ASSERT(sb);
+
+	while (m && m->m_nextpkt)
+		m = m->m_nextpkt;
+
+	if (m != sb->sb_lastrecord) {
+		printf("%s: sb_mb %p sb_lastrecord %p last %p\n",
+			__func__, sb->sb_mb, sb->sb_lastrecord, m);
+		printf("packet chain:\n");
+		for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt)
+			printf("\t%p\n", m);
+		panic("%s from %s:%u", __func__, file, line);
+	}
+}
+
+void
+sblastmbufchk(struct sockbuf *sb, const char *file, int line)
+{
+	struct mbuf *m = sb->sb_mb;
+	struct mbuf *n;
+
+	SOCKBUF_LOCK_ASSERT(sb);
+
+	while (m && m->m_nextpkt)
+		m = m->m_nextpkt;
+
+	while (m && m->m_next)
+		m = m->m_next;
+
+	if (m != sb->sb_mbtail) {
+		printf("%s: sb_mb %p sb_mbtail %p last %p\n",
+			__func__, sb->sb_mb, sb->sb_mbtail, m);
+		printf("packet tree:\n");
+		for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) {
+			printf("\t");
+			for (n = m; n != NULL; n = n->m_next)
+				printf("%p ", n);
+			printf("\n");
+		}
+		panic("%s from %s:%u", __func__, file, line);
+	}
+}
+#endif /* SOCKBUF_DEBUG */
+
+#define SBLINKRECORD(sb, m0) do {					\
+	SOCKBUF_LOCK_ASSERT(sb);					\
+	if ((sb)->sb_lastrecord != NULL)				\
+		(sb)->sb_lastrecord->m_nextpkt = (m0);			\
+	else								\
+		(sb)->sb_mb = (m0);					\
+	(sb)->sb_lastrecord = (m0);					\
+} while (/*CONSTCOND*/0)
+
+/*
+ * Append mbuf chain m to the last record in the socket buffer sb.  The
+ * additional space associated the mbuf chain is recorded in sb.  Empty mbufs
+ * are discarded and mbufs are compacted where possible.
+ */
+void
+sbappend_locked(struct sockbuf *sb, struct mbuf *m)
+{
+	struct mbuf *n;
+
+	SOCKBUF_LOCK_ASSERT(sb);
+
+	if (m == 0)
+		return;
+
+	SBLASTRECORDCHK(sb);
+	n = sb->sb_mb;
+	if (n) {
+		while (n->m_nextpkt)
+			n = n->m_nextpkt;
+		do {
+			if (n->m_flags & M_EOR) {
+				sbappendrecord_locked(sb, m); /* XXXXXX!!!! */
+				return;
+			}
+		} while (n->m_next && (n = n->m_next));
+	} else {
+		/*
+		 * XXX Would like to simply use sb_mbtail here, but
+		 * XXX I need to verify that I won't miss an EOR that
+		 * XXX way.
+		 */
+		if ((n = sb->sb_lastrecord) != NULL) {
+			do {
+				if (n->m_flags & M_EOR) {
+					sbappendrecord_locked(sb, m); /* XXXXXX!!!! */
+					return;
+				}
+			} while (n->m_next && (n = n->m_next));
+		} else {
+			/*
+			 * If this is the first record in the socket buffer,
+			 * it's also the last record.
+			 */
+			sb->sb_lastrecord = m;
+		}
+	}
+	sbcompress(sb, m, n);
+	SBLASTRECORDCHK(sb);
+}
+
+/*
+ * Append mbuf chain m to the last record in the socket buffer sb.  The
+ * additional space associated the mbuf chain is recorded in sb.  Empty mbufs
+ * are discarded and mbufs are compacted where possible.
+ */
+void
+sbappend(struct sockbuf *sb, struct mbuf *m)
+{
+
+	SOCKBUF_LOCK(sb);
+	sbappend_locked(sb, m);
+	SOCKBUF_UNLOCK(sb);
+}
+
+/*
+ * This version of sbappend() should only be used when the caller absolutely
+ * knows that there will never be more than one record in the socket buffer,
+ * that is, a stream protocol (such as TCP).
+ */
+void
+sbappendstream_locked(struct sockbuf *sb, struct mbuf *m)
+{
+	SOCKBUF_LOCK_ASSERT(sb);
+
+	KASSERT(m->m_nextpkt == NULL,("sbappendstream 0"));
+	KASSERT(sb->sb_mb == sb->sb_lastrecord,("sbappendstream 1"));
+
+	SBLASTMBUFCHK(sb);
+
+	sbcompress(sb, m, sb->sb_mbtail);
+
+	sb->sb_lastrecord = sb->sb_mb;
+	SBLASTRECORDCHK(sb);
+}
+
+/*
+ * This version of sbappend() should only be used when the caller absolutely
+ * knows that there will never be more than one record in the socket buffer,
+ * that is, a stream protocol (such as TCP).
+ */
+void
+sbappendstream(struct sockbuf *sb, struct mbuf *m)
+{
+
+	SOCKBUF_LOCK(sb);
+	sbappendstream_locked(sb, m);
+	SOCKBUF_UNLOCK(sb);
+}
+
+#ifdef SOCKBUF_DEBUG
+void
+sbcheck(struct sockbuf *sb)
+{
+	struct mbuf *m;
+	struct mbuf *n = 0;
+	u_long len = 0, mbcnt = 0;
+
+	SOCKBUF_LOCK_ASSERT(sb);
+
+	for (m = sb->sb_mb; m; m = n) {
+	    n = m->m_nextpkt;
+	    for (; m; m = m->m_next) {
+		len += m->m_len;
+		mbcnt += MSIZE;
+		if (m->m_flags & M_EXT) /*XXX*/ /* pretty sure this is bogus */
+			mbcnt += m->m_ext.ext_size;
+	    }
+	}
+	if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) {
+		printf("cc %ld != %u || mbcnt %ld != %u\n", len, sb->sb_cc,
+		    mbcnt, sb->sb_mbcnt);
+		panic("sbcheck");
+	}
+}
+#endif
+
+/*
+ * As above, except the mbuf chain begins a new record.
+ */
+void
+sbappendrecord_locked(struct sockbuf *sb, struct mbuf *m0)
+{
+	struct mbuf *m;
+
+	SOCKBUF_LOCK_ASSERT(sb);
+
+	if (m0 == 0)
+		return;
+	m = sb->sb_mb;
+	if (m)
+		while (m->m_nextpkt)
+			m = m->m_nextpkt;
+	/*
+	 * Put the first mbuf on the queue.  Note this permits zero length
+	 * records.
+	 */
+	sballoc(sb, m0);
+	SBLASTRECORDCHK(sb);
+	SBLINKRECORD(sb, m0);
+	if (m)
+		m->m_nextpkt = m0;
+	else
+		sb->sb_mb = m0;
+	m = m0->m_next;
+	m0->m_next = 0;
+	if (m && (m0->m_flags & M_EOR)) {
+		m0->m_flags &= ~M_EOR;
+		m->m_flags |= M_EOR;
+	}
+	sbcompress(sb, m, m0);
+}
+
+/*
+ * As above, except the mbuf chain begins a new record.
+ */
+void
+sbappendrecord(struct sockbuf *sb, struct mbuf *m0)
+{
+
+	SOCKBUF_LOCK(sb);
+	sbappendrecord_locked(sb, m0);
+	SOCKBUF_UNLOCK(sb);
+}
+
+/*
+ * Append address and data, and optionally, control (ancillary) data to the
+ * receive queue of a socket.  If present, m0 must include a packet header
+ * with total length.  Returns 0 if no space in sockbuf or insufficient
+ * mbufs.
+ */
+int
+sbappendaddr_locked(struct sockbuf *sb, const struct sockaddr *asa,
+    struct mbuf *m0, struct mbuf *control)
+{
+	struct mbuf *m, *n, *nlast;
+	int space = asa->sa_len;
+
+	SOCKBUF_LOCK_ASSERT(sb);
+
+	if (m0 && (m0->m_flags & M_PKTHDR) == 0)
+		panic("sbappendaddr_locked");
+	if (m0)
+		space += m0->m_pkthdr.len;
+	space += m_length(control, &n);
+
+	if (space > sbspace(sb))
+		return (0);
+#if MSIZE <= 256
+	if (asa->sa_len > MLEN)
+		return (0);
+#endif
+	MGET(m, M_DONTWAIT, MT_SONAME);
+	if (m == 0)
+		return (0);
+	m->m_len = asa->sa_len;
+	bcopy(asa, mtod(m, caddr_t), asa->sa_len);
+	if (n)
+		n->m_next = m0;		/* concatenate data to control */
+	else
+		control = m0;
+	m->m_next = control;
+	for (n = m; n->m_next != NULL; n = n->m_next)
+		sballoc(sb, n);
+	sballoc(sb, n);
+	nlast = n;
+	SBLINKRECORD(sb, m);
+
+	sb->sb_mbtail = nlast;
+	SBLASTMBUFCHK(sb);
+
+	SBLASTRECORDCHK(sb);
+	return (1);
+}
+
+/*
+ * Append address and data, and optionally, control (ancillary) data to the
+ * receive queue of a socket.  If present, m0 must include a packet header
+ * with total length.  Returns 0 if no space in sockbuf or insufficient
+ * mbufs.
+ */
+int
+sbappendaddr(struct sockbuf *sb, const struct sockaddr *asa,
+    struct mbuf *m0, struct mbuf *control)
+{
+	int retval;
+
+	SOCKBUF_LOCK(sb);
+	retval = sbappendaddr_locked(sb, asa, m0, control);
+	SOCKBUF_UNLOCK(sb);
+	return (retval);
+}
+
+int
+sbappendcontrol_locked(struct sockbuf *sb, struct mbuf *m0,
+    struct mbuf *control)
+{
+	struct mbuf *m, *n, *mlast;
+	int space;
+
+	SOCKBUF_LOCK_ASSERT(sb);
+
+	if (control == 0)
+		panic("sbappendcontrol_locked");
+	space = m_length(control, &n) + m_length(m0, NULL);
+
+	if (space > sbspace(sb))
+		return (0);
+	n->m_next = m0;			/* concatenate data to control */
+
+	SBLASTRECORDCHK(sb);
+
+	for (m = control; m->m_next; m = m->m_next)
+		sballoc(sb, m);
+	sballoc(sb, m);
+	mlast = m;
+	SBLINKRECORD(sb, control);
+
+	sb->sb_mbtail = mlast;
+	SBLASTMBUFCHK(sb);
+
+	SBLASTRECORDCHK(sb);
+	return (1);
+}
+
+int
+sbappendcontrol(struct sockbuf *sb, struct mbuf *m0, struct mbuf *control)
+{
+	int retval;
+
+	SOCKBUF_LOCK(sb);
+	retval = sbappendcontrol_locked(sb, m0, control);
+	SOCKBUF_UNLOCK(sb);
+	return (retval);
+}
+
+/*
+ * Append the data in mbuf chain (m) into the socket buffer sb following mbuf
+ * (n).  If (n) is NULL, the buffer is presumed empty.
+ *
+ * When the data is compressed, mbufs in the chain may be handled in one of
+ * three ways:
+ *
+ * (1) The mbuf may simply be dropped, if it contributes nothing (no data, no
+ *     record boundary, and no change in data type).
+ *
+ * (2) The mbuf may be coalesced -- i.e., data in the mbuf may be copied into
+ *     an mbuf already in the socket buffer.  This can occur if an
+ *     appropriate mbuf exists, there is room, and no merging of data types
+ *     will occur.
+ *
+ * (3) The mbuf may be appended to the end of the existing mbuf chain.
+ *
+ * If any of the new mbufs is marked as M_EOR, mark the last mbuf appended as
+ * end-of-record.
+ */
+void
+sbcompress(struct sockbuf *sb, struct mbuf *m, struct mbuf *n)
+{
+	int eor = 0;
+	struct mbuf *o;
+
+	SOCKBUF_LOCK_ASSERT(sb);
+
+	while (m) {
+		eor |= m->m_flags & M_EOR;
+		if (m->m_len == 0 &&
+		    (eor == 0 ||
+		     (((o = m->m_next) || (o = n)) &&
+		      o->m_type == m->m_type))) {
+			if (sb->sb_lastrecord == m)
+				sb->sb_lastrecord = m->m_next;
+			m = m_free(m);
+			continue;
+		}
+		if (n && (n->m_flags & M_EOR) == 0 &&
+		    M_WRITABLE(n) &&
+		    m->m_len <= MCLBYTES / 4 && /* XXX: Don't copy too much */
+		    m->m_len <= M_TRAILINGSPACE(n) &&
+		    n->m_type == m->m_type) {
+			bcopy(mtod(m, caddr_t), mtod(n, caddr_t) + n->m_len,
+			    (unsigned)m->m_len);
+			n->m_len += m->m_len;
+			sb->sb_cc += m->m_len;
+			if (m->m_type != MT_DATA && m->m_type != MT_OOBDATA)
+				/* XXX: Probably don't need.*/
+				sb->sb_ctl += m->m_len;
+			m = m_free(m);
+			continue;
+		}
+		if (n)
+			n->m_next = m;
+		else
+			sb->sb_mb = m;
+		sb->sb_mbtail = m;
+		sballoc(sb, m);
+		n = m;
+		m->m_flags &= ~M_EOR;
+		m = m->m_next;
+		n->m_next = 0;
+	}
+	if (eor) {
+		KASSERT(n != NULL, ("sbcompress: eor && n == NULL"));
+		n->m_flags |= eor;
+	}
+	SBLASTMBUFCHK(sb);
+}
+
+/*
+ * Free all mbufs in a sockbuf.  Check that all resources are reclaimed.
+ */
+static void
+sbflush_internal(struct sockbuf *sb)
+{
+
+	while (sb->sb_mbcnt) {
+		/*
+		 * Don't call sbdrop(sb, 0) if the leading mbuf is non-empty:
+		 * we would loop forever. Panic instead.
+		 */
+		if (!sb->sb_cc && (sb->sb_mb == NULL || sb->sb_mb->m_len))
+			break;
+		sbdrop_internal(sb, (int)sb->sb_cc);
+	}
+	if (sb->sb_cc || sb->sb_mb || sb->sb_mbcnt)
+		panic("sbflush_internal: cc %u || mb %p || mbcnt %u",
+		    sb->sb_cc, (void *)sb->sb_mb, sb->sb_mbcnt);
+}
+
+void
+sbflush_locked(struct sockbuf *sb)
+{
+
+	SOCKBUF_LOCK_ASSERT(sb);
+	sbflush_internal(sb);
+}
+
+void
+sbflush(struct sockbuf *sb)
+{
+
+	SOCKBUF_LOCK(sb);
+	sbflush_locked(sb);
+	SOCKBUF_UNLOCK(sb);
+}
+
+/*
+ * Drop data from (the front of) a sockbuf.
+ */
+static void
+sbdrop_internal(struct sockbuf *sb, int len)
+{
+	struct mbuf *m;
+	struct mbuf *next;
+
+	next = (m = sb->sb_mb) ? m->m_nextpkt : 0;
+	while (len > 0) {
+		if (m == 0) {
+			if (next == 0)
+				panic("sbdrop");
+			m = next;
+			next = m->m_nextpkt;
+			continue;
+		}
+		if (m->m_len > len) {
+			m->m_len -= len;
+			m->m_data += len;
+			sb->sb_cc -= len;
+			if (sb->sb_sndptroff != 0)
+				sb->sb_sndptroff -= len;
+			if (m->m_type != MT_DATA && m->m_type != MT_OOBDATA)
+				sb->sb_ctl -= len;
+			break;
+		}
+		len -= m->m_len;
+		sbfree(sb, m);
+		m = m_free(m);
+	}
+	while (m && m->m_len == 0) {
+		sbfree(sb, m);
+		m = m_free(m);
+	}
+	if (m) {
+		sb->sb_mb = m;
+		m->m_nextpkt = next;
+	} else
+		sb->sb_mb = next;
+	/*
+	 * First part is an inline SB_EMPTY_FIXUP().  Second part makes sure
+	 * sb_lastrecord is up-to-date if we dropped part of the last record.
+	 */
+	m = sb->sb_mb;
+	if (m == NULL) {
+		sb->sb_mbtail = NULL;
+		sb->sb_lastrecord = NULL;
+	} else if (m->m_nextpkt == NULL) {
+		sb->sb_lastrecord = m;
+	}
+}
+
+/*
+ * Drop data from (the front of) a sockbuf.
+ */
+void
+sbdrop_locked(struct sockbuf *sb, int len)
+{
+
+	SOCKBUF_LOCK_ASSERT(sb);
+
+	sbdrop_internal(sb, len);
+}
+
+void
+sbdrop(struct sockbuf *sb, int len)
+{
+
+	SOCKBUF_LOCK(sb);
+	sbdrop_locked(sb, len);
+	SOCKBUF_UNLOCK(sb);
+}
+
+/*
+ * Maintain a pointer and offset pair into the socket buffer mbuf chain to
+ * avoid traversal of the entire socket buffer for larger offsets.
+ */
+struct mbuf *
+sbsndptr(struct sockbuf *sb, u_int off, u_int len, u_int *moff)
+{
+	struct mbuf *m, *ret;
+
+	KASSERT(sb->sb_mb != NULL, ("%s: sb_mb is NULL", __func__));
+	KASSERT(off + len <= sb->sb_cc, ("%s: beyond sb", __func__));
+	KASSERT(sb->sb_sndptroff <= sb->sb_cc, ("%s: sndptroff broken", __func__));
+
+	/*
+	 * Is off below stored offset? Happens on retransmits.
+	 * Just return, we can't help here.
+	 */
+	if (sb->sb_sndptroff > off) {
+		*moff = off;
+		return (sb->sb_mb);
+	}
+
+	/* Return closest mbuf in chain for current offset. */
+	*moff = off - sb->sb_sndptroff;
+	m = ret = sb->sb_sndptr ? sb->sb_sndptr : sb->sb_mb;
+
+	/* Advance by len to be as close as possible for the next transmit. */
+	for (off = off - sb->sb_sndptroff + len - 1;
+	     off > 0 && off >= m->m_len;
+	     m = m->m_next) {
+		sb->sb_sndptroff += m->m_len;
+		off -= m->m_len;
+	}
+	sb->sb_sndptr = m;
+
+	return (ret);
+}
+
+/*
+ * Drop a record off the front of a sockbuf and move the next record to the
+ * front.
+ */
+void
+sbdroprecord_locked(struct sockbuf *sb)
+{
+	struct mbuf *m;
+
+	SOCKBUF_LOCK_ASSERT(sb);
+
+	m = sb->sb_mb;
+	if (m) {
+		sb->sb_mb = m->m_nextpkt;
+		do {
+			sbfree(sb, m);
+			m = m_free(m);
+		} while (m);
+	}
+	SB_EMPTY_FIXUP(sb);
+}
+
+/*
+ * Drop a record off the front of a sockbuf and move the next record to the
+ * front.
+ */
+void
+sbdroprecord(struct sockbuf *sb)
+{
+
+	SOCKBUF_LOCK(sb);
+	sbdroprecord_locked(sb);
+	SOCKBUF_UNLOCK(sb);
+}
+
+/*
+ * Create a "control" mbuf containing the specified data with the specified
+ * type for presentation on a socket buffer.
+ */
+struct mbuf *
+sbcreatecontrol(caddr_t p, int size, int type, int level)
+{
+	struct cmsghdr *cp;
+	struct mbuf *m;
+
+	if (CMSG_SPACE((u_int)size) > MCLBYTES)
+		return ((struct mbuf *) NULL);
+	if (CMSG_SPACE((u_int)size) > MLEN)
+		m = m_getcl(M_DONTWAIT, MT_CONTROL, 0);
+	else
+		m = m_get(M_DONTWAIT, MT_CONTROL);
+	if (m == NULL)
+		return ((struct mbuf *) NULL);
+	cp = mtod(m, struct cmsghdr *);
+	m->m_len = 0;
+	KASSERT(CMSG_SPACE((u_int)size) <= M_TRAILINGSPACE(m),
+	    ("sbcreatecontrol: short mbuf"));
+	if (p != NULL)
+		(void)memcpy(CMSG_DATA(cp), p, size);
+	m->m_len = CMSG_SPACE(size);
+	cp->cmsg_len = CMSG_LEN(size);
+	cp->cmsg_level = level;
+	cp->cmsg_type = type;
+	return (m);
+}
+
+/*
+ * This does the same for socket buffers that sotoxsocket does for sockets:
+ * generate an user-format data structure describing the socket buffer.  Note
+ * that the xsockbuf structure, since it is always embedded in a socket, does
+ * not include a self pointer nor a length.  We make this entry point public
+ * in case some other mechanism needs it.
+ */
+void
+sbtoxsockbuf(struct sockbuf *sb, struct xsockbuf *xsb)
+{
+
+	xsb->sb_cc = sb->sb_cc;
+	xsb->sb_hiwat = sb->sb_hiwat;
+	xsb->sb_mbcnt = sb->sb_mbcnt;
+	xsb->sb_mbmax = sb->sb_mbmax;
+	xsb->sb_lowat = sb->sb_lowat;
+	xsb->sb_flags = sb->sb_flags;
+	xsb->sb_timeo = sb->sb_timeo;
+}
+
+/* This takes the place of kern.maxsockbuf, which moved to kern.ipc. */
+static int dummy;
+SYSCTL_INT(_kern, KERN_DUMMY, dummy, CTLFLAG_RW, &dummy, 0, "");
+SYSCTL_OID(_kern_ipc, KIPC_MAXSOCKBUF, maxsockbuf, CTLTYPE_ULONG|CTLFLAG_RW,
+    &sb_max, 0, sysctl_handle_sb_max, "LU", "Maximum socket buffer size");
+SYSCTL_ULONG(_kern_ipc, KIPC_SOCKBUF_WASTE, sockbuf_waste_factor, CTLFLAG_RW,
+    &sb_efficiency, 0, "");
Index: kern_tc.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_tc.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/kern_tc.c -L sys/kern/kern_tc.c -u -r1.1.1.1 -r1.2
--- sys/kern/kern_tc.c
+++ sys/kern/kern_tc.c
@@ -8,7 +8,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_tc.c,v 1.164 2005/03/26 20:04:28 phk Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_tc.c,v 1.178 2007/06/04 18:25:07 dwmalone Exp $");
 
 #include "opt_ntp.h"
 
@@ -61,7 +61,7 @@
 	struct timehands	*th_next;
 };
 
-extern struct timehands th0;
+static struct timehands th0;
 static struct timehands th9 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th0};
 static struct timehands th8 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th9};
 static struct timehands th7 = { NULL, 0, 0, 0, {0, 0}, {0, 0}, {0, 0}, 0, &th8};
@@ -88,7 +88,7 @@
 static struct timecounter *timecounters = &dummy_timecounter;
 
 time_t time_second = 1;
-time_t time_uptime = 0;
+time_t time_uptime = 1;
 
 static struct bintime boottimebin;
 struct timeval boottime;
@@ -97,6 +97,7 @@
     NULL, 0, sysctl_kern_boottime, "S,timeval", "System boottime");
 
 SYSCTL_NODE(_kern, OID_AUTO, timecounter, CTLFLAG_RW, 0, "");
+SYSCTL_NODE(_kern_timecounter, OID_AUTO, tc, CTLFLAG_RW, 0, "");
 
 static int timestepwarnings;
 SYSCTL_INT(_kern_timecounter, OID_AUTO, stepwarnings, CTLFLAG_RW,
@@ -116,6 +117,7 @@
 #undef TC_STATS
 
 static void tc_windup(void);
+static void cpu_tick_calibrate(int);
 
 static int
 sysctl_kern_boottime(SYSCTL_HANDLER_ARGS)
@@ -131,6 +133,27 @@
 #endif
 		return SYSCTL_OUT(req, &boottime, sizeof(boottime));
 }
+
+static int
+sysctl_kern_timecounter_get(SYSCTL_HANDLER_ARGS)
+{
+	u_int ncount;
+	struct timecounter *tc = arg1;
+
+	ncount = tc->tc_get_timecount(tc);
+	return sysctl_handle_int(oidp, &ncount, 0, req);
+}
+
+static int
+sysctl_kern_timecounter_freq(SYSCTL_HANDLER_ARGS)
+{
+	u_int64_t freq;
+	struct timecounter *tc = arg1;
+
+	freq = tc->tc_frequency;
+	return sysctl_handle_quad(oidp, &freq, 0, req);
+}
+
 /*
  * Return the difference between the timehands' counter value now and what
  * was when we copied it to the timehands' offset_count.
@@ -307,6 +330,7 @@
 tc_init(struct timecounter *tc)
 {
 	u_int u;
+	struct sysctl_oid *tc_root;
 
 	u = tc->tc_frequency / tc->tc_counter_mask;
 	/* XXX: We need some margin here, 10% is a guess */
@@ -328,6 +352,24 @@
 	tc->tc_next = timecounters;
 	timecounters = tc;
 	/*
+	 * Set up sysctl tree for this counter.
+	 */
+	tc_root = SYSCTL_ADD_NODE(NULL,
+	    SYSCTL_STATIC_CHILDREN(_kern_timecounter_tc), OID_AUTO, tc->tc_name,
+	    CTLFLAG_RW, 0, "timecounter description");
+	SYSCTL_ADD_UINT(NULL, SYSCTL_CHILDREN(tc_root), OID_AUTO,
+	    "mask", CTLFLAG_RD, &(tc->tc_counter_mask), 0,
+	    "mask for implemented bits");
+	SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(tc_root), OID_AUTO,
+	    "counter", CTLTYPE_UINT | CTLFLAG_RD, tc, sizeof(*tc),
+	    sysctl_kern_timecounter_get, "IU", "current timecounter value");
+	SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(tc_root), OID_AUTO,
+	    "frequency", CTLTYPE_QUAD | CTLFLAG_RD, tc, sizeof(*tc),
+	     sysctl_kern_timecounter_freq, "QU", "timecounter frequency");
+	SYSCTL_ADD_INT(NULL, SYSCTL_CHILDREN(tc_root), OID_AUTO,
+	    "quality", CTLFLAG_RD, &(tc->tc_quality), 0,
+	    "goodness of time counter");
+	/*
 	 * Never automatically use a timecounter with negative quality.
 	 * Even though we run on the dummy counter, switching here may be
 	 * worse since this timecounter may not be monotonous.
@@ -360,12 +402,14 @@
 void
 tc_setclock(struct timespec *ts)
 {
-	struct timespec ts2;
+	struct timespec tbef, taft;
 	struct bintime bt, bt2;
 
+	cpu_tick_calibrate(1);
 	nsetclock++;
-	binuptime(&bt2);
+	nanotime(&tbef);
 	timespec2bintime(ts, &bt);
+	binuptime(&bt2);
 	bintime_sub(&bt, &bt2);
 	bintime_add(&bt2, &boottimebin);
 	boottimebin = bt;
@@ -373,12 +417,15 @@
 
 	/* XXX fiddle all the little crinkly bits around the fiords... */
 	tc_windup();
+	nanotime(&taft);
 	if (timestepwarnings) {
-		bintime2timespec(&bt2, &ts2);
-		log(LOG_INFO, "Time stepped from %jd.%09ld to %jd.%09ld\n",
-		    (intmax_t)ts2.tv_sec, ts2.tv_nsec,
+		log(LOG_INFO,
+		    "Time stepped from %jd.%09ld to %jd.%09ld (%jd.%09ld)\n",
+		    (intmax_t)tbef.tv_sec, tbef.tv_nsec,
+		    (intmax_t)taft.tv_sec, taft.tv_nsec,
 		    (intmax_t)ts->tv_sec, ts->tv_nsec);
 	}
+	cpu_tick_calibrate(1);
 }
 
 /*
@@ -475,8 +522,8 @@
 	 *	 x = a * 2^32 / 10^9 = a * 4.294967296
 	 *
 	 * The range of th_adjustment is +/- 5000PPM so inside a 64bit int
-	 * we can only multiply by about 850 without overflowing, but that
-	 * leaves suitably precise fractions for multiply before divide.
+	 * we can only multiply by about 850 without overflowing, that
+	 * leaves no suitably precise fractions for multiply before divide.
 	 *
 	 * Divide before multiply with a fraction of 2199/512 results in a
 	 * systematic undercompensation of 10PPM of th_adjustment.  On a
@@ -749,11 +796,16 @@
 tc_ticktock(void)
 {
 	static int count;
+	static time_t last_calib;
 
 	if (++count < tc_tick)
 		return;
 	count = 0;
 	tc_windup();
+	if (time_uptime != last_calib && !(time_uptime & 0xf)) {
+		cpu_tick_calibrate(0);
+		last_calib = time_uptime;
+	}
 }
 
 static void
@@ -782,3 +834,147 @@
 }
 
 SYSINIT(timecounter, SI_SUB_CLOCKS, SI_ORDER_SECOND, inittimecounter, NULL)
+
+/* Cpu tick handling -------------------------------------------------*/
+
+static int cpu_tick_variable;
+static uint64_t	cpu_tick_frequency;
+
+static uint64_t
+tc_cpu_ticks(void)
+{
+	static uint64_t base;
+	static unsigned last;
+	unsigned u;
+	struct timecounter *tc;
+
+	tc = timehands->th_counter;
+	u = tc->tc_get_timecount(tc) & tc->tc_counter_mask;
+	if (u < last)
+		base += (uint64_t)tc->tc_counter_mask + 1;
+	last = u;
+	return (u + base);
+}
+
+/*
+ * This function gets called ever 16 seconds on only one designated
+ * CPU in the system from hardclock() via tc_ticktock().
+ *
+ * Whenever the real time clock is stepped we get called with reset=1
+ * to make sure we handle suspend/resume and similar events correctly.
+ */
+
+static void
+cpu_tick_calibrate(int reset)
+{
+	static uint64_t c_last;
+	uint64_t c_this, c_delta;
+	static struct bintime  t_last;
+	struct bintime t_this, t_delta;
+	uint32_t divi;
+
+	if (reset) {
+		/* The clock was stepped, abort & reset */
+		t_last.sec = 0;
+		return;
+	}
+
+	/* we don't calibrate fixed rate cputicks */
+	if (!cpu_tick_variable)
+		return;
+
+	getbinuptime(&t_this);
+	c_this = cpu_ticks();
+	if (t_last.sec != 0) {
+		c_delta = c_this - c_last;
+		t_delta = t_this;
+		bintime_sub(&t_delta, &t_last);
+		/*
+		 * Validate that 16 +/- 1/256 seconds passed. 
+		 * After division by 16 this gives us a precision of
+		 * roughly 250PPM which is sufficient
+		 */
+		if (t_delta.sec > 16 || (
+		    t_delta.sec == 16 && t_delta.frac >= (0x01LL << 56))) {
+			/* too long */
+			if (bootverbose)
+				printf("%ju.%016jx too long\n",
+				    (uintmax_t)t_delta.sec,
+				    (uintmax_t)t_delta.frac);
+		} else if (t_delta.sec < 15 ||
+		    (t_delta.sec == 15 && t_delta.frac <= (0xffLL << 56))) {
+			/* too short */
+			if (bootverbose)
+				printf("%ju.%016jx too short\n",
+				    (uintmax_t)t_delta.sec,
+				    (uintmax_t)t_delta.frac);
+		} else {
+			/* just right */
+			/*
+			 * Headroom:
+			 * 	2^(64-20) / 16[s] =
+			 * 	2^(44) / 16[s] =
+			 * 	17.592.186.044.416 / 16 =
+			 * 	1.099.511.627.776 [Hz]
+			 */
+			divi = t_delta.sec << 20;
+			divi |= t_delta.frac >> (64 - 20);
+			c_delta <<= 20;
+			c_delta /= divi;
+			if (c_delta  > cpu_tick_frequency) {
+				if (0 && bootverbose)
+					printf("cpu_tick increased to %ju Hz\n",
+					    c_delta);
+				cpu_tick_frequency = c_delta;
+			}
+		}
+	}
+	c_last = c_this;
+	t_last = t_this;
+}
+
+void
+set_cputicker(cpu_tick_f *func, uint64_t freq, unsigned var)
+{
+
+	if (func == NULL) {
+		cpu_ticks = tc_cpu_ticks;
+	} else {
+		cpu_tick_frequency = freq;
+		cpu_tick_variable = var;
+		cpu_ticks = func;
+	}
+}
+
+uint64_t
+cpu_tickrate(void)
+{
+
+	if (cpu_ticks == tc_cpu_ticks) 
+		return (tc_getfrequency());
+	return (cpu_tick_frequency);
+}
+
+/*
+ * We need to be slightly careful converting cputicks to microseconds.
+ * There is plenty of margin in 64 bits of microseconds (half a million
+ * years) and in 64 bits at 4 GHz (146 years), but if we do a multiply
+ * before divide conversion (to retain precision) we find that the
+ * margin shrinks to 1.5 hours (one millionth of 146y).
+ * With a three prong approach we never lose significant bits, no
+ * matter what the cputick rate and length of timeinterval is.
+ */
+
+uint64_t
+cputick2usec(uint64_t tick)
+{
+
+	if (tick > 18446744073709551LL)		/* floor(2^64 / 1000) */
+		return (tick / (cpu_tickrate() / 1000000LL));
+	else if (tick > 18446744073709LL)	/* floor(2^64 / 1000000) */
+		return ((tick * 1000LL) / (cpu_tickrate() / 1000LL));
+	else
+		return ((tick * 1000000LL) / cpu_tickrate());
+}
+
+cpu_tick_f	*cpu_ticks = tc_cpu_ticks;
Index: subr_clock.c
===================================================================
RCS file: /home/cvs/src/sys/kern/subr_clock.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/subr_clock.c -L sys/kern/subr_clock.c -u -r1.1.1.1 -r1.2
--- sys/kern/subr_clock.c
+++ sys/kern/subr_clock.c
@@ -38,23 +38,8 @@
  *	from: src/sys/i386/isa/clock.c,v 1.176 2001/09/04
  */
 
-/*
- * Helpers for time-of-day clocks. This is useful for architectures that need
- * support multiple models of such clocks, and generally serves to make the
- * code more machine-independent.
- * If the clock in question can also be used as a time counter, the driver
- * needs to initiate this.
- * This code is not yet used by all architectures.
- */
-
-/*
- * Generic routines to convert between a POSIX date
- * (seconds since 1/1/1970) and yr/mo/day/hr/min/sec
- * Derived from NetBSD arch/hp300/hp300/clock.c
- */
-
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/subr_clock.c,v 1.6 2005/01/06 23:35:39 imp Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/subr_clock.c,v 1.12 2007/07/23 09:42:31 dwmalone Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -64,43 +49,21 @@
 #include <sys/sysctl.h>
 #include <sys/timetc.h>
 
-/* XXX: for the  CPU_* sysctl OID constants. */
-#include <machine/cpu.h>
-
-#include "clock_if.h"
-
-static __inline int leapyear(int year);
-static int sysctl_machdep_adjkerntz(SYSCTL_HANDLER_ARGS);
-
-#define	FEBRUARY	2
-#define	days_in_year(y) 	(leapyear(y) ? 366 : 365)
-#define	days_in_month(y, m) \
-	(month_days[(m) - 1] + (m == FEBRUARY ? leapyear(y) : 0))
-/* Day of week. Days are counted from 1/1/1970, which was a Thursday */
-#define	day_of_week(days)	(((days) + 4) % 7)
-
-static const int month_days[12] = {
-	31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31
-};
-
-static device_t clock_dev = NULL;
-static long clock_res;
+static int adjkerntz;		/* local offset from GMT in seconds */
+static int wall_cmos_clock;	/* wall CMOS clock assumed if != 0 */
+int disable_rtc_set;		/* disable resettodr() if != 0 */
 
-int adjkerntz;		/* local offset from GMT in seconds */
-int disable_rtc_set;	/* disable resettodr() if != 0 */
-int wall_cmos_clock;	/* wall CMOS clock assumed if != 0 */
+int tz_minuteswest;
+int tz_dsttime;
 
 /*
  * These have traditionally been in machdep, but should probably be moved to
  * kern.
  */
-SYSCTL_PROC(_machdep, CPU_ADJKERNTZ, adjkerntz, CTLTYPE_INT|CTLFLAG_RW,
-	&adjkerntz, 0, sysctl_machdep_adjkerntz, "I", "");
-
-SYSCTL_INT(_machdep, CPU_DISRTCSET, disable_rtc_set,
+SYSCTL_INT(_machdep, OID_AUTO, disable_rtc_set,
 	CTLFLAG_RW, &disable_rtc_set, 0, "");
 
-SYSCTL_INT(_machdep, CPU_WALLCLOCK, wall_cmos_clock,
+SYSCTL_INT(_machdep, OID_AUTO, wall_cmos_clock,
 	CTLFLAG_RW, &wall_cmos_clock, 0, "");
 
 static int
@@ -114,6 +77,28 @@
 	return (error);
 }
 
+SYSCTL_PROC(_machdep, OID_AUTO, adjkerntz, CTLTYPE_INT|CTLFLAG_RW,
+	&adjkerntz, 0, sysctl_machdep_adjkerntz, "I", "");
+
+/*--------------------------------------------------------------------*
+ * Generic routines to convert between a POSIX date
+ * (seconds since 1/1/1970) and yr/mo/day/hr/min/sec
+ * Derived from NetBSD arch/hp300/hp300/clock.c
+ */
+
+
+#define	FEBRUARY	2
+#define	days_in_year(y) 	(leapyear(y) ? 366 : 365)
+#define	days_in_month(y, m) \
+	(month_days[(m) - 1] + (m == FEBRUARY ? leapyear(y) : 0))
+/* Day of week. Days are counted from 1/1/1970, which was a Thursday */
+#define	day_of_week(days)	(((days) + 4) % 7)
+
+static const int month_days[12] = {
+	31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31
+};
+
+
 /*
  * This inline avoids some unnecessary modulo operations
  * as compared with the usual macro:
@@ -166,7 +151,7 @@
 	  	days += days_in_month(year, i);
 	days += (ct->day - 1);
 
-	/* Another sanity check. */
+	/* XXX Dow sanity check. Dow is not used, so should we check it? */
 	if (ct->dow != -1 && ct->dow != day_of_week(days))
 		return (EINVAL);
 
@@ -213,105 +198,9 @@
 	ct->nsec = ts->tv_nsec;
 }
 
-void
-clock_register(device_t dev, long res)
-{
-
-	if (clock_dev != NULL) {
-		if (clock_res > res) {
-			if (bootverbose) {
-				device_printf(dev, "not installed as "
-				    "time-of-day clock: clock %s has higher "
-				    "resolution\n", device_get_name(clock_dev));
-			}
-			return;
-		} else {
-			if (bootverbose) {
-				device_printf(clock_dev, "removed as "
-				    "time-of-day clock: clock %s has higher "
-				    "resolution\n", device_get_name(dev));
-			}
-		}
-	}
-	clock_dev = dev;
-	clock_res = res;
-	if (bootverbose) {
-		device_printf(dev, "registered as a time-of-day clock "
-		    "(resolution %ldus)\n", res);
-	}
-}
-
-/*
- * inittodr and settodr derived from the i386 versions written
- * by Christoph Robitschko <chmr at edvz.tu-graz.ac.at>,  reintroduced and
- * updated by Chris Stenton <chris at gnome.co.uk> 8/10/94
- */
-
-/*
- * Initialize the time of day register, based on the time base which is, e.g.
- * from a filesystem.
- */
-void
-inittodr(time_t base)
-{
-	struct timespec diff, ref, ts;
-	int error;
-
-	if (base) {
-		ref.tv_sec = base;
-		ref.tv_nsec = 0;
-		tc_setclock(&ref);
-	}
-
-	if (clock_dev == NULL) {
-		printf("warning: no time-of-day clock registered, system time "
-		    "will not be set accurately\n");
-		return;
-	}
-	error = CLOCK_GETTIME(clock_dev, &ts);
-	if (error != 0 && error != EINVAL) {
-		printf("warning: clock_gettime failed (%d), the system time "
-		    "will not be set accurately\n", error);
-		return;
-	}
-	if (error == EINVAL || ts.tv_sec < 0) {
-		printf("Invalid time in real time clock.\n");
-		printf("Check and reset the date immediately!\n");
-	}
-
-	ts.tv_sec += tz_minuteswest * 60 +
-	    (wall_cmos_clock ? adjkerntz : 0);
-
-	if (timespeccmp(&ref, &ts, >)) {
-		diff = ref;
-		timespecsub(&ref, &ts);
-	} else {
-		diff = ts;
-		timespecsub(&diff, &ref);
-	}
-	if (ts.tv_sec >= 2) {
-		/* badly off, adjust it */
-		tc_setclock(&ts);
-	}
-}
-
-/*
- * Write system time back to RTC
- */
-void
-resettodr()
+int
+utc_offset(void)
 {
-	struct timespec ts;
-	int error;
 
-	if (disable_rtc_set || clock_dev == NULL)
-		return;
-
-	getnanotime(&ts);
-	ts.tv_sec -= tz_minuteswest * 60 + (wall_cmos_clock ? adjkerntz : 0);
-	if ((error = CLOCK_SETTIME(clock_dev, &ts)) != 0) {
-		printf("warning: clock_settime failed (%d), time-of-day clock "
-		    "not adjusted to system time\n", error);
-		return;
-	}
+	return (tz_minuteswest * 60 + (wall_cmos_clock ? adjkerntz : 0));
 }
Index: vfs_default.c
===================================================================
RCS file: /home/cvs/src/sys/kern/vfs_default.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/vfs_default.c -L sys/kern/vfs_default.c -u -r1.2 -r1.3
--- sys/kern/vfs_default.c
+++ sys/kern/vfs_default.c
@@ -33,7 +33,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/vfs_default.c,v 1.127.2.2 2006/03/13 03:06:17 jeff Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/vfs_default.c,v 1.138 2007/05/18 13:02:13 kib Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -86,7 +86,7 @@
 	.vop_kqfilter =		vop_stdkqfilter,
 	.vop_islocked =		vop_stdislocked,
 	.vop_lease =		VOP_NULL,
-	.vop_lock =		vop_stdlock,
+	.vop_lock1 =		vop_stdlock,
 	.vop_lookup =		vop_nolookup,
 	.vop_open =		VOP_NULL,
 	.vop_pathconf =		VOP_EINVAL,
@@ -96,6 +96,7 @@
 	.vop_revoke =		VOP_PANIC,
 	.vop_strategy =		vop_nostrategy,
 	.vop_unlock =		vop_stdunlock,
+	.vop_vptofh =		vop_stdvptofh,
 };
 
 /*
@@ -217,6 +218,12 @@
 {
 
 	switch (ap->a_name) {
+		case _PC_NAME_MAX:
+			*ap->a_retval = NAME_MAX;
+			return (0);
+		case _PC_PATH_MAX:
+			*ap->a_retval = PATH_MAX;
+			return (0);
 		case _PC_LINK_MAX:
 			*ap->a_retval = LINK_MAX;
 			return (0);
@@ -246,15 +253,17 @@
  */
 int
 vop_stdlock(ap)
-	struct vop_lock_args /* {
+	struct vop_lock1_args /* {
 		struct vnode *a_vp;
 		int a_flags;
 		struct thread *a_td;
+		char *file;
+		int line;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 
-	return (lockmgr(vp->v_vnlock, ap->a_flags, VI_MTX(vp), ap->a_td));
+	return (_lockmgr(vp->v_vnlock, ap->a_flags, VI_MTX(vp), ap->a_td, ap->a_file, ap->a_line));
 }
 
 /* See above. */
@@ -337,8 +346,24 @@
 		struct mount **a_mpp;
 	} */ *ap;
 {
+	struct mount *mp;
 
-	*(ap->a_mpp) = ap->a_vp->v_mount;
+	/*
+	 * XXX Since this is called unlocked we may be recycled while
+	 * attempting to ref the mount.  If this is the case or mountpoint
+	 * will be set to NULL.  We only have to prevent this call from
+	 * returning with a ref to an incorrect mountpoint.  It is not
+	 * harmful to return with a ref to our previous mountpoint.
+	 */
+	mp = ap->a_vp->v_mount;
+	if (mp != NULL) {
+		vfs_ref(mp);
+		if (mp != ap->a_vp->v_mount) {
+			vfs_rel(mp);
+			mp = NULL;
+		}
+	}
+	*(ap->a_mpp) = mp;
 	return (0);
 }
 
@@ -487,6 +512,12 @@
 	     ap->a_sync, ap->a_rtvals);
 }
 
+int
+vop_stdvptofh(struct vop_vptofh_args *ap)
+{
+	return (EOPNOTSUPP);
+}
+
 /*
  * vfs default ops
  * used to fill the vfs function table to get reasonable default return values.
@@ -513,20 +544,11 @@
 }
 
 int
-vfs_stdvptofh (vp, fhp)
-	struct vnode *vp;
-	struct fid *fhp;
-{
-
-	return (EOPNOTSUPP);
-}
-
-int
 vfs_stdquotactl (mp, cmds, uid, arg, td)
 	struct mount *mp;
 	int cmds;
 	uid_t uid;
-	caddr_t arg;
+	void *arg;
 	struct thread *td;
 {
 
@@ -571,6 +593,7 @@
 		if (error)
 			allerror = error;
 
+		/* Do not turn this into vput.  td is not always curthread. */
 		VOP_UNLOCK(vp, 0, td);
 		vrele(vp);
 		MNT_ILOCK(mp);
Index: tty_cons.c
===================================================================
RCS file: /home/cvs/src/sys/kern/tty_cons.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/tty_cons.c -L sys/kern/tty_cons.c -u -r1.1.1.1 -r1.2
--- sys/kern/tty_cons.c
+++ sys/kern/tty_cons.c
@@ -35,12 +35,14 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/tty_cons.c,v 1.131 2005/02/27 21:52:41 phk Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/tty_cons.c,v 1.139 2007/05/31 11:51:51 kib Exp $");
 
 #include "opt_ddb.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
 #include <sys/conf.h>
 #include <sys/cons.h>
 #include <sys/fcntl.h>
@@ -49,6 +51,7 @@
 #include <sys/malloc.h>
 #include <sys/msgbuf.h>
 #include <sys/namei.h>
+#include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/queue.h>
 #include <sys/reboot.h>
@@ -99,7 +102,7 @@
 	    (cnd->cnd_vp->v_type == VBAD && !cn_devopen(cnd, td, 1)))
 
 static dev_t	cn_udev_t;
-SYSCTL_OPAQUE(_machdep, CPU_CONSDEV, consdev, CTLFLAG_RD,
+SYSCTL_OPAQUE(_machdep, OID_AUTO, consdev, CTLFLAG_RD,
 	&cn_udev_t, sizeof cn_udev_t, "T,struct cdev *", "");
 
 int	cons_avail_mask = 0;	/* Bit mask. Each registered low level console
@@ -117,10 +120,13 @@
 static char *console_pausestr=
 "<pause; press any key to proceed to next line or '.' to end pause mode>";
 struct tty *constty;			/* pointer to console "window" tty */
+static struct mtx cnputs_mtx;		/* Mutex for cnputs(). */
+static int use_cnputs_mtx = 0;		/* != 0 if cnputs_mtx locking reqd. */
 
 static void constty_timeout(void *arg);
 
-CONS_DRIVER(cons, NULL, NULL, NULL, NULL, NULL, NULL, NULL);
+static struct consdev cons_consdev;
+DATA_SET(cons_set, cons_consdev);
 SET_DECLARE(cons_set, struct consdev);
 
 void
@@ -157,15 +163,15 @@
 			/*
 			 * Initialize console, and attach to it.
 			 */
-			cnadd(cn);
 			cn->cn_init(cn);
+			cnadd(cn);
 		}
 	}
 	if (best_cn == NULL)
 		return;
 	if ((boothowto & RB_MULTIPLE) == 0) {
-		cnadd(best_cn);
 		best_cn->cn_init(best_cn);
+		cnadd(best_cn);
 	}
 	if (boothowto & RB_PAUSE)
 		console_pausing = 1;
@@ -401,7 +407,7 @@
 	}
 	snprintf(path, sizeof(path), "/dev/%s", cnd->cnd_cn->cn_name);
 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, path, td);
-	error = vn_open(&nd, &openflag, 0, -1);
+	error = vn_open(&nd, &openflag, 0, NULL);
 	if (error == 0) {
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		VOP_UNLOCK(nd.ni_vp, 0, td);
@@ -505,7 +511,7 @@
 	 * output from the "virtual" console.
 	 */
 	if (cmd == TIOCCONS && constty) {
-		error = suser(td);
+		error = priv_check(td, PRIV_TTY_CONSOLE);
 		if (error)
 			return (error);
 		constty = NULL;
@@ -597,7 +603,10 @@
 	STAILQ_FOREACH(cnd, &cn_devlist, cnd_next) {
 		cn = cnd->cnd_cn;
 		if (!kdb_active || !(cn->cn_flags & CN_FLAG_NODEBUG)) {
-			c = cn->cn_checkc(cn);
+			if (cn->cn_checkc != NULL)
+				c = cn->cn_checkc(cn);
+			else
+				c = cn->cn_getc(cn);
 			if (c != -1) {
 				return (c);
 			}
@@ -636,22 +645,21 @@
 }
 
 void
-cndbctl(int on)
+cnputs(char *p)
 {
-	struct cn_device *cnd;
-	struct consdev *cn;
-	static int refcount;
+	int c;
+	int unlock_reqd = 0;
 
-	if (!on)
-		refcount--;
-	if (refcount == 0)
-		STAILQ_FOREACH(cnd, &cn_devlist, cnd_next) {
-			cn = cnd->cnd_cn;
-			if (cn->cn_dbctl != NULL)
-				cn->cn_dbctl(cn, on);
-		}
-	if (on)
-		refcount++;
+	if (use_cnputs_mtx) {
+		mtx_lock_spin(&cnputs_mtx);
+		unlock_reqd = 1;
+	}
+
+	while ((c = *p++) != '\0')
+		cnputc(c);
+
+	if (unlock_reqd)
+		mtx_unlock_spin(&cnputs_mtx);
 }
 
 static int consmsgbuf_size = 8192;
@@ -723,6 +731,9 @@
 {
 
 	make_dev(&cn_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600, "console");
+
+	mtx_init(&cnputs_mtx, "cnputs_mtx", NULL, MTX_SPIN | MTX_NOWITNESS);
+	use_cnputs_mtx = 1;
 }
 
 SYSINIT(cndev, SI_SUB_DRIVERS, SI_ORDER_MIDDLE, cn_drvinit, NULL)
Index: sysv_ipc.c
===================================================================
RCS file: /home/cvs/src/sys/kern/sysv_ipc.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/sysv_ipc.c -L sys/kern/sysv_ipc.c -u -r1.1.1.1 -r1.2
--- sys/kern/sysv_ipc.c
+++ sys/kern/sysv_ipc.c
@@ -1,8 +1,12 @@
 /*	$NetBSD: sysv_ipc.c,v 1.7 1994/06/29 06:33:11 cgd Exp $	*/
 /*-
  * Copyright (c) 1994 Herb Peyerl <hpeyerl at novatel.ca>
+ * Copyright (c) 2006 nCircle Network Security, Inc.
  * All rights reserved.
  *
+ * This software was developed by Robert N. M. Watson for the TrustedBSD
+ * Project under contract to nCircle Network Security, Inc.
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
@@ -30,7 +34,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/sysv_ipc.c,v 1.29 2005/01/06 23:35:39 imp Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/sysv_ipc.c,v 1.34 2007/06/12 00:11:59 rwatson Exp $");
 
 #include "opt_sysvipc.h"
 
@@ -39,6 +43,7 @@
 #include <sys/sem.h>
 #include <sys/shm.h>
 #include <sys/ipc.h>
+#include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/ucred.h>
 
@@ -72,50 +77,73 @@
  * Note: The MAC Framework does not require any modifications to the
  * ipcperm() function, as access control checks are performed throughout the
  * implementation of each primitive.  Those entry point calls complement the
- * ipcperm() discertionary checks.
+ * ipcperm() discertionary checks.  Unlike file system discretionary access
+ * control, the original create of an object is given the same rights as the
+ * current owner.
  */
 int
-ipcperm(td, perm, mode)
-	struct thread *td;
-	struct ipc_perm *perm;
-	int mode;
+ipcperm(struct thread *td, struct ipc_perm *perm, int acc_mode)
 {
 	struct ucred *cred = td->td_ucred;
-	int error;
+	int error, obj_mode, dac_granted, priv_granted;
 
-	if (cred->cr_uid != perm->cuid && cred->cr_uid != perm->uid) {
-		/*
-		 * For a non-create/owner, we require privilege to
-		 * modify the object protections.  Note: some other
-		 * implementations permit IPC_M to be delegated to
-		 * unprivileged non-creator/owner uids/gids.
-		 */
-		if (mode & IPC_M) {
-			error = suser(td);
-			if (error)
-				return (error);
-		}
-		/*
-		 * Try to match against creator/owner group; if not, fall
-		 * back on other.
-		 */
-		mode >>= 3;
-		if (!groupmember(perm->gid, cred) &&
-		    !groupmember(perm->cgid, cred))
-			mode >>= 3;
+	dac_granted = 0;
+	if (cred->cr_uid == perm->cuid || cred->cr_uid == perm->uid) {
+		obj_mode = perm->mode;
+		dac_granted |= IPC_M;
+	} else if (groupmember(perm->gid, cred) ||
+	    groupmember(perm->cgid, cred)) {
+		obj_mode = perm->mode;
+		obj_mode <<= 3;
 	} else {
-		/*
-		 * Always permit the creator/owner to update the object
-		 * protections regardless of whether the object mode
-		 * permits it.
-		 */
-		if (mode & IPC_M)
-			return (0);
+		obj_mode = perm->mode;
+		obj_mode <<= 6;
+	}
+
+	/*
+	 * While the System V IPC permission model allows IPC_M to be
+	 * granted, as part of the mode, our implementation requires
+	 * privilege to adminster the object if not the owner or creator.
+	 */
+#if 0
+	if (obj_mode & IPC_M)
+		dac_granted |= IPC_M;
+#endif
+	if (obj_mode & IPC_R)
+		dac_granted |= IPC_R;
+	if (obj_mode & IPC_W)
+		dac_granted |= IPC_W;
+
+	/*
+	 * Simple case: all required rights are granted by DAC.
+	 */
+	if ((dac_granted & acc_mode) == acc_mode)
+		return (0);
+
+	/*
+	 * Privilege is required to satisfy the request.
+	 */
+	priv_granted = 0;
+	if ((acc_mode & IPC_M) && !(dac_granted & IPC_M)) {
+		error = priv_check(td, PRIV_IPC_ADMIN);
+		if (error == 0)
+			priv_granted |= IPC_M;
 	}
 
-	if ((mode & perm->mode) != mode) {
-		if (suser(td) != 0)
-			return (EACCES);
+	if ((acc_mode & IPC_R) && !(dac_granted & IPC_R)) {
+		error = priv_check(td, PRIV_IPC_READ);
+		if (error == 0)
+			priv_granted |= IPC_R;
 	}
-	return (0);
+
+	if ((acc_mode & IPC_W) && !(dac_granted & IPC_W)) {
+		error = priv_check(td, PRIV_IPC_WRITE);
+		if (error == 0)
+			priv_granted |= IPC_W;
+	}
+
+	if (((dac_granted | priv_granted) & acc_mode) == acc_mode)
+		return (0);
+	else
+		return (EACCES);
 }
Index: Makefile
===================================================================
RCS file: /home/cvs/src/sys/kern/Makefile,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/Makefile -L sys/kern/Makefile -u -r1.1.1.1 -r1.2
--- sys/kern/Makefile
+++ sys/kern/Makefile
@@ -1,5 +1,5 @@
 #	@(#)Makefile	8.2 (Berkeley) 3/21/94
-# $FreeBSD: src/sys/kern/Makefile,v 1.11.12.1 2005/07/18 19:54:49 jhb Exp $
+# $FreeBSD: src/sys/kern/Makefile,v 1.14 2007/06/25 05:06:56 rafan Exp $
 
 # Makefile for kernel tags files, init_sysent, etc.
 
@@ -11,10 +11,11 @@
 sysent:  init_sysent.c syscalls.c ../sys/syscall.h ../sys/syscall.mk \
 ../sys/sysproto.h
 
-init_sysent.c syscalls.c ../sys/syscall.h \
+init_sysent.c syscalls.c systrace_args.c ../sys/syscall.h \
 ../sys/syscall.mk ../sys/sysproto.h: makesyscalls.sh syscalls.master
 	-mv -f init_sysent.c init_sysent.c.bak
 	-mv -f syscalls.c syscalls.c.bak
+	-mv -f systrace_args.c systrace_args.c.bak
 	-mv -f ../sys/syscall.h ../sys/syscall.h.bak
 	-mv -f ../sys/syscall.mk ../sys/syscall.mk.bak
 	-mv -f ../sys/sysproto.h ../sys/sysproto.h.bak
@@ -37,7 +38,7 @@
 	dev dev/scsi \
 	fs fs/deadfs fs/fdescfs fs/fifofs \
 	fs/lofs fs/nullfs fs/portalfs fs/procfs \
-	fs/specfs fs/umapfs fs/unionfs \
+	fs/specfs fs/unionfs \
 	hp hp/dev hp/hpux \
 	kern libkern \
 	net netinet nfs scripts sys \
--- /dev/null
+++ sys/kern/uipc_mqueue.c
@@ -0,0 +1,2481 @@
+/*-
+ * Copyright (c) 2005 David Xu <davidxu at freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+
+/*
+ * POSIX message queue implementation.
+ *
+ * 1) A mqueue filesystem can be mounted, each message queue appears
+ *    in mounted directory, user can change queue's permission and
+ *    ownership, or remove a queue. Manually creating a file in the
+ *    directory causes a message queue to be created in the kernel with
+ *    default message queue attributes applied and same name used, this
+ *    method is not advocated since mq_open syscall allows user to specify
+ *    different attributes. Also the file system can be mounted multiple
+ *    times at different mount points but shows same contents.
+ *
+ * 2) Standard POSIX message queue API. The syscalls do not use vfs layer,
+ *    but directly operate on internal data structure, this allows user to
+ *    use the IPC facility without having to mount mqueue file system.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: src/sys/kern/uipc_mqueue.c,v 1.25 2007/06/12 00:11:59 rwatson Exp $");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/limits.h>
+#include <sys/buf.h>
+#include <sys/dirent.h>
+#include <sys/event.h>
+#include <sys/eventhandler.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/filedesc.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/module.h>
+#include <sys/mount.h>
+#include <sys/mqueue.h>
+#include <sys/mutex.h>
+#include <sys/namei.h>
+#include <sys/posix4.h>
+#include <sys/poll.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/queue.h>
+#include <sys/sysproto.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysent.h>
+#include <sys/sx.h>
+#include <sys/sysctl.h>
+#include <sys/taskqueue.h>
+#include <sys/unistd.h>
+#include <sys/vnode.h>
+#include <machine/atomic.h>
+
+/*
+ * Limits and constants
+ */
+#define	MQFS_NAMELEN		NAME_MAX
+#define MQFS_DELEN		(8 + MQFS_NAMELEN)
+
+/* node types */
+typedef enum {
+	mqfstype_none = 0,
+	mqfstype_root,
+	mqfstype_dir,
+	mqfstype_this,
+	mqfstype_parent,
+	mqfstype_file,
+	mqfstype_symlink,
+} mqfs_type_t;
+
+struct mqfs_node;
+
+/*
+ * mqfs_info: describes a mqfs instance
+ */
+struct mqfs_info {
+	struct sx		mi_lock;
+	struct mqfs_node	*mi_root;
+	struct unrhdr		*mi_unrhdr;
+};
+
+struct mqfs_vdata {
+	LIST_ENTRY(mqfs_vdata)	mv_link;
+	struct mqfs_node	*mv_node;
+	struct vnode		*mv_vnode;
+	struct task		mv_task;
+};
+
+/*
+ * mqfs_node: describes a node (file or directory) within a mqfs
+ */
+struct mqfs_node {
+	char			mn_name[MQFS_NAMELEN+1];
+	struct mqfs_info	*mn_info;
+	struct mqfs_node	*mn_parent;
+	LIST_HEAD(,mqfs_node)	mn_children;
+	LIST_ENTRY(mqfs_node)	mn_sibling;
+	LIST_HEAD(,mqfs_vdata)	mn_vnodes;
+	int			mn_refcount;
+	mqfs_type_t		mn_type;
+	int			mn_deleted;
+	u_int32_t		mn_fileno;
+	void			*mn_data;
+	struct timespec		mn_birth;
+	struct timespec		mn_ctime;
+	struct timespec		mn_atime;
+	struct timespec		mn_mtime;
+	uid_t			mn_uid;
+	gid_t			mn_gid;
+	int			mn_mode;
+};
+
+#define	VTON(vp)	(((struct mqfs_vdata *)((vp)->v_data))->mv_node)
+#define VTOMQ(vp) 	((struct mqueue *)(VTON(vp)->mn_data))
+#define	VFSTOMQFS(m)	((struct mqfs_info *)((m)->mnt_data))
+#define	FPTOMQ(fp)	((struct mqueue *)(((struct mqfs_node *) \
+				(fp)->f_data)->mn_data))
+
+TAILQ_HEAD(msgq, mqueue_msg);
+
+struct mqueue;
+
+struct mqueue_notifier {
+	LIST_ENTRY(mqueue_notifier)	nt_link;
+	struct sigevent			nt_sigev;
+	ksiginfo_t			nt_ksi;
+	struct proc			*nt_proc;
+};
+
+struct mqueue {
+	struct mtx	mq_mutex;
+	int		mq_flags;
+	long		mq_maxmsg;
+	long		mq_msgsize;
+	long		mq_curmsgs;
+	long		mq_totalbytes;
+	struct msgq	mq_msgq;
+	int		mq_receivers;
+	int		mq_senders;
+	struct selinfo	mq_rsel;
+	struct selinfo	mq_wsel;
+	struct mqueue_notifier	*mq_notifier;
+};
+
+#define	MQ_RSEL		0x01
+#define	MQ_WSEL		0x02
+
+struct mqueue_msg {
+	TAILQ_ENTRY(mqueue_msg)	msg_link;
+	unsigned int	msg_prio;
+	unsigned int	msg_size;
+	/* following real data... */
+};
+
+SYSCTL_NODE(_kern, OID_AUTO, mqueue, CTLFLAG_RW, 0,
+	"POSIX real time message queue");
+
+static int	default_maxmsg  = 10;
+static int	default_msgsize = 1024;
+
+static int	maxmsg = 100;
+SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmsg, CTLFLAG_RW,
+    &maxmsg, 0, "Default maximum messages in queue");
+static int	maxmsgsize = 16384;
+SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmsgsize, CTLFLAG_RW,
+    &maxmsgsize, 0, "Default maximum message size");
+static int	maxmq = 100;
+SYSCTL_INT(_kern_mqueue, OID_AUTO, maxmq, CTLFLAG_RW,
+    &maxmq, 0, "maximum message queues");
+static int	curmq = 0;
+SYSCTL_INT(_kern_mqueue, OID_AUTO, curmq, CTLFLAG_RW,
+    &curmq, 0, "current message queue number");
+static int	unloadable = 0;
+static MALLOC_DEFINE(M_MQUEUEDATA, "mqdata", "mqueue data");
+
+static eventhandler_tag exit_tag;
+
+/* Only one instance per-system */
+static struct mqfs_info		mqfs_data;
+static uma_zone_t		mqnode_zone;
+static uma_zone_t		mqueue_zone;
+static uma_zone_t		mvdata_zone;
+static uma_zone_t		mqnoti_zone;
+static struct vop_vector	mqfs_vnodeops;
+static struct fileops		mqueueops;
+
+/*
+ * Directory structure construction and manipulation
+ */
+#ifdef notyet
+static struct mqfs_node	*mqfs_create_dir(struct mqfs_node *parent,
+	const char *name, int namelen, struct ucred *cred, int mode);
+static struct mqfs_node	*mqfs_create_link(struct mqfs_node *parent,
+	const char *name, int namelen, struct ucred *cred, int mode);
+#endif
+
+static struct mqfs_node	*mqfs_create_file(struct mqfs_node *parent,
+	const char *name, int namelen, struct ucred *cred, int mode);
+static int	mqfs_destroy(struct mqfs_node *mn);
+static void	mqfs_fileno_alloc(struct mqfs_info *mi, struct mqfs_node *mn);
+static void	mqfs_fileno_free(struct mqfs_info *mi, struct mqfs_node *mn);
+static int	mqfs_allocv(struct mount *mp, struct vnode **vpp, struct mqfs_node *pn);
+
+/*
+ * Message queue construction and maniplation
+ */
+static struct mqueue	*mqueue_alloc(const struct mq_attr *attr);
+static void	mqueue_free(struct mqueue *mq);
+static int	mqueue_send(struct mqueue *mq, const char *msg_ptr,
+			size_t msg_len, unsigned msg_prio, int waitok,
+			const struct timespec *abs_timeout);
+static int	mqueue_receive(struct mqueue *mq, char *msg_ptr,
+			size_t msg_len, unsigned *msg_prio, int waitok,
+			const struct timespec *abs_timeout);
+static int	_mqueue_send(struct mqueue *mq, struct mqueue_msg *msg,
+			int timo);
+static int	_mqueue_recv(struct mqueue *mq, struct mqueue_msg **msg,
+			int timo);
+static void	mqueue_send_notification(struct mqueue *mq);
+static void	mqueue_fdclose(struct thread *td, int fd, struct file *fp);
+static void	mq_proc_exit(void *arg, struct proc *p);
+
+/*
+ * kqueue filters
+ */
+static void	filt_mqdetach(struct knote *kn);
+static int	filt_mqread(struct knote *kn, long hint);
+static int	filt_mqwrite(struct knote *kn, long hint);
+
+struct filterops mq_rfiltops =
+	{ 1, NULL, filt_mqdetach, filt_mqread };
+struct filterops mq_wfiltops =
+	{ 1, NULL, filt_mqdetach, filt_mqwrite };
+
+/*
+ * Initialize fileno bitmap
+ */
+static void
+mqfs_fileno_init(struct mqfs_info *mi)
+{
+	struct unrhdr *up;
+
+	up = new_unrhdr(1, INT_MAX, NULL);
+	mi->mi_unrhdr = up;
+}
+
+/*
+ * Tear down fileno bitmap
+ */
+static void
+mqfs_fileno_uninit(struct mqfs_info *mi)
+{
+	struct unrhdr *up;
+
+	up = mi->mi_unrhdr;
+	mi->mi_unrhdr = NULL;
+	delete_unrhdr(up);
+}
+
+/*
+ * Allocate a file number
+ */
+static void
+mqfs_fileno_alloc(struct mqfs_info *mi, struct mqfs_node *mn)
+{
+	/* make sure our parent has a file number */
+	if (mn->mn_parent && !mn->mn_parent->mn_fileno)
+		mqfs_fileno_alloc(mi, mn->mn_parent);
+
+	switch (mn->mn_type) {
+	case mqfstype_root:
+	case mqfstype_dir:
+	case mqfstype_file:
+	case mqfstype_symlink:
+		mn->mn_fileno = alloc_unr(mi->mi_unrhdr);
+		break;
+	case mqfstype_this:
+		KASSERT(mn->mn_parent != NULL,
+		    ("mqfstype_this node has no parent"));
+		mn->mn_fileno = mn->mn_parent->mn_fileno;
+		break;
+	case mqfstype_parent:
+		KASSERT(mn->mn_parent != NULL,
+		    ("mqfstype_parent node has no parent"));
+		if (mn->mn_parent == mi->mi_root) {
+			mn->mn_fileno = mn->mn_parent->mn_fileno;
+			break;
+		}
+		KASSERT(mn->mn_parent->mn_parent != NULL,
+		    ("mqfstype_parent node has no grandparent"));
+		mn->mn_fileno = mn->mn_parent->mn_parent->mn_fileno;
+		break;
+	default:
+		KASSERT(0,
+		    ("mqfs_fileno_alloc() called for unknown type node: %d",
+			mn->mn_type));
+		break;
+	}
+}
+
+/*
+ * Release a file number
+ */
+static void
+mqfs_fileno_free(struct mqfs_info *mi, struct mqfs_node *mn)
+{
+	switch (mn->mn_type) {
+	case mqfstype_root:
+	case mqfstype_dir:
+	case mqfstype_file:
+	case mqfstype_symlink:
+		free_unr(mi->mi_unrhdr, mn->mn_fileno);
+		break;
+	case mqfstype_this:
+	case mqfstype_parent:
+		/* ignore these, as they don't "own" their file number */
+		break;
+	default:
+		KASSERT(0,
+		    ("mqfs_fileno_free() called for unknown type node: %d", 
+			mn->mn_type));
+		break;
+	}
+}
+
+static __inline struct mqfs_node *
+mqnode_alloc(void)
+{
+	return uma_zalloc(mqnode_zone, M_WAITOK | M_ZERO);
+}
+
+static __inline void
+mqnode_free(struct mqfs_node *node)
+{
+	uma_zfree(mqnode_zone, node);
+}
+
+static __inline void
+mqnode_addref(struct mqfs_node *node)
+{
+	atomic_fetchadd_int(&node->mn_refcount, 1);
+}
+
+static __inline void
+mqnode_release(struct mqfs_node *node)
+{
+	int old, exp;
+
+	old = atomic_fetchadd_int(&node->mn_refcount, -1);
+	if (node->mn_type == mqfstype_dir ||
+	    node->mn_type == mqfstype_root)
+		exp = 3; /* include . and .. */
+	else
+		exp = 1;
+	if (old == exp)
+		mqfs_destroy(node);
+}
+
+/*
+ * Add a node to a directory
+ */
+static int
+mqfs_add_node(struct mqfs_node *parent, struct mqfs_node *node)
+{
+	KASSERT(parent != NULL, ("%s(): parent is NULL", __func__));
+	KASSERT(parent->mn_info != NULL,
+	    ("%s(): parent has no mn_info", __func__));
+	KASSERT(parent->mn_type == mqfstype_dir ||
+	    parent->mn_type == mqfstype_root,
+	    ("%s(): parent is not a directory", __func__));
+
+	node->mn_info = parent->mn_info;
+	node->mn_parent = parent;
+	LIST_INIT(&node->mn_children);
+	LIST_INIT(&node->mn_vnodes);
+	LIST_INSERT_HEAD(&parent->mn_children, node, mn_sibling);
+	mqnode_addref(parent);
+	return (0);
+}
+
+static struct mqfs_node *
+mqfs_create_node(const char *name, int namelen, struct ucred *cred, int mode,
+	int nodetype)
+{
+	struct mqfs_node *node;
+
+	node = mqnode_alloc();
+	strncpy(node->mn_name, name, namelen);
+	node->mn_type = nodetype;
+	node->mn_refcount = 1;
+	getnanotime(&node->mn_birth);
+	node->mn_ctime = node->mn_atime = node->mn_mtime
+		= node->mn_birth;
+	node->mn_uid = cred->cr_uid;
+	node->mn_gid = cred->cr_gid;
+	node->mn_mode = mode;
+	return (node);
+}
+
+/*
+ * Create a file
+ */
+static struct mqfs_node *
+mqfs_create_file(struct mqfs_node *parent, const char *name, int namelen,
+	struct ucred *cred, int mode)
+{
+	struct mqfs_node *node;
+
+	node = mqfs_create_node(name, namelen, cred, mode, mqfstype_file);
+	if (mqfs_add_node(parent, node) != 0) {
+		mqnode_free(node);
+		return (NULL);
+	}
+	return (node);
+}
+
+/*
+ * Add . and .. to a directory
+ */
+static int
+mqfs_fixup_dir(struct mqfs_node *parent)
+{
+	struct mqfs_node *dir;
+
+	dir = mqnode_alloc();
+	dir->mn_name[0] = '.';
+	dir->mn_type = mqfstype_this;
+	dir->mn_refcount = 1;
+	if (mqfs_add_node(parent, dir) != 0) {
+		mqnode_free(dir);
+		return (-1);
+	}
+
+	dir = mqnode_alloc();
+	dir->mn_name[0] = dir->mn_name[1] = '.';
+	dir->mn_type = mqfstype_parent;
+	dir->mn_refcount = 1;
+
+	if (mqfs_add_node(parent, dir) != 0) {
+		mqnode_free(dir);
+		return (-1);
+	}
+
+	return (0);
+}
+
+#ifdef notyet
+
+/*
+ * Create a directory
+ */
+static struct mqfs_node *
+mqfs_create_dir(struct mqfs_node *parent, const char *name, int namelen,
+	struct ucred *cred, int mode)
+{
+	struct mqfs_node *node;
+
+	node = mqfs_create_node(name, namelen, cred, mode, mqfstype_dir);
+	if (mqfs_add_node(parent, node) != 0) {
+		mqnode_free(node);
+		return (NULL);
+	}
+
+	if (mqfs_fixup_dir(node) != 0) {
+		mqfs_destroy(node);
+		return (NULL);
+	}
+	return (node);
+}
+
+/*
+ * Create a symlink
+ */
+static struct mqfs_node *
+mqfs_create_link(struct mqfs_node *parent, const char *name, int namelen,
+	struct ucred *cred, int mode)
+{
+	struct mqfs_node *node;
+
+	node = mqfs_create_node(name, namelen, cred, mode, mqfstype_symlink);
+	if (mqfs_add_node(parent, node) != 0) {
+		mqnode_free(node);
+		return (NULL);
+	}
+	return (node);
+}
+
+#endif
+
+/*
+ * Destroy a node or a tree of nodes
+ */
+static int
+mqfs_destroy(struct mqfs_node *node)
+{
+	struct mqfs_node *parent;
+
+	KASSERT(node != NULL,
+	    ("%s(): node is NULL", __func__));
+	KASSERT(node->mn_info != NULL,
+	    ("%s(): node has no mn_info", __func__));
+
+	/* destroy children */
+	if (node->mn_type == mqfstype_dir || node->mn_type == mqfstype_root)
+		while (! LIST_EMPTY(&node->mn_children))
+			mqfs_destroy(LIST_FIRST(&node->mn_children));
+
+	/* unlink from parent */
+	if ((parent = node->mn_parent) != NULL) {
+		KASSERT(parent->mn_info == node->mn_info,
+		    ("%s(): parent has different mn_info", __func__));
+		LIST_REMOVE(node, mn_sibling);
+	}
+
+	if (node->mn_fileno != 0)
+		mqfs_fileno_free(node->mn_info, node);
+	if (node->mn_data != NULL)
+		mqueue_free(node->mn_data);
+	mqnode_free(node);
+	return (0);
+}
+
+/*
+ * Mount a mqfs instance
+ */
+static int
+mqfs_mount(struct mount *mp, struct thread *td)
+{
+	struct statfs *sbp;
+
+	if (mp->mnt_flag & MNT_UPDATE)
+		return (EOPNOTSUPP);
+
+	mp->mnt_data = &mqfs_data;
+	MNT_ILOCK(mp);
+	mp->mnt_flag |= MNT_LOCAL;
+	mp->mnt_kern_flag |= MNTK_MPSAFE;
+	MNT_IUNLOCK(mp);
+	vfs_getnewfsid(mp);
+
+	sbp = &mp->mnt_stat;
+	vfs_mountedfrom(mp, "mqueue");
+	sbp->f_bsize = PAGE_SIZE;
+	sbp->f_iosize = PAGE_SIZE;
+	sbp->f_blocks = 1;
+	sbp->f_bfree = 0;
+	sbp->f_bavail = 0;
+	sbp->f_files = 1;
+	sbp->f_ffree = 0;
+	return (0);
+}
+
+/*
+ * Unmount a mqfs instance
+ */
+static int
+mqfs_unmount(struct mount *mp, int mntflags, struct thread *td)
+{
+	int error;
+
+	error = vflush(mp, 0, (mntflags & MNT_FORCE) ?  FORCECLOSE : 0, td);
+	return (error);
+}
+
+/*
+ * Return a root vnode
+ */
+static int
+mqfs_root(struct mount *mp, int flags, struct vnode **vpp, struct thread *td)
+{
+	struct mqfs_info *mqfs;
+	int ret;
+
+	mqfs = VFSTOMQFS(mp);
+	sx_xlock(&mqfs->mi_lock);
+	ret = mqfs_allocv(mp, vpp, mqfs->mi_root);
+	sx_xunlock(&mqfs->mi_lock);
+	return (ret);
+}
+
+/*
+ * Return filesystem stats
+ */
+static int
+mqfs_statfs(struct mount *mp, struct statfs *sbp, struct thread *td)
+{
+	/* XXX update statistics */
+	return (0);
+}
+
+/*
+ * Initialize a mqfs instance
+ */
+static int
+mqfs_init(struct vfsconf *vfc)
+{
+	struct mqfs_node *root;
+	struct mqfs_info *mi;
+
+	mqnode_zone = uma_zcreate("mqnode", sizeof(struct mqfs_node),
+		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+	mqueue_zone = uma_zcreate("mqueue", sizeof(struct mqueue),
+		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+	mvdata_zone = uma_zcreate("mvdata",
+		sizeof(struct mqfs_vdata), NULL, NULL, NULL,
+		NULL, UMA_ALIGN_PTR, 0);
+	mqnoti_zone = uma_zcreate("mqnotifier", sizeof(struct mqueue_notifier),
+		NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
+	mi = &mqfs_data;
+	sx_init(&mi->mi_lock, "mqfs lock");
+	/* set up the root diretory */
+	root = mqfs_create_node("/", 1, curthread->td_ucred, 01777,
+		mqfstype_root);
+	root->mn_info = mi;
+	LIST_INIT(&root->mn_children);
+	LIST_INIT(&root->mn_vnodes);
+	mi->mi_root = root;
+	mqfs_fileno_init(mi);
+	mqfs_fileno_alloc(mi, root);
+	mqfs_fixup_dir(root);
+	exit_tag = EVENTHANDLER_REGISTER(process_exit, mq_proc_exit, NULL,
+	    EVENTHANDLER_PRI_ANY);
+	mq_fdclose = mqueue_fdclose;
+	p31b_setcfg(CTL_P1003_1B_MESSAGE_PASSING, _POSIX_MESSAGE_PASSING);
+	return (0);
+}
+
+/*
+ * Destroy a mqfs instance
+ */
+static int
+mqfs_uninit(struct vfsconf *vfc)
+{
+	struct mqfs_info *mi;
+
+	if (!unloadable)
+		return (EOPNOTSUPP);
+	EVENTHANDLER_DEREGISTER(process_exit, exit_tag);
+	mi = &mqfs_data;
+	mqfs_destroy(mi->mi_root);
+	mi->mi_root = NULL;
+	mqfs_fileno_uninit(mi);
+	sx_destroy(&mi->mi_lock);
+	uma_zdestroy(mqnode_zone);
+	uma_zdestroy(mqueue_zone);
+	uma_zdestroy(mvdata_zone);
+	uma_zdestroy(mqnoti_zone);
+	return (0);
+}
+
+/*
+ * task routine
+ */
+static void
+do_recycle(void *context, int pending __unused)
+{
+	struct vnode *vp = (struct vnode *)context;
+
+	vrecycle(vp, curthread);
+	vdrop(vp);
+}
+
+/*
+ * Allocate a vnode
+ */
+static int
+mqfs_allocv(struct mount *mp, struct vnode **vpp, struct mqfs_node *pn)
+{
+	struct mqfs_vdata *vd;
+	int error;
+
+	LIST_FOREACH(vd, &pn->mn_vnodes, mv_link) {
+		if (vd->mv_vnode->v_mount == mp)
+			break;
+	}
+
+	if (vd != NULL) {
+		if (vget(vd->mv_vnode, 0, curthread) == 0) {
+			*vpp = vd->mv_vnode;
+			vn_lock(*vpp, LK_RETRY | LK_EXCLUSIVE,
+			    curthread);
+			return (0);
+		}
+		/* XXX if this can happen, we're in trouble */
+	}
+
+	error = getnewvnode("mqueue", mp, &mqfs_vnodeops, vpp);
+	if (error)
+		return (error);
+	vn_lock(*vpp, LK_EXCLUSIVE | LK_RETRY, curthread);
+	error = insmntque(*vpp, mp);
+	if (error != 0) {
+		*vpp = NULLVP;
+		return (error);
+	}
+	vd = uma_zalloc(mvdata_zone, M_WAITOK);
+	(*vpp)->v_data = vd;
+	vd->mv_vnode = *vpp;
+	vd->mv_node = pn;
+	TASK_INIT(&vd->mv_task, 0, do_recycle, *vpp);
+	LIST_INSERT_HEAD(&pn->mn_vnodes, vd, mv_link);
+	mqnode_addref(pn);
+	switch (pn->mn_type) {
+	case mqfstype_root:
+		(*vpp)->v_vflag = VV_ROOT;
+		/* fall through */
+	case mqfstype_dir:
+	case mqfstype_this:
+	case mqfstype_parent:
+		(*vpp)->v_type = VDIR;
+		break;
+	case mqfstype_file:
+		(*vpp)->v_type = VREG;
+		break;
+	case mqfstype_symlink:
+		(*vpp)->v_type = VLNK;
+		break;
+	case mqfstype_none:
+		KASSERT(0, ("mqfs_allocf called for null node\n"));
+	default:
+		panic("%s has unexpected type: %d", pn->mn_name, pn->mn_type);
+	}
+	return (0);
+}
+
+/* 
+ * Search a directory entry
+ */
+static struct mqfs_node *
+mqfs_search(struct mqfs_node *pd, const char *name, int len)
+{
+	struct mqfs_node *pn;
+
+	LIST_FOREACH(pn, &pd->mn_children, mn_sibling) {
+		if (strncmp(pn->mn_name, name, len) == 0)
+			return (pn);
+	}
+	return (NULL);
+}
+
+/*
+ * Look up a file or directory.
+ */
+static int
+mqfs_lookupx(struct vop_cachedlookup_args *ap)
+{
+	struct componentname *cnp;
+	struct vnode *dvp, **vpp;
+	struct mqfs_node *pd;
+	struct mqfs_node *pn;
+	int nameiop, flags, error, namelen;
+	char *pname;
+	struct thread *td;
+
+	cnp = ap->a_cnp;
+	vpp = ap->a_vpp;
+	dvp = ap->a_dvp;
+	pname = cnp->cn_nameptr;
+	namelen = cnp->cn_namelen;
+	td = cnp->cn_thread;
+	flags = cnp->cn_flags;
+	nameiop = cnp->cn_nameiop;
+	pd = VTON(dvp);
+	pn = NULL;
+	*vpp = NULLVP;
+
+	if (dvp->v_type != VDIR)
+		return (ENOTDIR);
+
+	error = VOP_ACCESS(dvp, VEXEC, cnp->cn_cred, cnp->cn_thread);
+	if (error)
+		return (error);
+
+	/* shortcut: check if the name is too long */
+	if (cnp->cn_namelen >= MQFS_NAMELEN)
+		return (ENOENT);
+
+	/* self */
+	if (namelen == 1 && pname[0] == '.') {
+		if ((flags & ISLASTCN) && nameiop != LOOKUP)
+			return (EINVAL);
+		pn = pd;
+		*vpp = dvp;
+		VREF(dvp);
+		return (0);
+	}
+
+	/* parent */
+	if (cnp->cn_flags & ISDOTDOT) {
+		if (dvp->v_vflag & VV_ROOT)
+			return (EIO);
+		if ((flags & ISLASTCN) && nameiop != LOOKUP)
+			return (EINVAL);
+		VOP_UNLOCK(dvp, 0, cnp->cn_thread);
+		KASSERT(pd->mn_parent, ("non-root directory has no parent"));
+		pn = pd->mn_parent;
+		error = mqfs_allocv(dvp->v_mount, vpp, pn);
+		vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY, td);
+		return (error);
+	}
+
+	/* named node */
+	pn = mqfs_search(pd, pname, namelen);
+	
+	/* found */
+	if (pn != NULL) {
+		/* DELETE */
+		if (nameiop == DELETE && (flags & ISLASTCN)) {
+			error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td);
+			if (error)
+				return (error);
+			if (*vpp == dvp) {
+				VREF(dvp);
+				*vpp = dvp;
+				return (0);
+			}
+		}
+
+		/* allocate vnode */
+		error = mqfs_allocv(dvp->v_mount, vpp, pn);
+		if (error == 0 && cnp->cn_flags & MAKEENTRY)
+			cache_enter(dvp, *vpp, cnp);
+		return (error);
+	}
+	
+	/* not found */
+
+	/* will create a new entry in the directory ? */
+	if ((nameiop == CREATE || nameiop == RENAME) && (flags & LOCKPARENT)
+	    && (flags & ISLASTCN)) {
+		error = VOP_ACCESS(dvp, VWRITE, cnp->cn_cred, td);
+		if (error)
+			return (error);
+		cnp->cn_flags |= SAVENAME;
+		return (EJUSTRETURN);
+	}
+	return (ENOENT);
+}
+
+#if 0
+struct vop_lookup_args {
+	struct vop_generic_args a_gen;
+	struct vnode *a_dvp;
+	struct vnode **a_vpp;
+	struct componentname *a_cnp;
+};
+#endif
+
+/*
+ * vnode lookup operation
+ */
+static int
+mqfs_lookup(struct vop_cachedlookup_args *ap)
+{
+	struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount);
+	int rc;
+
+	sx_xlock(&mqfs->mi_lock);
+	rc = mqfs_lookupx(ap);
+	sx_xunlock(&mqfs->mi_lock);
+	return (rc);
+}
+
+#if 0
+struct vop_create_args {
+	struct vnode *a_dvp;
+	struct vnode **a_vpp;
+	struct componentname *a_cnp;
+	struct vattr *a_vap;
+};
+#endif
+
+/*
+ * vnode creation operation
+ */
+static int
+mqfs_create(struct vop_create_args *ap)
+{
+	struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount);
+	struct componentname *cnp = ap->a_cnp;
+	struct mqfs_node *pd;
+	struct mqfs_node *pn;
+	struct mqueue *mq;
+	int error;
+
+	pd = VTON(ap->a_dvp);
+	if (pd->mn_type != mqfstype_root && pd->mn_type != mqfstype_dir)
+		return (ENOTDIR);
+	mq = mqueue_alloc(NULL);
+	if (mq == NULL)
+		return (EAGAIN);
+	sx_xlock(&mqfs->mi_lock);
+#if 0
+	/* named node */
+	pn = mqfs_search(pd, cnp->cn_nameptr, cnp->cn_namelen);
+	if (pn != NULL) {
+		mqueue_free(mq);
+		sx_xunlock(&mqfs->mi_lock);
+		return (EEXIST);
+	}
+#else
+	if ((cnp->cn_flags & HASBUF) == 0)
+		panic("%s: no name", __func__);
+#endif
+	pn = mqfs_create_file(pd, cnp->cn_nameptr, cnp->cn_namelen,
+		cnp->cn_cred, ap->a_vap->va_mode);
+	if (pn == NULL)
+		error = ENOSPC;
+	else {
+		error = mqfs_allocv(ap->a_dvp->v_mount, ap->a_vpp, pn);
+		if (error)
+			mqfs_destroy(pn);
+		else
+			pn->mn_data = mq;
+	}
+	sx_xunlock(&mqfs->mi_lock);
+	if (error)
+		mqueue_free(mq);
+	return (error);
+}
+
+/*
+ * Remove an entry
+ */
+static
+int do_unlink(struct mqfs_node *pn, struct ucred *ucred)
+{
+	struct mqfs_node *parent;
+	struct mqfs_vdata *vd;
+	int error = 0;
+
+	sx_assert(&pn->mn_info->mi_lock, SX_LOCKED);
+
+	if (ucred->cr_uid != pn->mn_uid &&
+	    (error = priv_check_cred(ucred, PRIV_MQ_ADMIN, 0)) != 0)
+		error = EACCES;
+	else if (!pn->mn_deleted) {
+		parent = pn->mn_parent;
+		pn->mn_parent = NULL;
+		pn->mn_deleted = 1;
+		LIST_REMOVE(pn, mn_sibling);
+		LIST_FOREACH(vd, &pn->mn_vnodes, mv_link) {
+			cache_purge(vd->mv_vnode);
+			vhold(vd->mv_vnode);
+			taskqueue_enqueue(taskqueue_thread, &vd->mv_task);
+		}
+		mqnode_release(pn);
+		mqnode_release(parent);
+	} else
+		error = ENOENT;
+	return (error);
+}
+
+#if 0
+struct vop_remove_args {
+	struct vnode *a_dvp;
+	struct vnode *a_vp;
+	struct componentname *a_cnp;
+};
+#endif
+
+/*
+ * vnode removal operation
+ */
+static int
+mqfs_remove(struct vop_remove_args *ap)
+{
+	struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount);
+	struct mqfs_node *pn;
+	int error;
+
+	if (ap->a_vp->v_type == VDIR)
+                return (EPERM);
+	pn = VTON(ap->a_vp);
+	sx_xlock(&mqfs->mi_lock);
+	error = do_unlink(pn, ap->a_cnp->cn_cred);
+	sx_xunlock(&mqfs->mi_lock);
+	return (error);
+}
+
+#if 0
+struct vop_inactive_args {
+	struct vnode *a_vp;
+	struct thread *a_td;
+};
+#endif
+
+static int
+mqfs_inactive(struct vop_inactive_args *ap)
+{
+	struct mqfs_node *pn = VTON(ap->a_vp);
+
+	if (pn->mn_deleted)
+		vrecycle(ap->a_vp, ap->a_td);
+	return (0);
+}
+
+#if 0
+struct vop_reclaim_args {
+	struct vop_generic_args a_gen;
+	struct vnode *a_vp;
+	struct thread *a_td;
+};
+#endif
+
+static int
+mqfs_reclaim(struct vop_reclaim_args *ap)
+{
+	struct mqfs_info *mqfs = VFSTOMQFS(ap->a_vp->v_mount);
+	struct vnode *vp = ap->a_vp;
+	struct mqfs_node *pn;
+	struct mqfs_vdata *vd;
+
+	vd = vp->v_data;
+	pn = vd->mv_node;
+	sx_xlock(&mqfs->mi_lock);
+	vp->v_data = NULL;
+	LIST_REMOVE(vd, mv_link);
+	uma_zfree(mvdata_zone, vd);
+	mqnode_release(pn);
+	sx_xunlock(&mqfs->mi_lock);
+	return (0);
+}
+
+#if 0
+struct vop_open_args {
+	struct vop_generic_args a_gen;
+	struct vnode *a_vp;
+	int a_mode;
+	struct ucred *a_cred;
+	struct thread *a_td;
+	int a_fdidx;
+};
+#endif
+
+static int
+mqfs_open(struct vop_open_args *ap)
+{
+	return (0);
+}
+
+#if 0
+struct vop_close_args {
+	struct vop_generic_args a_gen;
+	struct vnode *a_vp;
+	int a_fflag;
+	struct ucred *a_cred;
+	struct thread *a_td;
+};
+#endif
+
+static int
+mqfs_close(struct vop_close_args *ap)
+{
+	return (0);
+}
+
+#if 0
+struct vop_access_args {
+	struct vop_generic_args a_gen;
+	struct vnode *a_vp;
+	int a_mode;
+	struct ucred *a_cred;
+	struct thread *a_td;
+};
+#endif
+
+/*
+ * Verify permissions
+ */
+static int
+mqfs_access(struct vop_access_args *ap)
+{
+	struct vnode *vp = ap->a_vp;
+	struct vattr vattr;
+	int error;
+
+	error = VOP_GETATTR(vp, &vattr, ap->a_cred, ap->a_td);
+	if (error)
+		return (error);
+	error = vaccess(vp->v_type, vattr.va_mode, vattr.va_uid,
+	    vattr.va_gid, ap->a_mode, ap->a_cred, NULL);
+	return (error);
+}
+
+#if 0
+struct vop_getattr_args {
+	struct vop_generic_args a_gen;
+	struct vnode *a_vp;
+	struct vattr *a_vap;
+	struct ucred *a_cred;
+	struct thread *a_td;
+};
+#endif
+
+/*
+ * Get file attributes
+ */
+static int
+mqfs_getattr(struct vop_getattr_args *ap)
+{
+	struct vnode *vp = ap->a_vp;
+	struct mqfs_node *pn = VTON(vp);
+	struct vattr *vap = ap->a_vap;
+	int error = 0;
+
+	VATTR_NULL(vap);
+	vap->va_type = vp->v_type;
+	vap->va_mode = pn->mn_mode;
+	vap->va_nlink = 1;
+	vap->va_uid = pn->mn_uid;
+	vap->va_gid = pn->mn_gid;
+	vap->va_fsid = vp->v_mount->mnt_stat.f_fsid.val[0];
+	vap->va_fileid = pn->mn_fileno;
+	vap->va_size = 0;
+	vap->va_blocksize = PAGE_SIZE;
+	vap->va_bytes = vap->va_size = 0;
+	vap->va_atime = pn->mn_atime;
+	vap->va_mtime = pn->mn_mtime;
+	vap->va_ctime = pn->mn_ctime;
+	vap->va_birthtime = pn->mn_birth;
+	vap->va_gen = 0;
+	vap->va_flags = 0;
+	vap->va_rdev = 0;
+	vap->va_bytes = 0;
+	vap->va_filerev = 0;
+	vap->va_vaflags = 0;
+	return (error);
+}
+
+#if 0
+struct vop_setattr_args {
+	struct vop_generic_args a_gen;
+	struct vnode *a_vp;
+	struct vattr *a_vap;
+	struct ucred *a_cred;
+	struct thread *a_td;
+};
+#endif
+/*
+ * Set attributes
+ */
+static int
+mqfs_setattr(struct vop_setattr_args *ap)
+{
+	struct mqfs_node *pn;
+	struct vattr *vap;
+	struct vnode *vp;
+	int c, error;
+	uid_t uid;
+	gid_t gid;
+
+	vap = ap->a_vap;
+	vp = ap->a_vp;
+	if ((vap->va_type != VNON) ||
+	    (vap->va_nlink != VNOVAL) ||
+	    (vap->va_fsid != VNOVAL) ||
+	    (vap->va_fileid != VNOVAL) ||
+	    (vap->va_blocksize != VNOVAL) ||
+	    (vap->va_flags != VNOVAL && vap->va_flags != 0) ||
+	    (vap->va_rdev != VNOVAL) ||
+	    ((int)vap->va_bytes != VNOVAL) ||
+	    (vap->va_gen != VNOVAL)) {
+		return (EINVAL);
+	}
+
+	pn = VTON(vp);
+
+	error = c = 0;
+	if (vap->va_uid == (uid_t)VNOVAL)
+		uid = pn->mn_uid;
+	else
+		uid = vap->va_uid;
+	if (vap->va_gid == (gid_t)VNOVAL)
+		gid = pn->mn_gid;
+	else
+		gid = vap->va_gid;
+
+	if (uid != pn->mn_uid || gid != pn->mn_gid) {
+		/*
+		 * To modify the ownership of a file, must possess VADMIN
+		 * for that file.
+		 */
+		if ((error = VOP_ACCESS(vp, VADMIN, ap->a_cred, ap->a_td)))
+			return (error);
+
+		/*
+		 * XXXRW: Why is there a privilege check here: shouldn't the
+		 * check in VOP_ACCESS() be enough?  Also, are the group bits
+		 * below definitely right?
+		 */
+		if (((ap->a_cred->cr_uid != pn->mn_uid) || uid != pn->mn_uid ||
+		    (gid != pn->mn_gid && !groupmember(gid, ap->a_cred))) &&
+		    (error = priv_check(ap->a_td, PRIV_MQ_ADMIN)) != 0)
+			return (error);
+		pn->mn_uid = uid;
+		pn->mn_gid = gid;
+		c = 1;
+	}
+
+	if (vap->va_mode != (mode_t)VNOVAL) {
+		if ((ap->a_cred->cr_uid != pn->mn_uid) &&
+		    (error = priv_check(ap->a_td, PRIV_MQ_ADMIN)))
+			return (error);
+		pn->mn_mode = vap->va_mode;
+		c = 1;
+	}
+
+	if (vap->va_atime.tv_sec != VNOVAL || vap->va_mtime.tv_sec != VNOVAL) {
+		/* See the comment in ufs_vnops::ufs_setattr(). */
+		if ((error = VOP_ACCESS(vp, VADMIN, ap->a_cred, ap->a_td)) &&
+		    ((vap->va_vaflags & VA_UTIMES_NULL) == 0 ||
+		    (error = VOP_ACCESS(vp, VWRITE, ap->a_cred, ap->a_td))))
+			return (error);
+		if (vap->va_atime.tv_sec != VNOVAL) {
+			pn->mn_atime = vap->va_atime;
+		}
+		if (vap->va_mtime.tv_sec != VNOVAL) {
+			pn->mn_mtime = vap->va_mtime;
+		}
+		c = 1;
+	}
+	if (c) {
+		vfs_timestamp(&pn->mn_ctime);
+	}
+	return (0);
+}
+
+#if 0
+struct vop_read_args {
+	struct vop_generic_args a_gen;
+	struct vnode *a_vp;
+	struct uio *a_uio;
+	int a_ioflag;
+	struct ucred *a_cred;
+};
+#endif
+
+/*
+ * Read from a file
+ */
+static int
+mqfs_read(struct vop_read_args *ap)
+{
+	char buf[80];
+	struct vnode *vp = ap->a_vp;
+	struct uio *uio = ap->a_uio;
+	struct mqfs_node *pn;
+	struct mqueue *mq;
+	int len, error;
+
+	if (vp->v_type != VREG)
+		return (EINVAL);
+
+	pn = VTON(vp);
+	mq = VTOMQ(vp);
+	snprintf(buf, sizeof(buf),
+		"QSIZE:%-10ld MAXMSG:%-10ld CURMSG:%-10ld MSGSIZE:%-10ld\n",
+		mq->mq_totalbytes,
+		mq->mq_maxmsg,
+		mq->mq_curmsgs,
+		mq->mq_msgsize);
+	buf[sizeof(buf)-1] = '\0';
+	len = strlen(buf);
+	error = uiomove_frombuf(buf, len, uio);
+	return (error);
+}
+
+#if 0
+struct vop_readdir_args {
+	struct vop_generic_args a_gen;
+	struct vnode *a_vp;
+	struct uio *a_uio;
+	struct ucred *a_cred;
+	int *a_eofflag;
+	int *a_ncookies;
+	u_long **a_cookies;
+};
+#endif
+
+/*
+ * Return directory entries.
+ */
+static int
+mqfs_readdir(struct vop_readdir_args *ap)
+{
+	struct vnode *vp;
+	struct mqfs_info *mi;
+	struct mqfs_node *pd;
+	struct mqfs_node *pn;
+	struct dirent entry;
+	struct uio *uio;
+	int *tmp_ncookies = NULL;
+	off_t offset;
+	int error, i;
+
+	vp = ap->a_vp;
+	mi = VFSTOMQFS(vp->v_mount);
+	pd = VTON(vp);
+	uio = ap->a_uio;
+
+	if (vp->v_type != VDIR)
+		return (ENOTDIR);
+
+	if (uio->uio_offset < 0)
+		return (EINVAL);
+
+	if (ap->a_ncookies != NULL) {
+		tmp_ncookies = ap->a_ncookies;
+		*ap->a_ncookies = 0;
+		ap->a_ncookies = NULL;
+        }
+
+	error = 0;
+	offset = 0;
+
+	sx_xlock(&mi->mi_lock);
+
+	LIST_FOREACH(pn, &pd->mn_children, mn_sibling) {
+		entry.d_reclen = sizeof(entry);
+		if (!pn->mn_fileno)
+			mqfs_fileno_alloc(mi, pn);
+		entry.d_fileno = pn->mn_fileno;
+		for (i = 0; i < MQFS_NAMELEN - 1 && pn->mn_name[i] != '\0'; ++i)
+			entry.d_name[i] = pn->mn_name[i];
+		entry.d_name[i] = 0;
+		entry.d_namlen = i;
+		switch (pn->mn_type) {
+		case mqfstype_root:
+		case mqfstype_dir:
+		case mqfstype_this:
+		case mqfstype_parent:
+			entry.d_type = DT_DIR;
+			break;
+		case mqfstype_file:
+			entry.d_type = DT_REG;
+			break;
+		case mqfstype_symlink:
+			entry.d_type = DT_LNK;
+			break;
+		default:
+			panic("%s has unexpected node type: %d", pn->mn_name,
+				pn->mn_type);
+		}
+		if (entry.d_reclen > uio->uio_resid)
+                        break;
+		if (offset >= uio->uio_offset) {
+			error = vfs_read_dirent(ap, &entry, offset);
+                        if (error)
+                                break;
+                }
+                offset += entry.d_reclen;
+	}
+	sx_xunlock(&mi->mi_lock);
+
+	uio->uio_offset = offset;
+
+	if (tmp_ncookies != NULL)
+		ap->a_ncookies = tmp_ncookies;
+
+	return (error);
+}
+
+#ifdef notyet
+
+#if 0
+struct vop_mkdir_args {
+	struct vnode *a_dvp;
+	struvt vnode **a_vpp;
+	struvt componentname *a_cnp;
+	struct vattr *a_vap;
+};
+#endif
+
+/*
+ * Create a directory.
+ */
+static int
+mqfs_mkdir(struct vop_mkdir_args *ap)
+{
+	struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount);
+	struct componentname *cnp = ap->a_cnp;
+	struct mqfs_node *pd = VTON(ap->a_dvp);
+	struct mqfs_node *pn;
+	int error;
+
+	if (pd->mn_type != mqfstype_root && pd->mn_type != mqfstype_dir)
+		return (ENOTDIR);
+	sx_xlock(&mqfs->mi_lock);
+#if 0
+	/* named node */
+	pn = mqfs_search(pd, cnp->cn_nameptr, cnp->cn_namelen);
+	if (pn != NULL) {
+		sx_xunlock(&mqfs->mi_lock);
+		return (EEXIST);
+	}
+#else
+	if ((cnp->cn_flags & HASBUF) == 0)
+		panic("%s: no name", __func__);
+#endif
+	pn = mqfs_create_dir(pd, cnp->cn_nameptr, cnp->cn_namelen,
+		ap->a_vap->cn_cred, ap->a_vap->va_mode);
+	if (pn == NULL)
+		error = ENOSPC;
+	else
+		error = mqfs_allocv(ap->a_dvp->v_mount, ap->a_vpp, pn);
+	sx_xunlock(&mqfs->mi_lock);
+	return (error);
+}
+
+#if 0
+struct vop_rmdir_args {
+	struct vnode *a_dvp;
+	struct vnode *a_vp;
+	struct componentname *a_cnp;
+};
+#endif
+
+/*
+ * Remove a directory.
+ */
+static int
+mqfs_rmdir(struct vop_rmdir_args *ap)
+{
+	struct mqfs_info *mqfs = VFSTOMQFS(ap->a_dvp->v_mount);
+	struct mqfs_node *pn = VTON(ap->a_vp);
+	struct mqfs_node *pt;
+
+	if (pn->mn_type != mqfstype_dir)
+		return (ENOTDIR);
+
+	sx_xlock(&mqfs->mi_lock);
+	if (pn->mn_deleted) {
+		sx_xunlock(&mqfs->mi_lock);
+		return (ENOENT);
+	}
+
+	pt = LIST_FIRST(&pn->mn_children);
+	pt = LIST_NEXT(pt, mn_sibling);
+	pt = LIST_NEXT(pt, mn_sibling);
+	if (pt != NULL) {
+		sx_xunlock(&mqfs->mi_lock);
+		return (ENOTEMPTY);
+	}
+	pt = pn->mn_parent;
+	pn->mn_parent = NULL;
+	pn->mn_deleted = 1;
+	LIST_REMOVE(pn, mn_sibling);
+	mqnode_release(pn);
+	mqnode_release(pt);
+	sx_xunlock(&mqfs->mi_lock);
+	cache_purge(ap->a_vp);
+	return (0);
+}
+
+#endif /* notyet */
+
+/*
+ * Allocate a message queue
+ */
+static struct mqueue *
+mqueue_alloc(const struct mq_attr *attr)
+{
+	struct mqueue *mq;
+
+	if (curmq >= maxmq)
+		return (NULL);
+	mq = uma_zalloc(mqueue_zone, M_WAITOK | M_ZERO);
+	TAILQ_INIT(&mq->mq_msgq);
+	if (attr != NULL) {
+		mq->mq_maxmsg = attr->mq_maxmsg;
+		mq->mq_msgsize = attr->mq_msgsize;
+	} else {
+		mq->mq_maxmsg = default_maxmsg;
+		mq->mq_msgsize = default_msgsize;
+	}
+	mtx_init(&mq->mq_mutex, "mqueue", NULL, MTX_DEF);
+	knlist_init(&mq->mq_rsel.si_note, &mq->mq_mutex, NULL, NULL, NULL);
+	knlist_init(&mq->mq_wsel.si_note, &mq->mq_mutex, NULL, NULL, NULL);
+	atomic_add_int(&curmq, 1);
+	return (mq);
+}
+
+/*
+ * Destroy a message queue
+ */
+static void
+mqueue_free(struct mqueue *mq)
+{
+	struct mqueue_msg *msg;
+
+	while ((msg = TAILQ_FIRST(&mq->mq_msgq)) != NULL) {
+		TAILQ_REMOVE(&mq->mq_msgq, msg, msg_link);
+		FREE(msg, M_MQUEUEDATA);
+	}
+
+	mtx_destroy(&mq->mq_mutex);
+	knlist_destroy(&mq->mq_rsel.si_note);
+	knlist_destroy(&mq->mq_wsel.si_note);
+	uma_zfree(mqueue_zone, mq);
+	atomic_add_int(&curmq, -1);
+}
+
+/*
+ * Load a message from user space
+ */
+static struct mqueue_msg *
+mqueue_loadmsg(const char *msg_ptr, size_t msg_size, int msg_prio)
+{
+	struct mqueue_msg *msg;
+	size_t len;
+	int error;
+
+	len = sizeof(struct mqueue_msg) + msg_size;
+	MALLOC(msg, struct mqueue_msg *, len, M_MQUEUEDATA, M_WAITOK);
+	error = copyin(msg_ptr, ((char *)msg) + sizeof(struct mqueue_msg),
+	    msg_size);
+	if (error) {
+		FREE(msg, M_MQUEUEDATA);
+		msg = NULL;
+	} else {
+		msg->msg_size = msg_size;
+		msg->msg_prio = msg_prio;
+	}
+	return (msg);
+}
+
+/*
+ * Save a message to user space
+ */
+static int
+mqueue_savemsg(struct mqueue_msg *msg, char *msg_ptr, int *msg_prio)
+{
+	int error;
+
+	error = copyout(((char *)msg) + sizeof(*msg), msg_ptr,
+		msg->msg_size);
+	if (error == 0 && msg_prio != NULL)
+		error = copyout(&msg->msg_prio, msg_prio, sizeof(int));
+	return (error);
+}
+
+/*
+ * Free a message's memory
+ */
+static __inline void
+mqueue_freemsg(struct mqueue_msg *msg)
+{
+	FREE(msg, M_MQUEUEDATA);
+}
+
+/*
+ * Send a message. if waitok is false, thread will not be
+ * blocked if there is no data in queue, otherwise, absolute
+ * time will be checked.
+ */
+int
+mqueue_send(struct mqueue *mq, const char *msg_ptr,
+	size_t msg_len, unsigned msg_prio, int waitok,
+	const struct timespec *abs_timeout)
+{
+	struct mqueue_msg *msg;
+	struct timespec ets, ts, ts2;
+	struct timeval tv;
+	int error;
+
+	if (msg_prio >= MQ_PRIO_MAX)
+		return (EINVAL);
+	if (msg_len > mq->mq_msgsize)
+		return (EMSGSIZE);
+	msg = mqueue_loadmsg(msg_ptr, msg_len, msg_prio);
+	if (msg == NULL)
+		return (EFAULT);
+
+	/* O_NONBLOCK case */
+	if (!waitok) {
+		error = _mqueue_send(mq, msg, -1);
+		if (error)
+			goto bad;
+		return (0);
+	}
+
+	/* we allow a null timeout (wait forever) */
+	if (abs_timeout == NULL) {
+		error = _mqueue_send(mq, msg, 0);
+		if (error)
+			goto bad;
+		return (0);
+	}
+
+	/* send it before checking time */
+	error = _mqueue_send(mq, msg, -1);
+	if (error == 0)
+		return (0);
+
+	if (error != EAGAIN)
+		goto bad;
+
+	error = copyin(abs_timeout, &ets, sizeof(ets));
+	if (error != 0)
+		goto bad;
+	if (ets.tv_nsec >= 1000000000 || ets.tv_nsec < 0) {
+		error = EINVAL;
+		goto bad;
+	}
+	for (;;) {
+		ts2 = ets;
+		getnanotime(&ts);
+		timespecsub(&ts2, &ts);
+		if (ts2.tv_sec < 0 || (ts2.tv_sec == 0 && ts2.tv_nsec <= 0)) {
+			error = ETIMEDOUT;
+			break;
+		}
+		TIMESPEC_TO_TIMEVAL(&tv, &ts2);
+		error = _mqueue_send(mq, msg, tvtohz(&tv));
+		if (error != ETIMEDOUT)
+			break;
+	}
+	if (error == 0)
+		return (0);
+bad:
+	mqueue_freemsg(msg);
+	return (error);
+}
+
+/*
+ * Common routine to send a message
+ */
+static int
+_mqueue_send(struct mqueue *mq, struct mqueue_msg *msg, int timo)
+{	
+	struct mqueue_msg *msg2;
+	int error = 0;
+
+	mtx_lock(&mq->mq_mutex);
+	while (mq->mq_curmsgs >= mq->mq_maxmsg && error == 0) {
+		if (timo < 0) {
+			mtx_unlock(&mq->mq_mutex);
+			return (EAGAIN);
+		}
+		mq->mq_senders++;
+		error = msleep(&mq->mq_senders, &mq->mq_mutex,
+			    PCATCH, "mqsend", timo);
+		mq->mq_senders--;
+		if (error == EAGAIN)
+			error = ETIMEDOUT;
+	}
+	if (mq->mq_curmsgs >= mq->mq_maxmsg) {
+		mtx_unlock(&mq->mq_mutex);
+		return (error);
+	}
+	error = 0;
+	if (TAILQ_EMPTY(&mq->mq_msgq)) {
+		TAILQ_INSERT_HEAD(&mq->mq_msgq, msg, msg_link);
+	} else {
+		if (msg->msg_prio <= TAILQ_LAST(&mq->mq_msgq, msgq)->msg_prio) {
+			TAILQ_INSERT_TAIL(&mq->mq_msgq, msg, msg_link);
+		} else {
+			TAILQ_FOREACH(msg2, &mq->mq_msgq, msg_link) {
+				if (msg2->msg_prio < msg->msg_prio)
+					break;
+			}
+			TAILQ_INSERT_BEFORE(msg2, msg, msg_link);
+		}
+	}
+	mq->mq_curmsgs++;
+	mq->mq_totalbytes += msg->msg_size;
+	if (mq->mq_receivers)
+		wakeup_one(&mq->mq_receivers);
+	else if (mq->mq_notifier != NULL)
+		mqueue_send_notification(mq);
+	if (mq->mq_flags & MQ_RSEL) {
+		mq->mq_flags &= ~MQ_RSEL;
+		selwakeup(&mq->mq_rsel);
+	}
+	KNOTE_LOCKED(&mq->mq_rsel.si_note, 0);
+	mtx_unlock(&mq->mq_mutex);
+	return (0);
+}
+
+/*
+ * Send realtime a signal to process which registered itself
+ * successfully by mq_notify.
+ */
+static void
+mqueue_send_notification(struct mqueue *mq)
+{
+	struct mqueue_notifier *nt;
+	struct proc *p;
+
+	mtx_assert(&mq->mq_mutex, MA_OWNED);
+	nt = mq->mq_notifier;
+	if (nt->nt_sigev.sigev_notify != SIGEV_NONE) {
+		p = nt->nt_proc;
+		PROC_LOCK(p);
+		if (!KSI_ONQ(&nt->nt_ksi))
+			psignal_event(p, &nt->nt_sigev, &nt->nt_ksi);
+		PROC_UNLOCK(p);
+	}
+	mq->mq_notifier = NULL;
+}
+
+/*
+ * Get a message. if waitok is false, thread will not be
+ * blocked if there is no data in queue, otherwise, absolute
+ * time will be checked.
+ */
+int
+mqueue_receive(struct mqueue *mq, char *msg_ptr,
+	size_t msg_len, unsigned *msg_prio, int waitok,
+	const struct timespec *abs_timeout)
+{
+	struct mqueue_msg *msg;
+	struct timespec ets, ts, ts2;
+	struct timeval tv;
+	int error;
+
+	if (msg_len < mq->mq_msgsize)
+		return (EMSGSIZE);
+
+	/* O_NONBLOCK case */
+	if (!waitok) {
+		error = _mqueue_recv(mq, &msg, -1);
+		if (error)
+			return (error);
+		goto received;
+	}
+
+	/* we allow a null timeout (wait forever). */
+	if (abs_timeout == NULL) {
+		error = _mqueue_recv(mq, &msg, 0);
+		if (error)
+			return (error);
+		goto received;
+	}
+
+	/* try to get a message before checking time */
+	error = _mqueue_recv(mq, &msg, -1);
+	if (error == 0)
+		goto received;
+
+	if (error != EAGAIN)
+		return (error);
+
+	error = copyin(abs_timeout, &ets, sizeof(ets));
+	if (error != 0)
+		return (error);
+	if (ets.tv_nsec >= 1000000000 || ets.tv_nsec < 0) {
+		error = EINVAL;
+		return (error);
+	}
+
+	for (;;) {
+		ts2 = ets;
+		getnanotime(&ts);
+		timespecsub(&ts2, &ts);
+		if (ts2.tv_sec < 0 || (ts2.tv_sec == 0 && ts2.tv_nsec <= 0)) {
+			error = ETIMEDOUT;
+			return (error);
+		}
+		TIMESPEC_TO_TIMEVAL(&tv, &ts2);
+		error = _mqueue_recv(mq, &msg, tvtohz(&tv));
+		if (error == 0)
+			break;
+		if (error != ETIMEDOUT)
+			return (error);
+	}
+
+received:
+	error = mqueue_savemsg(msg, msg_ptr, msg_prio);
+	if (error == 0) {
+		curthread->td_retval[0] = msg->msg_size;
+		curthread->td_retval[1] = 0;
+	}
+	mqueue_freemsg(msg);
+	return (error);
+}
+
+/*
+ * Common routine to receive a message
+ */
+static int
+_mqueue_recv(struct mqueue *mq, struct mqueue_msg **msg, int timo)
+{	
+	int error = 0;
+	
+	mtx_lock(&mq->mq_mutex);
+	while ((*msg = TAILQ_FIRST(&mq->mq_msgq)) == NULL && error == 0) {
+		if (timo < 0) {
+			mtx_unlock(&mq->mq_mutex);
+			return (EAGAIN);
+		}
+		mq->mq_receivers++;
+		error = msleep(&mq->mq_receivers, &mq->mq_mutex,
+			    PCATCH, "mqrecv", timo);
+		mq->mq_receivers--;
+		if (error == EAGAIN)
+			error = ETIMEDOUT;
+	}
+	if (*msg != NULL) {
+		error = 0;
+		TAILQ_REMOVE(&mq->mq_msgq, *msg, msg_link);
+		mq->mq_curmsgs--;
+		mq->mq_totalbytes -= (*msg)->msg_size;
+		if (mq->mq_senders)
+			wakeup_one(&mq->mq_senders);
+		if (mq->mq_flags & MQ_WSEL) {
+			mq->mq_flags &= ~MQ_WSEL;
+			selwakeup(&mq->mq_wsel);
+		}
+		KNOTE_LOCKED(&mq->mq_wsel.si_note, 0);
+	}
+	if (mq->mq_notifier != NULL && mq->mq_receivers == 0 &&
+	    !TAILQ_EMPTY(&mq->mq_msgq)) {
+		mqueue_send_notification(mq);
+	}
+	mtx_unlock(&mq->mq_mutex);
+	return (error);
+}
+
+static __inline struct mqueue_notifier *
+notifier_alloc(void)
+{
+	return (uma_zalloc(mqnoti_zone, M_WAITOK | M_ZERO));
+}
+
+static __inline void
+notifier_free(struct mqueue_notifier *p)
+{
+	uma_zfree(mqnoti_zone, p);
+}
+
+static struct mqueue_notifier *
+notifier_search(struct proc *p, int fd)
+{
+	struct mqueue_notifier *nt;
+
+	LIST_FOREACH(nt, &p->p_mqnotifier, nt_link) {
+		if (nt->nt_ksi.ksi_mqd == fd)
+			break;
+	}
+	return (nt);
+}
+
+static __inline void
+notifier_insert(struct proc *p, struct mqueue_notifier *nt)
+{
+	LIST_INSERT_HEAD(&p->p_mqnotifier, nt, nt_link);
+}
+
+static __inline void
+notifier_delete(struct proc *p, struct mqueue_notifier *nt)
+{
+	LIST_REMOVE(nt, nt_link);
+	notifier_free(nt);
+}
+
+static void
+notifier_remove(struct proc *p, struct mqueue *mq, int fd)
+{
+	struct mqueue_notifier *nt;
+
+	mtx_assert(&mq->mq_mutex, MA_OWNED);
+	PROC_LOCK(p);
+	nt = notifier_search(p, fd);
+	if (nt != NULL) {
+		if (mq->mq_notifier == nt)
+			mq->mq_notifier = NULL;
+		sigqueue_take(&nt->nt_ksi);
+		notifier_delete(p, nt);
+	}
+	PROC_UNLOCK(p);
+}
+
+/*
+ * Syscall to open a message queue.
+ */
+int
+kmq_open(struct thread *td, struct kmq_open_args *uap)
+{
+	char path[MQFS_NAMELEN + 1];
+	struct mq_attr attr, *pattr;
+	struct mqfs_node *pn;
+	struct filedesc *fdp;
+	struct file *fp;
+	struct mqueue *mq;
+	int fd, error, len, flags, cmode;
+
+	if ((uap->flags & O_ACCMODE) == O_ACCMODE)
+		return (EINVAL);
+
+	fdp = td->td_proc->p_fd;
+	flags = FFLAGS(uap->flags);
+	cmode = (((uap->mode & ~fdp->fd_cmask) & ALLPERMS) & ~S_ISTXT);
+	mq = NULL;
+	if ((flags & O_CREAT) && (uap->attr != NULL)) {
+		error = copyin(uap->attr, &attr, sizeof(attr));
+		if (error)
+			return (error);
+		if (attr.mq_maxmsg <= 0 || attr.mq_maxmsg > maxmsg)
+			return (EINVAL);
+		if (attr.mq_msgsize <= 0 || attr.mq_msgsize > maxmsgsize)
+			return (EINVAL);
+		pattr = &attr;
+	} else
+		pattr = NULL;
+
+	error = copyinstr(uap->path, path, MQFS_NAMELEN + 1, NULL);
+        if (error)
+		return (error);
+
+	/*
+	 * The first character of name must be a slash  (/) character
+	 * and the remaining characters of name cannot include any slash
+	 * characters. 
+	 */
+	len = strlen(path);
+	if (len < 2  || path[0] != '/' || index(path + 1, '/') != NULL)
+		return (EINVAL);
+
+	error = falloc(td, &fp, &fd);
+	if (error)
+		return (error);
+
+	sx_xlock(&mqfs_data.mi_lock);
+	pn = mqfs_search(mqfs_data.mi_root, path + 1, len - 1);
+	if (pn == NULL) {
+		if (!(flags & O_CREAT)) {
+			error = ENOENT;
+		} else {
+			mq = mqueue_alloc(pattr);
+			if (mq == NULL) {
+				error = ENFILE;
+			} else {
+				pn = mqfs_create_file(mqfs_data.mi_root,
+				         path + 1, len - 1, td->td_ucred,
+					 cmode);
+				if (pn == NULL) {
+					error = ENOSPC;
+					mqueue_free(mq);
+				}
+			}
+		}
+
+		if (error == 0) {
+			pn->mn_data = mq;
+		}
+	} else {
+		if ((flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) {
+			error = EEXIST;
+		} else {
+			int acc_mode = 0;
+
+			if (flags & FREAD)
+				acc_mode |= VREAD;
+			if (flags & FWRITE)
+				acc_mode |= VWRITE;
+			error = vaccess(VREG, pn->mn_mode, pn->mn_uid,
+				    pn->mn_gid, acc_mode, td->td_ucred, NULL);
+		}
+	}
+
+	if (error) {
+		sx_xunlock(&mqfs_data.mi_lock);
+		fdclose(fdp, fp, fd, td);
+		fdrop(fp, td);
+		return (error);
+	}
+
+	mqnode_addref(pn);
+	sx_xunlock(&mqfs_data.mi_lock);
+
+	FILE_LOCK(fp);
+	fp->f_flag = (flags & (FREAD | FWRITE | O_NONBLOCK));
+	fp->f_type = DTYPE_MQUEUE;
+	fp->f_data = pn;
+	fp->f_ops = &mqueueops;
+	FILE_UNLOCK(fp);
+
+	FILEDESC_XLOCK(fdp);
+	if (fdp->fd_ofiles[fd] == fp)
+		fdp->fd_ofileflags[fd] |= UF_EXCLOSE;
+	FILEDESC_XUNLOCK(fdp);
+	td->td_retval[0] = fd;
+	fdrop(fp, td);
+	return (0);
+}
+
+/*
+ * Syscall to unlink a message queue.
+ */
+int
+kmq_unlink(struct thread *td, struct kmq_unlink_args *uap)
+{
+	char path[MQFS_NAMELEN+1];
+	struct mqfs_node *pn;
+	int error, len;
+
+	error = copyinstr(uap->path, path, MQFS_NAMELEN + 1, NULL);
+        if (error)
+		return (error);
+
+	len = strlen(path);
+	if (len < 2  || path[0] != '/' || index(path + 1, '/') != NULL)
+		return (EINVAL);
+
+	sx_xlock(&mqfs_data.mi_lock);
+	pn = mqfs_search(mqfs_data.mi_root, path + 1, len - 1);
+	if (pn != NULL)
+		error = do_unlink(pn, td->td_ucred);
+	else
+		error = ENOENT;
+	sx_xunlock(&mqfs_data.mi_lock);
+	return (error);
+}
+
+typedef int (*_fgetf)(struct thread *, int, struct file **);
+
+/*
+ * Get message queue by giving file slot
+ */
+static int
+_getmq(struct thread *td, int fd, _fgetf func,
+       struct file **fpp, struct mqfs_node **ppn, struct mqueue **pmq)
+{
+	struct mqfs_node *pn;
+	int error;
+
+	error = func(td, fd, fpp);
+	if (error)
+		return (error);
+	if (&mqueueops != (*fpp)->f_ops) {
+		fdrop(*fpp, td);
+		return (EBADF);
+	}
+	pn = (*fpp)->f_data;
+	if (ppn)
+		*ppn = pn;
+	if (pmq)
+		*pmq = pn->mn_data;
+	return (0);
+}
+
+static __inline int
+getmq(struct thread *td, int fd, struct file **fpp, struct mqfs_node **ppn,
+	struct mqueue **pmq)
+{
+	return _getmq(td, fd, fget, fpp, ppn, pmq);
+}
+
+static __inline int
+getmq_read(struct thread *td, int fd, struct file **fpp,
+	 struct mqfs_node **ppn, struct mqueue **pmq)
+{
+	return _getmq(td, fd, fget_read, fpp, ppn, pmq);
+}
+
+static __inline int
+getmq_write(struct thread *td, int fd, struct file **fpp,
+	struct mqfs_node **ppn, struct mqueue **pmq)
+{
+	return _getmq(td, fd, fget_write, fpp, ppn, pmq);
+}
+
+int
+kmq_setattr(struct thread *td, struct kmq_setattr_args *uap)
+{
+	struct mqueue *mq;
+	struct file *fp;
+	struct mq_attr attr, oattr;
+	int error;
+
+	if (uap->attr) {
+		error = copyin(uap->attr, &attr, sizeof(attr));
+		if (error)
+			return (error);
+		if (attr.mq_flags & ~O_NONBLOCK)
+			return (EINVAL);
+	}
+	error = getmq(td, uap->mqd, &fp, NULL, &mq);
+	if (error)
+		return (error);
+	oattr.mq_maxmsg  = mq->mq_maxmsg;
+	oattr.mq_msgsize = mq->mq_msgsize;
+	oattr.mq_curmsgs = mq->mq_curmsgs;
+	FILE_LOCK(fp);
+	oattr.mq_flags = (O_NONBLOCK & fp->f_flag);
+	if (uap->attr) {
+		fp->f_flag &= ~O_NONBLOCK;
+		fp->f_flag |= (attr.mq_flags & O_NONBLOCK);
+	}
+	FILE_UNLOCK(fp);
+	fdrop(fp, td);
+	if (uap->oattr)
+		error = copyout(&oattr, uap->oattr, sizeof(oattr));
+	return (error);
+}
+
+int
+kmq_timedreceive(struct thread *td, struct kmq_timedreceive_args *uap)
+{
+	struct mqueue *mq;
+	struct file *fp;
+	int error;
+	int waitok;
+
+	error = getmq_read(td, uap->mqd, &fp, NULL, &mq);
+	if (error)
+		return (error);
+	waitok = !(fp->f_flag & O_NONBLOCK);
+	error = mqueue_receive(mq, uap->msg_ptr, uap->msg_len,
+		uap->msg_prio, waitok, uap->abs_timeout);
+	fdrop(fp, td);
+	return (error);
+}
+
+int
+kmq_timedsend(struct thread *td, struct kmq_timedsend_args *uap)
+{
+	struct mqueue *mq;
+	struct file *fp;
+	int error, waitok;
+
+	error = getmq_write(td, uap->mqd, &fp, NULL, &mq);
+	if (error)
+		return (error);
+	waitok = !(fp->f_flag & O_NONBLOCK);
+	error = mqueue_send(mq, uap->msg_ptr, uap->msg_len,
+		uap->msg_prio, waitok, uap->abs_timeout);
+	fdrop(fp, td);
+	return (error);
+}
+
+int
+kmq_notify(struct thread *td, struct kmq_notify_args *uap)
+{
+	struct sigevent ev;
+	struct filedesc *fdp;
+	struct proc *p;
+	struct mqueue *mq;
+	struct file *fp;
+	struct mqueue_notifier *nt, *newnt = NULL;
+	int error;
+
+	p = td->td_proc;
+	fdp = td->td_proc->p_fd;
+	if (uap->sigev) {
+		error = copyin(uap->sigev, &ev, sizeof(ev));
+		if (error)
+			return (error);
+		if (ev.sigev_notify != SIGEV_SIGNAL &&
+		    ev.sigev_notify != SIGEV_THREAD_ID &&
+		    ev.sigev_notify != SIGEV_NONE)
+			return (EINVAL);
+		if ((ev.sigev_notify == SIGEV_SIGNAL ||
+		     ev.sigev_notify == SIGEV_THREAD_ID) &&
+			!_SIG_VALID(ev.sigev_signo))
+			return (EINVAL);
+	}
+	error = getmq(td, uap->mqd, &fp, NULL, &mq);
+	if (error)
+		return (error);
+again:
+	FILEDESC_SLOCK(fdp);
+	if (fget_locked(fdp, uap->mqd) != fp) {
+		FILEDESC_SUNLOCK(fdp);
+		error = EBADF;
+		goto out;
+	}
+	mtx_lock(&mq->mq_mutex);
+	FILEDESC_SUNLOCK(fdp);
+	if (uap->sigev != NULL) {
+		if (mq->mq_notifier != NULL) {
+			error = EBUSY;
+		} else {
+			PROC_LOCK(p);
+			nt = notifier_search(p, uap->mqd);
+			if (nt == NULL) {
+				if (newnt == NULL) {
+					PROC_UNLOCK(p);
+					mtx_unlock(&mq->mq_mutex);
+					newnt = notifier_alloc();
+					goto again;
+				}
+			}
+
+			if (nt != NULL) {
+				sigqueue_take(&nt->nt_ksi);
+				if (newnt != NULL) {
+					notifier_free(newnt);
+					newnt = NULL;
+				}
+			} else {
+				nt = newnt;
+				newnt = NULL;
+				ksiginfo_init(&nt->nt_ksi);
+				nt->nt_ksi.ksi_flags |= KSI_INS | KSI_EXT;
+				nt->nt_ksi.ksi_code = SI_MESGQ;
+				nt->nt_proc = p;
+				nt->nt_ksi.ksi_mqd = uap->mqd;
+				notifier_insert(p, nt);
+			}
+			nt->nt_sigev = ev;
+			mq->mq_notifier = nt;
+			PROC_UNLOCK(p);
+			/*
+			 * if there is no receivers and message queue
+			 * is not empty, we should send notification
+			 * as soon as possible.
+			 */
+			if (mq->mq_receivers == 0 &&
+			    !TAILQ_EMPTY(&mq->mq_msgq))
+				mqueue_send_notification(mq);
+		}
+	} else {
+		notifier_remove(p, mq, uap->mqd);
+	}
+	mtx_unlock(&mq->mq_mutex);
+
+out:
+	fdrop(fp, td);
+	if (newnt != NULL)
+		notifier_free(newnt);
+	return (error);
+}
+
+static void
+mqueue_fdclose(struct thread *td, int fd, struct file *fp)
+{
+	struct filedesc *fdp;
+	struct mqueue *mq;
+ 
+	fdp = td->td_proc->p_fd;
+	FILEDESC_LOCK_ASSERT(fdp);
+
+	if (fp->f_ops == &mqueueops) {
+		mq = FPTOMQ(fp);
+		mtx_lock(&mq->mq_mutex);
+		notifier_remove(td->td_proc, mq, fd);
+
+		/* have to wakeup thread in same process */
+		if (mq->mq_flags & MQ_RSEL) {
+			mq->mq_flags &= ~MQ_RSEL;
+			selwakeup(&mq->mq_rsel);
+		}
+		if (mq->mq_flags & MQ_WSEL) {
+			mq->mq_flags &= ~MQ_WSEL;
+			selwakeup(&mq->mq_wsel);
+		}
+		mtx_unlock(&mq->mq_mutex);
+	}
+}
+
+static void
+mq_proc_exit(void *arg __unused, struct proc *p)
+{
+	struct filedesc *fdp;
+	struct file *fp;
+	struct mqueue *mq;
+	int i;
+
+	fdp = p->p_fd;
+	FILEDESC_SLOCK(fdp);
+	for (i = 0; i < fdp->fd_nfiles; ++i) {
+		fp = fget_locked(fdp, i);
+		if (fp != NULL && fp->f_ops == &mqueueops) {
+			mq = FPTOMQ(fp);
+			mtx_lock(&mq->mq_mutex);
+			notifier_remove(p, FPTOMQ(fp), i);
+			mtx_unlock(&mq->mq_mutex);
+		}
+	}
+	FILEDESC_SUNLOCK(fdp);
+	KASSERT(LIST_EMPTY(&p->p_mqnotifier), ("mq notifiers left"));
+}
+
+static int
+mqf_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
+	int flags, struct thread *td)
+{
+	return (EOPNOTSUPP);
+}
+
+static int
+mqf_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
+	int flags, struct thread *td)
+{
+	return (EOPNOTSUPP);
+}
+
+static int
+mqf_ioctl(struct file *fp, u_long cmd, void *data,
+	struct ucred *active_cred, struct thread *td)
+{
+	return (ENOTTY);
+}
+
+static int
+mqf_poll(struct file *fp, int events, struct ucred *active_cred,
+	struct thread *td)
+{
+	struct mqueue *mq = FPTOMQ(fp);
+	int revents = 0;
+
+	mtx_lock(&mq->mq_mutex);
+	if (events & (POLLIN | POLLRDNORM)) {
+		if (mq->mq_curmsgs) {
+			revents |= events & (POLLIN | POLLRDNORM);
+		} else {
+			mq->mq_flags |= MQ_RSEL;
+			selrecord(td, &mq->mq_rsel);
+ 		}
+	}
+	if (events & POLLOUT) {
+		if (mq->mq_curmsgs < mq->mq_maxmsg)
+			revents |= POLLOUT;
+		else {
+			mq->mq_flags |= MQ_WSEL;
+			selrecord(td, &mq->mq_wsel);
+		}
+	}
+	mtx_unlock(&mq->mq_mutex);
+	return (revents);
+}
+
+static int
+mqf_close(struct file *fp, struct thread *td)
+{
+	struct mqfs_node *pn;
+
+	fp->f_ops = &badfileops;
+	pn = fp->f_data;
+	fp->f_data = NULL;
+	sx_xlock(&mqfs_data.mi_lock);
+	mqnode_release(pn);
+	sx_xunlock(&mqfs_data.mi_lock);
+	return (0);
+}
+
+static int
+mqf_stat(struct file *fp, struct stat *st, struct ucred *active_cred,
+	struct thread *td)
+{
+	struct mqfs_node *pn = fp->f_data;
+
+	bzero(st, sizeof *st);
+	st->st_atimespec = pn->mn_atime;
+	st->st_mtimespec = pn->mn_mtime;
+	st->st_ctimespec = pn->mn_ctime;
+	st->st_birthtimespec = pn->mn_birth;
+	st->st_uid = pn->mn_uid;
+	st->st_gid = pn->mn_gid;
+	st->st_mode = S_IFIFO | pn->mn_mode;
+	return (0);
+}
+
+static int
+mqf_kqfilter(struct file *fp, struct knote *kn)
+{
+	struct mqueue *mq = FPTOMQ(fp);
+	int error = 0;
+
+	if (kn->kn_filter == EVFILT_READ) {
+		kn->kn_fop = &mq_rfiltops;
+		knlist_add(&mq->mq_rsel.si_note, kn, 0);
+	} else if (kn->kn_filter == EVFILT_WRITE) {
+		kn->kn_fop = &mq_wfiltops;
+		knlist_add(&mq->mq_wsel.si_note, kn, 0);
+	} else
+		error = EINVAL;
+	return (error);
+}
+
+static void
+filt_mqdetach(struct knote *kn)
+{
+	struct mqueue *mq = FPTOMQ(kn->kn_fp);
+
+	if (kn->kn_filter == EVFILT_READ)
+		knlist_remove(&mq->mq_rsel.si_note, kn, 0);
+	else if (kn->kn_filter == EVFILT_WRITE)
+		knlist_remove(&mq->mq_wsel.si_note, kn, 0);
+	else
+		panic("filt_mqdetach");
+}
+
+static int
+filt_mqread(struct knote *kn, long hint)
+{
+	struct mqueue *mq = FPTOMQ(kn->kn_fp);
+
+	mtx_assert(&mq->mq_mutex, MA_OWNED);
+	return (mq->mq_curmsgs != 0);
+}
+
+static int
+filt_mqwrite(struct knote *kn, long hint)
+{
+	struct mqueue *mq = FPTOMQ(kn->kn_fp);
+
+	mtx_assert(&mq->mq_mutex, MA_OWNED);
+	return (mq->mq_curmsgs < mq->mq_maxmsg);
+}
+
+static struct fileops mqueueops = {
+	.fo_read		= mqf_read,
+	.fo_write		= mqf_write,
+	.fo_ioctl		= mqf_ioctl,
+	.fo_poll		= mqf_poll,
+	.fo_kqfilter		= mqf_kqfilter,
+	.fo_stat		= mqf_stat,
+	.fo_close		= mqf_close
+};
+
+static struct vop_vector mqfs_vnodeops = {
+	.vop_default 		= &default_vnodeops,
+	.vop_access		= mqfs_access,
+	.vop_cachedlookup	= mqfs_lookup,
+	.vop_lookup		= vfs_cache_lookup,
+	.vop_reclaim		= mqfs_reclaim,
+	.vop_create		= mqfs_create,
+	.vop_remove		= mqfs_remove,
+	.vop_inactive		= mqfs_inactive,
+	.vop_open		= mqfs_open,
+	.vop_close		= mqfs_close,
+	.vop_getattr		= mqfs_getattr,
+	.vop_setattr		= mqfs_setattr,
+	.vop_read		= mqfs_read,
+	.vop_write		= VOP_EOPNOTSUPP,
+	.vop_readdir		= mqfs_readdir,
+	.vop_mkdir		= VOP_EOPNOTSUPP,
+	.vop_rmdir		= VOP_EOPNOTSUPP
+};
+
+static struct vfsops mqfs_vfsops = {
+	.vfs_init 		= mqfs_init,
+	.vfs_uninit		= mqfs_uninit,
+	.vfs_mount		= mqfs_mount,
+	.vfs_unmount		= mqfs_unmount,
+	.vfs_root		= mqfs_root,
+	.vfs_statfs		= mqfs_statfs,
+};
+
+SYSCALL_MODULE_HELPER(kmq_open);
+SYSCALL_MODULE_HELPER(kmq_setattr);
+SYSCALL_MODULE_HELPER(kmq_timedsend);
+SYSCALL_MODULE_HELPER(kmq_timedreceive);
+SYSCALL_MODULE_HELPER(kmq_notify);
+SYSCALL_MODULE_HELPER(kmq_unlink);
+
+VFS_SET(mqfs_vfsops, mqueuefs, VFCF_SYNTHETIC);
+MODULE_VERSION(mqueuefs, 1);
--- /dev/null
+++ sys/kern/subr_lock.c
@@ -0,0 +1,356 @@
+/*-
+ * Copyright (c) 2006 John Baldwin <jhb at FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the author nor the names of any co-contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * This module holds the global variables and functions used to maintain
+ * lock_object structures.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: src/sys/kern/subr_lock.c,v 1.17 2007/09/14 01:12:39 attilio Exp $");
+
+#include "opt_ddb.h"
+#include "opt_mprof.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/ktr.h>
+#include <sys/linker_set.h>
+#include <sys/lock.h>
+#include <sys/sbuf.h>
+#include <sys/sysctl.h>
+#include <sys/lock_profile.h>
+
+#ifdef DDB
+#include <ddb/ddb.h>
+#endif
+
+CTASSERT(LOCK_CLASS_MAX == 15);
+
+struct lock_class *lock_classes[LOCK_CLASS_MAX + 1] = {
+	&lock_class_mtx_spin,
+	&lock_class_mtx_sleep,
+	&lock_class_sx,
+	&lock_class_rw,
+	&lock_class_lockmgr,
+};
+
+#ifdef LOCK_PROFILING
+#include <machine/cpufunc.h>
+
+SYSCTL_NODE(_debug, OID_AUTO, lock, CTLFLAG_RD, NULL, "lock debugging");
+SYSCTL_NODE(_debug_lock, OID_AUTO, prof, CTLFLAG_RD, NULL, "lock profiling");
+int lock_prof_enable = 0;
+SYSCTL_INT(_debug_lock_prof, OID_AUTO, enable, CTLFLAG_RW,
+    &lock_prof_enable, 0, "Enable lock profiling");
+
+/*
+ * lprof_buf is a static pool of profiling records to avoid possible
+ * reentrance of the memory allocation functions.
+ *
+ * Note: NUM_LPROF_BUFFERS must be smaller than LPROF_HASH_SIZE.
+ */
+struct lock_prof lprof_buf[LPROF_HASH_SIZE];
+static int allocated_lprof_buf;
+struct mtx lprof_locks[LPROF_LOCK_SIZE];
+
+
+/* SWAG: sbuf size = avg stat. line size * number of locks */
+#define LPROF_SBUF_SIZE		256 * 400
+
+static int lock_prof_acquisitions;
+SYSCTL_INT(_debug_lock_prof, OID_AUTO, acquisitions, CTLFLAG_RD,
+    &lock_prof_acquisitions, 0, "Number of lock acquistions recorded");
+static int lock_prof_records;
+SYSCTL_INT(_debug_lock_prof, OID_AUTO, records, CTLFLAG_RD,
+    &lock_prof_records, 0, "Number of profiling records");
+static int lock_prof_maxrecords = LPROF_HASH_SIZE;
+SYSCTL_INT(_debug_lock_prof, OID_AUTO, maxrecords, CTLFLAG_RD,
+    &lock_prof_maxrecords, 0, "Maximum number of profiling records");
+static int lock_prof_rejected;
+SYSCTL_INT(_debug_lock_prof, OID_AUTO, rejected, CTLFLAG_RD,
+    &lock_prof_rejected, 0, "Number of rejected profiling records");
+static int lock_prof_hashsize = LPROF_HASH_SIZE;
+SYSCTL_INT(_debug_lock_prof, OID_AUTO, hashsize, CTLFLAG_RD,
+    &lock_prof_hashsize, 0, "Hash size");
+static int lock_prof_collisions = 0;
+SYSCTL_INT(_debug_lock_prof, OID_AUTO, collisions, CTLFLAG_RD,
+    &lock_prof_collisions, 0, "Number of hash collisions");
+
+#ifndef USE_CPU_NANOSECONDS
+u_int64_t
+nanoseconds(void)
+{
+	struct timespec tv;
+
+	nanotime(&tv);
+	return (tv.tv_sec * (u_int64_t)1000000000 + tv.tv_nsec);
+}
+#endif
+
+static int
+dump_lock_prof_stats(SYSCTL_HANDLER_ARGS)
+{
+        struct sbuf *sb;
+        int error, i;
+        static int multiplier = 1;
+        const char *p;
+
+        if (allocated_lprof_buf == 0)
+                return (SYSCTL_OUT(req, "No locking recorded",
+                    sizeof("No locking recorded")));
+
+retry_sbufops:
+        sb = sbuf_new(NULL, NULL, LPROF_SBUF_SIZE * multiplier, SBUF_FIXEDLEN);
+        sbuf_printf(sb, "\n%6s %12s %12s %11s %5s %5s %12s %12s %s\n",
+            "max", "total", "wait_total", "count", "avg", "wait_avg", "cnt_hold", "cnt_lock", "name");
+        for (i = 0; i < LPROF_HASH_SIZE; ++i) {
+                if (lprof_buf[i].name == NULL)
+                        continue;
+                for (p = lprof_buf[i].file;
+                        p != NULL && strncmp(p, "../", 3) == 0; p += 3)
+                                /* nothing */ ;
+                sbuf_printf(sb, "%6ju %12ju %12ju %11ju %5ju %5ju %12ju %12ju %s:%d (%s:%s)\n",
+                    lprof_buf[i].cnt_max / 1000,
+                    lprof_buf[i].cnt_tot / 1000,
+                    lprof_buf[i].cnt_wait / 1000,
+                    lprof_buf[i].cnt_cur,
+                    lprof_buf[i].cnt_cur == 0 ? (uintmax_t)0 :
+                        lprof_buf[i].cnt_tot / (lprof_buf[i].cnt_cur * 1000),
+                    lprof_buf[i].cnt_cur == 0 ? (uintmax_t)0 :
+                        lprof_buf[i].cnt_wait / (lprof_buf[i].cnt_cur * 1000),
+                    lprof_buf[i].cnt_contest_holding,
+                    lprof_buf[i].cnt_contest_locking,
+                    p, lprof_buf[i].line, 
+			    lprof_buf[i].type,
+			    lprof_buf[i].name);
+                if (sbuf_overflowed(sb)) {
+                        sbuf_delete(sb);
+                        multiplier++;
+                        goto retry_sbufops;
+                }
+        }
+
+        sbuf_finish(sb);
+        error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
+        sbuf_delete(sb);
+        return (error);
+}
+static int
+reset_lock_prof_stats(SYSCTL_HANDLER_ARGS)
+{
+        int error, v;
+
+        if (allocated_lprof_buf == 0)
+                return (0);
+
+        v = 0;
+        error = sysctl_handle_int(oidp, &v, 0, req);
+        if (error)
+                return (error);
+        if (req->newptr == NULL)
+                return (error);
+        if (v == 0)
+                return (0);
+
+        bzero(lprof_buf, LPROF_HASH_SIZE*sizeof(*lprof_buf));
+        allocated_lprof_buf = 0;
+        return (0);
+}
+
+SYSCTL_PROC(_debug_lock_prof, OID_AUTO, stats, CTLTYPE_STRING | CTLFLAG_RD,
+    NULL, 0, dump_lock_prof_stats, "A", "Lock profiling statistics");
+
+SYSCTL_PROC(_debug_lock_prof, OID_AUTO, reset, CTLTYPE_INT | CTLFLAG_RW,
+    NULL, 0, reset_lock_prof_stats, "I", "Reset lock profiling statistics");
+#endif
+
+void
+lock_init(struct lock_object *lock, struct lock_class *class, const char *name,
+    const char *type, int flags)
+{
+	int i;
+
+	/* Check for double-init and zero object. */
+	KASSERT(!lock_initalized(lock), ("lock \"%s\" %p already initialized",
+	    name, lock));
+
+	/* Look up lock class to find its index. */
+	for (i = 0; i < LOCK_CLASS_MAX; i++)
+		if (lock_classes[i] == class) {
+			lock->lo_flags = i << LO_CLASSSHIFT;
+			break;
+		}
+	KASSERT(i < LOCK_CLASS_MAX, ("unknown lock class %p", class));
+
+	/* Initialize the lock object. */
+	lock->lo_name = name;
+	lock->lo_type = type != NULL ? type : name;
+	lock->lo_flags |= flags | LO_INITIALIZED;
+	LOCK_LOG_INIT(lock, 0);
+	WITNESS_INIT(lock);
+	lock_profile_object_init(lock, class, name);
+}
+
+void
+lock_destroy(struct lock_object *lock)
+{
+
+	KASSERT(lock_initalized(lock), ("lock %p is not initialized", lock));
+	lock_profile_object_destroy(lock);
+	WITNESS_DESTROY(lock);
+	LOCK_LOG_DESTROY(lock, 0);
+	lock->lo_flags &= ~LO_INITIALIZED;
+}
+
+#ifdef DDB
+DB_SHOW_COMMAND(lock, db_show_lock)
+{
+	struct lock_object *lock;
+	struct lock_class *class;
+
+	if (!have_addr)
+		return;
+	lock = (struct lock_object *)addr;
+	if (LO_CLASSINDEX(lock) > LOCK_CLASS_MAX) {
+		db_printf("Unknown lock class: %d\n", LO_CLASSINDEX(lock));
+		return;
+	}
+	class = LOCK_CLASS(lock);
+	db_printf(" class: %s\n", class->lc_name);
+	db_printf(" name: %s\n", lock->lo_name);
+	if (lock->lo_type && lock->lo_type != lock->lo_name)
+		db_printf(" type: %s\n", lock->lo_type);
+	class->lc_ddb_show(lock);
+}
+#endif
+
+#ifdef LOCK_PROFILING
+void _lock_profile_obtain_lock_success(struct lock_object *lo, int contested, uint64_t waittime, const char *file, int line)
+{
+        struct lock_profile_object *l = &lo->lo_profile_obj;
+
+	lo->lo_profile_obj.lpo_contest_holding = 0;
+	
+	if (contested)
+		lo->lo_profile_obj.lpo_contest_locking++;		
+	
+	l->lpo_filename = file;
+	l->lpo_lineno = line;
+	l->lpo_acqtime = nanoseconds(); 
+	if (waittime && (l->lpo_acqtime > waittime))
+		l->lpo_waittime = l->lpo_acqtime - waittime;
+	else
+		l->lpo_waittime = 0;
+}
+
+void _lock_profile_release_lock(struct lock_object *lo)
+{
+        struct lock_profile_object *l = &lo->lo_profile_obj;
+
+        if (l->lpo_acqtime) {
+                const char *unknown = "(unknown)";
+                u_int64_t acqtime, now, waittime;
+                struct lock_prof *mpp;
+                u_int hash;
+                const char *p = l->lpo_filename;
+                int collision = 0;
+
+                now = nanoseconds();
+                acqtime = l->lpo_acqtime;
+                waittime = l->lpo_waittime;
+                if (now <= acqtime)
+                        return;
+                if (p == NULL || *p == '\0')
+                        p = unknown;
+                hash = (l->lpo_namehash * 31 * 31 + (uintptr_t)p * 31 + l->lpo_lineno) & LPROF_HASH_MASK;
+                mpp = &lprof_buf[hash];
+                while (mpp->name != NULL) {
+                        if (mpp->line == l->lpo_lineno &&
+                          mpp->file == p &&
+                          mpp->namehash == l->lpo_namehash)
+                                break;
+                        /* If the lprof_hash entry is allocated to someone 
+			 * else, try the next one 
+			 */
+                        collision = 1;
+                        hash = (hash + 1) & LPROF_HASH_MASK;
+                        mpp = &lprof_buf[hash];
+                }
+                if (mpp->name == NULL) {
+                        int buf;
+
+                        buf = atomic_fetchadd_int(&allocated_lprof_buf, 1);
+                        /* Just exit if we cannot get a trace buffer */
+                        if (buf >= LPROF_HASH_SIZE) {
+                                ++lock_prof_rejected;
+                                return;
+                        }
+			mpp->file = p;
+			mpp->line = l->lpo_lineno;
+			mpp->namehash = l->lpo_namehash;
+			mpp->type = l->lpo_type;
+			mpp->name = lo->lo_name;
+
+			if (collision)
+				++lock_prof_collisions;
+			
+                        /* 
+			 * We might have raced someone else but who cares, 
+			 * they'll try again next time 
+			 */
+                        ++lock_prof_records;
+                }
+                LPROF_LOCK(hash);
+                /*
+                 * Record if the lock has been held longer now than ever
+                 * before.
+                 */
+                if (now - acqtime > mpp->cnt_max)
+                        mpp->cnt_max = now - acqtime;
+                mpp->cnt_tot += now - acqtime;
+                mpp->cnt_wait += waittime;
+                mpp->cnt_cur++;
+                /*
+                 * There's a small race, really we should cmpxchg
+                 * 0 with the current value, but that would bill
+                 * the contention to the wrong lock instance if
+                 * it followed this also.
+                 */
+                mpp->cnt_contest_holding += l->lpo_contest_holding;
+                mpp->cnt_contest_locking += l->lpo_contest_locking;
+                LPROF_UNLOCK(hash);
+
+        }
+        l->lpo_acqtime = 0;
+        l->lpo_waittime = 0;
+        l->lpo_contest_locking = 0;
+        l->lpo_contest_holding = 0;
+}
+#endif
Index: kern_fork.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_fork.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/kern_fork.c -L sys/kern/kern_fork.c -u -r1.2 -r1.3
--- sys/kern/kern_fork.c
+++ sys/kern/kern_fork.c
@@ -35,7 +35,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_fork.c,v 1.252 2005/07/01 16:28:30 ssouhlal Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_fork.c,v 1.282.2.1.2.1 2008/01/19 18:15:05 kib Exp $");
 
 #include "opt_ktrace.h"
 #include "opt_mac.h"
@@ -51,6 +51,7 @@
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
+#include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/pioctl.h>
 #include <sys/resourcevar.h>
@@ -59,13 +60,15 @@
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 #include <sys/acct.h>
-#include <sys/mac.h>
 #include <sys/ktr.h>
 #include <sys/ktrace.h>
 #include <sys/unistd.h>	
 #include <sys/sx.h>
 #include <sys/signalvar.h>
 
+#include <security/audit/audit.h>
+#include <security/mac/mac_framework.h>
+
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
@@ -79,11 +82,6 @@
 };
 #endif
 
-static int forksleep; /* Place for fork1() to sleep on. */
-
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 fork(td, uap)
@@ -101,9 +99,6 @@
 	return (error);
 }
 
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 vfork(td, uap)
@@ -121,9 +116,6 @@
 	return (error);
 }
 
-/*
- * MPSAFE
- */
 int
 rfork(td, uap)
 	struct thread *td;
@@ -136,6 +128,7 @@
 	if ((uap->flags & RFKERNELONLY) != 0)
 		return (EINVAL);
 
+	AUDIT_ARG(fflags, uap->flags);
 	error = fork1(td, uap->flags, 0, &p2);
 	if (error == 0) {
 		td->td_retval[0] = p2 ? p2->p_pid : 0;
@@ -201,8 +194,8 @@
 	struct filedesc *fd;
 	struct filedesc_to_leader *fdtol;
 	struct thread *td2;
-	struct ksegrp *kg2;
 	struct sigacts *newsigacts;
+	struct vmspace *vm2;
 	int error;
 
 	/* Can't copy and clear. */
@@ -217,8 +210,8 @@
 	 */
 	if ((flags & RFPROC) == 0) {
 		if ((p1->p_flag & P_HADTHREADS) &&
-  	            (flags & (RFCFDG | RFFDG))) {
-  	        	PROC_LOCK(p1);
+		    (flags & (RFCFDG | RFFDG))) {
+			PROC_LOCK(p1);
 			if (thread_single(SINGLE_BOUNDARY)) {
 				PROC_UNLOCK(p1);
 				return (ERESTART);
@@ -226,7 +219,10 @@
 			PROC_UNLOCK(p1);
 		}
 
-		vm_forkproc(td, NULL, NULL, flags);
+		error = vm_forkproc(td, NULL, NULL, NULL, flags);
+		if (error)
+			goto norfproc_fail;
+
 		/*
 		 * Close all file descriptors.
 		 */
@@ -243,50 +239,50 @@
 		if (flags & RFFDG) 
 			fdunshare(p1, td);
 
-		if((p1->p_flag & P_HADTHREADS) &&
-		   (flags & (RFCFDG|RFFDG))) {
+norfproc_fail:
+		if ((p1->p_flag & P_HADTHREADS) &&
+		    (flags & (RFCFDG | RFFDG))) {
 			PROC_LOCK(p1);
 			thread_single_end();
 			PROC_UNLOCK(p1);
 		}
 		*procp = NULL;
-		return (0);
-	}
-
-	/*
-	 * Note 1:1 allows for forking with one thread coming out on the
-	 * other side with the expectation that the process is about to
-	 * exec.
-	 */
-	if (p1->p_flag & P_HADTHREADS) {
-		/*
-		 * Idle the other threads for a second.
-		 * Since the user space is copied, it must remain stable.
-		 * In addition, all threads (from the user perspective)
-		 * need to either be suspended or in the kernel,
-		 * where they will try restart in the parent and will
-		 * be aborted in the child.
-		 */
-		PROC_LOCK(p1);
-		if (thread_single(SINGLE_NO_EXIT)) {
-			/* Abort. Someone else is single threading before us. */
-			PROC_UNLOCK(p1);
-			return (ERESTART);
-		}
-		PROC_UNLOCK(p1);
-		/*
-		 * All other activity in this process
-		 * is now suspended at the user boundary,
-		 * (or other safe places if we think of any).
-		 */
+		return (error);
 	}
 
 	/* Allocate new proc. */
 	newproc = uma_zalloc(proc_zone, M_WAITOK);
+	if (TAILQ_EMPTY(&newproc->p_threads)) {
+		td2 = thread_alloc();
+		if (td2 == NULL) {
+			error = ENOMEM;
+			goto fail1;
+		}
+		proc_linkup(newproc, td2);
+		sched_newproc(newproc, td2);
+	} else
+		td2 = FIRST_THREAD_IN_PROC(newproc);
+
+	/* Allocate and switch to an alternate kstack if specified. */
+	if (pages != 0) {
+		if (!vm_thread_new_altkstack(td2, pages)) {
+			error = ENOMEM;
+			goto fail1;
+		}
+	}
+	if ((flags & RFMEM) == 0) {
+		vm2 = vmspace_fork(p1->p_vmspace);
+		if (vm2 == NULL) {
+			error = ENOMEM;
+			goto fail1;
+		}
+	} else
+		vm2 = NULL;
 #ifdef MAC
 	mac_init_proc(newproc);
 #endif
 	knlist_init(&newproc->p_klist, &newproc->p_mtx, NULL, NULL, NULL);
+	STAILQ_INIT(&newproc->p_ktr);
 
 	/* We have to lock the process tree while we look for a pid. */
 	sx_slock(&proctree_lock);
@@ -299,9 +295,8 @@
 	 * processes, maxproc is the limit.
 	 */
 	sx_xlock(&allproc_lock);
-	if ((nprocs >= maxproc - 10 &&
-	    suser_cred(td->td_ucred, SUSER_RUID) != 0) ||
-	    nprocs >= maxproc) {
+	if ((nprocs >= maxproc - 10 && priv_check_cred(td->td_ucred,
+	    PRIV_MAXPROC, 0) != 0) || nprocs >= maxproc) {
 		error = EAGAIN;
 		goto fail;
 	}
@@ -309,14 +304,16 @@
 	/*
 	 * Increment the count of procs running with this uid. Don't allow
 	 * a nonprivileged user to exceed their current limit.
+	 *
+	 * XXXRW: Can we avoid privilege here if it's not needed?
 	 */
-	error = suser_cred(td->td_ucred, SUSER_RUID|SUSER_ALLOWJAIL);
-	if (error==0)
+	error = priv_check_cred(td->td_ucred, PRIV_PROC_LIMIT, 0);
+	if (error == 0)
 		ok = chgproccnt(td->td_ucred->cr_ruidinfo, 1, 0);
 	else {
 		PROC_LOCK(p1);
 		ok = chgproccnt(td->td_ucred->cr_ruidinfo, 1,
-  	             lim_cur(p1, RLIMIT_NPROC));
+		    lim_cur(p1, RLIMIT_NPROC));
 		PROC_UNLOCK(p1);
 	}
 	if (!ok) {
@@ -369,16 +366,14 @@
 		p2 = LIST_FIRST(&allproc);
 again:
 		for (; p2 != NULL; p2 = LIST_NEXT(p2, p_list)) {
-
 			while (p2->p_pid == trypid ||
 			    (p2->p_pgrp != NULL &&
 			    (p2->p_pgrp->pg_id == trypid ||
 			    (p2->p_session != NULL &&
 			    p2->p_session->s_sid == trypid)))) {
 				trypid++;
-				if (trypid >= pidchecked) 
+				if (trypid >= pidchecked)
 					goto retry;
-				
 			}
 			if (p2->p_pid > trypid && pidchecked > p2->p_pid)
 				pidchecked = p2->p_pid;
@@ -411,10 +406,31 @@
 	p2 = newproc;
 	p2->p_state = PRS_NEW;		/* protect against others */
 	p2->p_pid = trypid;
+	/*
+	 * Allow the scheduler to initialize the child.
+	 */
+	thread_lock(td);
+	sched_fork(td, td2);
+	thread_unlock(td);
+	AUDIT_ARG(pid, p2->p_pid);
 	LIST_INSERT_HEAD(&allproc, p2, p_list);
 	LIST_INSERT_HEAD(PIDHASH(p2->p_pid), p2, p_hash);
+
+	PROC_LOCK(p2);
+	PROC_LOCK(p1);
+
 	sx_xunlock(&allproc_lock);
 
+	bcopy(&p1->p_startcopy, &p2->p_startcopy,
+	    __rangeof(struct proc, p_startcopy, p_endcopy));
+	PROC_UNLOCK(p1);
+
+	bzero(&p2->p_startzero,
+	    __rangeof(struct proc, p_startzero, p_endzero));
+
+	p2->p_ucred = crhold(td->td_ucred);
+	PROC_UNLOCK(p2);
+
 	/*
 	 * Malloc things while we don't hold any locks.
 	 */
@@ -445,9 +461,9 @@
 			 * shared process leaders.
 			 */
 			fdtol = p1->p_fdtol;
-			FILEDESC_LOCK_FAST(p1->p_fd);
+			FILEDESC_XLOCK(p1->p_fd);
 			fdtol->fdl_refcount++;
-			FILEDESC_UNLOCK_FAST(p1->p_fd);
+			FILEDESC_XUNLOCK(p1->p_fd);
 		} else {
 			/* 
 			 * Shared file descriptor table, and
@@ -463,52 +479,29 @@
 	 * Start by zeroing the section of proc that is zero-initialized,
 	 * then copy the section that is copied directly from the parent.
 	 */
-	td2 = FIRST_THREAD_IN_PROC(p2);
-	kg2 = FIRST_KSEGRP_IN_PROC(p2);
-
-	/* Allocate and switch to an alternate kstack if specified. */
-	if (pages != 0)
-		vm_thread_new_altkstack(td2, pages);
 
 	PROC_LOCK(p2);
 	PROC_LOCK(p1);
 
-	bzero(&p2->p_startzero,
-	    __rangeof(struct proc, p_startzero, p_endzero));
 	bzero(&td2->td_startzero,
 	    __rangeof(struct thread, td_startzero, td_endzero));
-	bzero(&kg2->kg_startzero,
-	    __rangeof(struct ksegrp, kg_startzero, kg_endzero));
 
-	bcopy(&p1->p_startcopy, &p2->p_startcopy,
-	    __rangeof(struct proc, p_startcopy, p_endcopy));
 	bcopy(&td->td_startcopy, &td2->td_startcopy,
 	    __rangeof(struct thread, td_startcopy, td_endcopy));
-	bcopy(&td->td_ksegrp->kg_startcopy, &kg2->kg_startcopy,
-	    __rangeof(struct ksegrp, kg_startcopy, kg_endcopy));
 
 	td2->td_sigstk = td->td_sigstk;
 	td2->td_sigmask = td->td_sigmask;
+	td2->td_flags = TDF_INMEM;
 
 	/*
 	 * Duplicate sub-structures as needed.
 	 * Increase reference counts on shared objects.
 	 */
-	p2->p_flag = 0;
+	p2->p_flag = P_INMEM;
+	p2->p_swtick = ticks;
 	if (p1->p_flag & P_PROFIL)
 		startprofclock(p2);
-	mtx_lock_spin(&sched_lock);
-	p2->p_sflag = PS_INMEM;
-	/*
-	 * Allow the scheduler to adjust the priority of the child and
-	 * parent while we hold the sched_lock.
-	 */
-	sched_fork(td, td2);
-
-	mtx_unlock_spin(&sched_lock);
-	p2->p_ucred = crhold(td->td_ucred);
-	td2->td_ucred = crhold(p2->p_ucred);	/* XXXKSE */
-
+	td2->td_ucred = crhold(p2->p_ucred);
 	pargs_hold(p2->p_args);
 
 	if (flags & RFSIGSHARE) {
@@ -529,7 +522,7 @@
 	/*
 	 * p_limit is copy-on-write.  Bump its refcount.
 	 */
-	p2->p_limit = lim_hold(p1->p_limit);
+	lim_fork(p1, p2);
 
 	pstats_fork(p1->p_stats, p2->p_stats);
 
@@ -657,23 +650,23 @@
 	 * Finish creating the child process.  It will return via a different
 	 * execution path later.  (ie: directly into user mode)
 	 */
-	vm_forkproc(td, p2, td2, flags);
+	vm_forkproc(td, p2, td2, vm2, flags);
 
 	if (flags == (RFFDG | RFPROC)) {
-		atomic_add_int(&cnt.v_forks, 1);
-		atomic_add_int(&cnt.v_forkpages, p2->p_vmspace->vm_dsize +
+		PCPU_INC(cnt.v_forks);
+		PCPU_ADD(cnt.v_forkpages, p2->p_vmspace->vm_dsize +
 		    p2->p_vmspace->vm_ssize);
 	} else if (flags == (RFFDG | RFPROC | RFPPWAIT | RFMEM)) {
-		atomic_add_int(&cnt.v_vforks, 1);
-		atomic_add_int(&cnt.v_vforkpages, p2->p_vmspace->vm_dsize +
+		PCPU_INC(cnt.v_vforks);
+		PCPU_ADD(cnt.v_vforkpages, p2->p_vmspace->vm_dsize +
 		    p2->p_vmspace->vm_ssize);
 	} else if (p1 == &proc0) {
-		atomic_add_int(&cnt.v_kthreads, 1);
-		atomic_add_int(&cnt.v_kthreadpages, p2->p_vmspace->vm_dsize +
+		PCPU_INC(cnt.v_kthreads);
+		PCPU_ADD(cnt.v_kthreadpages, p2->p_vmspace->vm_dsize +
 		    p2->p_vmspace->vm_ssize);
 	} else {
-		atomic_add_int(&cnt.v_rforks, 1);
-		atomic_add_int(&cnt.v_rforkpages, p2->p_vmspace->vm_dsize +
+		PCPU_INC(cnt.v_rforks);
+		PCPU_ADD(cnt.v_rforkpages, p2->p_vmspace->vm_dsize +
 		    p2->p_vmspace->vm_ssize);
 	}
 
@@ -688,18 +681,20 @@
 	 * Set the child start time and mark the process as being complete.
 	 */
 	microuptime(&p2->p_stats->p_start);
-	mtx_lock_spin(&sched_lock);
+	PROC_SLOCK(p2);
 	p2->p_state = PRS_NORMAL;
+	PROC_SUNLOCK(p2);
 
 	/*
 	 * If RFSTOPPED not requested, make child runnable and add to
 	 * run queue.
 	 */
 	if ((flags & RFSTOPPED) == 0) {
+		thread_lock(td2);
 		TD_SET_CAN_RUN(td2);
-		setrunqueue(td2, SRQ_BORING);
+		sched_add(td2, SRQ_BORING);
+		thread_unlock(td2);
 	}
-	mtx_unlock_spin(&sched_lock);
 
 	/*
 	 * Now can be swapped.
@@ -725,15 +720,6 @@
 	PROC_UNLOCK(p2);
 
 	/*
-	 * If other threads are waiting, let them continue now.
-	 */
-	if (p1->p_flag & P_HADTHREADS) {
-		PROC_LOCK(p1);
-		thread_single_end();
-		PROC_UNLOCK(p1);
-	}
-
-	/*
 	 * Return child proc pointer to parent.
 	 */
 	*procp = p2;
@@ -742,18 +728,14 @@
 	sx_sunlock(&proctree_lock);
 	if (ppsratecheck(&lastfail, &curfail, 1))
 		printf("maxproc limit exceeded by uid %i, please see tuning(7) and login.conf(5).\n",
-			td->td_ucred->cr_ruid);
+		    td->td_ucred->cr_ruid);
 	sx_xunlock(&allproc_lock);
 #ifdef MAC
 	mac_destroy_proc(newproc);
 #endif
+fail1:
 	uma_zfree(proc_zone, newproc);
-	if (p1->p_flag & P_HADTHREADS) {
-		PROC_LOCK(p1);
-		thread_single_end();
-		PROC_UNLOCK(p1);
-	}
-	tsleep(&forksleep, PUSER, "fork", hz / 2);
+	pause("fork", hz / 2);
 	return (error);
 }
 
@@ -769,33 +751,26 @@
 {
 	struct proc *p;
 	struct thread *td;
+	struct thread *dtd;
 
-	/*
-	 * Finish setting up thread glue so that it begins execution in a
-	 * non-nested critical section with sched_lock held but not recursed.
-	 */
 	td = curthread;
 	p = td->td_proc;
-	td->td_oncpu = PCPU_GET(cpuid);
 	KASSERT(p->p_state == PRS_NORMAL, ("executing process is still new"));
 
-	sched_lock.mtx_lock = (uintptr_t)td;
-	mtx_assert(&sched_lock, MA_OWNED | MA_NOTRECURSED);
 	CTR4(KTR_PROC, "fork_exit: new thread %p (kse %p, pid %d, %s)",
 		td, td->td_sched, p->p_pid, p->p_comm);
 
+	sched_fork_exit(td);
 	/*
-	 * Processes normally resume in mi_switch() after being
-	 * cpu_switch()'ed to, but when children start up they arrive here
-	 * instead, so we must do much the same things as mi_switch() would.
-	 */
-
-	if ((td = PCPU_GET(deadthread))) {
+	* Processes normally resume in mi_switch() after being
+	* cpu_switch()'ed to, but when children start up they arrive here
+	* instead, so we must do much the same things as mi_switch() would.
+	*/
+	if ((dtd = PCPU_GET(deadthread))) {
 		PCPU_SET(deadthread, NULL);
-		thread_stash(td);
+		thread_stash(dtd);
 	}
-	td = curthread;
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(td);
 
 	/*
 	 * cpu_set_fork_handler intercepts this function call to
@@ -809,15 +784,14 @@
 	 * Check if a kernel thread misbehaved and returned from its main
 	 * function.
 	 */
-	PROC_LOCK(p);
 	if (p->p_flag & P_KTHREAD) {
-		PROC_UNLOCK(p);
 		printf("Kernel thread \"%s\" (pid %d) exited prematurely.\n",
 		    p->p_comm, p->p_pid);
 		kthread_exit(0);
 	}
-	PROC_UNLOCK(p);
 	mtx_assert(&Giant, MA_NOTOWNED);
+
+	EVENTHANDLER_INVOKE(schedtail, p);
 }
 
 /*
@@ -832,7 +806,7 @@
 	struct trapframe *frame;
 {
 
-	userret(td, frame, 0);
+	userret(td, frame);
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_SYSRET))
 		ktrsysret(SYS_fork, 0, 0);
--- /dev/null
+++ sys/kern/kern_rwlock.c
@@ -0,0 +1,948 @@
+/*-
+ * Copyright (c) 2006 John Baldwin <jhb at FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the author nor the names of any co-contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Machine independent bits of reader/writer lock implementation.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: src/sys/kern/kern_rwlock.c,v 1.28.4.2 2007/12/01 11:28:37 attilio Exp $");
+
+#include "opt_ddb.h"
+#include "opt_no_adaptive_rwlocks.h"
+
+#include <sys/param.h>
+#include <sys/ktr.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/rwlock.h>
+#include <sys/systm.h>
+#include <sys/turnstile.h>
+
+#include <machine/cpu.h>
+
+CTASSERT((RW_RECURSE & LO_CLASSFLAGS) == RW_RECURSE);
+
+#if defined(SMP) && !defined(NO_ADAPTIVE_RWLOCKS)
+#define	ADAPTIVE_RWLOCKS
+#endif
+
+#ifdef DDB
+#include <ddb/ddb.h>
+
+static void	db_show_rwlock(struct lock_object *lock);
+#endif
+static void	lock_rw(struct lock_object *lock, int how);
+static int	unlock_rw(struct lock_object *lock);
+
+struct lock_class lock_class_rw = {
+	.lc_name = "rw",
+	.lc_flags = LC_SLEEPLOCK | LC_RECURSABLE | LC_UPGRADABLE,
+#ifdef DDB
+	.lc_ddb_show = db_show_rwlock,
+#endif
+	.lc_lock = lock_rw,
+	.lc_unlock = unlock_rw,
+};
+
+/*
+ * Return a pointer to the owning thread if the lock is write-locked or
+ * NULL if the lock is unlocked or read-locked.
+ */
+#define	rw_wowner(rw)							\
+	((rw)->rw_lock & RW_LOCK_READ ? NULL :				\
+	    (struct thread *)RW_OWNER((rw)->rw_lock))
+
+/*
+ * Returns if a write owner is recursed.  Write ownership is not assured
+ * here and should be previously checked.
+ */
+#define	rw_recursed(rw)		((rw)->rw_recurse != 0)
+
+/*
+ * Return true if curthread helds the lock.
+ */
+#define	rw_wlocked(rw)		(rw_wowner((rw)) == curthread)
+
+/*
+ * Return a pointer to the owning thread for this lock who should receive
+ * any priority lent by threads that block on this lock.  Currently this
+ * is identical to rw_wowner().
+ */
+#define	rw_owner(rw)		rw_wowner(rw)
+
+#ifndef INVARIANTS
+#define	_rw_assert(rw, what, file, line)
+#endif
+
+void
+lock_rw(struct lock_object *lock, int how)
+{
+	struct rwlock *rw;
+
+	rw = (struct rwlock *)lock;
+	if (how)
+		rw_wlock(rw);
+	else
+		rw_rlock(rw);
+}
+
+int
+unlock_rw(struct lock_object *lock)
+{
+	struct rwlock *rw;
+
+	rw = (struct rwlock *)lock;
+	rw_assert(rw, RA_LOCKED | LA_NOTRECURSED);
+	if (rw->rw_lock & RW_LOCK_READ) {
+		rw_runlock(rw);
+		return (0);
+	} else {
+		rw_wunlock(rw);
+		return (1);
+	}
+}
+
+void
+rw_init_flags(struct rwlock *rw, const char *name, int opts)
+{
+	int flags;
+
+	MPASS((opts & ~(RW_DUPOK | RW_NOPROFILE | RW_NOWITNESS | RW_QUIET |
+	    RW_RECURSE)) == 0);
+
+	flags = LO_UPGRADABLE | LO_RECURSABLE;
+	if (opts & RW_DUPOK)
+		flags |= LO_DUPOK;
+	if (opts & RW_NOPROFILE)
+		flags |= LO_NOPROFILE;
+	if (!(opts & RW_NOWITNESS))
+		flags |= LO_WITNESS;
+	if (opts & RW_QUIET)
+		flags |= LO_QUIET;
+	flags |= opts & RW_RECURSE;
+
+	rw->rw_lock = RW_UNLOCKED;
+	rw->rw_recurse = 0;
+	lock_init(&rw->lock_object, &lock_class_rw, name, NULL, flags);
+}
+
+void
+rw_destroy(struct rwlock *rw)
+{
+
+	KASSERT(rw->rw_lock == RW_UNLOCKED, ("rw lock not unlocked"));
+	KASSERT(rw->rw_recurse == 0, ("rw lock still recursed"));
+	rw->rw_lock = RW_DESTROYED;
+	lock_destroy(&rw->lock_object);
+}
+
+void
+rw_sysinit(void *arg)
+{
+	struct rw_args *args = arg;
+
+	rw_init(args->ra_rw, args->ra_desc);
+}
+
+int
+rw_wowned(struct rwlock *rw)
+{
+
+	return (rw_wowner(rw) == curthread);
+}
+
+void
+_rw_wlock(struct rwlock *rw, const char *file, int line)
+{
+
+	MPASS(curthread != NULL);
+	KASSERT(rw->rw_lock != RW_DESTROYED,
+	    ("rw_wlock() of destroyed rwlock @ %s:%d", file, line));
+	WITNESS_CHECKORDER(&rw->lock_object, LOP_NEWORDER | LOP_EXCLUSIVE, file,
+	    line);
+	__rw_wlock(rw, curthread, file, line);
+	LOCK_LOG_LOCK("WLOCK", &rw->lock_object, 0, rw->rw_recurse, file, line);
+	WITNESS_LOCK(&rw->lock_object, LOP_EXCLUSIVE, file, line);
+	curthread->td_locks++;
+}
+
+void
+_rw_wunlock(struct rwlock *rw, const char *file, int line)
+{
+
+	MPASS(curthread != NULL);
+	KASSERT(rw->rw_lock != RW_DESTROYED,
+	    ("rw_wunlock() of destroyed rwlock @ %s:%d", file, line));
+	_rw_assert(rw, RA_WLOCKED, file, line);
+	curthread->td_locks--;
+	WITNESS_UNLOCK(&rw->lock_object, LOP_EXCLUSIVE, file, line);
+	LOCK_LOG_LOCK("WUNLOCK", &rw->lock_object, 0, rw->rw_recurse, file,
+	    line);
+	if (!rw_recursed(rw))
+		lock_profile_release_lock(&rw->lock_object);
+	__rw_wunlock(rw, curthread, file, line);
+}
+
+void
+_rw_rlock(struct rwlock *rw, const char *file, int line)
+{
+	struct turnstile *ts;
+#ifdef ADAPTIVE_RWLOCKS
+	volatile struct thread *owner;
+#endif
+#ifdef LOCK_PROFILING_SHARED
+	uint64_t waittime = 0;
+	int contested = 0;
+#endif
+	uintptr_t x;
+
+	KASSERT(rw->rw_lock != RW_DESTROYED,
+	    ("rw_rlock() of destroyed rwlock @ %s:%d", file, line));
+	KASSERT(rw_wowner(rw) != curthread,
+	    ("%s (%s): wlock already held @ %s:%d", __func__,
+	    rw->lock_object.lo_name, file, line));
+	WITNESS_CHECKORDER(&rw->lock_object, LOP_NEWORDER, file, line);
+
+	/*
+	 * Note that we don't make any attempt to try to block read
+	 * locks once a writer has blocked on the lock.  The reason is
+	 * that we currently allow for read locks to recurse and we
+	 * don't keep track of all the holders of read locks.  Thus, if
+	 * we were to block readers once a writer blocked and a reader
+	 * tried to recurse on their reader lock after a writer had
+	 * blocked we would end up in a deadlock since the reader would
+	 * be blocked on the writer, and the writer would be blocked
+	 * waiting for the reader to release its original read lock.
+	 */
+	for (;;) {
+		/*
+		 * Handle the easy case.  If no other thread has a write
+		 * lock, then try to bump up the count of read locks.  Note
+		 * that we have to preserve the current state of the
+		 * RW_LOCK_WRITE_WAITERS flag.  If we fail to acquire a
+		 * read lock, then rw_lock must have changed, so restart
+		 * the loop.  Note that this handles the case of a
+		 * completely unlocked rwlock since such a lock is encoded
+		 * as a read lock with no waiters.
+		 */
+		x = rw->rw_lock;
+		if (x & RW_LOCK_READ) {
+
+			/*
+			 * The RW_LOCK_READ_WAITERS flag should only be set
+			 * if another thread currently holds a write lock,
+			 * and in that case RW_LOCK_READ should be clear.
+			 */
+			MPASS((x & RW_LOCK_READ_WAITERS) == 0);
+			if (atomic_cmpset_acq_ptr(&rw->rw_lock, x,
+			    x + RW_ONE_READER)) {
+#ifdef LOCK_PROFILING_SHARED
+				if (RW_READERS(x) == 0)
+					lock_profile_obtain_lock_success(
+					    &rw->lock_object, contested,
+					    waittime, file, line);
+#endif
+				if (LOCK_LOG_TEST(&rw->lock_object, 0))
+					CTR4(KTR_LOCK,
+					    "%s: %p succeed %p -> %p", __func__,
+					    rw, (void *)x,
+					    (void *)(x + RW_ONE_READER));
+				break;
+			}
+			cpu_spinwait();
+			continue;
+		}
+
+#ifdef ADAPTIVE_RWLOCKS
+		/*
+		 * If the owner is running on another CPU, spin until
+		 * the owner stops running or the state of the lock
+		 * changes.
+		 */
+		owner = (struct thread *)RW_OWNER(x);
+		if (TD_IS_RUNNING(owner)) {
+			if (LOCK_LOG_TEST(&rw->lock_object, 0))
+				CTR3(KTR_LOCK, "%s: spinning on %p held by %p",
+				    __func__, rw, owner);
+#ifdef LOCK_PROFILING_SHARED
+			lock_profile_obtain_lock_failed(&rw->lock_object,
+			    &contested, &waittime);
+#endif
+			while ((struct thread*)RW_OWNER(rw->rw_lock) == owner &&
+			    TD_IS_RUNNING(owner))
+				cpu_spinwait();
+			continue;
+		}
+#endif
+
+		/*
+		 * Okay, now it's the hard case.  Some other thread already
+		 * has a write lock, so acquire the turnstile lock so we can
+		 * begin the process of blocking.
+		 */
+		ts = turnstile_trywait(&rw->lock_object);
+
+		/*
+		 * The lock might have been released while we spun, so
+		 * recheck its state and restart the loop if there is no
+		 * longer a write lock.
+		 */
+		x = rw->rw_lock;
+		if (x & RW_LOCK_READ) {
+			turnstile_cancel(ts);
+			cpu_spinwait();
+			continue;
+		}
+
+#ifdef ADAPTIVE_RWLOCKS
+		/*
+		 * If the current owner of the lock is executing on another
+		 * CPU quit the hard path and try to spin.
+		 */
+		owner = (struct thread *)RW_OWNER(x);
+		if (TD_IS_RUNNING(owner)) {
+			turnstile_cancel(ts);
+			cpu_spinwait();
+			continue;
+		}
+#endif
+
+		/*
+		 * Ok, it's still a write lock.  If the RW_LOCK_READ_WAITERS
+		 * flag is already set, then we can go ahead and block.  If
+		 * it is not set then try to set it.  If we fail to set it
+		 * drop the turnstile lock and restart the loop.
+		 */
+		if (!(x & RW_LOCK_READ_WAITERS)) {
+			if (!atomic_cmpset_ptr(&rw->rw_lock, x,
+			    x | RW_LOCK_READ_WAITERS)) {
+				turnstile_cancel(ts);
+				cpu_spinwait();
+				continue;
+			}
+			if (LOCK_LOG_TEST(&rw->lock_object, 0))
+				CTR2(KTR_LOCK, "%s: %p set read waiters flag",
+				    __func__, rw);
+		}
+
+		/*
+		 * We were unable to acquire the lock and the read waiters
+		 * flag is set, so we must block on the turnstile.
+		 */
+		if (LOCK_LOG_TEST(&rw->lock_object, 0))
+			CTR2(KTR_LOCK, "%s: %p blocking on turnstile", __func__,
+			    rw);
+#ifdef LOCK_PROFILING_SHARED
+		lock_profile_obtain_lock_failed(&rw->lock_object, &contested,
+		    &waittime);
+#endif
+		turnstile_wait(ts, rw_owner(rw), TS_SHARED_QUEUE);
+		if (LOCK_LOG_TEST(&rw->lock_object, 0))
+			CTR2(KTR_LOCK, "%s: %p resuming from turnstile",
+			    __func__, rw);
+	}
+
+	/*
+	 * TODO: acquire "owner of record" here.  Here be turnstile dragons
+	 * however.  turnstiles don't like owners changing between calls to
+	 * turnstile_wait() currently.
+	 */
+
+	LOCK_LOG_LOCK("RLOCK", &rw->lock_object, 0, 0, file, line);
+	WITNESS_LOCK(&rw->lock_object, 0, file, line);
+	curthread->td_locks++;
+}
+
+void
+_rw_runlock(struct rwlock *rw, const char *file, int line)
+{
+	struct turnstile *ts;
+	uintptr_t x;
+
+	KASSERT(rw->rw_lock != RW_DESTROYED,
+	    ("rw_runlock() of destroyed rwlock @ %s:%d", file, line));
+	_rw_assert(rw, RA_RLOCKED, file, line);
+	curthread->td_locks--;
+	WITNESS_UNLOCK(&rw->lock_object, 0, file, line);
+	LOCK_LOG_LOCK("RUNLOCK", &rw->lock_object, 0, 0, file, line);
+
+	/* TODO: drop "owner of record" here. */
+
+	for (;;) {
+		/*
+		 * See if there is more than one read lock held.  If so,
+		 * just drop one and return.
+		 */
+		x = rw->rw_lock;
+		if (RW_READERS(x) > 1) {
+			if (atomic_cmpset_ptr(&rw->rw_lock, x,
+			    x - RW_ONE_READER)) {
+				if (LOCK_LOG_TEST(&rw->lock_object, 0))
+					CTR4(KTR_LOCK,
+					    "%s: %p succeeded %p -> %p",
+					    __func__, rw, (void *)x,
+					    (void *)(x - RW_ONE_READER));
+				break;
+			}
+			continue;
+		}
+
+
+		/*
+		 * We should never have read waiters while at least one
+		 * thread holds a read lock.  (See note above)
+		 */
+		KASSERT(!(x & RW_LOCK_READ_WAITERS),
+		    ("%s: waiting readers", __func__));
+#ifdef LOCK_PROFILING_SHARED
+		lock_profile_release_lock(&rw->lock_object);
+#endif
+
+		/*
+		 * If there aren't any waiters for a write lock, then try
+		 * to drop it quickly.
+		 */
+		if (!(x & RW_LOCK_WRITE_WAITERS)) {
+
+			/*
+			 * There shouldn't be any flags set and we should
+			 * be the only read lock.  If we fail to release
+			 * the single read lock, then another thread might
+			 * have just acquired a read lock, so go back up
+			 * to the multiple read locks case.
+			 */
+			MPASS(x == RW_READERS_LOCK(1));
+			if (atomic_cmpset_ptr(&rw->rw_lock, RW_READERS_LOCK(1),
+			    RW_UNLOCKED)) {
+				if (LOCK_LOG_TEST(&rw->lock_object, 0))
+					CTR2(KTR_LOCK, "%s: %p last succeeded",
+					    __func__, rw);
+				break;
+			}
+			continue;
+		}
+
+		/*
+		 * There should just be one reader with one or more
+		 * writers waiting.
+		 */
+		MPASS(x == (RW_READERS_LOCK(1) | RW_LOCK_WRITE_WAITERS));
+
+		/*
+		 * Ok, we know we have a waiting writer and we think we
+		 * are the last reader, so grab the turnstile lock.
+		 */
+		turnstile_chain_lock(&rw->lock_object);
+
+		/*
+		 * Try to drop our lock leaving the lock in a unlocked
+		 * state.
+		 *
+		 * If you wanted to do explicit lock handoff you'd have to
+		 * do it here.  You'd also want to use turnstile_signal()
+		 * and you'd have to handle the race where a higher
+		 * priority thread blocks on the write lock before the
+		 * thread you wakeup actually runs and have the new thread
+		 * "steal" the lock.  For now it's a lot simpler to just
+		 * wakeup all of the waiters.
+		 *
+		 * As above, if we fail, then another thread might have
+		 * acquired a read lock, so drop the turnstile lock and
+		 * restart.
+		 */
+		if (!atomic_cmpset_ptr(&rw->rw_lock,
+		    RW_READERS_LOCK(1) | RW_LOCK_WRITE_WAITERS, RW_UNLOCKED)) {
+			turnstile_chain_unlock(&rw->lock_object);
+			continue;
+		}
+		if (LOCK_LOG_TEST(&rw->lock_object, 0))
+			CTR2(KTR_LOCK, "%s: %p last succeeded with waiters",
+			    __func__, rw);
+
+		/*
+		 * Ok.  The lock is released and all that's left is to
+		 * wake up the waiters.  Note that the lock might not be
+		 * free anymore, but in that case the writers will just
+		 * block again if they run before the new lock holder(s)
+		 * release the lock.
+		 */
+		ts = turnstile_lookup(&rw->lock_object);
+		MPASS(ts != NULL);
+		turnstile_broadcast(ts, TS_EXCLUSIVE_QUEUE);
+		turnstile_unpend(ts, TS_SHARED_LOCK);
+		turnstile_chain_unlock(&rw->lock_object);
+		break;
+	}
+}
+
+/*
+ * This function is called when we are unable to obtain a write lock on the
+ * first try.  This means that at least one other thread holds either a
+ * read or write lock.
+ */
+void
+_rw_wlock_hard(struct rwlock *rw, uintptr_t tid, const char *file, int line)
+{
+	struct turnstile *ts;
+#ifdef ADAPTIVE_RWLOCKS
+	volatile struct thread *owner;
+#endif
+	uint64_t waittime = 0;
+	uintptr_t v;
+	int contested = 0;
+
+	if (rw_wlocked(rw)) {
+		KASSERT(rw->lock_object.lo_flags & RW_RECURSE,
+		    ("%s: recursing but non-recursive rw %s @ %s:%d\n",
+		    __func__, rw->lock_object.lo_name, file, line));
+		rw->rw_recurse++;
+		atomic_set_ptr(&rw->rw_lock, RW_LOCK_RECURSED);
+		if (LOCK_LOG_TEST(&rw->lock_object, 0))
+			CTR2(KTR_LOCK, "%s: %p recursing", __func__, rw);
+		return;
+	}
+
+	if (LOCK_LOG_TEST(&rw->lock_object, 0))
+		CTR5(KTR_LOCK, "%s: %s contested (lock=%p) at %s:%d", __func__,
+		    rw->lock_object.lo_name, (void *)rw->rw_lock, file, line);
+
+	while (!_rw_write_lock(rw, tid)) {
+#ifdef ADAPTIVE_RWLOCKS
+		/*
+		 * If the lock is write locked and the owner is
+		 * running on another CPU, spin until the owner stops
+		 * running or the state of the lock changes.
+		 */
+		v = rw->rw_lock;
+		owner = (struct thread *)RW_OWNER(v);
+		if (!(v & RW_LOCK_READ) && TD_IS_RUNNING(owner)) {
+			if (LOCK_LOG_TEST(&rw->lock_object, 0))
+				CTR3(KTR_LOCK, "%s: spinning on %p held by %p",
+				    __func__, rw, owner);
+			lock_profile_obtain_lock_failed(&rw->lock_object,
+			    &contested, &waittime);
+			while ((struct thread*)RW_OWNER(rw->rw_lock) == owner &&
+			    TD_IS_RUNNING(owner))
+				cpu_spinwait();
+			continue;
+		}
+#endif
+
+		ts = turnstile_trywait(&rw->lock_object);
+		v = rw->rw_lock;
+
+		/*
+		 * If the lock was released while spinning on the
+		 * turnstile chain lock, try again.
+		 */
+		if (v == RW_UNLOCKED) {
+			turnstile_cancel(ts);
+			cpu_spinwait();
+			continue;
+		}
+
+#ifdef ADAPTIVE_RWLOCKS
+		/*
+		 * If the current owner of the lock is executing on another
+		 * CPU quit the hard path and try to spin.
+		 */
+		if (!(v & RW_LOCK_READ)) {
+			owner = (struct thread *)RW_OWNER(v);
+			if (TD_IS_RUNNING(owner)) {
+				turnstile_cancel(ts);
+				cpu_spinwait();
+				continue;
+			}
+		}
+#endif
+
+		/*
+		 * If the lock was released by a writer with both readers
+		 * and writers waiting and a reader hasn't woken up and
+		 * acquired the lock yet, rw_lock will be set to the
+		 * value RW_UNLOCKED | RW_LOCK_WRITE_WAITERS.  If we see
+		 * that value, try to acquire it once.  Note that we have
+		 * to preserve the RW_LOCK_WRITE_WAITERS flag as there are
+		 * other writers waiting still.  If we fail, restart the
+		 * loop.
+		 */
+		if (v == (RW_UNLOCKED | RW_LOCK_WRITE_WAITERS)) {
+			if (atomic_cmpset_acq_ptr(&rw->rw_lock,
+			    RW_UNLOCKED | RW_LOCK_WRITE_WAITERS,
+			    tid | RW_LOCK_WRITE_WAITERS)) {
+				turnstile_claim(ts);
+				CTR2(KTR_LOCK, "%s: %p claimed by new writer",
+				    __func__, rw);
+				break;
+			}
+			turnstile_cancel(ts);
+			cpu_spinwait();
+			continue;
+		}
+
+		/*
+		 * If the RW_LOCK_WRITE_WAITERS flag isn't set, then try to
+		 * set it.  If we fail to set it, then loop back and try
+		 * again.
+		 */
+		if (!(v & RW_LOCK_WRITE_WAITERS)) {
+			if (!atomic_cmpset_ptr(&rw->rw_lock, v,
+			    v | RW_LOCK_WRITE_WAITERS)) {
+				turnstile_cancel(ts);
+				cpu_spinwait();
+				continue;
+			}
+			if (LOCK_LOG_TEST(&rw->lock_object, 0))
+				CTR2(KTR_LOCK, "%s: %p set write waiters flag",
+				    __func__, rw);
+		}
+
+		/*
+		 * We were unable to acquire the lock and the write waiters
+		 * flag is set, so we must block on the turnstile.
+		 */
+		if (LOCK_LOG_TEST(&rw->lock_object, 0))
+			CTR2(KTR_LOCK, "%s: %p blocking on turnstile", __func__,
+			    rw);
+		lock_profile_obtain_lock_failed(&rw->lock_object, &contested,
+		    &waittime);
+		turnstile_wait(ts, rw_owner(rw), TS_EXCLUSIVE_QUEUE);
+		if (LOCK_LOG_TEST(&rw->lock_object, 0))
+			CTR2(KTR_LOCK, "%s: %p resuming from turnstile",
+			    __func__, rw);
+	}
+	lock_profile_obtain_lock_success(&rw->lock_object, contested, waittime,
+	    file, line);
+}
+
+/*
+ * This function is called if the first try at releasing a write lock failed.
+ * This means that one of the 2 waiter bits must be set indicating that at
+ * least one thread is waiting on this lock.
+ */
+void
+_rw_wunlock_hard(struct rwlock *rw, uintptr_t tid, const char *file, int line)
+{
+	struct turnstile *ts;
+	uintptr_t v;
+	int queue;
+
+	if (rw_wlocked(rw) && rw_recursed(rw)) {
+		if ((--rw->rw_recurse) == 0)
+			atomic_clear_ptr(&rw->rw_lock, RW_LOCK_RECURSED);
+		if (LOCK_LOG_TEST(&rw->lock_object, 0))
+			CTR2(KTR_LOCK, "%s: %p unrecursing", __func__, rw);
+		return;
+	}
+
+	KASSERT(rw->rw_lock & (RW_LOCK_READ_WAITERS | RW_LOCK_WRITE_WAITERS),
+	    ("%s: neither of the waiter flags are set", __func__));
+
+	if (LOCK_LOG_TEST(&rw->lock_object, 0))
+		CTR2(KTR_LOCK, "%s: %p contested", __func__, rw);
+
+	turnstile_chain_lock(&rw->lock_object);
+	ts = turnstile_lookup(&rw->lock_object);
+
+	MPASS(ts != NULL);
+
+	/*
+	 * Use the same algo as sx locks for now.  Prefer waking up shared
+	 * waiters if we have any over writers.  This is probably not ideal.
+	 *
+	 * 'v' is the value we are going to write back to rw_lock.  If we
+	 * have waiters on both queues, we need to preserve the state of
+	 * the waiter flag for the queue we don't wake up.  For now this is
+	 * hardcoded for the algorithm mentioned above.
+	 *
+	 * In the case of both readers and writers waiting we wakeup the
+	 * readers but leave the RW_LOCK_WRITE_WAITERS flag set.  If a
+	 * new writer comes in before a reader it will claim the lock up
+	 * above.  There is probably a potential priority inversion in
+	 * there that could be worked around either by waking both queues
+	 * of waiters or doing some complicated lock handoff gymnastics.
+	 */
+	v = RW_UNLOCKED;
+	if (rw->rw_lock & RW_LOCK_READ_WAITERS) {
+		queue = TS_SHARED_QUEUE;
+		v |= (rw->rw_lock & RW_LOCK_WRITE_WAITERS);
+	} else
+		queue = TS_EXCLUSIVE_QUEUE;
+
+	/* Wake up all waiters for the specific queue. */
+	if (LOCK_LOG_TEST(&rw->lock_object, 0))
+		CTR3(KTR_LOCK, "%s: %p waking up %s waiters", __func__, rw,
+		    queue == TS_SHARED_QUEUE ? "read" : "write");
+	turnstile_broadcast(ts, queue);
+	atomic_store_rel_ptr(&rw->rw_lock, v);
+	turnstile_unpend(ts, TS_EXCLUSIVE_LOCK);
+	turnstile_chain_unlock(&rw->lock_object);
+}
+
+/*
+ * Attempt to do a non-blocking upgrade from a read lock to a write
+ * lock.  This will only succeed if this thread holds a single read
+ * lock.  Returns true if the upgrade succeeded and false otherwise.
+ */
+int
+_rw_try_upgrade(struct rwlock *rw, const char *file, int line)
+{
+	uintptr_t v, tid;
+	struct turnstile *ts;
+	int success;
+
+	KASSERT(rw->rw_lock != RW_DESTROYED,
+	    ("rw_try_upgrade() of destroyed rwlock @ %s:%d", file, line));
+	_rw_assert(rw, RA_RLOCKED, file, line);
+
+	/*
+	 * Attempt to switch from one reader to a writer.  If there
+	 * are any write waiters, then we will have to lock the
+	 * turnstile first to prevent races with another writer
+	 * calling turnstile_wait() before we have claimed this
+	 * turnstile.  So, do the simple case of no waiters first.
+	 */
+	tid = (uintptr_t)curthread;
+	if (!(rw->rw_lock & RW_LOCK_WRITE_WAITERS)) {
+		success = atomic_cmpset_ptr(&rw->rw_lock, RW_READERS_LOCK(1),
+		    tid);
+		goto out;
+	}
+
+	/*
+	 * Ok, we think we have write waiters, so lock the
+	 * turnstile.
+	 */
+	ts = turnstile_trywait(&rw->lock_object);
+
+	/*
+	 * Try to switch from one reader to a writer again.  This time
+	 * we honor the current state of the RW_LOCK_WRITE_WAITERS
+	 * flag.  If we obtain the lock with the flag set, then claim
+	 * ownership of the turnstile.
+	 */
+	v = rw->rw_lock & RW_LOCK_WRITE_WAITERS;
+	success = atomic_cmpset_ptr(&rw->rw_lock, RW_READERS_LOCK(1) | v,
+	    tid | v);
+	if (success && v)
+		turnstile_claim(ts);
+	else
+		turnstile_cancel(ts);
+out:
+	LOCK_LOG_TRY("WUPGRADE", &rw->lock_object, 0, success, file, line);
+	if (success)
+		WITNESS_UPGRADE(&rw->lock_object, LOP_EXCLUSIVE | LOP_TRYLOCK,
+		    file, line);
+	return (success);
+}
+
+/*
+ * Downgrade a write lock into a single read lock.
+ */
+void
+_rw_downgrade(struct rwlock *rw, const char *file, int line)
+{
+	struct turnstile *ts;
+	uintptr_t tid, v;
+
+	KASSERT(rw->rw_lock != RW_DESTROYED,
+	    ("rw_downgrade() of destroyed rwlock @ %s:%d", file, line));
+	_rw_assert(rw, RA_WLOCKED | RA_NOTRECURSED, file, line);
+#ifndef INVARIANTS
+	if (rw_recursed(rw))
+		panic("downgrade of a recursed lock");
+#endif
+
+	WITNESS_DOWNGRADE(&rw->lock_object, 0, file, line);
+
+	/*
+	 * Convert from a writer to a single reader.  First we handle
+	 * the easy case with no waiters.  If there are any waiters, we
+	 * lock the turnstile, "disown" the lock, and awaken any read
+	 * waiters.
+	 */
+	tid = (uintptr_t)curthread;
+	if (atomic_cmpset_rel_ptr(&rw->rw_lock, tid, RW_READERS_LOCK(1)))
+		goto out;
+
+	/*
+	 * Ok, we think we have waiters, so lock the turnstile so we can
+	 * read the waiter flags without any races.
+	 */
+	turnstile_chain_lock(&rw->lock_object);
+	v = rw->rw_lock;
+	MPASS(v & (RW_LOCK_READ_WAITERS | RW_LOCK_WRITE_WAITERS));
+
+	/*
+	 * Downgrade from a write lock while preserving
+	 * RW_LOCK_WRITE_WAITERS and give up ownership of the
+	 * turnstile.  If there are any read waiters, wake them up.
+	 */
+	ts = turnstile_lookup(&rw->lock_object);
+	MPASS(ts != NULL);
+	if (v & RW_LOCK_READ_WAITERS)
+		turnstile_broadcast(ts, TS_SHARED_QUEUE);
+	atomic_store_rel_ptr(&rw->rw_lock, RW_READERS_LOCK(1) |
+	    (v & RW_LOCK_WRITE_WAITERS));
+	if (v & RW_LOCK_READ_WAITERS)
+		turnstile_unpend(ts, TS_EXCLUSIVE_LOCK);
+	else if (ts)
+		turnstile_disown(ts);
+	turnstile_chain_unlock(&rw->lock_object);
+out:
+	LOCK_LOG_LOCK("WDOWNGRADE", &rw->lock_object, 0, 0, file, line);
+}
+
+#ifdef INVARIANT_SUPPORT
+#ifndef INVARIANTS
+#undef _rw_assert
+#endif
+
+/*
+ * In the non-WITNESS case, rw_assert() can only detect that at least
+ * *some* thread owns an rlock, but it cannot guarantee that *this*
+ * thread owns an rlock.
+ */
+void
+_rw_assert(struct rwlock *rw, int what, const char *file, int line)
+{
+
+	if (panicstr != NULL)
+		return;
+	switch (what) {
+	case RA_LOCKED:
+	case RA_LOCKED | RA_RECURSED:
+	case RA_LOCKED | RA_NOTRECURSED:
+	case RA_RLOCKED:
+#ifdef WITNESS
+		witness_assert(&rw->lock_object, what, file, line);
+#else
+		/*
+		 * If some other thread has a write lock or we have one
+		 * and are asserting a read lock, fail.  Also, if no one
+		 * has a lock at all, fail.
+		 */
+		if (rw->rw_lock == RW_UNLOCKED ||
+		    (!(rw->rw_lock & RW_LOCK_READ) && (what == RA_RLOCKED ||
+		    rw_wowner(rw) != curthread)))
+			panic("Lock %s not %slocked @ %s:%d\n",
+			    rw->lock_object.lo_name, (what == RA_RLOCKED) ?
+			    "read " : "", file, line);
+
+		if (!(rw->rw_lock & RW_LOCK_READ)) {
+			if (rw_recursed(rw)) {
+				if (what & RA_NOTRECURSED)
+					panic("Lock %s recursed @ %s:%d\n",
+					    rw->lock_object.lo_name, file,
+					    line);
+			} else if (what & RA_RECURSED)
+				panic("Lock %s not recursed @ %s:%d\n",
+				    rw->lock_object.lo_name, file, line);
+		}
+#endif
+		break;
+	case RA_WLOCKED:
+	case RA_WLOCKED | RA_RECURSED:
+	case RA_WLOCKED | RA_NOTRECURSED:
+		if (rw_wowner(rw) != curthread)
+			panic("Lock %s not exclusively locked @ %s:%d\n",
+			    rw->lock_object.lo_name, file, line);
+		if (rw_recursed(rw)) {
+			if (what & RA_NOTRECURSED)
+				panic("Lock %s recursed @ %s:%d\n",
+				    rw->lock_object.lo_name, file, line);
+		} else if (what & RA_RECURSED)
+			panic("Lock %s not recursed @ %s:%d\n",
+			    rw->lock_object.lo_name, file, line);
+		break;
+	case RA_UNLOCKED:
+#ifdef WITNESS
+		witness_assert(&rw->lock_object, what, file, line);
+#else
+		/*
+		 * If we hold a write lock fail.  We can't reliably check
+		 * to see if we hold a read lock or not.
+		 */
+		if (rw_wowner(rw) == curthread)
+			panic("Lock %s exclusively locked @ %s:%d\n",
+			    rw->lock_object.lo_name, file, line);
+#endif
+		break;
+	default:
+		panic("Unknown rw lock assertion: %d @ %s:%d", what, file,
+		    line);
+	}
+}
+#endif /* INVARIANT_SUPPORT */
+
+#ifdef DDB
+void
+db_show_rwlock(struct lock_object *lock)
+{
+	struct rwlock *rw;
+	struct thread *td;
+
+	rw = (struct rwlock *)lock;
+
+	db_printf(" state: ");
+	if (rw->rw_lock == RW_UNLOCKED)
+		db_printf("UNLOCKED\n");
+	else if (rw->rw_lock == RW_DESTROYED) {
+		db_printf("DESTROYED\n");
+		return;
+	} else if (rw->rw_lock & RW_LOCK_READ)
+		db_printf("RLOCK: %ju locks\n",
+		    (uintmax_t)(RW_READERS(rw->rw_lock)));
+	else {
+		td = rw_wowner(rw);
+		db_printf("WLOCK: %p (tid %d, pid %d, \"%s\")\n", td,
+		    td->td_tid, td->td_proc->p_pid, td->td_proc->p_comm);
+		if (rw_recursed(rw))
+			db_printf(" recursed: %u\n", rw->rw_recurse);
+	}
+	db_printf(" waiters: ");
+	switch (rw->rw_lock & (RW_LOCK_READ_WAITERS | RW_LOCK_WRITE_WAITERS)) {
+	case RW_LOCK_READ_WAITERS:
+		db_printf("readers\n");
+		break;
+	case RW_LOCK_WRITE_WAITERS:
+		db_printf("writers\n");
+		break;
+	case RW_LOCK_READ_WAITERS | RW_LOCK_WRITE_WAITERS:
+		db_printf("readers and writers\n");
+		break;
+	default:
+		db_printf("none\n");
+		break;
+	}
+}
+
+#endif
Index: sys_socket.c
===================================================================
RCS file: /home/cvs/src/sys/kern/sys_socket.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/sys_socket.c -L sys/kern/sys_socket.c -u -r1.1.1.1 -r1.2
--- sys/kern/sys_socket.c
+++ sys/kern/sys_socket.c
@@ -30,7 +30,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/sys_socket.c,v 1.69 2005/04/16 18:46:28 rwatson Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/sys_socket.c,v 1.73 2007/08/06 14:26:00 rwatson Exp $");
 
 #include "opt_mac.h"
 
@@ -38,7 +38,6 @@
 #include <sys/systm.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
-#include <sys/mac.h>
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/sigio.h>
@@ -55,7 +54,9 @@
 #include <net/if.h>
 #include <net/route.h>
 
-struct	fileops socketops = {
+#include <security/mac/mac_framework.h>
+
+struct fileops	socketops = {
 	.fo_read = soo_read,
 	.fo_write = soo_write,
 	.fo_ioctl = soo_ioctl,
@@ -68,78 +69,54 @@
 
 /* ARGSUSED */
 int
-soo_read(fp, uio, active_cred, flags, td)
-	struct file *fp;
-	struct uio *uio;
-	struct ucred *active_cred;
-	struct thread *td;
-	int flags;
+soo_read(struct file *fp, struct uio *uio, struct ucred *active_cred,
+    int flags, struct thread *td)
 {
 	struct socket *so = fp->f_data;
+#ifdef MAC
 	int error;
 
-	NET_LOCK_GIANT();
-#ifdef MAC
 	SOCK_LOCK(so);
 	error = mac_check_socket_receive(active_cred, so);
 	SOCK_UNLOCK(so);
-	if (error) {
-		NET_UNLOCK_GIANT();
+	if (error)
 		return (error);
-	}
 #endif
-	error = so->so_proto->pr_usrreqs->pru_soreceive(so, 0, uio, 0, 0, 0);
-	NET_UNLOCK_GIANT();
-	return (error);
+	return (soreceive(so, 0, uio, 0, 0, 0));
 }
 
 /* ARGSUSED */
 int
-soo_write(fp, uio, active_cred, flags, td)
-	struct file *fp;
-	struct uio *uio;
-	struct ucred *active_cred;
-	struct thread *td;
-	int flags;
+soo_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
+    int flags, struct thread *td)
 {
 	struct socket *so = fp->f_data;
 	int error;
 
-	NET_LOCK_GIANT();
 #ifdef MAC
 	SOCK_LOCK(so);
 	error = mac_check_socket_send(active_cred, so);
 	SOCK_UNLOCK(so);
-	if (error) {
-		NET_UNLOCK_GIANT();
+	if (error)
 		return (error);
-	}
 #endif
-	error = so->so_proto->pr_usrreqs->pru_sosend(so, 0, uio, 0, 0, 0,
-						    uio->uio_td);
+	error = sosend(so, 0, uio, 0, 0, 0, uio->uio_td);
 	if (error == EPIPE && (so->so_options & SO_NOSIGPIPE) == 0) {
 		PROC_LOCK(uio->uio_td->td_proc);
 		psignal(uio->uio_td->td_proc, SIGPIPE);
 		PROC_UNLOCK(uio->uio_td->td_proc);
 	}
-	NET_UNLOCK_GIANT();
 	return (error);
 }
 
 int
-soo_ioctl(fp, cmd, data, active_cred, td)
-	struct file *fp;
-	u_long cmd;
-	void *data;
-	struct ucred *active_cred;
-	struct thread *td;
+soo_ioctl(struct file *fp, u_long cmd, void *data, struct ucred *active_cred,
+    struct thread *td)
 {
 	struct socket *so = fp->f_data;
 	int error = 0;
 
-	NET_LOCK_GIANT();
 	switch (cmd) {
-
 	case FIONBIO:
 		SOCK_LOCK(so);
 		if (*(int *)data)
@@ -151,10 +128,10 @@
 
 	case FIOASYNC:
 		/*
-		 * XXXRW: This code separately acquires SOCK_LOCK(so)
-		 * and SOCKBUF_LOCK(&so->so_rcv) even though they are
-		 * the same mutex to avoid introducing the assumption
-		 * that they are the same.
+		 * XXXRW: This code separately acquires SOCK_LOCK(so) and
+		 * SOCKBUF_LOCK(&so->so_rcv) even though they are the same
+		 * mutex to avoid introducing the assumption that they are
+		 * the same.
 		 */
 		if (*(int *)data) {
 			SOCK_LOCK(so);
@@ -206,9 +183,9 @@
 		break;
 	default:
 		/*
-		 * Interface/routing/protocol specific ioctls:
-		 * interface and routing ioctls should have a
-		 * different entry since a socket's unnecessary
+		 * Interface/routing/protocol specific ioctls: interface and
+		 * routing ioctls should have a different entry since a
+		 * socket is unnecessary.
 		 */
 		if (IOCGROUP(cmd) == 'i')
 			error = ifioctl(so, cmd, data, td);
@@ -219,65 +196,50 @@
 			    (so, cmd, data, 0, td));
 		break;
 	}
-	NET_UNLOCK_GIANT();
-	return(error);
+	return (error);
 }
 
 int
-soo_poll(fp, events, active_cred, td)
-	struct file *fp;
-	int events;
-	struct ucred *active_cred;
-	struct thread *td;
+soo_poll(struct file *fp, int events, struct ucred *active_cred,
+    struct thread *td)
 {
 	struct socket *so = fp->f_data;
+#ifdef MAC
 	int error;
 
-	NET_LOCK_GIANT();
-#ifdef MAC
 	SOCK_LOCK(so);
 	error = mac_check_socket_poll(active_cred, so);
 	SOCK_UNLOCK(so);
-	if (error) {
-		NET_UNLOCK_GIANT();
+	if (error)
 		return (error);
-	}
 #endif
-	error = (so->so_proto->pr_usrreqs->pru_sopoll)
-	    (so, events, fp->f_cred, td);
-	NET_UNLOCK_GIANT();
-
-	return (error);
+	return (sopoll(so, events, fp->f_cred, td));
 }
 
 int
-soo_stat(fp, ub, active_cred, td)
-	struct file *fp;
-	struct stat *ub;
-	struct ucred *active_cred;
-	struct thread *td;
+soo_stat(struct file *fp, struct stat *ub, struct ucred *active_cred,
+    struct thread *td)
 {
 	struct socket *so = fp->f_data;
+#ifdef MAC
 	int error;
+#endif
 
 	bzero((caddr_t)ub, sizeof (*ub));
 	ub->st_mode = S_IFSOCK;
-	NET_LOCK_GIANT();
 #ifdef MAC
 	SOCK_LOCK(so);
 	error = mac_check_socket_stat(active_cred, so);
 	SOCK_UNLOCK(so);
-	if (error) {
-		NET_UNLOCK_GIANT();
+	if (error)
 		return (error);
-	}
 #endif
 	/*
 	 * If SBS_CANTRCVMORE is set, but there's still data left in the
 	 * receive buffer, the socket is still readable.
 	 *
-	 * XXXRW: perhaps should lock socket buffer so st_size result
-	 * is consistent.
+	 * XXXRW: perhaps should lock socket buffer so st_size result is
+	 * consistent.
 	 */
 	/* Unlocked read. */
 	if ((so->so_rcv.sb_state & SBS_CANTRCVMORE) == 0 ||
@@ -288,33 +250,27 @@
 	ub->st_size = so->so_rcv.sb_cc - so->so_rcv.sb_ctl;
 	ub->st_uid = so->so_cred->cr_uid;
 	ub->st_gid = so->so_cred->cr_gid;
-	error = (*so->so_proto->pr_usrreqs->pru_sense)(so, ub);
-	NET_UNLOCK_GIANT();
-	return (error);
+	return (*so->so_proto->pr_usrreqs->pru_sense)(so, ub);
 }
 
 /*
- * API socket close on file pointer.  We call soclose() to close the 
- * socket (including initiating closing protocols).  soclose() will
- * sorele() the file reference but the actual socket will not go away
- * until the socket's ref count hits 0.
+ * API socket close on file pointer.  We call soclose() to close the socket
+ * (including initiating closing protocols).  soclose() will sorele() the
+ * file reference but the actual socket will not go away until the socket's
+ * ref count hits 0.
  */
 /* ARGSUSED */
 int
-soo_close(fp, td)
-	struct file *fp;
-	struct thread *td;
+soo_close(struct file *fp, struct thread *td)
 {
 	int error = 0;
 	struct socket *so;
 
-	NET_LOCK_GIANT();
 	so = fp->f_data;
 	fp->f_ops = &badfileops;
 	fp->f_data = NULL;
 
 	if (so)
 		error = soclose(so);
-	NET_UNLOCK_GIANT();
 	return (error);
 }
Index: uipc_cow.c
===================================================================
RCS file: /home/cvs/src/sys/kern/uipc_cow.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/uipc_cow.c -L sys/kern/uipc_cow.c -u -r1.1.1.1 -r1.2
--- sys/kern/uipc_cow.c
+++ sys/kern/uipc_cow.c
@@ -36,7 +36,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/uipc_cow.c,v 1.23.2.1 2005/10/26 20:21:23 delphij Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/uipc_cow.c,v 1.26 2005/10/23 07:41:56 alc Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
Index: vfs_cluster.c
===================================================================
RCS file: /home/cvs/src/sys/kern/vfs_cluster.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/vfs_cluster.c -L sys/kern/vfs_cluster.c -u -r1.2 -r1.3
--- sys/kern/vfs_cluster.c
+++ sys/kern/vfs_cluster.c
@@ -32,7 +32,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/vfs_cluster.c,v 1.166.2.3 2006/03/22 17:54:50 tegge Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/vfs_cluster.c,v 1.176 2007/06/01 01:12:44 jeff Exp $");
 
 #include "opt_debug_cluster.h"
 
@@ -58,7 +58,7 @@
     "Debug VFS clustering code");
 #endif
 
-static MALLOC_DEFINE(M_SEGMENT, "cluster_save buffer", "cluster_save buffer");
+static MALLOC_DEFINE(M_SEGMENT, "cl_savebuf", "cluster_save buffer");
 
 static struct cluster_save *
 	cluster_collectbufs(struct vnode *vp, struct buf *last_bp);
@@ -228,7 +228,7 @@
 			BUF_KERNPROC(bp);
 		bp->b_iooffset = dbtob(bp->b_blkno);
 		bstrategy(bp);
-		curproc->p_stats->p_ru.ru_inblock++;
+		curthread->td_ru.ru_inblock++;
 	}
 
 	/*
@@ -281,7 +281,7 @@
 			BUF_KERNPROC(rbp);
 		rbp->b_iooffset = dbtob(rbp->b_blkno);
 		bstrategy(rbp);
-		curproc->p_stats->p_ru.ru_inblock++;
+		curthread->td_ru.ru_inblock++;
 	}
 
 	if (reqbp)
@@ -595,7 +595,7 @@
 	int async;
 
 	if (vp->v_type == VREG) {
-		async = vp->v_mount->mnt_flag & MNT_ASYNC;
+		async = vp->v_mount->mnt_kern_flag & MNTK_ASYNC;
 		lblocksize = vp->v_mount->mnt_stat.f_iosize;
 	} else {
 		async = 0;
@@ -770,6 +770,12 @@
 			--len;
 			continue;
 		}
+		if (tbp->b_pin_count >  0) {
+			BUF_UNLOCK(tbp);
+			++start_lbn;
+			--len;
+			continue;
+		}
 		bremfree(tbp);
 		tbp->b_flags &= ~B_DONE;
 
@@ -873,6 +879,15 @@
 					BUF_UNLOCK(tbp);
 					break;
 				}
+
+				/*
+				 * Do not pull in pinned buffers.
+				 */
+				if (tbp->b_pin_count > 0) {
+					BUF_UNLOCK(tbp);
+					break;
+				}
+
 				/*
 				 * Ok, it's passed all the tests,
 				 * so remove it from the free list
@@ -896,7 +911,7 @@
 				if (i != 0) { /* if not first buffer */
 					for (j = 0; j < tbp->b_npages; j += 1) {
 						m = tbp->b_pages[j];
-						if (m->flags & PG_BUSY) {
+						if (m->oflags & VPO_BUSY) {
 							VM_OBJECT_UNLOCK(
 							    tbp->b_object);
 							bqrelse(tbp);
Index: kern_proc.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_proc.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/kern_proc.c -L sys/kern/kern_proc.c -u -r1.2 -r1.3
--- sys/kern/kern_proc.c
+++ sys/kern/kern_proc.c
@@ -27,11 +27,10 @@
  * SUCH DAMAGE.
  *
  *	@(#)kern_proc.c	8.7 (Berkeley) 2/14/95
- * $FreeBSD: src/sys/kern/kern_proc.c,v 1.230.2.3 2006/01/05 20:23:10 truckman Exp $
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_proc.c,v 1.230.2.3 2006/01/05 20:23:10 truckman Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_proc.c,v 1.252.2.2.2.1 2008/01/19 18:15:05 kib Exp $");
 
 #include "opt_ktrace.h"
 #include "opt_kstack_pages.h"
@@ -43,6 +42,7 @@
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
+#include <sys/refcount.h>
 #include <sys/sysent.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
@@ -54,6 +54,7 @@
 #include <sys/user.h>
 #include <sys/jail.h>
 #include <sys/vnode.h>
+#include <sys/eventhandler.h>
 #ifdef KTRACE
 #include <sys/uio.h>
 #include <sys/ktrace.h>
@@ -92,7 +93,6 @@
 struct proclist zombproc;
 struct sx allproc_lock;
 struct sx proctree_lock;
-struct mtx pargs_ref_lock;
 struct mtx ppeers_lock;
 uma_zone_t proc_zone;
 uma_zone_t ithread_zone;
@@ -111,7 +111,6 @@
 
 	sx_init(&allproc_lock, "allproc");
 	sx_init(&proctree_lock, "proctree");
-	mtx_init(&pargs_ref_lock, "struct pargs.ref", NULL, MTX_DEF);
 	mtx_init(&ppeers_lock, "p_peers", NULL, MTX_DEF);
 	LIST_INIT(&allproc);
 	LIST_INIT(&zombproc);
@@ -132,6 +131,7 @@
 	struct proc *p;
 
 	p = (struct proc *)mem;
+	EVENTHANDLER_INVOKE(process_ctor, p);
 	return (0);
 }
 
@@ -143,29 +143,28 @@
 {
 	struct proc *p;
 	struct thread *td;
-#ifdef INVARIANTS
-	struct ksegrp *kg;
-#endif
 
 	/* INVARIANTS checks go here */
 	p = (struct proc *)mem;
         td = FIRST_THREAD_IN_PROC(p);
+	if (td != NULL) {
 #ifdef INVARIANTS
-	KASSERT((p->p_numthreads == 1),
-	    ("bad number of threads in exiting process"));
-	KASSERT((p->p_numksegrps == 1), ("free proc with > 1 ksegrp"));
-	KASSERT((td != NULL), ("proc_dtor: bad thread pointer"));
-        kg = FIRST_KSEGRP_IN_PROC(p);
-	KASSERT((kg != NULL), ("proc_dtor: bad kg pointer"));
+		KASSERT((p->p_numthreads == 1),
+		    ("bad number of threads in exiting process"));
+		KASSERT(STAILQ_EMPTY(&p->p_ktr), ("proc_dtor: non-empty p_ktr"));
 #endif
 
-	/* Dispose of an alternate kstack, if it exists.
-	 * XXX What if there are more than one thread in the proc?
-	 *     The first thread in the proc is special and not
-	 *     freed, so you gotta do this here.
-	 */
-	if (((p->p_flag & P_KTHREAD) != 0) && (td->td_altkstack != 0))
-		vm_thread_dispose_altkstack(td);
+		/* Dispose of an alternate kstack, if it exists.
+		 * XXX What if there are more than one thread in the proc?
+		 *     The first thread in the proc is special and not
+		 *     freed, so you gotta do this here.
+		 */
+		if (((p->p_flag & P_KTHREAD) != 0) && (td->td_altkstack != 0))
+			vm_thread_dispose_altkstack(td);
+	}
+	EVENTHANDLER_INVOKE(process_dtor, p);
+	if (p->p_ksi != NULL)
+		KASSERT(! KSI_ONQ(p->p_ksi), ("SIGCHLD queue"));
 }
 
 /*
@@ -175,18 +174,15 @@
 proc_init(void *mem, int size, int flags)
 {
 	struct proc *p;
-	struct thread *td;
-	struct ksegrp *kg;
 
 	p = (struct proc *)mem;
 	p->p_sched = (struct p_sched *)&p[1];
-	td = thread_alloc();
-	kg = ksegrp_alloc();
 	bzero(&p->p_mtx, sizeof(struct mtx));
 	mtx_init(&p->p_mtx, "process lock", NULL, MTX_DEF | MTX_DUPOK);
+	mtx_init(&p->p_slock, "process slock", NULL, MTX_SPIN | MTX_RECURSE);
+	TAILQ_INIT(&p->p_threads);	     /* all threads in proc */
+	EVENTHANDLER_INVOKE(process_init, p);
 	p->p_stats = pstats_alloc();
-	proc_linkup(p, kg, td);
-	sched_newproc(p, kg, td);
 	return (0);
 }
 
@@ -197,8 +193,19 @@
 static void
 proc_fini(void *mem, int size)
 {
+#ifdef notnow
+	struct proc *p;
 
+	p = (struct proc *)mem;
+	EVENTHANDLER_INVOKE(process_fini, p);
+	pstats_free(p->p_stats);
+	thread_free(FIRST_THREAD_IN_PROC(p));
+	mtx_destroy(&p->p_mtx);
+	if (p->p_ksi != NULL)
+		ksiginfo_free(p->p_ksi);
+#else
 	panic("proc reclaimed");
+#endif
 }
 
 /*
@@ -297,7 +304,7 @@
 		 * new session
 		 */
 		mtx_init(&sess->s_mtx, "session", NULL, MTX_DEF);
-		mtx_lock(&Giant);	/* XXX TTY */
+		mtx_lock(&Giant);       /* XXX TTY */
 		PROC_LOCK(p);
 		p->p_flag &= ~P_CONTROLT;
 		PROC_UNLOCK(p);
@@ -313,7 +320,7 @@
 		KASSERT(p == curproc,
 		    ("enterpgrp: mksession and p != curproc"));
 	} else {
-		mtx_lock(&Giant);	/* XXX TTY */
+		mtx_lock(&Giant);       /* XXX TTY */
 		pgrp->pg_session = p->p_session;
 		SESS_LOCK(pgrp->pg_session);
 		pgrp->pg_session->s_count++;
@@ -331,7 +338,7 @@
 	pgrp->pg_jobc = 0;
 	SLIST_INIT(&pgrp->pg_sigiolst);
 	PGRP_UNLOCK(pgrp);
-	mtx_unlock(&Giant);	/* XXX TTY */
+	mtx_unlock(&Giant);       /* XXX TTY */
 
 	doenterpgrp(p, pgrp);
 
@@ -391,7 +398,7 @@
 	fixjobc(p, pgrp, 1);
 	fixjobc(p, p->p_pgrp, 0);
 
-	mtx_lock(&Giant);	/* XXX TTY */
+	mtx_lock(&Giant);       /* XXX TTY */
 	PGRP_LOCK(pgrp);
 	PGRP_LOCK(savepgrp);
 	PROC_LOCK(p);
@@ -401,7 +408,7 @@
 	LIST_INSERT_HEAD(&pgrp->pg_members, p, p_pglist);
 	PGRP_UNLOCK(savepgrp);
 	PGRP_UNLOCK(pgrp);
-	mtx_unlock(&Giant);	/* XXX TTY */
+	mtx_unlock(&Giant);     /* XXX TTY */
 	if (LIST_EMPTY(&savepgrp->pg_members))
 		pgdelete(savepgrp);
 }
@@ -449,7 +456,7 @@
 	 */
 	funsetownlst(&pgrp->pg_sigiolst);
 
-	mtx_lock(&Giant);	/* XXX TTY */
+	mtx_lock(&Giant);       /* XXX TTY */
 	PGRP_LOCK(pgrp);
 	if (pgrp->pg_session->s_ttyp != NULL &&
 	    pgrp->pg_session->s_ttyp->t_pgrp == pgrp)
@@ -460,7 +467,7 @@
 	PGRP_UNLOCK(pgrp);
 	mtx_destroy(&pgrp->pg_mtx);
 	FREE(pgrp, M_PGRP);
-	mtx_unlock(&Giant);	 /* XXX TTY */
+	mtx_unlock(&Giant);     /* XXX TTY */
 }
 
 static void
@@ -620,7 +627,6 @@
 	struct thread *td0;
 	struct tty *tp;
 	struct session *sp;
-	struct timeval tv;
 	struct ucred *cred;
 	struct sigacts *ps;
 
@@ -667,7 +673,7 @@
 		kp->ki_sigcatch = ps->ps_sigcatch;
 		mtx_unlock(&ps->ps_mtx);
 	}
-	mtx_lock_spin(&sched_lock);
+	PROC_SLOCK(p);
 	if (p->p_state != PRS_NEW &&
 	    p->p_state != PRS_ZOMBIE &&
 	    p->p_vmspace != NULL) {
@@ -687,18 +693,23 @@
 		kp->ki_ssize = vm->vm_ssize;
 	} else if (p->p_state == PRS_ZOMBIE)
 		kp->ki_stat = SZOMB;
-	kp->ki_sflag = p->p_sflag;
-	kp->ki_swtime = p->p_swtime;
+	if (kp->ki_flag & P_INMEM)
+		kp->ki_sflag = PS_INMEM;
+	else
+		kp->ki_sflag = 0;
+	/* Calculate legacy swtime as seconds since 'swtick'. */
+	kp->ki_swtime = (ticks - p->p_swtick) / hz;
 	kp->ki_pid = p->p_pid;
 	kp->ki_nice = p->p_nice;
-	bintime2timeval(&p->p_rux.rux_runtime, &tv);
-	kp->ki_runtime = tv.tv_sec * (u_int64_t)1000000 + tv.tv_usec;
-	mtx_unlock_spin(&sched_lock);
-	if ((p->p_sflag & PS_INMEM) && p->p_stats != NULL) {
+	rufetch(p, &kp->ki_rusage);
+	kp->ki_runtime = cputick2usec(p->p_rux.rux_runtime);
+	PROC_SUNLOCK(p);
+	if ((p->p_flag & P_INMEM) && p->p_stats != NULL) {
 		kp->ki_start = p->p_stats->p_start;
 		timevaladd(&kp->ki_start, &boottime);
-		kp->ki_rusage = p->p_stats->p_ru;
+		PROC_SLOCK(p);
 		calcru(p, &kp->ki_rusage.ru_utime, &kp->ki_rusage.ru_stime);
+		PROC_SUNLOCK(p);
 		calccru(p, &kp->ki_childutime, &kp->ki_childstime);
 
 		/* Some callers want child-times in a single value */
@@ -731,10 +742,8 @@
 			kp->ki_tsid = tp->t_session->s_sid;
 	} else
 		kp->ki_tdev = NODEV;
-	if (p->p_comm[0] != '\0') {
+	if (p->p_comm[0] != '\0')
 		strlcpy(kp->ki_comm, p->p_comm, sizeof(kp->ki_comm));
-		strlcpy(kp->ki_ocomm, p->p_comm, sizeof(kp->ki_ocomm));
-	}
 	if (p->p_sysent && p->p_sysent->sv_name != NULL &&
 	    p->p_sysent->sv_name[0] != '\0')
 		strlcpy(kp->ki_emul, p->p_sysent->sv_name, sizeof(kp->ki_emul));
@@ -748,20 +757,23 @@
 
 /*
  * Fill in information that is thread specific.
- * Must be called with sched_lock locked.
+ * Must be called with p_slock locked.
  */
 static void
 fill_kinfo_thread(struct thread *td, struct kinfo_proc *kp)
 {
-	struct ksegrp *kg;
 	struct proc *p;
 
 	p = td->td_proc;
+	PROC_SLOCK_ASSERT(p, MA_OWNED);
 
+	thread_lock(td);
 	if (td->td_wmesg != NULL)
 		strlcpy(kp->ki_wmesg, td->td_wmesg, sizeof(kp->ki_wmesg));
 	else
 		bzero(kp->ki_wmesg, sizeof(kp->ki_wmesg));
+	if (td->td_name[0] != '\0')
+		strlcpy(kp->ki_ocomm, td->td_name, sizeof(kp->ki_ocomm));
 	if (TD_ON_LOCK(td)) {
 		kp->ki_kiflag |= KI_LOCKBLOCK;
 		strlcpy(kp->ki_lockname, td->td_lockname,
@@ -791,14 +803,6 @@
 		kp->ki_stat = SIDL;
 	}
 
-	kg = td->td_ksegrp;
-
-	/* things in the KSE GROUP */
-	kp->ki_estcpu = kg->kg_estcpu;
-	kp->ki_slptime = kg->kg_slptime;
-	kp->ki_pri.pri_user = kg->kg_user_pri;
-	kp->ki_pri.pri_class = kg->kg_pri_class;
-
 	/* Things in the thread */
 	kp->ki_wchan = td->td_wchan;
 	kp->ki_pri.pri_level = td->td_priority;
@@ -811,12 +815,17 @@
 	kp->ki_pcb = td->td_pcb;
 	kp->ki_kstack = (void *)td->td_kstack;
 	kp->ki_pctcpu = sched_pctcpu(td);
+	kp->ki_estcpu = td->td_estcpu;
+	kp->ki_slptime = (ticks - td->td_slptick) / hz;
+	kp->ki_pri.pri_class = td->td_pri_class;
+	kp->ki_pri.pri_user = td->td_user_pri;
 
 	/* We can't get this anymore but ps etc never used it anyway. */
 	kp->ki_rqindex = 0;
 
 	SIGSETOR(kp->ki_siglist, td->td_siglist);
 	kp->ki_sigmask = td->td_sigmask;
+	thread_unlock(td);
 }
 
 /*
@@ -828,10 +837,10 @@
 {
 
 	fill_kinfo_proc_only(p, kp);
-	mtx_lock_spin(&sched_lock);
+	PROC_SLOCK(p);
 	if (FIRST_THREAD_IN_PROC(p) != NULL)
 		fill_kinfo_thread(FIRST_THREAD_IN_PROC(p), kp);
-	mtx_unlock_spin(&sched_lock);
+	PROC_SUNLOCK(p);
 }
 
 struct pstats *
@@ -898,26 +907,26 @@
 
 	fill_kinfo_proc_only(p, &kinfo_proc);
 	if (flags & KERN_PROC_NOTHREADS) {
-		mtx_lock_spin(&sched_lock);
+		PROC_SLOCK(p);
 		if (FIRST_THREAD_IN_PROC(p) != NULL)
 			fill_kinfo_thread(FIRST_THREAD_IN_PROC(p), &kinfo_proc);
-		mtx_unlock_spin(&sched_lock);
+		PROC_SUNLOCK(p);
 		error = SYSCTL_OUT(req, (caddr_t)&kinfo_proc,
 				   sizeof(kinfo_proc));
 	} else {
-		mtx_lock_spin(&sched_lock);
+		PROC_SLOCK(p);
 		if (FIRST_THREAD_IN_PROC(p) != NULL)
 			FOREACH_THREAD_IN_PROC(p, td) {
 				fill_kinfo_thread(td, &kinfo_proc);
 				error = SYSCTL_OUT(req, (caddr_t)&kinfo_proc,
-					   	sizeof(kinfo_proc));
+						   sizeof(kinfo_proc));
 				if (error)
 					break;
 			}
 		else
 			error = SYSCTL_OUT(req, (caddr_t)&kinfo_proc,
-				   	sizeof(kinfo_proc));
-		mtx_unlock_spin(&sched_lock);
+					   sizeof(kinfo_proc));
+		PROC_SUNLOCK(p);
 	}
 	PROC_UNLOCK(p);
 	if (error)
@@ -1007,13 +1016,15 @@
 			/*
 			 * Skip embryonic processes.
 			 */
-			mtx_lock_spin(&sched_lock);
+			PROC_SLOCK(p);
 			if (p->p_state == PRS_NEW) {
-				mtx_unlock_spin(&sched_lock);
+				PROC_SUNLOCK(p);
 				continue;
 			}
-			mtx_unlock_spin(&sched_lock);
+			PROC_SUNLOCK(p);
 			PROC_LOCK(p);
+			KASSERT(p->p_ucred != NULL,
+			    ("process credential is NULL for non-NEW proc"));
 			/*
 			 * Show a user only appropriate processes.
 			 */
@@ -1028,8 +1039,7 @@
 			switch (oid_number) {
 
 			case KERN_PROC_GID:
-				if (p->p_ucred == NULL ||
-				    p->p_ucred->cr_gid != (gid_t)name[0]) {
+				if (p->p_ucred->cr_gid != (gid_t)name[0]) {
 					PROC_UNLOCK(p);
 					continue;
 				}
@@ -1037,7 +1047,7 @@
 
 			case KERN_PROC_PGRP:
 				/* could do this by traversing pgrp */
-				if (p->p_pgrp == NULL || 
+				if (p->p_pgrp == NULL ||
 				    p->p_pgrp->pg_id != (pid_t)name[0]) {
 					PROC_UNLOCK(p);
 					continue;
@@ -1045,8 +1055,7 @@
 				break;
 
 			case KERN_PROC_RGID:
-				if (p->p_ucred == NULL ||
-				    p->p_ucred->cr_rgid != (gid_t)name[0]) {
+				if (p->p_ucred->cr_rgid != (gid_t)name[0]) {
 					PROC_UNLOCK(p);
 					continue;
 				}
@@ -1078,16 +1087,14 @@
 				break;
 
 			case KERN_PROC_UID:
-				if (p->p_ucred == NULL || 
-				    p->p_ucred->cr_uid != (uid_t)name[0]) {
+				if (p->p_ucred->cr_uid != (uid_t)name[0]) {
 					PROC_UNLOCK(p);
 					continue;
 				}
 				break;
 
 			case KERN_PROC_RUID:
-				if (p->p_ucred == NULL || 
-				    p->p_ucred->cr_ruid != (uid_t)name[0]) {
+				if (p->p_ucred->cr_ruid != (uid_t)name[0]) {
 					PROC_UNLOCK(p);
 					continue;
 				}
@@ -1119,7 +1126,7 @@
 
 	MALLOC(pa, struct pargs *, sizeof(struct pargs) + len, M_PARGS,
 		M_WAITOK);
-	pa->ar_ref = 1;
+	refcount_init(&pa->ar_ref, 1);
 	pa->ar_length = len;
 	return (pa);
 }
@@ -1137,9 +1144,7 @@
 
 	if (pa == NULL)
 		return;
-	PARGS_LOCK(pa);
-	pa->ar_ref++;
-	PARGS_UNLOCK(pa);
+	refcount_acquire(&pa->ar_ref);
 }
 
 void
@@ -1148,12 +1153,8 @@
 
 	if (pa == NULL)
 		return;
-	PARGS_LOCK(pa);
-	if (--pa->ar_ref == 0) {
-		PARGS_UNLOCK(pa);
+	if (refcount_release(&pa->ar_ref))
 		pargs_free(pa);
-	} else
-		PARGS_UNLOCK(pa);
 }
 
 /*
@@ -1242,6 +1243,11 @@
 	}
 
 	vp = p->p_textvp;
+	if (vp == NULL) {
+		if (*pidp != -1)
+			PROC_UNLOCK(p);
+		return (0);
+	}
 	vref(vp);
 	if (*pidp != -1)
 		PROC_UNLOCK(p);
Index: link_elf_obj.c
===================================================================
RCS file: /home/cvs/src/sys/kern/link_elf_obj.c,v
retrieving revision 1.1.1.2
retrieving revision 1.2
diff -L sys/kern/link_elf_obj.c -L sys/kern/link_elf_obj.c -u -r1.1.1.2 -r1.2
--- sys/kern/link_elf_obj.c
+++ sys/kern/link_elf_obj.c
@@ -26,7 +26,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/link_elf_obj.c,v 1.87.2.3 2005/12/30 22:13:58 marcel Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/link_elf_obj.c,v 1.95 2007/05/31 11:51:51 kib Exp $");
 
 #include "opt_ddb.h"
 #include "opt_mac.h"
@@ -35,9 +35,9 @@
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
-#include <sys/mac.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
+#include <sys/mount.h>
 #include <sys/proc.h>
 #include <sys/namei.h>
 #include <sys/fcntl.h>
@@ -46,6 +46,8 @@
 
 #include <machine/elf.h>
 
+#include <security/mac/mac_framework.h>
+
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_object.h>
@@ -393,19 +395,19 @@
 	int nsym;
 	int pb, rl, ra;
 	int alignmask;
-
-	GIANT_REQUIRED;
+	int vfslocked;
 
 	shdr = NULL;
 	lf = NULL;
 	mapsize = 0;
 	hdr = NULL;
 
-	NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, filename, td);
+	NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE, UIO_SYSSPACE, filename, td);
 	flags = FREAD;
-	error = vn_open(&nd, &flags, 0, -1);
+	error = vn_open(&nd, &flags, 0, NULL);
 	if (error)
 		return error;
+	vfslocked = NDHASGIANT(&nd);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 #ifdef MAC
 	error = mac_check_kld_load(td->td_ucred, nd.ni_vp);
@@ -788,6 +790,7 @@
 		free(hdr, M_LINKER);
 	VOP_UNLOCK(nd.ni_vp, 0, td);
 	vn_close(nd.ni_vp, FREAD, td->td_ucred, td);
+	VFS_UNLOCK_GIANT(vfslocked);
 
 	return error;
 }
@@ -1112,6 +1115,51 @@
 }
 
 static void
+link_elf_fix_link_set(elf_file_t ef)
+{
+	static const char startn[] = "__start_";
+	static const char stopn[] = "__stop_";
+	Elf_Sym *sym;
+	const char *sym_name, *linkset_name;
+	Elf_Addr startp, stopp;
+	Elf_Size symidx;
+	int start, i;
+
+	startp = stopp = 0;
+	for (symidx = 1 /* zero entry is special */;
+		symidx < ef->ddbsymcnt; symidx++) {
+		sym = ef->ddbsymtab + symidx;
+		if (sym->st_shndx != SHN_UNDEF)
+			continue;
+
+		sym_name = ef->ddbstrtab + sym->st_name;
+		if (strncmp(sym_name, startn, sizeof(startn) - 1) == 0) {
+			start = 1;
+			linkset_name = sym_name + sizeof(startn) - 1;
+		}
+		else if (strncmp(sym_name, stopn, sizeof(stopn) - 1) == 0) {
+			start = 0;
+			linkset_name = sym_name + sizeof(stopn) - 1;
+		}
+		else
+			continue;
+
+		for (i = 0; i < ef->nprogtab; i++) {
+			if (strcmp(ef->progtab[i].name, linkset_name) == 0) {
+				startp = (Elf_Addr)ef->progtab[i].addr;
+				stopp = (Elf_Addr)(startp + ef->progtab[i].size);
+				break;
+			}
+		}
+		if (i == ef->nprogtab)
+			continue;
+
+		sym->st_value = start ? startp : stopp;
+		sym->st_shndx = i;
+	}
+}
+
+static void
 link_elf_reloc_local(linker_file_t lf)
 {
 	elf_file_t ef = (elf_file_t)lf;
@@ -1124,6 +1172,8 @@
 	int i;
 	Elf_Size symidx;
 
+	link_elf_fix_link_set(ef);
+
 	/* Perform relocations without addend if there are any: */
 	for (i = 0; i < ef->nrel; i++) {
 		rel = ef->reltab[i].rel;
Index: subr_power.c
===================================================================
RCS file: /home/cvs/src/sys/kern/subr_power.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/subr_power.c -L sys/kern/subr_power.c -u -r1.1.1.1 -r1.2
--- sys/kern/subr_power.c
+++ sys/kern/subr_power.c
@@ -25,17 +25,27 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/subr_power.c,v 1.5 2004/01/02 18:24:13 njl Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/subr_power.c,v 1.8 2005/11/09 16:22:56 imp Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
 
 #include <sys/power.h>
+#include <sys/taskqueue.h>
 
 static u_int		 power_pm_type	= POWER_PM_TYPE_NONE;
 static power_pm_fn_t	 power_pm_fn	= NULL;
 static void		*power_pm_arg	= NULL;
+static struct task	 power_pm_task;
+
+static void
+power_pm_deferred_fn(void *arg, int pending)
+{
+	int state = (intptr_t)arg;
+
+	power_pm_fn(POWER_CMD_SUSPEND, power_pm_arg, state);
+}
 
 int
 power_pm_register(u_int pm_type, power_pm_fn_t pm_fn, void *pm_arg)
@@ -48,6 +58,7 @@
 		power_pm_fn	= pm_fn;
 		power_pm_arg	= pm_arg;
 		error = 0;
+		TASK_INIT(&power_pm_task, 0, power_pm_deferred_fn, NULL);
 	} else {
 		error = ENXIO;
 	}
@@ -72,8 +83,8 @@
 	    state != POWER_SLEEP_STATE_SUSPEND &&
 	    state != POWER_SLEEP_STATE_HIBERNATE)
 		return;
-
-	power_pm_fn(POWER_CMD_SUSPEND, power_pm_arg, state);
+	power_pm_task.ta_context = (void *)(intptr_t)state;
+	taskqueue_enqueue(taskqueue_thread, &power_pm_task);
 }
 
 /*
Index: vfs_subr.c
===================================================================
RCS file: /home/cvs/src/sys/kern/vfs_subr.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/vfs_subr.c -L sys/kern/vfs_subr.c -u -r1.2 -r1.3
--- sys/kern/vfs_subr.c
+++ sys/kern/vfs_subr.c
@@ -39,7 +39,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/vfs_subr.c,v 1.635.2.16.2.1 2006/05/04 07:42:10 scottl Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/vfs_subr.c,v 1.707.2.1 2007/12/13 11:58:00 kib Exp $");
 
 #include "opt_ddb.h"
 #include "opt_mac.h"
@@ -55,13 +55,14 @@
 #include <sys/extattr.h>
 #include <sys/file.h>
 #include <sys/fcntl.h>
+#include <sys/jail.h>
 #include <sys/kdb.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
-#include <sys/mac.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/namei.h>
+#include <sys/priv.h>
 #include <sys/reboot.h>
 #include <sys/sleepqueue.h>
 #include <sys/stat.h>
@@ -72,6 +73,8 @@
 
 #include <machine/stdarg.h>
 
+#include <security/mac/mac_framework.h>
+
 #include <vm/vm.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
@@ -81,16 +84,18 @@
 #include <vm/vm_kern.h>
 #include <vm/uma.h>
 
-static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure");
+#ifdef DDB
+#include <ddb/ddb.h>
+#endif
+
+static MALLOC_DEFINE(M_NETADDR, "subr_export_host", "Export host address structure");
 
 static void	delmntque(struct vnode *vp);
-static void	insmntque(struct vnode *vp, struct mount *mp);
 static int	flushbuflist(struct bufv *bufv, int flags, struct bufobj *bo,
 		    int slpflag, int slptimeo);
 static void	syncer_shutdown(void *arg, int howto);
 static int	vtryrecycle(struct vnode *vp);
 static void	vbusy(struct vnode *vp);
-static void	vdropl(struct vnode *vp);
 static void	vinactive(struct vnode *, struct thread *);
 static void	v_incr_usecount(struct vnode *);
 static void	v_decr_usecount(struct vnode *);
@@ -109,19 +114,15 @@
  * Enable Giant pushdown based on whether or not the vm is mpsafe in this
  * build.  Without mpsafevm the buffer cache can not run Giant free.
  */
-#if defined(__alpha__) || defined(__amd64__) || defined(__i386__) || \
-	defined(__sparc64__)
 int mpsafe_vfs = 1;
-#else
-int mpsafe_vfs;
-#endif
 TUNABLE_INT("debug.mpsafevfs", &mpsafe_vfs);
 SYSCTL_INT(_debug, OID_AUTO, mpsafevfs, CTLFLAG_RD, &mpsafe_vfs, 0,
     "MPSAFE VFS");
 
 /*
  * Number of vnodes in existence.  Increased whenever getnewvnode()
- * allocates a new vnode, never decreased.
+ * allocates a new vnode, decreased on vdestroy() called on VI_DOOMed
+ * vnode.
  */
 static unsigned long	numvnodes;
 
@@ -304,14 +305,14 @@
 			    desiredvnodes, MAXVNODES_MAX);
 		desiredvnodes = MAXVNODES_MAX;
 	}
-	wantfreevnodes = desiredvnodes / 4; 
+	wantfreevnodes = desiredvnodes / 4;
 	mtx_init(&mntid_mtx, "mntid", NULL, MTX_DEF);
 	TAILQ_INIT(&vnode_free_list);
 	mtx_init(&vnode_free_list_mtx, "vnode_free_list", NULL, MTX_DEF);
 	vnode_zone = uma_zcreate("VNODE", sizeof (struct vnode), NULL, NULL,
 	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	vnodepoll_zone = uma_zcreate("VNODEPOLL", sizeof (struct vpollinfo),
-	      NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	/*
 	 * Initialize the filesystem syncer.
 	 */
@@ -328,11 +329,8 @@
  * unmounting. Interlock is not released on failure.
  */
 int
-vfs_busy(mp, flags, interlkp, td)
-	struct mount *mp;
-	int flags;
-	struct mtx *interlkp;
-	struct thread *td;
+vfs_busy(struct mount *mp, int flags, struct mtx *interlkp,
+    struct thread *td)
 {
 	int lkflags;
 
@@ -365,7 +363,6 @@
 	lkflags = LK_SHARED | LK_INTERLOCK;
 	if (lockmgr(&mp->mnt_lock, lkflags, MNT_MTX(mp), td))
 		panic("vfs_busy: unexpected lock failure");
-	vfs_rel(mp);
 	return (0);
 }
 
@@ -373,20 +370,18 @@
  * Free a busy filesystem.
  */
 void
-vfs_unbusy(mp, td)
-	struct mount *mp;
-	struct thread *td;
+vfs_unbusy(struct mount *mp, struct thread *td)
 {
 
 	lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, td);
+	vfs_rel(mp);
 }
 
 /*
  * Lookup a mount point by filesystem identifier.
  */
 struct mount *
-vfs_getvfs(fsid)
-	fsid_t *fsid;
+vfs_getvfs(fsid_t *fsid)
 {
 	struct mount *mp;
 
@@ -394,6 +389,7 @@
 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
 		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
+			vfs_ref(mp);
 			mtx_unlock(&mountlist_mtx);
 			return (mp);
 		}
@@ -403,16 +399,39 @@
 }
 
 /*
- * Check if a user can access priveledged mount options.
+ * Check if a user can access privileged mount options.
  */
 int
 vfs_suser(struct mount *mp, struct thread *td)
 {
 	int error;
 
+	/*
+	 * If the thread is jailed, but this is not a jail-friendly file
+	 * system, deny immediately.
+	 */
+	if (jailed(td->td_ucred) && !(mp->mnt_vfc->vfc_flags & VFCF_JAIL))
+		return (EPERM);
+
+	/*
+	 * If the file system was mounted outside a jail and a jailed thread
+	 * tries to access it, deny immediately.
+	 */
+	if (!jailed(mp->mnt_cred) && jailed(td->td_ucred))
+		return (EPERM);
+
+	/*
+	 * If the file system was mounted inside different jail that the jail of
+	 * the calling thread, deny immediately.
+	 */
+	if (jailed(mp->mnt_cred) && jailed(td->td_ucred) &&
+	    mp->mnt_cred->cr_prison != td->td_ucred->cr_prison) {
+		return (EPERM);
+	}
+
 	if ((mp->mnt_flag & MNT_USER) == 0 ||
 	    mp->mnt_cred->cr_uid != td->td_ucred->cr_uid) {
-		if ((error = suser(td)) != 0)
+		if ((error = priv_check(td, PRIV_VFS_MOUNT_OWNER)) != 0)
 			return (error);
 	}
 	return (0);
@@ -431,10 +450,10 @@
  * different mounts.
  */
 void
-vfs_getnewfsid(mp)
-	struct mount *mp;
+vfs_getnewfsid(struct mount *mp)
 {
 	static u_int16_t mntid_base;
+	struct mount *nmp;
 	fsid_t tfsid;
 	int mtype;
 
@@ -446,8 +465,9 @@
 		tfsid.val[0] = makedev(255,
 		    mtype | ((mntid_base & 0xFF00) << 8) | (mntid_base & 0xFF));
 		mntid_base++;
-		if (vfs_getvfs(&tfsid) == NULL)
+		if ((nmp = vfs_getvfs(&tfsid)) == NULL)
 			break;
+		vfs_rel(nmp);
 	}
 	mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
 	mp->mnt_stat.f_fsid.val[1] = tfsid.val[1];
@@ -472,8 +492,7 @@
  * Get a current timestamp.
  */
 void
-vfs_timestamp(tsp)
-	struct timespec *tsp;
+vfs_timestamp(struct timespec *tsp)
 {
 	struct timeval tv;
 
@@ -500,8 +519,7 @@
  * Set vnode attributes to VNOVAL
  */
 void
-vattr_null(vap)
-	struct vattr *vap;
+vattr_null(struct vattr *vap)
 {
 
 	vap->va_type = VNON;
@@ -610,7 +628,7 @@
 		 * vnode lock before our VOP_LOCK() call fails.
 		 */
 		if (vp->v_usecount || !LIST_EMPTY(&(vp)->v_cache_src) ||
-		    (vp->v_object != NULL && 
+		    (vp->v_object != NULL &&
 		    vp->v_object->resident_page_count > trigger)) {
 			VOP_UNLOCK(vp, LK_INTERLOCK, td);
 			goto next_iter_mntunlocked;
@@ -700,13 +718,13 @@
 	struct mount *mp, *nmp;
 	int done;
 	struct proc *p = vnlruproc;
-	struct thread *td = FIRST_THREAD_IN_PROC(p);
-
-	mtx_lock(&Giant);
+	struct thread *td = curthread;
 
 	EVENTHANDLER_REGISTER(shutdown_pre_sync, kproc_shutdown, p,
 	    SHUTDOWN_PRI_FIRST);
 
+	mtx_lock(&Giant);
+
 	for (;;) {
 		kthread_suspend_check(p);
 		mtx_lock(&vnode_free_list_mtx);
@@ -742,6 +760,7 @@
 		}
 		mtx_unlock(&mountlist_mtx);
 		if (done == 0) {
+			EVENTHANDLER_INVOKE(vfs_lowvnodes, desiredvnodes / 10);
 #if 0
 			/* These messages are temporary debugging aids */
 			if (vnlru_nowhere < 5)
@@ -751,7 +770,7 @@
 #endif
 			vnlru_nowhere++;
 			tsleep(vnlruproc, PPAUSE, "vlrup", hz * 3);
-		} else 
+		} else
 			uio_yield();
 	}
 }
@@ -790,6 +809,7 @@
 	VNASSERT(bo->bo_dirty.bv_root == NULL, vp, ("dirtyblkroot not NULL"));
 	VNASSERT(TAILQ_EMPTY(&vp->v_cache_dst), vp, ("vp has namecache dst"));
 	VNASSERT(LIST_EMPTY(&vp->v_cache_src), vp, ("vp has namecache src"));
+	VI_UNLOCK(vp);
 #ifdef MAC
 	mac_destroy_vnode(vp);
 #endif
@@ -859,11 +879,8 @@
  * Return the next vnode from the free list.
  */
 int
-getnewvnode(tag, mp, vops, vpp)
-	const char *tag;
-	struct mount *mp;
-	struct vop_vector *vops;
-	struct vnode **vpp;
+getnewvnode(const char *tag, struct mount *mp, struct vop_vector *vops,
+    struct vnode **vpp)
 {
 	struct vnode *vp = NULL;
 	struct bufobj *bo;
@@ -878,8 +895,17 @@
 	 * Wait for available vnodes.
 	 */
 	if (numvnodes > desiredvnodes) {
+		if (mp != NULL && (mp->mnt_kern_flag & MNTK_SUSPEND)) {
+			/*
+			 * File system is beeing suspended, we cannot risk a
+			 * deadlock here, so allocate new vnode anyway.
+			 */
+			if (freevnodes > wantfreevnodes)
+				vnlru_free(freevnodes - wantfreevnodes);
+			goto alloc;
+		}
 		if (vnlruproc_sig == 0) {
-			vnlruproc_sig = 1;      /* avoid unnecessary wakeups */
+			vnlruproc_sig = 1;	/* avoid unnecessary wakeups */
 			wakeup(vnlruproc);
 		}
 		msleep(&vnlruproc_sig, &vnode_free_list_mtx, PVFS,
@@ -891,6 +917,7 @@
 		}
 #endif
 	}
+alloc:
 	numvnodes++;
 	mtx_unlock(&vnode_free_list_mtx);
 	vp = (struct vnode *) uma_zalloc(vnode_zone, M_WAITOK|M_ZERO);
@@ -935,7 +962,6 @@
 		printf("NULL mp in getnewvnode()\n");
 #endif
 	if (mp != NULL) {
-		insmntque(vp, mp);
 		bo->bo_bsize = mp->mnt_stat.f_iosize;
 		if ((mp->mnt_kern_flag & MNTK_NOKNOTE) != 0)
 			vp->v_vflag |= VV_NOKNOTE;
@@ -967,22 +993,56 @@
 	MNT_IUNLOCK(mp);
 }
 
+static void
+insmntque_stddtr(struct vnode *vp, void *dtr_arg)
+{
+	struct thread *td;
+
+	td = curthread; /* XXX ? */
+	vp->v_data = NULL;
+	vp->v_op = &dead_vnodeops;
+	/* XXX non mp-safe fs may still call insmntque with vnode
+	   unlocked */
+	if (!VOP_ISLOCKED(vp, td))
+		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	vgone(vp);
+	vput(vp);
+}
+
 /*
  * Insert into list of vnodes for the new mount point, if available.
  */
-static void
-insmntque(struct vnode *vp, struct mount *mp)
+int
+insmntque1(struct vnode *vp, struct mount *mp,
+	void (*dtr)(struct vnode *, void *), void *dtr_arg)
 {
 
-	vp->v_mount = mp;
+	KASSERT(vp->v_mount == NULL,
+		("insmntque: vnode already on per mount vnode list"));
 	VNASSERT(mp != NULL, vp, ("Don't call insmntque(foo, NULL)"));
 	MNT_ILOCK(mp);
+	if ((mp->mnt_kern_flag & MNTK_NOINSMNTQ) != 0 &&
+	    mp->mnt_nvnodelistsize == 0) {
+		MNT_IUNLOCK(mp);
+		if (dtr != NULL)
+			dtr(vp, dtr_arg);
+		return (EBUSY);
+	}
+	vp->v_mount = mp;
 	MNT_REF(mp);
 	TAILQ_INSERT_TAIL(&mp->mnt_nvnodelist, vp, v_nmntvnodes);
 	VNASSERT(mp->mnt_nvnodelistsize >= 0, vp,
 		("neg mount point vnode list size"));
 	mp->mnt_nvnodelistsize++;
 	MNT_IUNLOCK(mp);
+	return (0);
+}
+
+int
+insmntque(struct vnode *vp, struct mount *mp)
+{
+
+	return (insmntque1(vp, mp, insmntque_stddtr, NULL));
 }
 
 /*
@@ -990,7 +1050,8 @@
  * Called with the underlying object locked.
  */
 int
-bufobj_invalbuf(struct bufobj *bo, int flags, struct thread *td, int slpflag, int slptimeo)
+bufobj_invalbuf(struct bufobj *bo, int flags, struct thread *td, int slpflag,
+    int slptimeo)
 {
 	int error;
 
@@ -1073,7 +1134,8 @@
  * Called with the underlying object locked.
  */
 int
-vinvalbuf(struct vnode *vp, int flags, struct thread *td, int slpflag, int slptimeo)
+vinvalbuf(struct vnode *vp, int flags, struct thread *td, int slpflag,
+    int slptimeo)
 {
 
 	CTR2(KTR_VFS, "vinvalbuf vp %p flags %d", vp, flags);
@@ -1086,11 +1148,8 @@
  *
  */
 static int
-flushbuflist(bufv, flags, bo, slpflag, slptimeo)
-	struct bufv *bufv;
-	int flags;
-	struct bufobj *bo;
-	int slpflag, slptimeo;
+flushbuflist( struct bufv *bufv, int flags, struct bufobj *bo, int slpflag,
+    int slptimeo)
 {
 	struct buf *bp, *nbp;
 	int retval, error;
@@ -1121,7 +1180,7 @@
 			return (error != ENOLCK ? error : EAGAIN);
 		}
 		KASSERT(bp->b_bufobj == bo,
-	            ("bp %p wrong b_bufobj %p should be %p",
+		    ("bp %p wrong b_bufobj %p should be %p",
 		    bp, bp->b_bufobj, bo));
 		if (bp->b_bufobj != bo) {	/* XXX: necessary ? */
 			BUF_UNLOCK(bp);
@@ -1143,12 +1202,12 @@
 			return (EAGAIN);	/* XXX: why not loop ? */
 		}
 		bremfree(bp);
-		bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF);
+		bp->b_flags |= (B_INVAL | B_RELBUF);
 		bp->b_flags &= ~B_ASYNC;
 		brelse(bp);
 		BO_LOCK(bo);
 		if (nbp != NULL &&
-		    (nbp->b_bufobj != bo || 
+		    (nbp->b_bufobj != bo ||
 		     nbp->b_lblkno != lblkno ||
 		     (nbp->b_xflags &
 		      (BX_BKGRDMARKER | BX_VNDIRTY | BX_VNCLEAN)) != xflags))
@@ -1163,7 +1222,8 @@
  * sync activity.
  */
 int
-vtruncbuf(struct vnode *vp, struct ucred *cred, struct thread *td, off_t length, int blksize)
+vtruncbuf(struct vnode *vp, struct ucred *cred, struct thread *td,
+    off_t length, int blksize)
 {
 	struct buf *bp, *nbp;
 	int anyfreed;
@@ -1339,7 +1399,7 @@
 	KASSERT((bp->b_xflags & (BX_VNDIRTY|BX_VNCLEAN)) !=
 	    (BX_VNDIRTY|BX_VNCLEAN),
 	    ("buf_vlist_remove: Buf %p is on two lists", bp));
-	if (bp->b_xflags & BX_VNDIRTY) 
+	if (bp->b_xflags & BX_VNDIRTY)
 		bv = &bp->b_bufobj->bo_dirty;
 	else
 		bv = &bp->b_bufobj->bo_clean;
@@ -1454,6 +1514,9 @@
 
 	ASSERT_VI_LOCKED(vp, "bgetvp");
 	vholdl(vp);
+	if (VFS_NEEDSGIANT(vp->v_mount) ||
+	    vp->v_bufobj.bo_flag & BO_NEEDSGIANT)
+		bp->b_flags |= B_NEEDSGIANT;
 	bp->b_vp = vp;
 	bp->b_bufobj = &vp->v_bufobj;
 	/*
@@ -1488,9 +1551,10 @@
 		bo->bo_flag &= ~BO_ONWORKLST;
 		mtx_lock(&sync_mtx);
 		LIST_REMOVE(bo, bo_synclist);
- 		syncer_worklist_len--;
+		syncer_worklist_len--;
 		mtx_unlock(&sync_mtx);
 	}
+	bp->b_flags &= ~B_NEEDSGIANT;
 	bp->b_vp = NULL;
 	bp->b_bufobj = NULL;
 	vdropl(vp);
@@ -1511,7 +1575,7 @@
 		LIST_REMOVE(bo, bo_synclist);
 	else {
 		bo->bo_flag |= BO_ONWORKLST;
- 		syncer_worklist_len++;
+		syncer_worklist_len++;
 	}
 
 	if (delay > syncer_maxdelay - 2)
@@ -1547,16 +1611,42 @@
 SYSINIT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
 
 static int
-sync_vnode(struct bufobj *bo, struct thread *td)
+sync_vnode(struct synclist *slp, struct bufobj **bo, struct thread *td)
 {
 	struct vnode *vp;
 	struct mount *mp;
+	int vfslocked;
 
-	vp = bo->__bo_vnode; 	/* XXX */
-	if (VOP_ISLOCKED(vp, NULL) != 0)
+	vfslocked = 0;
+restart:
+	*bo = LIST_FIRST(slp);
+	if (*bo == NULL) {
+		VFS_UNLOCK_GIANT(vfslocked);
+		return (0);
+	}
+	vp = (*bo)->__bo_vnode;	/* XXX */
+	if (VFS_NEEDSGIANT(vp->v_mount)) {
+		if (!vfslocked) {
+			vfslocked = 1;
+			if (mtx_trylock(&Giant) == 0) {
+				mtx_unlock(&sync_mtx);
+				mtx_lock(&Giant);
+				mtx_lock(&sync_mtx);
+				goto restart;
+			}
+		}
+	} else {
+		VFS_UNLOCK_GIANT(vfslocked);
+		vfslocked = 0;
+	}
+	if (VOP_ISLOCKED(vp, NULL) != 0) {
+		VFS_UNLOCK_GIANT(vfslocked);
 		return (1);
-	if (VI_TRYLOCK(vp) == 0)
+	}
+	if (VI_TRYLOCK(vp) == 0) {
+		VFS_UNLOCK_GIANT(vfslocked);
 		return (1);
+	}
 	/*
 	 * We use vhold in case the vnode does not
 	 * successfully sync.  vhold prevents the vnode from
@@ -1568,6 +1658,7 @@
 	VI_UNLOCK(vp);
 	if (vn_start_write(vp, &mp, V_NOWAIT) != 0) {
 		vdrop(vp);
+		VFS_UNLOCK_GIANT(vfslocked);
 		mtx_lock(&sync_mtx);
 		return (1);
 	}
@@ -1576,16 +1667,17 @@
 	VOP_UNLOCK(vp, 0, td);
 	vn_finished_write(mp);
 	VI_LOCK(vp);
-	if ((bo->bo_flag & BO_ONWORKLST) != 0) {
+	if (((*bo)->bo_flag & BO_ONWORKLST) != 0) {
 		/*
 		 * Put us back on the worklist.  The worklist
 		 * routine will remove us from our current
 		 * position and then add us back in at a later
 		 * position.
 		 */
-		vn_syncer_add_to_worklist(bo, syncdelay);
+		vn_syncer_add_to_worklist(*bo, syncdelay);
 	}
 	vdropl(vp);
+	VFS_UNLOCK_GIANT(vfslocked);
 	mtx_lock(&sync_mtx);
 	return (0);
 }
@@ -1600,7 +1692,7 @@
 	struct synclist *slp;
 	struct bufobj *bo;
 	long starttime;
-	struct thread *td = FIRST_THREAD_IN_PROC(updateproc);
+	struct thread *td = curthread;
 	static int dummychan;
 	int last_work_seen;
 	int net_worklist_len;
@@ -1608,19 +1700,18 @@
 	int first_printf;
 	int error;
 
-	mtx_lock(&Giant);
 	last_work_seen = 0;
 	syncer_final_iter = 0;
 	first_printf = 1;
 	syncer_state = SYNCER_RUNNING;
-	starttime = time_second;
+	starttime = time_uptime;
 	td->td_pflags |= TDP_NORUNNINGBUF;
 
 	EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc,
 	    SHUTDOWN_PRI_LAST);
 
+	mtx_lock(&sync_mtx);
 	for (;;) {
-		mtx_lock(&sync_mtx);
 		if (syncer_state == SYNCER_FINAL_DELAY &&
 		    syncer_final_iter == 0) {
 			mtx_unlock(&sync_mtx);
@@ -1629,14 +1720,14 @@
 		}
 		net_worklist_len = syncer_worklist_len - sync_vnode_count;
 		if (syncer_state != SYNCER_RUNNING &&
-		    starttime != time_second) {
+		    starttime != time_uptime) {
 			if (first_printf) {
 				printf("\nSyncing disks, vnodes remaining...");
 				first_printf = 0;
 			}
 			printf("%d ", net_worklist_len);
 		}
-		starttime = time_second;
+		starttime = time_uptime;
 
 		/*
 		 * Push files whose dirty time has expired.  Be careful
@@ -1652,7 +1743,7 @@
 			next = &syncer_workitem_pending[syncer_delayno];
 			/*
 			 * If the worklist has wrapped since the
-			 * it was emptied of all but syncer vnodes, 
+			 * it was emptied of all but syncer vnodes,
 			 * switch to the FINAL_DELAY state and run
 			 * for one more second.
 			 */
@@ -1675,8 +1766,8 @@
 			last_work_seen = syncer_delayno;
 		if (net_worklist_len > 0 && syncer_state == SYNCER_FINAL_DELAY)
 			syncer_state = SYNCER_SHUTTING_DOWN;
-		while ((bo = LIST_FIRST(slp)) != NULL) {
-			error = sync_vnode(bo, td);
+		while (!LIST_EMPTY(slp)) {
+			error = sync_vnode(slp, &bo, td);
 			if (error == 1) {
 				LIST_REMOVE(bo, bo_synclist);
 				LIST_INSERT_HEAD(next, bo, bo_synclist);
@@ -1685,7 +1776,6 @@
 		}
 		if (syncer_state == SYNCER_FINAL_DELAY && syncer_final_iter > 0)
 			syncer_final_iter--;
-		mtx_unlock(&sync_mtx);
 		/*
 		 * The variable rushjob allows the kernel to speed up the
 		 * processing of the filesystem syncer process. A rushjob
@@ -1696,15 +1786,12 @@
 		 * ahead of the disk that the kernel memory pool is being
 		 * threatened with exhaustion.
 		 */
-		mtx_lock(&sync_mtx);
 		if (rushjob > 0) {
 			rushjob -= 1;
-			mtx_unlock(&sync_mtx);
 			continue;
 		}
-		mtx_unlock(&sync_mtx);
 		/*
-		 * Just sleep for a short period if time between
+		 * Just sleep for a short period of time between
 		 * iterations when shutting down to allow some I/O
 		 * to happen.
 		 *
@@ -1716,10 +1803,10 @@
 		 * filesystem activity.
 		 */
 		if (syncer_state != SYNCER_RUNNING)
-			tsleep(&dummychan, PPAUSE, "syncfnl",
+			msleep(&dummychan, &sync_mtx, PPAUSE, "syncfnl",
 			    hz / SYNCER_SHUTDOWN_SPEEDUP);
-		else if (time_second == starttime)
-			tsleep(&lbolt, PPAUSE, "syncer", 0);
+		else if (time_uptime == starttime)
+			msleep(&lbolt, &sync_mtx, PPAUSE, "syncer", 0);
 	}
 }
 
@@ -1729,13 +1816,12 @@
  * normal turn time, otherwise it could take over the cpu.
  */
 int
-speedup_syncer()
+speedup_syncer(void)
 {
 	struct thread *td;
 	int ret = 0;
 
 	td = FIRST_THREAD_IN_PROC(updateproc);
-	sleepq_remove(td, &lbolt);
 	mtx_lock(&sync_mtx);
 	if (rushjob < syncdelay / 2) {
 		rushjob += 1;
@@ -1743,6 +1829,7 @@
 		ret = 1;
 	}
 	mtx_unlock(&sync_mtx);
+	sleepq_remove(td, &lbolt);
 	return (ret);
 }
 
@@ -1758,11 +1845,11 @@
 	if (howto & RB_NOSYNC)
 		return;
 	td = FIRST_THREAD_IN_PROC(updateproc);
-	sleepq_remove(td, &lbolt);
 	mtx_lock(&sync_mtx);
 	syncer_state = SYNCER_SHUTTING_DOWN;
 	rushjob = 0;
 	mtx_unlock(&sync_mtx);
+	sleepq_remove(td, &lbolt);
 	kproc_shutdown(arg, howto);
 }
 
@@ -1827,7 +1914,7 @@
 		if ((bo->bo_flag & BO_ONWORKLST) && bo->bo_dirty.bv_cnt == 0) {
 			mtx_lock(&sync_mtx);
 			LIST_REMOVE(bo, bo_synclist);
- 			syncer_worklist_len--;
+			syncer_worklist_len--;
 			mtx_unlock(&sync_mtx);
 			bo->bo_flag &= ~BO_ONWORKLST;
 		}
@@ -2031,8 +2118,7 @@
  * If count drops to zero, call inactive routine and return to freelist.
  */
 void
-vrele(vp)
-	struct vnode *vp;
+vrele(struct vnode *vp)
 {
 	struct thread *td = curthread;	/* XXX */
 
@@ -2086,11 +2172,10 @@
 /*
  * Release an already locked vnode.  This give the same effects as
  * unlock+vrele(), but takes less time and avoids releasing and
- * re-aquiring the lock (as vrele() aquires the lock internally.)
+ * re-aquiring the lock (as vrele() acquires the lock internally.)
  */
 void
-vput(vp)
-	struct vnode *vp;
+vput(struct vnode *vp)
 {
 	struct thread *td = curthread;	/* XXX */
 	int error;
@@ -2180,10 +2265,11 @@
  * the vnode we will free it if it has been vgone'd otherwise it is
  * placed on the free list.
  */
-static void
+void
 vdropl(struct vnode *vp)
 {
 
+	ASSERT_VI_LOCKED(vp, "vdropl");
 	if (vp->v_holdcnt <= 0)
 		panic("vdrop: holdcnt %d", vp->v_holdcnt);
 	vp->v_holdcnt--;
@@ -2247,11 +2333,7 @@
 #endif
 
 int
-vflush(mp, rootrefs, flags, td)
-	struct mount *mp;
-	int rootrefs;
-	int flags;
-	struct thread *td;
+vflush( struct mount *mp, int rootrefs, int flags, struct thread *td)
 {
 	struct vnode *vp, *mvp, *rootvp = NULL;
 	struct vattr vattr;
@@ -2405,11 +2487,8 @@
 	CTR1(KTR_VFS, "vgonel: vp %p", vp);
 	ASSERT_VOP_LOCKED(vp, "vgonel");
 	ASSERT_VI_LOCKED(vp, "vgonel");
-#if 0
-	/* XXX Need to fix ttyvp before I enable this. */
 	VNASSERT(vp->v_holdcnt, vp,
 	    ("vgonel: vp %p has no reference.", vp));
-#endif
 	td = curthread;
 
 	/*
@@ -2476,8 +2555,7 @@
  * Calculate the total number of references to a special device.
  */
 int
-vcount(vp)
-	struct vnode *vp;
+vcount(struct vnode *vp)
 {
 	int count;
 
@@ -2491,8 +2569,7 @@
  * Same as above, but using the struct cdev *as argument
  */
 int
-count_dev(dev)
-	struct cdev *dev;
+count_dev(struct cdev *dev)
 {
 	int count;
 
@@ -2513,7 +2590,8 @@
 vn_printf(struct vnode *vp, const char *fmt, ...)
 {
 	va_list ap;
-	char buf[96];
+	char buf[256], buf2[16];
+	u_long flags;
 
 	va_start(ap, fmt);
 	vprintf(fmt, ap);
@@ -2525,15 +2603,54 @@
 	buf[0] = '\0';
 	buf[1] = '\0';
 	if (vp->v_vflag & VV_ROOT)
-		strcat(buf, "|VV_ROOT");
+		strlcat(buf, "|VV_ROOT", sizeof(buf));
+	if (vp->v_vflag & VV_ISTTY)
+		strlcat(buf, "|VV_ISTTY", sizeof(buf));
+	if (vp->v_vflag & VV_NOSYNC)
+		strlcat(buf, "|VV_NOSYNC", sizeof(buf));
+	if (vp->v_vflag & VV_CACHEDLABEL)
+		strlcat(buf, "|VV_CACHEDLABEL", sizeof(buf));
 	if (vp->v_vflag & VV_TEXT)
-		strcat(buf, "|VV_TEXT");
+		strlcat(buf, "|VV_TEXT", sizeof(buf));
+	if (vp->v_vflag & VV_COPYONWRITE)
+		strlcat(buf, "|VV_COPYONWRITE", sizeof(buf));
 	if (vp->v_vflag & VV_SYSTEM)
-		strcat(buf, "|VV_SYSTEM");
+		strlcat(buf, "|VV_SYSTEM", sizeof(buf));
+	if (vp->v_vflag & VV_PROCDEP)
+		strlcat(buf, "|VV_PROCDEP", sizeof(buf));
+	if (vp->v_vflag & VV_NOKNOTE)
+		strlcat(buf, "|VV_NOKNOTE", sizeof(buf));
+	if (vp->v_vflag & VV_DELETED)
+		strlcat(buf, "|VV_DELETED", sizeof(buf));
+	if (vp->v_vflag & VV_MD)
+		strlcat(buf, "|VV_MD", sizeof(buf));
+	flags = vp->v_vflag & ~(VV_ROOT | VV_ISTTY | VV_NOSYNC |
+	    VV_CACHEDLABEL | VV_TEXT | VV_COPYONWRITE | VV_SYSTEM | VV_PROCDEP |
+	    VV_NOKNOTE | VV_DELETED | VV_MD);
+	if (flags != 0) {
+		snprintf(buf2, sizeof(buf2), "|VV(0x%lx)", flags);
+		strlcat(buf, buf2, sizeof(buf));
+	}
+	if (vp->v_iflag & VI_MOUNT)
+		strlcat(buf, "|VI_MOUNT", sizeof(buf));
+	if (vp->v_iflag & VI_AGE)
+		strlcat(buf, "|VI_AGE", sizeof(buf));
 	if (vp->v_iflag & VI_DOOMED)
-		strcat(buf, "|VI_DOOMED");
+		strlcat(buf, "|VI_DOOMED", sizeof(buf));
 	if (vp->v_iflag & VI_FREE)
-		strcat(buf, "|VI_FREE");
+		strlcat(buf, "|VI_FREE", sizeof(buf));
+	if (vp->v_iflag & VI_OBJDIRTY)
+		strlcat(buf, "|VI_OBJDIRTY", sizeof(buf));
+	if (vp->v_iflag & VI_DOINGINACT)
+		strlcat(buf, "|VI_DOINGINACT", sizeof(buf));
+	if (vp->v_iflag & VI_OWEINACT)
+		strlcat(buf, "|VI_OWEINACT", sizeof(buf));
+	flags = vp->v_iflag & ~(VI_MOUNT | VI_AGE | VI_DOOMED | VI_FREE |
+	    VI_OBJDIRTY | VI_DOINGINACT | VI_OWEINACT);
+	if (flags != 0) {
+		snprintf(buf2, sizeof(buf2), "|VI(0x%lx)", flags);
+		strlcat(buf, buf2, sizeof(buf));
+	}
 	printf("    flags (%s)\n", buf + 1);
 	if (mtx_owned(VI_MTX(vp)))
 		printf(" VI_LOCKed");
@@ -2549,7 +2666,6 @@
 }
 
 #ifdef DDB
-#include <ddb/ddb.h>
 /*
  * List all of the locked vnodes in the system.
  * Called when debugging the kernel.
@@ -2575,7 +2691,20 @@
 		nmp = TAILQ_NEXT(mp, mnt_list);
 	}
 }
-#endif
+
+/*
+ * Show details about the given vnode.
+ */
+DB_SHOW_COMMAND(vnode, db_show_vnode)
+{
+	struct vnode *vp;
+
+	if (!have_addr)
+		return;
+	vp = (struct vnode *)addr;
+	vn_printf(vp, "vnode ");
+}
+#endif	/* DDB */
 
 /*
  * Fill in a struct xvfsconf based on a struct vfsconf.
@@ -2791,7 +2920,7 @@
  * of mounting to avoid dependencies.
  */
 void
-vfs_unmountall()
+vfs_unmountall(void)
 {
 	struct mount *mp;
 	struct thread *td;
@@ -2839,7 +2968,6 @@
 	struct vnode *vp, *mvp;
 	struct vm_object *obj;
 
-	(void) vn_start_write(NULL, &mp, V_WAIT);
 	MNT_ILOCK(mp);
 	MNT_VNODE_FOREACH(vp, mp, mvp) {
 		VI_LOCK(vp);
@@ -2870,7 +2998,6 @@
 			VI_UNLOCK(vp);
 	}
 	MNT_IUNLOCK(mp);
-	vn_finished_write(mp);
 }
 
 /*
@@ -2945,10 +3072,7 @@
  * to avoid race conditions.)
  */
 int
-vn_pollrecord(vp, td, events)
-	struct vnode *vp;
-	struct thread *td;
-	short events;
+vn_pollrecord(struct vnode *vp, struct thread *td, int events)
 {
 
 	if (vp->v_pollinfo == NULL)
@@ -2988,7 +3112,7 @@
 	.vop_fsync =	sync_fsync,		/* fsync */
 	.vop_inactive =	sync_inactive,	/* inactive */
 	.vop_reclaim =	sync_reclaim,	/* reclaim */
-	.vop_lock =	vop_stdlock,	/* lock */
+	.vop_lock1 =	vop_stdlock,	/* lock */
 	.vop_unlock =	vop_stdunlock,	/* unlock */
 	.vop_islocked =	vop_stdislocked,	/* islocked */
 };
@@ -2997,8 +3121,7 @@
  * Create a new filesystem syncer vnode for the specified mount point.
  */
 int
-vfs_allocate_syncvnode(mp)
-	struct mount *mp;
+vfs_allocate_syncvnode(struct mount *mp)
 {
 	struct vnode *vp;
 	static long start, incr, next;
@@ -3010,6 +3133,9 @@
 		return (error);
 	}
 	vp->v_type = VNON;
+	error = insmntque(vp, mp);
+	if (error != 0)
+		panic("vfs_allocate_syncvnode: insmntque failed");
 	/*
 	 * Place the vnode onto the syncer worklist. We attempt to
 	 * scatter them about on the list so that they will go off
@@ -3042,18 +3168,12 @@
  * Do a lazy sync of the filesystem.
  */
 static int
-sync_fsync(ap)
-	struct vop_fsync_args /* {
-		struct vnode *a_vp;
-		struct ucred *a_cred;
-		int a_waitfor;
-		struct thread *a_td;
-	} */ *ap;
+sync_fsync(struct vop_fsync_args *ap)
 {
 	struct vnode *syncvp = ap->a_vp;
 	struct mount *mp = syncvp->v_mount;
 	struct thread *td = ap->a_td;
-	int error, asyncflag;
+	int error;
 	struct bufobj *bo;
 
 	/*
@@ -3083,12 +3203,17 @@
 		vfs_unbusy(mp, td);
 		return (0);
 	}
-	asyncflag = mp->mnt_flag & MNT_ASYNC;
-	mp->mnt_flag &= ~MNT_ASYNC;
+	MNT_ILOCK(mp);
+	mp->mnt_noasync++;
+	mp->mnt_kern_flag &= ~MNTK_ASYNC;
+	MNT_IUNLOCK(mp);
 	vfs_msync(mp, MNT_NOWAIT);
 	error = VFS_SYNC(mp, MNT_LAZY, td);
-	if (asyncflag)
-		mp->mnt_flag |= MNT_ASYNC;
+	MNT_ILOCK(mp);
+	mp->mnt_noasync--;
+	if ((mp->mnt_flag & MNT_ASYNC) != 0 && mp->mnt_noasync == 0)
+		mp->mnt_kern_flag |= MNTK_ASYNC;
+	MNT_IUNLOCK(mp);
 	vn_finished_write(mp);
 	vfs_unbusy(mp, td);
 	return (error);
@@ -3098,11 +3223,7 @@
  * The syncer vnode is no referenced.
  */
 static int
-sync_inactive(ap)
-	struct vop_inactive_args /* {
-		struct vnode *a_vp;
-		struct thread *a_td;
-	} */ *ap;
+sync_inactive(struct vop_inactive_args *ap)
 {
 
 	vgone(ap->a_vp);
@@ -3115,10 +3236,7 @@
  * Modifications to the worklist must be protected by sync_mtx.
  */
 static int
-sync_reclaim(ap)
-	struct vop_reclaim_args /* {
-		struct vnode *a_vp;
-	} */ *ap;
+sync_reclaim(struct vop_reclaim_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct bufobj *bo;
@@ -3129,7 +3247,7 @@
 	if (bo->bo_flag & BO_ONWORKLST) {
 		mtx_lock(&sync_mtx);
 		LIST_REMOVE(bo, bo_synclist);
- 		syncer_worklist_len--;
+		syncer_worklist_len--;
 		sync_vnode_count--;
 		mtx_unlock(&sync_mtx);
 		bo->bo_flag &= ~BO_ONWORKLST;
@@ -3143,9 +3261,7 @@
  * Check if vnode represents a disk device
  */
 int
-vn_isdisk(vp, errp)
-	struct vnode *vp;
-	int *errp;
+vn_isdisk(struct vnode *vp, int *errp)
 {
 	int error;
 
@@ -3171,21 +3287,16 @@
  * and optional call-by-reference privused argument allowing vaccess()
  * to indicate to the caller whether privilege was used to satisfy the
  * request (obsoleted).  Returns 0 on success, or an errno on failure.
+ *
+ * The ifdef'd CAPABILITIES version is here for reference, but is not
+ * actually used.
  */
 int
-vaccess(type, file_mode, file_uid, file_gid, acc_mode, cred, privused)
-	enum vtype type;
-	mode_t file_mode;
-	uid_t file_uid;
-	gid_t file_gid;
-	mode_t acc_mode;
-	struct ucred *cred;
-	int *privused;
+vaccess(enum vtype type, mode_t file_mode, uid_t file_uid, gid_t file_gid,
+    mode_t acc_mode, struct ucred *cred, int *privused)
 {
 	mode_t dac_granted;
-#ifdef CAPABILITIES
-	mode_t cap_granted;
-#endif
+	mode_t priv_granted;
 
 	/*
 	 * Look for a normal, non-privileged way to access the file/directory
@@ -3239,56 +3350,46 @@
 		return (0);
 
 privcheck:
-	if (!suser_cred(cred, SUSER_ALLOWJAIL)) {
-		/* XXX audit: privilege used */
-		if (privused != NULL)
-			*privused = 1;
-		return (0);
-	}
-
-#ifdef CAPABILITIES
 	/*
-	 * Build a capability mask to determine if the set of capabilities
+	 * Build a privilege mask to determine if the set of privileges
 	 * satisfies the requirements when combined with the granted mask
-	 * from above.
-	 * For each capability, if the capability is required, bitwise
-	 * or the request type onto the cap_granted mask.
+	 * from above.  For each privilege, if the privilege is required,
+	 * bitwise or the request type onto the priv_granted mask.
 	 */
-	cap_granted = 0;
+	priv_granted = 0;
 
 	if (type == VDIR) {
 		/*
-		 * For directories, use CAP_DAC_READ_SEARCH to satisfy
-		 * VEXEC requests, instead of CAP_DAC_EXECUTE.
+		 * For directories, use PRIV_VFS_LOOKUP to satisfy VEXEC
+		 * requests, instead of PRIV_VFS_EXEC.
 		 */
 		if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
-		    !cap_check(cred, NULL, CAP_DAC_READ_SEARCH, SUSER_ALLOWJAIL))
-			cap_granted |= VEXEC;
+		    !priv_check_cred(cred, PRIV_VFS_LOOKUP, 0))
+			priv_granted |= VEXEC;
 	} else {
 		if ((acc_mode & VEXEC) && ((dac_granted & VEXEC) == 0) &&
-		    !cap_check(cred, NULL, CAP_DAC_EXECUTE, SUSER_ALLOWJAIL))
-			cap_granted |= VEXEC;
+		    !priv_check_cred(cred, PRIV_VFS_EXEC, 0))
+			priv_granted |= VEXEC;
 	}
 
 	if ((acc_mode & VREAD) && ((dac_granted & VREAD) == 0) &&
-	    !cap_check(cred, NULL, CAP_DAC_READ_SEARCH, SUSER_ALLOWJAIL))
-		cap_granted |= VREAD;
+	    !priv_check_cred(cred, PRIV_VFS_READ, 0))
+		priv_granted |= VREAD;
 
 	if ((acc_mode & VWRITE) && ((dac_granted & VWRITE) == 0) &&
-	    !cap_check(cred, NULL, CAP_DAC_WRITE, SUSER_ALLOWJAIL))
-		cap_granted |= (VWRITE | VAPPEND);
+	    !priv_check_cred(cred, PRIV_VFS_WRITE, 0))
+		priv_granted |= (VWRITE | VAPPEND);
 
 	if ((acc_mode & VADMIN) && ((dac_granted & VADMIN) == 0) &&
-	    !cap_check(cred, NULL, CAP_FOWNER, SUSER_ALLOWJAIL))
-		cap_granted |= VADMIN;
+	    !priv_check_cred(cred, PRIV_VFS_ADMIN, 0))
+		priv_granted |= VADMIN;
 
-	if ((acc_mode & (cap_granted | dac_granted)) == acc_mode) {
+	if ((acc_mode & (priv_granted | dac_granted)) == acc_mode) {
 		/* XXX audit: privilege used */
 		if (privused != NULL)
 			*privused = 1;
 		return (0);
 	}
-#endif
 
 	return ((acc_mode & VADMIN) ? EPERM : EACCES);
 }
@@ -3298,8 +3399,8 @@
  * permissions.
  */
 int
-extattr_check_cred(struct vnode *vp, int attrnamespace,
-    struct ucred *cred, struct thread *td, int access)
+extattr_check_cred(struct vnode *vp, int attrnamespace, struct ucred *cred,
+    struct thread *td, int access)
 {
 
 	/*
@@ -3309,16 +3410,13 @@
 		return (0);
 
 	/*
-	 * Do not allow privileged processes in jail to directly
-	 * manipulate system attributes.
-	 *
-	 * XXX What capability should apply here?
-	 * Probably CAP_SYS_SETFFLAG.
+	 * Do not allow privileged processes in jail to directly manipulate
+	 * system attributes.
 	 */
 	switch (attrnamespace) {
 	case EXTATTR_NAMESPACE_SYSTEM:
 		/* Potentially should be: return (EPERM); */
-		return (suser_cred(cred, 0));
+		return (priv_check_cred(cred, PRIV_VFS_EXTATTR_SYSTEM, 0));
 	case EXTATTR_NAMESPACE_USER:
 		return (VOP_ACCESS(vp, access, cred, td));
 	default:
@@ -3438,10 +3536,10 @@
 	ASSERT_VI_UNLOCKED(a->a_fdvp, "VOP_RENAME");
 
 	/* Check the source (from). */
-	if (a->a_tdvp != a->a_fdvp)
+	if (a->a_tdvp != a->a_fdvp && a->a_tvp != a->a_fdvp)
 		ASSERT_VOP_UNLOCKED(a->a_fdvp, "vop_rename: fdvp locked");
 	if (a->a_tvp != a->a_fvp)
-		ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: tvp locked");
+		ASSERT_VOP_UNLOCKED(a->a_fvp, "vop_rename: fvp locked");
 
 	/* Check the target. */
 	if (a->a_tvp)
@@ -3521,7 +3619,7 @@
 vop_lock_pre(void *ap)
 {
 #ifdef DEBUG_VFS_LOCKS
-	struct vop_lock_args *a = ap;
+	struct vop_lock1_args *a = ap;
 
 	if ((a->a_flags & LK_INTERLOCK) == 0)
 		ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
@@ -3534,7 +3632,7 @@
 vop_lock_post(void *ap, int rc)
 {
 #ifdef DEBUG_VFS_LOCKS
-	struct vop_lock_args *a = ap;
+	struct vop_lock1_args *a = ap;
 
 	ASSERT_VI_UNLOCKED(a->a_vp, "VOP_LOCK");
 	if (rc == 0)
@@ -3571,16 +3669,16 @@
 	struct vop_create_args *a = ap;
 
 	if (!rc)
-		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE); 
+		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
 }
 
 void
 vop_link_post(void *ap, int rc)
 {
 	struct vop_link_args *a = ap;
-	
+
 	if (!rc) {
-		VFS_KNOTE_LOCKED(a->a_vp, NOTE_LINK); 
+		VFS_KNOTE_LOCKED(a->a_vp, NOTE_LINK);
 		VFS_KNOTE_LOCKED(a->a_tdvp, NOTE_WRITE);
 	}
 }
@@ -3659,7 +3757,7 @@
 vop_symlink_post(void *ap, int rc)
 {
 	struct vop_symlink_args *a = ap;
-	
+
 	if (!rc)
 		VFS_KNOTE_LOCKED(a->a_dvp, NOTE_WRITE);
 }
@@ -3730,14 +3828,17 @@
 	/* ensure that a specific sysctl goes to the right filesystem. */
 	if (strcmp(vc.vc_fstypename, "*") != 0 &&
 	    strcmp(vc.vc_fstypename, mp->mnt_vfc->vfc_name) != 0) {
+		vfs_rel(mp);
 		return (EINVAL);
 	}
 	VCTLTOREQ(&vc, req);
-	return (VFS_SYSCTL(mp, vc.vc_op, req));
+	error = VFS_SYSCTL(mp, vc.vc_op, req);
+	vfs_rel(mp);
+	return (error);
 }
 
-SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLFLAG_WR,
-        NULL, 0, sysctl_vfs_ctl, "", "Sysctl by fsid");
+SYSCTL_PROC(_vfs, OID_AUTO, ctl, CTLFLAG_WR, NULL, 0, sysctl_vfs_ctl, "",
+    "Sysctl by fsid");
 
 /*
  * Function to initialize a va_filerev field sensibly.
@@ -3792,7 +3893,7 @@
 {
 	struct vnode *vp = ap->a_vp;
 	struct knote *kn = ap->a_kn;
-	struct knlist *knl; 
+	struct knlist *knl;
 
 	switch (kn->kn_filter) {
 	case EVFILT_READ:
@@ -3848,7 +3949,7 @@
 		return (1);
 	}
 
-	if (VOP_GETATTR(vp, &va, curthread->td_ucred, curthread)) 
+	if (VOP_GETATTR(vp, &va, curthread->td_ucred, curthread))
 		return (0);
 
 	kn->kn_data = va.va_size - kn->kn_fp->f_offset;
--- sys/kern/kern_acl.c
+++ /dev/null
@@ -1,1036 +0,0 @@
-/*-
- * Copyright (c) 1999-2003 Robert N. M. Watson
- * All rights reserved.
- *
- * This software was developed by Robert Watson for the TrustedBSD Project.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *
- * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-/*
- * Developed by the TrustedBSD Project.
- * Support for POSIX.1e access control lists.
- */
-
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_acl.c,v 1.45.8.2 2005/11/13 03:14:00 csjp Exp $");
-
-#include "opt_mac.h"
-
-#include <sys/param.h>
-#include <sys/systm.h>
-#include <sys/sysproto.h>
-#include <sys/kernel.h>
-#include <sys/mac.h>
-#include <sys/malloc.h>
-#include <sys/mount.h>
-#include <sys/vnode.h>
-#include <sys/lock.h>
-#include <sys/mutex.h>
-#include <sys/namei.h>
-#include <sys/file.h>
-#include <sys/filedesc.h>
-#include <sys/proc.h>
-#include <sys/sysent.h>
-#include <sys/errno.h>
-#include <sys/stat.h>
-#include <sys/acl.h>
-
-#include <vm/uma.h>
-
-uma_zone_t	acl_zone;
-static int	vacl_set_acl(struct thread *td, struct vnode *vp,
-		    acl_type_t type, struct acl *aclp);
-static int	vacl_get_acl(struct thread *td, struct vnode *vp,
-		    acl_type_t type, struct acl *aclp);
-static int	vacl_aclcheck(struct thread *td, struct vnode *vp,
-		    acl_type_t type, struct acl *aclp);
-
-/*
- * Implement a version of vaccess() that understands POSIX.1e ACL semantics.
- * Return 0 on success, else an errno value.  Should be merged into
- * vaccess() eventually.
- */
-int
-vaccess_acl_posix1e(enum vtype type, uid_t file_uid, gid_t file_gid,
-    struct acl *acl, mode_t acc_mode, struct ucred *cred, int *privused)
-{
-	struct acl_entry *acl_other, *acl_mask;
-	mode_t dac_granted;
-	mode_t cap_granted;
-	mode_t acl_mask_granted;
-	int group_matched, i;
-
-	/*
-	 * Look for a normal, non-privileged way to access the file/directory
-	 * as requested.  If it exists, go with that.  Otherwise, attempt
-	 * to use privileges granted via cap_granted.  In some cases,
-	 * which privileges to use may be ambiguous due to "best match",
-	 * in which case fall back on first match for the time being.
-	 */
-	if (privused != NULL)
-		*privused = 0;
-
-	/*
-	 * Determine privileges now, but don't apply until we've found
-	 * a DAC entry that matches but has failed to allow access.
-	 */
-#ifndef CAPABILITIES
-	if (suser_cred(cred, SUSER_ALLOWJAIL) == 0)
-		cap_granted = VALLPERM;
-	else
-		cap_granted = 0;
-#else
-	cap_granted = 0;
-
-	if (type == VDIR) {
-		if ((acc_mode & VEXEC) && !cap_check(cred, NULL,
-		     CAP_DAC_READ_SEARCH, SUSER_ALLOWJAIL))
-			cap_granted |= VEXEC;
-	} else {
-		if ((acc_mode & VEXEC) && !cap_check(cred, NULL,
-		    CAP_DAC_EXECUTE, SUSER_ALLOWJAIL))
-			cap_granted |= VEXEC;
-	}
-
-	if ((acc_mode & VREAD) && !cap_check(cred, NULL, CAP_DAC_READ_SEARCH,
-	    SUSER_ALLOWJAIL))
-		cap_granted |= VREAD;
-
-	if (((acc_mode & VWRITE) || (acc_mode & VAPPEND)) &&
-	    !cap_check(cred, NULL, CAP_DAC_WRITE, SUSER_ALLOWJAIL))
-		cap_granted |= (VWRITE | VAPPEND);
-
-	if ((acc_mode & VADMIN) && !cap_check(cred, NULL, CAP_FOWNER,
-	    SUSER_ALLOWJAIL))
-		cap_granted |= VADMIN;
-#endif /* CAPABILITIES */
-
-	/*
-	 * The owner matches if the effective uid associated with the
-	 * credential matches that of the ACL_USER_OBJ entry.  While we're
-	 * doing the first scan, also cache the location of the ACL_MASK
-	 * and ACL_OTHER entries, preventing some future iterations.
-	 */
-	acl_mask = acl_other = NULL;
-	for (i = 0; i < acl->acl_cnt; i++) {
-		switch (acl->acl_entry[i].ae_tag) {
-		case ACL_USER_OBJ:
-			if (file_uid != cred->cr_uid)
-				break;
-			dac_granted = 0;
-			dac_granted |= VADMIN;
-			if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
-				dac_granted |= VEXEC;
-			if (acl->acl_entry[i].ae_perm & ACL_READ)
-				dac_granted |= VREAD;
-			if (acl->acl_entry[i].ae_perm & ACL_WRITE)
-				dac_granted |= (VWRITE | VAPPEND);
-			if ((acc_mode & dac_granted) == acc_mode)
-				return (0);
-			if ((acc_mode & (dac_granted | cap_granted)) ==
-			    acc_mode) {
-				if (privused != NULL)
-					*privused = 1;
-				return (0);
-			}
-			goto error;
-
-		case ACL_MASK:
-			acl_mask = &acl->acl_entry[i];
-			break;
-
-		case ACL_OTHER:
-			acl_other = &acl->acl_entry[i];
-			break;
-
-		default:
-			break;
-		}
-	}
-
-	/*
-	 * An ACL_OTHER entry should always exist in a valid access
-	 * ACL.  If it doesn't, then generate a serious failure.  For now,
-	 * this means a debugging message and EPERM, but in the future
-	 * should probably be a panic.
-	 */
-	if (acl_other == NULL) {
-		/*
-		 * XXX This should never happen
-		 */
-		printf("vaccess_acl_posix1e: ACL_OTHER missing\n");
-		return (EPERM);
-	}
-
-	/*
-	 * Checks against ACL_USER, ACL_GROUP_OBJ, and ACL_GROUP fields
-	 * are masked by an ACL_MASK entry, if any.  As such, first identify
-	 * the ACL_MASK field, then iterate through identifying potential
-	 * user matches, then group matches.  If there is no ACL_MASK,
-	 * assume that the mask allows all requests to succeed.
-	 */
-	if (acl_mask != NULL) {
-		acl_mask_granted = 0;
-		if (acl_mask->ae_perm & ACL_EXECUTE)
-			acl_mask_granted |= VEXEC;
-		if (acl_mask->ae_perm & ACL_READ)
-			acl_mask_granted |= VREAD;
-		if (acl_mask->ae_perm & ACL_WRITE)
-			acl_mask_granted |= (VWRITE | VAPPEND);
-	} else
-		acl_mask_granted = VEXEC | VREAD | VWRITE | VAPPEND;
-
-	/*
-	 * Iterate through user ACL entries.  Do checks twice, first
-	 * without privilege, and then if a match is found but failed,
-	 * a second time with privilege.
-	 */
-
-	/*
-	 * Check ACL_USER ACL entries.
-	 */
-	for (i = 0; i < acl->acl_cnt; i++) {
-		switch (acl->acl_entry[i].ae_tag) {
-		case ACL_USER:
-			if (acl->acl_entry[i].ae_id != cred->cr_uid)
-				break;
-			dac_granted = 0;
-			if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
-				dac_granted |= VEXEC;
-			if (acl->acl_entry[i].ae_perm & ACL_READ)
-				dac_granted |= VREAD;
-			if (acl->acl_entry[i].ae_perm & ACL_WRITE)
-				dac_granted |= (VWRITE | VAPPEND);
-			dac_granted &= acl_mask_granted;
-			if ((acc_mode & dac_granted) == acc_mode)
-				return (0);
-			if ((acc_mode & (dac_granted | cap_granted)) !=
-			    acc_mode)
-				goto error;
-
-			if (privused != NULL)
-				*privused = 1;
-			return (0);
-		}
-	}
-
-	/*
-	 * Group match is best-match, not first-match, so find a 
-	 * "best" match.  Iterate across, testing each potential group
-	 * match.  Make sure we keep track of whether we found a match
-	 * or not, so that we know if we should try again with any
-	 * available privilege, or if we should move on to ACL_OTHER.
-	 */
-	group_matched = 0;
-	for (i = 0; i < acl->acl_cnt; i++) {
-		switch (acl->acl_entry[i].ae_tag) {
-		case ACL_GROUP_OBJ:
-			if (!groupmember(file_gid, cred))
-				break;
-			dac_granted = 0;
-			if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
-				dac_granted |= VEXEC;
-			if (acl->acl_entry[i].ae_perm & ACL_READ)
-				dac_granted |= VREAD;
-			if (acl->acl_entry[i].ae_perm & ACL_WRITE)
-				dac_granted |= (VWRITE | VAPPEND);
-			dac_granted  &= acl_mask_granted;
-
-			if ((acc_mode & dac_granted) == acc_mode)
-				return (0);
-
-			group_matched = 1;
-			break;
-
-		case ACL_GROUP:
-			if (!groupmember(acl->acl_entry[i].ae_id, cred))
-				break;
-			dac_granted = 0;
-			if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
-				dac_granted |= VEXEC;
-			if (acl->acl_entry[i].ae_perm & ACL_READ)
-				dac_granted |= VREAD;
-			if (acl->acl_entry[i].ae_perm & ACL_WRITE)
-				dac_granted |= (VWRITE | VAPPEND);
-			dac_granted  &= acl_mask_granted;
-
-			if ((acc_mode & dac_granted) == acc_mode)
-				return (0);
-
-			group_matched = 1;
-			break;
-
-		default:
-			break;
-		}
-	}
-
-	if (group_matched == 1) {
-		/*
-		 * There was a match, but it did not grant rights via
-		 * pure DAC.  Try again, this time with privilege.
-		 */
-		for (i = 0; i < acl->acl_cnt; i++) {
-			switch (acl->acl_entry[i].ae_tag) {
-			case ACL_GROUP_OBJ:
-				if (!groupmember(file_gid, cred))
-					break;
-				dac_granted = 0;
-				if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
-					dac_granted |= VEXEC;
-				if (acl->acl_entry[i].ae_perm & ACL_READ)
-					dac_granted |= VREAD;
-				if (acl->acl_entry[i].ae_perm & ACL_WRITE)
-					dac_granted |= (VWRITE | VAPPEND);
-				dac_granted &= acl_mask_granted;
-
-				if ((acc_mode & (dac_granted | cap_granted)) !=
-				    acc_mode)
-					break;
-
-				if (privused != NULL)
-					*privused = 1;
-				return (0);
-
-			case ACL_GROUP:
-				if (!groupmember(acl->acl_entry[i].ae_id,
-				    cred))
-					break;
-				dac_granted = 0;
-				if (acl->acl_entry[i].ae_perm & ACL_EXECUTE)
-				dac_granted |= VEXEC;
-				if (acl->acl_entry[i].ae_perm & ACL_READ)
-					dac_granted |= VREAD;
-				if (acl->acl_entry[i].ae_perm & ACL_WRITE)
-					dac_granted |= (VWRITE | VAPPEND);
-				dac_granted &= acl_mask_granted;
-
-				if ((acc_mode & (dac_granted | cap_granted)) !=
-				    acc_mode)
-					break;
-
-				if (privused != NULL)
-					*privused = 1;
-				return (0);
-
-			default:
-				break;
-			}
-		}
-		/*
-		 * Even with privilege, group membership was not sufficient.
-		 * Return failure.
-		 */
-		goto error;
-	}
-		
-	/*
-	 * Fall back on ACL_OTHER.  ACL_MASK is not applied to ACL_OTHER.
-	 */
-	dac_granted = 0;
-	if (acl_other->ae_perm & ACL_EXECUTE)
-		dac_granted |= VEXEC;
-	if (acl_other->ae_perm & ACL_READ)
-		dac_granted |= VREAD;
-	if (acl_other->ae_perm & ACL_WRITE)
-		dac_granted |= (VWRITE | VAPPEND);
-
-	if ((acc_mode & dac_granted) == acc_mode)
-		return (0);
-	if ((acc_mode & (dac_granted | cap_granted)) == acc_mode) {
-		if (privused != NULL)
-			*privused = 1;
-		return (0);
-	}
-
-error:
-	return ((acc_mode & VADMIN) ? EPERM : EACCES);
-}
-
-/*
- * For the purposes of filesystems maintaining the _OBJ entries in an
- * inode with a mode_t field, this routine converts a mode_t entry
- * to an acl_perm_t.
- */
-acl_perm_t
-acl_posix1e_mode_to_perm(acl_tag_t tag, mode_t mode)
-{
-	acl_perm_t	perm = 0;
-
-	switch(tag) {
-	case ACL_USER_OBJ:
-		if (mode & S_IXUSR)
-			perm |= ACL_EXECUTE;
-		if (mode & S_IRUSR)
-			perm |= ACL_READ;
-		if (mode & S_IWUSR)
-			perm |= ACL_WRITE;
-		return (perm);
-
-	case ACL_GROUP_OBJ:
-		if (mode & S_IXGRP)
-			perm |= ACL_EXECUTE;
-		if (mode & S_IRGRP)
-			perm |= ACL_READ;
-		if (mode & S_IWGRP)
-			perm |= ACL_WRITE;
-		return (perm);
-
-	case ACL_OTHER:
-		if (mode & S_IXOTH)
-			perm |= ACL_EXECUTE;
-		if (mode & S_IROTH)
-			perm |= ACL_READ;
-		if (mode & S_IWOTH)
-			perm |= ACL_WRITE;
-		return (perm);
-
-	default:
-		printf("acl_posix1e_mode_to_perm: invalid tag (%d)\n", tag);
-		return (0);
-	}
-}
-
-/*
- * Given inode information (uid, gid, mode), return an acl entry of the
- * appropriate type.
- */
-struct acl_entry
-acl_posix1e_mode_to_entry(acl_tag_t tag, uid_t uid, gid_t gid, mode_t mode)
-{
-	struct acl_entry	acl_entry;
-
-	acl_entry.ae_tag = tag;
-	acl_entry.ae_perm = acl_posix1e_mode_to_perm(tag, mode);
-	switch(tag) {
-	case ACL_USER_OBJ:
-		acl_entry.ae_id = uid;
-		break;
-
-	case ACL_GROUP_OBJ:
-		acl_entry.ae_id = gid;
-		break;
-
-	case ACL_OTHER:
-		acl_entry.ae_id = ACL_UNDEFINED_ID;
-		break;
-
-	default:
-		acl_entry.ae_id = ACL_UNDEFINED_ID;
-		printf("acl_posix1e_mode_to_entry: invalid tag (%d)\n", tag);
-	}
-
-	return (acl_entry);
-}
-
-/*
- * Utility function to generate a file mode given appropriate ACL entries.
- */
-mode_t
-acl_posix1e_perms_to_mode(struct acl_entry *acl_user_obj_entry,
-    struct acl_entry *acl_group_obj_entry, struct acl_entry *acl_other_entry)
-{
-	mode_t	mode;
-
-	mode = 0;
-	if (acl_user_obj_entry->ae_perm & ACL_EXECUTE)
-		mode |= S_IXUSR;
-	if (acl_user_obj_entry->ae_perm & ACL_READ)
-		mode |= S_IRUSR;
-	if (acl_user_obj_entry->ae_perm & ACL_WRITE)
-		mode |= S_IWUSR;
-	if (acl_group_obj_entry->ae_perm & ACL_EXECUTE)
-		mode |= S_IXGRP;
-	if (acl_group_obj_entry->ae_perm & ACL_READ)
-		mode |= S_IRGRP;
-	if (acl_group_obj_entry->ae_perm & ACL_WRITE)
-		mode |= S_IWGRP;
-	if (acl_other_entry->ae_perm & ACL_EXECUTE)
-		mode |= S_IXOTH;
-	if (acl_other_entry->ae_perm & ACL_READ)
-		mode |= S_IROTH;
-	if (acl_other_entry->ae_perm & ACL_WRITE)
-		mode |= S_IWOTH;
-
-	return (mode);
-}
-
-/*
- * Utility function to generate a file mode given a complete POSIX.1e
- * access ACL.  Note that if the ACL is improperly formed, this may
- * result in a panic.
- */
-mode_t
-acl_posix1e_acl_to_mode(struct acl *acl)
-{
-	struct acl_entry *acl_mask, *acl_user_obj, *acl_group_obj, *acl_other;
-	int i;
-
-	/*
-	 * Find the ACL entries relevant to a POSIX permission mode.
-	 */
-	acl_user_obj = acl_group_obj = acl_other = acl_mask = NULL;
-	for (i = 0; i < acl->acl_cnt; i++) {
-		switch (acl->acl_entry[i].ae_tag) {
-		case ACL_USER_OBJ:
-			acl_user_obj = &acl->acl_entry[i];
-			break;
-
-		case ACL_GROUP_OBJ:
-			acl_group_obj = &acl->acl_entry[i];
-			break;
-
-		case ACL_OTHER:
-			acl_other = &acl->acl_entry[i];
-			break;
-
-		case ACL_MASK:
-			acl_mask = &acl->acl_entry[i];
-			break;
-
-		case ACL_USER:
-		case ACL_GROUP:
-			break;
-
-		default:
-			panic("acl_posix1e_acl_to_mode: bad ae_tag");
-		}
-	}
-
-	if (acl_user_obj == NULL || acl_group_obj == NULL || acl_other == NULL)
-		panic("acl_posix1e_acl_to_mode: missing base ae_tags");
-
-	/*
-	 * POSIX.1e specifies that if there is an ACL_MASK entry, we replace
-	 * the mode "group" bits with its permissions.  If there isn't, we
-	 * use the ACL_GROUP_OBJ permissions.
-	 */
-	if (acl_mask != NULL)
-		return (acl_posix1e_perms_to_mode(acl_user_obj, acl_mask,
-		    acl_other));
-	else
-		return (acl_posix1e_perms_to_mode(acl_user_obj, acl_group_obj,
-		    acl_other));
-}
-
-/*
- * Perform a syntactic check of the ACL, sufficient to allow an
- * implementing filesystem to determine if it should accept this and
- * rely on the POSIX.1e ACL properties.
- */
-int
-acl_posix1e_check(struct acl *acl)
-{
-	int num_acl_user_obj, num_acl_user, num_acl_group_obj, num_acl_group;
-	int num_acl_mask, num_acl_other, i;
-
-	/*
-	 * Verify that the number of entries does not exceed the maximum
-	 * defined for acl_t.
-	 * Verify that the correct number of various sorts of ae_tags are
-	 * present:
-	 *   Exactly one ACL_USER_OBJ
-	 *   Exactly one ACL_GROUP_OBJ
-	 *   Exactly one ACL_OTHER
-	 *   If any ACL_USER or ACL_GROUP entries appear, then exactly one
-	 *   ACL_MASK entry must also appear.
-	 * Verify that all ae_perm entries are in ACL_PERM_BITS.
-	 * Verify all ae_tag entries are understood by this implementation.
-	 * Note: Does not check for uniqueness of qualifier (ae_id) field.
-	 */
-	num_acl_user_obj = num_acl_user = num_acl_group_obj = num_acl_group =
-	    num_acl_mask = num_acl_other = 0;
-	if (acl->acl_cnt > ACL_MAX_ENTRIES || acl->acl_cnt < 0)
-		return (EINVAL);
-	for (i = 0; i < acl->acl_cnt; i++) {
-		/*
-		 * Check for a valid tag.
-		 */
-		switch(acl->acl_entry[i].ae_tag) {
-		case ACL_USER_OBJ:
-			acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */
-			if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID)
-				return (EINVAL);
-			num_acl_user_obj++;
-			break;
-		case ACL_GROUP_OBJ:
-			acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */
-			if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID)
-				return (EINVAL);
-			num_acl_group_obj++;
-			break;
-		case ACL_USER:
-			if (acl->acl_entry[i].ae_id == ACL_UNDEFINED_ID)
-				return (EINVAL);
-			num_acl_user++;
-			break;
-		case ACL_GROUP:
-			if (acl->acl_entry[i].ae_id == ACL_UNDEFINED_ID)
-				return (EINVAL);
-			num_acl_group++;
-			break;
-		case ACL_OTHER:
-			acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */
-			if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID)
-				return (EINVAL);
-			num_acl_other++;
-			break;
-		case ACL_MASK:
-			acl->acl_entry[i].ae_id = ACL_UNDEFINED_ID; /* XXX */
-			if (acl->acl_entry[i].ae_id != ACL_UNDEFINED_ID)
-				return (EINVAL);
-			num_acl_mask++;
-			break;
-		default:
-			return (EINVAL);
-		}
-		/*
-		 * Check for valid perm entries.
-		 */
-		if ((acl->acl_entry[i].ae_perm | ACL_PERM_BITS) !=
-		    ACL_PERM_BITS)
-			return (EINVAL);
-	}
-	if ((num_acl_user_obj != 1) || (num_acl_group_obj != 1) ||
-	    (num_acl_other != 1) || (num_acl_mask != 0 && num_acl_mask != 1))
-		return (EINVAL);
-	if (((num_acl_group != 0) || (num_acl_user != 0)) &&
-	    (num_acl_mask != 1))
-		return (EINVAL);
-	return (0);
-}
-
-/*
- * Given a requested mode for a new object, and a default ACL, combine
- * the two to produce a new mode.  Be careful not to clear any bits that
- * aren't intended to be affected by the POSIX.1e ACL.  Eventually,
- * this might also take the cmask as an argument, if we push that down
- * into per-filesystem-code.
- */
-mode_t
-acl_posix1e_newfilemode(mode_t cmode, struct acl *dacl)
-{
-	mode_t mode;
-
-	mode = cmode;
-	/*
-	 * The current composition policy is that a permission bit must
-	 * be set in *both* the ACL and the requested creation mode for
-	 * it to appear in the resulting mode/ACL.  First clear any
-	 * possibly effected bits, then reconstruct.
-	 */
-	mode &= ACL_PRESERVE_MASK;
-	mode |= (ACL_OVERRIDE_MASK & cmode & acl_posix1e_acl_to_mode(dacl));
-
-	return (mode);
-}
-
-/*
- * These calls wrap the real vnode operations, and are called by the 
- * syscall code once the syscall has converted the path or file
- * descriptor to a vnode (unlocked).  The aclp pointer is assumed
- * still to point to userland, so this should not be consumed within
- * the kernel except by syscall code.  Other code should directly
- * invoke VOP_{SET,GET}ACL.
- */
-
-/*
- * Given a vnode, set its ACL.
- */
-static int
-vacl_set_acl(struct thread *td, struct vnode *vp, acl_type_t type,
-    struct acl *aclp)
-{
-	struct acl inkernacl;
-	struct mount *mp;
-	int error;
-
-	error = copyin(aclp, &inkernacl, sizeof(struct acl));
-	if (error)
-		return(error);
-	error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
-	if (error != 0)
-		return (error);
-	VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
-#ifdef MAC
-	error = mac_check_vnode_setacl(td->td_ucred, vp, type, &inkernacl);
-	if (error != 0)
-		goto out;
-#endif
-	error = VOP_SETACL(vp, type, &inkernacl, td->td_ucred, td);
-#ifdef MAC
-out:
-#endif
-	VOP_UNLOCK(vp, 0, td);
-	vn_finished_write(mp);
-	return(error);
-}
-
-/*
- * Given a vnode, get its ACL.
- */
-static int
-vacl_get_acl(struct thread *td, struct vnode *vp, acl_type_t type,
-    struct acl *aclp)
-{
-	struct acl inkernelacl;
-	int error;
-
-	VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
-#ifdef MAC
-	error = mac_check_vnode_getacl(td->td_ucred, vp, type);
-	if (error != 0)
-		goto out;
-#endif
-	error = VOP_GETACL(vp, type, &inkernelacl, td->td_ucred, td);
-#ifdef MAC
-out:
-#endif
-	VOP_UNLOCK(vp, 0, td);
-	if (error == 0)
-		error = copyout(&inkernelacl, aclp, sizeof(struct acl));
-	return (error);
-}
-
-/*
- * Given a vnode, delete its ACL.
- */
-static int
-vacl_delete(struct thread *td, struct vnode *vp, acl_type_t type)
-{
-	struct mount *mp;
-	int error;
-
-	error = vn_start_write(vp, &mp, V_WAIT | PCATCH);
-	if (error)
-		return (error);
-	VOP_LEASE(vp, td, td->td_ucred, LEASE_WRITE);
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
-#ifdef MAC
-	error = mac_check_vnode_deleteacl(td->td_ucred, vp, type);
-	if (error)
-		goto out;
-#endif
-	error = VOP_SETACL(vp, type, 0, td->td_ucred, td);
-#ifdef MAC
-out:
-#endif
-	VOP_UNLOCK(vp, 0, td);
-	vn_finished_write(mp);
-	return (error);
-}
-
-/*
- * Given a vnode, check whether an ACL is appropriate for it
- */
-static int
-vacl_aclcheck(struct thread *td, struct vnode *vp, acl_type_t type,
-    struct acl *aclp)
-{
-	struct acl inkernelacl;
-	int error;
-
-	error = copyin(aclp, &inkernelacl, sizeof(struct acl));
-	if (error)
-		return(error);
-	error = VOP_ACLCHECK(vp, type, &inkernelacl, td->td_ucred, td);
-	return (error);
-}
-
-/*
- * syscalls -- convert the path/fd to a vnode, and call vacl_whatever.
- * Don't need to lock, as the vacl_ code will get/release any locks
- * required.
- */
-
-/*
- * Given a file path, get an ACL for it
- *
- * MPSAFE
- */
-int
-__acl_get_file(struct thread *td, struct __acl_get_file_args *uap)
-{
-	struct nameidata nd;
-	int vfslocked, error;
-
-	NDINIT(&nd, LOOKUP, MPSAFE|FOLLOW, UIO_USERSPACE, uap->path, td);
-	error = namei(&nd);
-	vfslocked = NDHASGIANT(&nd);
-	if (error == 0) {
-		error = vacl_get_acl(td, nd.ni_vp, uap->type, uap->aclp);
-		NDFREE(&nd, 0);
-	}
-	VFS_UNLOCK_GIANT(vfslocked);
-	return (error);
-}
-
-/*
- * Given a file path, get an ACL for it; don't follow links.
- *
- * MPSAFE
- */
-int
-__acl_get_link(struct thread *td, struct __acl_get_link_args *uap)
-{
-	struct nameidata nd;
-	int vfslocked, error;
-
-	NDINIT(&nd, LOOKUP, MPSAFE|NOFOLLOW, UIO_USERSPACE, uap->path, td);
-	error = namei(&nd);
-	vfslocked = NDHASGIANT(&nd);
-	if (error == 0) {
-		error = vacl_get_acl(td, nd.ni_vp, uap->type, uap->aclp);
-		NDFREE(&nd, 0);
-	}
-	VFS_UNLOCK_GIANT(vfslocked);
-	return (error);
-}
-
-/*
- * Given a file path, set an ACL for it
- *
- * MPSAFE
- */
-int
-__acl_set_file(struct thread *td, struct __acl_set_file_args *uap)
-{
-	struct nameidata nd;
-	int vfslocked, error;
-
-	NDINIT(&nd, LOOKUP, MPSAFE|FOLLOW, UIO_USERSPACE, uap->path, td);
-	error = namei(&nd);
-	vfslocked = NDHASGIANT(&nd);
-	if (error == 0) {
-		error = vacl_set_acl(td, nd.ni_vp, uap->type, uap->aclp);
-		NDFREE(&nd, 0);
-	}
-	VFS_UNLOCK_GIANT(vfslocked);
-	return (error);
-}
-
-/*
- * Given a file path, set an ACL for it; don't follow links.
- *
- * MPSAFE
- */
-int
-__acl_set_link(struct thread *td, struct __acl_set_link_args *uap)
-{
-	struct nameidata nd;
-	int vfslocked, error;
-
-	NDINIT(&nd, LOOKUP, MPSAFE|NOFOLLOW, UIO_USERSPACE, uap->path, td);
-	error = namei(&nd);
-	vfslocked = NDHASGIANT(&nd);
-	if (error == 0) {
-		error = vacl_set_acl(td, nd.ni_vp, uap->type, uap->aclp);
-		NDFREE(&nd, 0);
-	}
-	VFS_UNLOCK_GIANT(vfslocked);
-	return (error);
-}
-
-/*
- * Given a file descriptor, get an ACL for it
- *
- * MPSAFE
- */
-int
-__acl_get_fd(struct thread *td, struct __acl_get_fd_args *uap)
-{
-	struct file *fp;
-	int vfslocked, error;
-
-	error = getvnode(td->td_proc->p_fd, uap->filedes, &fp);
-	if (error == 0) {
-		vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
-		error = vacl_get_acl(td, fp->f_vnode, uap->type, uap->aclp);
-		fdrop(fp, td);
-		VFS_UNLOCK_GIANT(vfslocked);
-	}
-	return (error);
-}
-
-/*
- * Given a file descriptor, set an ACL for it
- *
- * MPSAFE
- */
-int
-__acl_set_fd(struct thread *td, struct __acl_set_fd_args *uap)
-{
-	struct file *fp;
-	int vfslocked, error;
-
-	error = getvnode(td->td_proc->p_fd, uap->filedes, &fp);
-	if (error == 0) {
-		vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
-		error = vacl_set_acl(td, fp->f_vnode, uap->type, uap->aclp);
-		fdrop(fp, td);
-		VFS_UNLOCK_GIANT(vfslocked);
-	}
-	return (error);
-}
-
-/*
- * Given a file path, delete an ACL from it.
- *
- * MPSAFE
- */
-int
-__acl_delete_file(struct thread *td, struct __acl_delete_file_args *uap)
-{
-	struct nameidata nd;
-	int vfslocked, error;
-
-	NDINIT(&nd, LOOKUP, MPSAFE|FOLLOW, UIO_USERSPACE, uap->path, td);
-	error = namei(&nd);
-	vfslocked = NDHASGIANT(&nd);
-	if (error == 0) {
-		error = vacl_delete(td, nd.ni_vp, uap->type);
-		NDFREE(&nd, 0);
-	}
-	VFS_UNLOCK_GIANT(vfslocked);
-	return (error);
-}
-
-/*
- * Given a file path, delete an ACL from it; don't follow links.
- *
- * MPSAFE
- */
-int
-__acl_delete_link(struct thread *td, struct __acl_delete_link_args *uap)
-{
-	struct nameidata nd;
-	int vfslocked, error;
-
-	NDINIT(&nd, LOOKUP, MPSAFE|NOFOLLOW, UIO_USERSPACE, uap->path, td);
-	error = namei(&nd);
-	vfslocked = NDHASGIANT(&nd);
-	if (error == 0) {
-		error = vacl_delete(td, nd.ni_vp, uap->type);
-		NDFREE(&nd, 0);
-	}
-	VFS_UNLOCK_GIANT(vfslocked);
-	return (error);
-}
-
-/*
- * Given a file path, delete an ACL from it.
- *
- * MPSAFE
- */
-int
-__acl_delete_fd(struct thread *td, struct __acl_delete_fd_args *uap)
-{
-	struct file *fp;
-	int vfslocked, error;
-
-	error = getvnode(td->td_proc->p_fd, uap->filedes, &fp);
-	if (error == 0) {
-		vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
-		error = vacl_delete(td, fp->f_vnode, uap->type);
-		fdrop(fp, td);
-		VFS_UNLOCK_GIANT(vfslocked);
-	}
-	return (error);
-}
-
-/*
- * Given a file path, check an ACL for it
- *
- * MPSAFE
- */
-int
-__acl_aclcheck_file(struct thread *td, struct __acl_aclcheck_file_args *uap)
-{
-	struct nameidata	nd;
-	int vfslocked, error;
-
-	NDINIT(&nd, LOOKUP, MPSAFE|FOLLOW, UIO_USERSPACE, uap->path, td);
-	error = namei(&nd);
-	vfslocked = NDHASGIANT(&nd);
-	if (error == 0) {
-		error = vacl_aclcheck(td, nd.ni_vp, uap->type, uap->aclp);
-		NDFREE(&nd, 0);
-	}
-	VFS_UNLOCK_GIANT(vfslocked);
-	return (error);
-}
-
-/*
- * Given a file path, check an ACL for it; don't follow links.
- *
- * MPSAFE
- */
-int
-__acl_aclcheck_link(struct thread *td, struct __acl_aclcheck_link_args *uap)
-{
-	struct nameidata	nd;
-	int vfslocked, error;
-
-	NDINIT(&nd, LOOKUP, MPSAFE|NOFOLLOW, UIO_USERSPACE, uap->path, td);
-	error = namei(&nd);
-	vfslocked = NDHASGIANT(&nd);
-	if (error == 0) {
-		error = vacl_aclcheck(td, nd.ni_vp, uap->type, uap->aclp);
-		NDFREE(&nd, 0);
-	}
-	VFS_UNLOCK_GIANT(vfslocked);
-	return (error);
-}
-
-/*
- * Given a file descriptor, check an ACL for it
- *
- * MPSAFE
- */
-int
-__acl_aclcheck_fd(struct thread *td, struct __acl_aclcheck_fd_args *uap)
-{
-	struct file *fp;
-	int vfslocked, error;
-
-	error = getvnode(td->td_proc->p_fd, uap->filedes, &fp);
-	if (error == 0) {
-		vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
-		error = vacl_aclcheck(td, fp->f_vnode, uap->type, uap->aclp);
-		fdrop(fp, td);
-		VFS_UNLOCK_GIANT(vfslocked);
-	}
-	return (error);
-}
-
-/* ARGUSED */
-
-static void
-aclinit(void *dummy __unused)
-{
-
-	acl_zone = uma_zcreate("ACL UMA zone", sizeof(struct acl),
-	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
-}
-SYSINIT(acls, SI_SUB_ACL, SI_ORDER_FIRST, aclinit, NULL)
Index: syscalls.master
===================================================================
RCS file: /home/cvs/src/sys/kern/syscalls.master,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/syscalls.master -L sys/kern/syscalls.master -u -r1.2 -r1.3
--- sys/kern/syscalls.master
+++ sys/kern/syscalls.master
@@ -1,19 +1,18 @@
- $FreeBSD: src/sys/kern/syscalls.master,v 1.198.2.2 2006/03/17 01:47:06 rwatson Exp $
+ $FreeBSD: src/sys/kern/syscalls.master,v 1.233 2007/08/16 05:26:41 davidxu Exp $
 ;	from: @(#)syscalls.master	8.2 (Berkeley) 1/13/94
 ;
 ; System call name/number master file.
 ; Processed to created init_sysent.c, syscalls.c and syscall.h.
 
-; Columns: number [M]type nargs name alt{name,tag,rtyp}/comments
+; Columns: number audit type name alt{name,tag,rtyp}/comments
 ;	number	system call number, must be in order
 ;	audit	the audit event associated with the system call
 ;		A value of AUE_NULL means no auditing, but it also means that
 ;		there is no audit event for the call at this time. For the
 ;		case where the event exists, but we don't want auditing, the
 ;		event should be #defined to AUE_NULL in audit_kevents.h.
-;	type	one of [M]STD, [M]OBSOL, [M]UNIMPL, [M]COMPAT, [M]CPT_NOA,
-;		[M]LIBCOMPAT, [M]NODEF, [M]NOARGS, [M]NOPROTO, [M]NOIMPL,
-;		[M]NOSTD, [M]COMPAT4
+;	type	one of STD, OBSOL, UNIMPL, COMPAT, CPT_NOA, LIBCOMPAT,
+;		NODEF, NOARGS, NOPROTO, NOIMPL, NOSTD, COMPAT4
 ;	name	psuedo-prototype of syscall routine
 ;		If one of the following alts is different, then all appear:
 ;	altname	name of system call if different
@@ -22,9 +21,6 @@
 ;		for UNIMPL/OBSOL, name continues with comments
 
 ; types:
-;	[M]	e.g. like MSTD -- means the system call is MP-safe.  If no
-;		M prefix is used, the syscall wrapper will obtain the Giant
-;		lock for the syscall.
 ;	STD	always included
 ;	COMPAT	included on COMPAT #ifdef
 ;	COMPAT4	included on COMPAT4 #ifdef (FreeBSD 4 compat)
@@ -32,8 +28,8 @@
 ;	OBSOL	obsolete, not included in system, only specifies name
 ;	UNIMPL	not implemented, placeholder only
 ;	NOSTD	implemented but as a lkm that can be statically
-;			compiled in sysent entry will be filled with lkmsys
-;			so the SYSCALL_MODULE macro works
+;		compiled in; sysent entry will be filled with lkmsys
+;		so the SYSCALL_MODULE macro works
 ;
 ; Please copy any additions and changes to the following compatability tables:
 ; sys/compat/freebsd32/syscalls.master
@@ -50,247 +46,247 @@
 ; redistributions should be placed in the reserved range at the end
 ; of the current calls.
 
-0	AUE_NULL	MSTD	{ int nosys(void); } syscall nosys_args int
-1	AUE_NULL	MSTD	{ void sys_exit(int rval); } exit \
+0	AUE_NULL	STD	{ int nosys(void); } syscall nosys_args int
+1	AUE_EXIT	STD	{ void sys_exit(int rval); } exit \
 				    sys_exit_args void
-2	AUE_NULL	MSTD	{ int fork(void); }
-3	AUE_NULL	MSTD	{ ssize_t read(int fd, void *buf, \
+2	AUE_FORK	STD	{ int fork(void); }
+3	AUE_NULL	STD	{ ssize_t read(int fd, void *buf, \
 				    size_t nbyte); }
-4	AUE_NULL	MSTD	{ ssize_t write(int fd, const void *buf, \
+4	AUE_NULL	STD	{ ssize_t write(int fd, const void *buf, \
 				    size_t nbyte); }
-5	AUE_NULL	MSTD	{ int open(char *path, int flags, int mode); }
+5	AUE_OPEN_RWTC	STD	{ int open(char *path, int flags, int mode); }
 ; XXX should be		{ int open(const char *path, int flags, ...); }
 ; but we're not ready for `const' or varargs.
 ; XXX man page says `mode_t mode'.
-6	AUE_NULL	MSTD	{ int close(int fd); }
-7	AUE_NULL	MSTD	{ int wait4(int pid, int *status, \
+6	AUE_CLOSE	STD	{ int close(int fd); }
+7	AUE_WAIT4	STD	{ int wait4(int pid, int *status, \
 				    int options, struct rusage *rusage); } \
 				    wait4 wait_args int
-8	AUE_NULL	MCOMPAT	{ int creat(char *path, int mode); }
-9	AUE_NULL	MSTD	{ int link(char *path, char *link); }
-10	AUE_NULL	MSTD	{ int unlink(char *path); }
+8	AUE_CREAT	COMPAT	{ int creat(char *path, int mode); }
+9	AUE_LINK	STD	{ int link(char *path, char *link); }
+10	AUE_UNLINK	STD	{ int unlink(char *path); }
 11	AUE_NULL	OBSOL	execv
-12	AUE_NULL	MSTD	{ int chdir(char *path); }
-13	AUE_NULL	MSTD	{ int fchdir(int fd); }
-14	AUE_NULL	MSTD	{ int mknod(char *path, int mode, int dev); }
-15	AUE_NULL	MSTD	{ int chmod(char *path, int mode); }
-16	AUE_NULL	MSTD	{ int chown(char *path, int uid, int gid); }
-17	AUE_NULL	MSTD	{ int obreak(char *nsize); } break \
+12	AUE_CHDIR	STD	{ int chdir(char *path); }
+13	AUE_FCHDIR	STD	{ int fchdir(int fd); }
+14	AUE_MKNOD	STD	{ int mknod(char *path, int mode, int dev); }
+15	AUE_CHMOD	STD	{ int chmod(char *path, int mode); }
+16	AUE_CHOWN	STD	{ int chown(char *path, int uid, int gid); }
+17	AUE_NULL	STD	{ int obreak(char *nsize); } break \
 				    obreak_args int
-18	AUE_NULL	MCOMPAT4	{ int getfsstat(struct ostatfs *buf, \
+18	AUE_GETFSSTAT	COMPAT4	{ int getfsstat(struct ostatfs *buf, \
 				    long bufsize, int flags); }
-19	AUE_NULL	MCOMPAT	{ long lseek(int fd, long offset, \
+19	AUE_LSEEK	COMPAT	{ long lseek(int fd, long offset, \
 				    int whence); }
-20	AUE_NULL	MSTD	{ pid_t getpid(void); }
-21	AUE_NULL	STD	{ int mount(char *type, char *path, \
+20	AUE_GETPID	STD	{ pid_t getpid(void); }
+21	AUE_MOUNT	STD	{ int mount(char *type, char *path, \
 				    int flags, caddr_t data); }
 ; XXX `path' should have type `const char *' but we're not ready for that.
-22	AUE_NULL	STD	{ int unmount(char *path, int flags); }
-23	AUE_NULL	MSTD	{ int setuid(uid_t uid); }
-24	AUE_NULL	MSTD	{ uid_t getuid(void); }
-25	AUE_NULL	MSTD	{ uid_t geteuid(void); }
-26	AUE_NULL	MSTD	{ int ptrace(int req, pid_t pid, \
+22	AUE_UMOUNT	STD	{ int unmount(char *path, int flags); }
+23	AUE_SETUID	STD	{ int setuid(uid_t uid); }
+24	AUE_GETUID	STD	{ uid_t getuid(void); }
+25	AUE_GETEUID	STD	{ uid_t geteuid(void); }
+26	AUE_PTRACE	STD	{ int ptrace(int req, pid_t pid, \
 				    caddr_t addr, int data); }
-27	AUE_NULL	MSTD	{ int recvmsg(int s, struct msghdr *msg, \
+27	AUE_RECVMSG	STD	{ int recvmsg(int s, struct msghdr *msg, \
 				    int flags); }
-28	AUE_NULL	MSTD	{ int sendmsg(int s, struct msghdr *msg, \
+28	AUE_SENDMSG	STD	{ int sendmsg(int s, struct msghdr *msg, \
 				    int flags); }
-29	AUE_NULL	MSTD	{ int recvfrom(int s, caddr_t buf, \
+29	AUE_RECVFROM	STD	{ int recvfrom(int s, caddr_t buf, \
 				    size_t len, int flags, \
 				    struct sockaddr * __restrict from, \
 				    __socklen_t * __restrict fromlenaddr); }
-30	AUE_NULL	MSTD	{ int accept(int s, \
+30	AUE_ACCEPT	STD	{ int accept(int s, \
 				    struct sockaddr * __restrict name, \
 				    __socklen_t * __restrict anamelen); }
-31	AUE_NULL	MSTD	{ int getpeername(int fdes, \
+31	AUE_GETPEERNAME	STD	{ int getpeername(int fdes, \
 				    struct sockaddr * __restrict asa, \
 				    __socklen_t * __restrict alen); }
-32	AUE_NULL	MSTD	{ int getsockname(int fdes, \
+32	AUE_GETSOCKNAME	STD	{ int getsockname(int fdes, \
 				    struct sockaddr * __restrict asa, \
 				    __socklen_t * __restrict alen); }
-33	AUE_NULL	MSTD	{ int access(char *path, int flags); }
-34	AUE_NULL	MSTD	{ int chflags(char *path, int flags); }
-35	AUE_NULL	MSTD	{ int fchflags(int fd, int flags); }
-36	AUE_NULL	MSTD	{ int sync(void); }
-37	AUE_NULL	MSTD	{ int kill(int pid, int signum); }
-38	AUE_NULL	MCOMPAT	{ int stat(char *path, struct ostat *ub); }
-39	AUE_NULL	MSTD	{ pid_t getppid(void); }
-40	AUE_NULL	MCOMPAT	{ int lstat(char *path, struct ostat *ub); }
-41	AUE_NULL	MSTD	{ int dup(u_int fd); }
-42	AUE_NULL	MSTD	{ int pipe(void); }
-43	AUE_NULL	MSTD	{ gid_t getegid(void); }
-44	AUE_NULL	MSTD	{ int profil(caddr_t samples, size_t size, \
+33	AUE_ACCESS	STD	{ int access(char *path, int flags); }
+34	AUE_CHFLAGS	STD	{ int chflags(char *path, int flags); }
+35	AUE_FCHFLAGS	STD	{ int fchflags(int fd, int flags); }
+36	AUE_SYNC	STD	{ int sync(void); }
+37	AUE_KILL	STD	{ int kill(int pid, int signum); }
+38	AUE_STAT	COMPAT	{ int stat(char *path, struct ostat *ub); }
+39	AUE_GETPPID	STD	{ pid_t getppid(void); }
+40	AUE_LSTAT	COMPAT	{ int lstat(char *path, struct ostat *ub); }
+41	AUE_DUP		STD	{ int dup(u_int fd); }
+42	AUE_PIPE	STD	{ int pipe(void); }
+43	AUE_GETEGID	STD	{ gid_t getegid(void); }
+44	AUE_PROFILE	STD	{ int profil(caddr_t samples, size_t size, \
 				    size_t offset, u_int scale); }
-45	AUE_NULL	MSTD	{ int ktrace(const char *fname, int ops, \
+45	AUE_KTRACE	STD	{ int ktrace(const char *fname, int ops, \
 				    int facs, int pid); }
-46	AUE_NULL	MCOMPAT	{ int sigaction(int signum, \
+46	AUE_SIGACTION	COMPAT	{ int sigaction(int signum, \
 				    struct osigaction *nsa, \
 				    struct osigaction *osa); }
-47	AUE_NULL	MSTD	{ gid_t getgid(void); }
-48	AUE_NULL	MCOMPAT	{ int sigprocmask(int how, osigset_t mask); }
+47	AUE_GETGID	STD	{ gid_t getgid(void); }
+48	AUE_SIGPROCMASK	COMPAT	{ int sigprocmask(int how, osigset_t mask); }
 ; XXX note nonstandard (bogus) calling convention - the libc stub passes
 ; us the mask, not a pointer to it, and we return the old mask as the
 ; (int) return value.
-49	AUE_NULL	MSTD	{ int getlogin(char *namebuf, u_int \
+49	AUE_GETLOGIN	STD	{ int getlogin(char *namebuf, u_int \
 				    namelen); }
-50	AUE_NULL	MSTD	{ int setlogin(char *namebuf); }
-51	AUE_NULL	MSTD	{ int acct(char *path); }
-52	AUE_NULL	MCOMPAT	{ int sigpending(void); }
-53	AUE_NULL	MSTD	{ int sigaltstack(stack_t *ss, \
+50	AUE_SETLOGIN	STD	{ int setlogin(char *namebuf); }
+51	AUE_ACCT	STD	{ int acct(char *path); }
+52	AUE_SIGPENDING	COMPAT	{ int sigpending(void); }
+53	AUE_SIGALTSTACK	STD	{ int sigaltstack(stack_t *ss, \
 				    stack_t *oss); }
-54	AUE_NULL	MSTD	{ int ioctl(int fd, u_long com, \
+54	AUE_IOCTL	STD	{ int ioctl(int fd, u_long com, \
 				    caddr_t data); }
-55	AUE_NULL	MSTD	{ int reboot(int opt); }
-56	AUE_NULL	MSTD	{ int revoke(char *path); }
-57	AUE_NULL	MSTD	{ int symlink(char *path, char *link); }
-58	AUE_NULL	MSTD	{ int readlink(char *path, char *buf, \
+55	AUE_REBOOT	STD	{ int reboot(int opt); }
+56	AUE_REVOKE	STD	{ int revoke(char *path); }
+57	AUE_SYMLINK	STD	{ int symlink(char *path, char *link); }
+58	AUE_READLINK	STD	{ int readlink(char *path, char *buf, \
 				    int count); }
-59	AUE_NULL	MSTD	{ int execve(char *fname, char **argv, \
+59	AUE_EXECVE	STD	{ int execve(char *fname, char **argv, \
 				    char **envv); }
-60	AUE_NULL	MSTD	{ int umask(int newmask); } umask umask_args \
+60	AUE_UMASK	STD	{ int umask(int newmask); } umask umask_args \
 				    int
-61	AUE_NULL	MSTD	{ int chroot(char *path); }
-62	AUE_NULL	MCOMPAT	{ int fstat(int fd, struct ostat *sb); }
-63	AUE_NULL	MCOMPAT	{ int getkerninfo(int op, char *where, \
+61	AUE_CHROOT	STD	{ int chroot(char *path); }
+62	AUE_FSTAT	COMPAT	{ int fstat(int fd, struct ostat *sb); }
+63	AUE_NULL	COMPAT	{ int getkerninfo(int op, char *where, \
 				    size_t *size, int arg); } getkerninfo \
 				    getkerninfo_args int
-64	AUE_NULL	MCOMPAT	{ int getpagesize(void); } getpagesize \
+64	AUE_NULL	COMPAT	{ int getpagesize(void); } getpagesize \
 				    getpagesize_args int
-65	AUE_NULL	MSTD	{ int msync(void *addr, size_t len, \
+65	AUE_MSYNC	STD	{ int msync(void *addr, size_t len, \
 				    int flags); }
-66	AUE_NULL	MSTD	{ int vfork(void); }
+66	AUE_VFORK	STD	{ int vfork(void); }
 67	AUE_NULL	OBSOL	vread
 68	AUE_NULL	OBSOL	vwrite
-69	AUE_NULL	MSTD	{ int sbrk(int incr); }
-70	AUE_NULL	MSTD	{ int sstk(int incr); }
-71	AUE_NULL	MCOMPAT	{ int mmap(void *addr, int len, int prot, \
+69	AUE_SBRK	STD	{ int sbrk(int incr); }
+70	AUE_SSTK	STD	{ int sstk(int incr); }
+71	AUE_MMAP	COMPAT	{ int mmap(void *addr, int len, int prot, \
 				    int flags, int fd, long pos); }
-72	AUE_NULL	MSTD	{ int ovadvise(int anom); } vadvise \
+72	AUE_O_VADVISE	STD	{ int ovadvise(int anom); } vadvise \
 				    ovadvise_args int
-73	AUE_NULL	MSTD	{ int munmap(void *addr, size_t len); }
-74	AUE_NULL	MSTD	{ int mprotect(const void *addr, size_t len, \
+73	AUE_MUNMAP	STD	{ int munmap(void *addr, size_t len); }
+74	AUE_MPROTECT	STD	{ int mprotect(const void *addr, size_t len, \
 				    int prot); }
-75	AUE_NULL	MSTD	{ int madvise(void *addr, size_t len, \
+75	AUE_MADVISE	STD	{ int madvise(void *addr, size_t len, \
 				    int behav); }
 76	AUE_NULL	OBSOL	vhangup
 77	AUE_NULL	OBSOL	vlimit
-78	AUE_NULL	MSTD	{ int mincore(const void *addr, size_t len, \
+78	AUE_MINCORE	STD	{ int mincore(const void *addr, size_t len, \
 				    char *vec); }
-79	AUE_NULL	MSTD	{ int getgroups(u_int gidsetsize, \
+79	AUE_GETGROUPS	STD	{ int getgroups(u_int gidsetsize, \
 				    gid_t *gidset); }
-80	AUE_NULL	MSTD	{ int setgroups(u_int gidsetsize, \
+80	AUE_SETGROUPS	STD	{ int setgroups(u_int gidsetsize, \
 				    gid_t *gidset); }
-81	AUE_NULL	MSTD	{ int getpgrp(void); }
-82	AUE_NULL	MSTD	{ int setpgid(int pid, int pgid); }
-83	AUE_NULL	MSTD	{ int setitimer(u_int which, struct \
+81	AUE_GETPGRP	STD	{ int getpgrp(void); }
+82	AUE_SETPGRP	STD	{ int setpgid(int pid, int pgid); }
+83	AUE_SETITIMER	STD	{ int setitimer(u_int which, struct \
 				    itimerval *itv, struct itimerval *oitv); }
-84	AUE_NULL	MCOMPAT	{ int wait(void); }
-85	AUE_NULL	MSTD	{ int swapon(char *name); }
-86	AUE_NULL	MSTD	{ int getitimer(u_int which, \
+84	AUE_WAIT4	COMPAT	{ int wait(void); }
+85	AUE_SWAPON	STD	{ int swapon(char *name); }
+86	AUE_GETITIMER	STD	{ int getitimer(u_int which, \
 				    struct itimerval *itv); }
-87	AUE_NULL	MCOMPAT	{ int gethostname(char *hostname, \
+87	AUE_SYSCTL	COMPAT	{ int gethostname(char *hostname, \
 				    u_int len); } gethostname \
 				    gethostname_args int
-88	AUE_NULL	MCOMPAT	{ int sethostname(char *hostname, \
+88	AUE_SYSCTL	COMPAT	{ int sethostname(char *hostname, \
 				    u_int len); } sethostname \
 				    sethostname_args int
-89	AUE_NULL	MSTD	{ int getdtablesize(void); }
-90	AUE_NULL	MSTD	{ int dup2(u_int from, u_int to); }
+89	AUE_GETDTABLESIZE	STD	{ int getdtablesize(void); }
+90	AUE_DUP2	STD	{ int dup2(u_int from, u_int to); }
 91	AUE_NULL	UNIMPL	getdopt
-92	AUE_NULL	MSTD	{ int fcntl(int fd, int cmd, long arg); }
+92	AUE_FCNTL	STD	{ int fcntl(int fd, int cmd, long arg); }
 ; XXX should be	{ int fcntl(int fd, int cmd, ...); }
 ; but we're not ready for varargs.
-93	AUE_NULL	MSTD	{ int select(int nd, fd_set *in, fd_set *ou, \
+93	AUE_SELECT	STD	{ int select(int nd, fd_set *in, fd_set *ou, \
 				    fd_set *ex, struct timeval *tv); }
 94	AUE_NULL	UNIMPL	setdopt
-95	AUE_NULL	MSTD	{ int fsync(int fd); }
-96	AUE_NULL	MSTD	{ int setpriority(int which, int who, \
+95	AUE_FSYNC	STD	{ int fsync(int fd); }
+96	AUE_SETPRIORITY	STD	{ int setpriority(int which, int who, \
 				    int prio); }
-97	AUE_NULL	MSTD	{ int socket(int domain, int type, \
+97	AUE_SOCKET	STD	{ int socket(int domain, int type, \
 				    int protocol); }
-98	AUE_NULL	MSTD	{ int connect(int s, caddr_t name, \
+98	AUE_CONNECT	STD	{ int connect(int s, caddr_t name, \
 				    int namelen); }
-99	AUE_NULL	MCPT_NOA { int accept(int s, caddr_t name, \
+99	AUE_ACCEPT	CPT_NOA	{ int accept(int s, caddr_t name, \
 				    int *anamelen); } accept accept_args int
-100	AUE_NULL	MSTD	{ int getpriority(int which, int who); }
-101	AUE_NULL	MCOMPAT	{ int send(int s, caddr_t buf, int len, \
+100	AUE_GETPRIORITY	STD	{ int getpriority(int which, int who); }
+101	AUE_SEND	COMPAT	{ int send(int s, caddr_t buf, int len, \
 				    int flags); }
-102	AUE_NULL	MCOMPAT	{ int recv(int s, caddr_t buf, int len, \
+102	AUE_RECV	COMPAT	{ int recv(int s, caddr_t buf, int len, \
 				    int flags); }
-103	AUE_NULL	MCOMPAT	{ int sigreturn( \
+103	AUE_SIGRETURN	COMPAT	{ int sigreturn( \
 				    struct osigcontext *sigcntxp); }
-104	AUE_NULL	MSTD	{ int bind(int s, caddr_t name, \
+104	AUE_BIND	STD	{ int bind(int s, caddr_t name, \
 				    int namelen); }
-105	AUE_NULL	MSTD	{ int setsockopt(int s, int level, int name, \
+105	AUE_SETSOCKOPT	STD	{ int setsockopt(int s, int level, int name, \
 				    caddr_t val, int valsize); }
-106	AUE_NULL	MSTD	{ int listen(int s, int backlog); }
+106	AUE_LISTEN	STD	{ int listen(int s, int backlog); }
 107	AUE_NULL	OBSOL	vtimes
-108	AUE_NULL	MCOMPAT	{ int sigvec(int signum, struct sigvec *nsv, \
+108	AUE_NULL	COMPAT	{ int sigvec(int signum, struct sigvec *nsv, \
 				    struct sigvec *osv); }
-109	AUE_NULL	MCOMPAT	{ int sigblock(int mask); }
-110	AUE_NULL	MCOMPAT	{ int sigsetmask(int mask); }
-111	AUE_NULL	MCOMPAT	{ int sigsuspend(osigset_t mask); }
+109	AUE_NULL	COMPAT	{ int sigblock(int mask); }
+110	AUE_NULL	COMPAT	{ int sigsetmask(int mask); }
+111	AUE_NULL	COMPAT	{ int sigsuspend(osigset_t mask); }
 ; XXX note nonstandard (bogus) calling convention - the libc stub passes
 ; us the mask, not a pointer to it.
-112	AUE_NULL	MCOMPAT	{ int sigstack(struct sigstack *nss, \
+112	AUE_NULL	COMPAT	{ int sigstack(struct sigstack *nss, \
 				    struct sigstack *oss); }
-113	AUE_NULL	MCOMPAT	{ int recvmsg(int s, struct omsghdr *msg, \
+113	AUE_RECVMSG	COMPAT	{ int recvmsg(int s, struct omsghdr *msg, \
 				    int flags); }
-114	AUE_NULL	MCOMPAT	{ int sendmsg(int s, caddr_t msg, \
+114	AUE_SENDMSG	COMPAT	{ int sendmsg(int s, caddr_t msg, \
 				    int flags); }
 115	AUE_NULL	OBSOL	vtrace
-116	AUE_NULL	MSTD	{ int gettimeofday(struct timeval *tp, \
+116	AUE_GETTIMEOFDAY	STD	{ int gettimeofday(struct timeval *tp, \
 				    struct timezone *tzp); }
-117	AUE_NULL	MSTD	{ int getrusage(int who, \
+117	AUE_GETRUSAGE	STD	{ int getrusage(int who, \
 				    struct rusage *rusage); }
-118	AUE_NULL	MSTD	{ int getsockopt(int s, int level, int name, \
+118	AUE_GETSOCKOPT	STD	{ int getsockopt(int s, int level, int name, \
 				    caddr_t val, int *avalsize); }
 119	AUE_NULL	UNIMPL	resuba (BSD/OS 2.x)
-120	AUE_NULL	MSTD	{ int readv(int fd, struct iovec *iovp, \
+120	AUE_READV	STD	{ int readv(int fd, struct iovec *iovp, \
 				    u_int iovcnt); }
-121	AUE_NULL	MSTD	{ int writev(int fd, struct iovec *iovp, \
+121	AUE_WRITEV	STD	{ int writev(int fd, struct iovec *iovp, \
 				    u_int iovcnt); }
-122	AUE_NULL	MSTD	{ int settimeofday(struct timeval *tv, \
+122	AUE_SETTIMEOFDAY	STD	{ int settimeofday(struct timeval *tv, \
 				    struct timezone *tzp); }
-123	AUE_NULL	MSTD	{ int fchown(int fd, int uid, int gid); }
-124	AUE_NULL	MSTD	{ int fchmod(int fd, int mode); }
-125	AUE_NULL	MCPT_NOA { int recvfrom(int s, caddr_t buf, \
+123	AUE_FCHOWN	STD	{ int fchown(int fd, int uid, int gid); }
+124	AUE_FCHMOD	STD	{ int fchmod(int fd, int mode); }
+125	AUE_RECVFROM	CPT_NOA	{ int recvfrom(int s, caddr_t buf, \
 				    size_t len, int flags, caddr_t from, int \
 				    *fromlenaddr); } recvfrom recvfrom_args \
 				    int
-126	AUE_NULL	MSTD	{ int setreuid(int ruid, int euid); }
-127	AUE_NULL	MSTD	{ int setregid(int rgid, int egid); }
-128	AUE_NULL	MSTD	{ int rename(char *from, char *to); }
-129	AUE_NULL	MCOMPAT	{ int truncate(char *path, long length); }
-130	AUE_NULL	MCOMPAT	{ int ftruncate(int fd, long length); }
-131	AUE_NULL	MSTD	{ int flock(int fd, int how); }
-132	AUE_NULL	MSTD	{ int mkfifo(char *path, int mode); }
-133	AUE_NULL	MSTD	{ int sendto(int s, caddr_t buf, size_t len, \
+126	AUE_SETREUID	STD	{ int setreuid(int ruid, int euid); }
+127	AUE_SETREGID	STD	{ int setregid(int rgid, int egid); }
+128	AUE_RENAME	STD	{ int rename(char *from, char *to); }
+129	AUE_TRUNCATE	COMPAT	{ int truncate(char *path, long length); }
+130	AUE_FTRUNCATE	COMPAT	{ int ftruncate(int fd, long length); }
+131	AUE_FLOCK	STD	{ int flock(int fd, int how); }
+132	AUE_MKFIFO	STD	{ int mkfifo(char *path, int mode); }
+133	AUE_SENDTO	STD	{ int sendto(int s, caddr_t buf, size_t len, \
 				    int flags, caddr_t to, int tolen); }
-134	AUE_NULL	MSTD	{ int shutdown(int s, int how); }
-135	AUE_NULL	MSTD	{ int socketpair(int domain, int type, \
+134	AUE_SHUTDOWN	STD	{ int shutdown(int s, int how); }
+135	AUE_SOCKETPAIR	STD	{ int socketpair(int domain, int type, \
 				    int protocol, int *rsv); }
-136	AUE_NULL	MSTD	{ int mkdir(char *path, int mode); }
-137	AUE_NULL	MSTD	{ int rmdir(char *path); }
-138	AUE_NULL	MSTD	{ int utimes(char *path, \
+136	AUE_MKDIR	STD	{ int mkdir(char *path, int mode); }
+137	AUE_RMDIR	STD	{ int rmdir(char *path); }
+138	AUE_UTIMES	STD	{ int utimes(char *path, \
 				    struct timeval *tptr); }
 139	AUE_NULL	OBSOL	4.2 sigreturn
-140	AUE_NULL	MSTD	{ int adjtime(struct timeval *delta, \
+140	AUE_ADJTIME	STD	{ int adjtime(struct timeval *delta, \
 				    struct timeval *olddelta); }
-141	AUE_NULL	MCOMPAT	{ int getpeername(int fdes, caddr_t asa, \
+141	AUE_GETPEERNAME	COMPAT	{ int getpeername(int fdes, caddr_t asa, \
 				    int *alen); }
-142	AUE_NULL	MCOMPAT	{ long gethostid(void); }
-143	AUE_NULL	MCOMPAT	{ int sethostid(long hostid); }
-144	AUE_NULL	MCOMPAT	{ int getrlimit(u_int which, struct \
+142	AUE_SYSCTL	COMPAT	{ long gethostid(void); }
+143	AUE_SYSCTL	COMPAT	{ int sethostid(long hostid); }
+144	AUE_GETRLIMIT	COMPAT	{ int getrlimit(u_int which, struct \
 				    orlimit *rlp); }
-145	AUE_NULL	MCOMPAT	{ int setrlimit(u_int which, \
+145	AUE_SETRLIMIT	COMPAT	{ int setrlimit(u_int which, \
 				    struct orlimit *rlp); }
-146	AUE_NULL	MCOMPAT	{ int killpg(int pgid, int signum); }
-147	AUE_NULL	MSTD	{ int setsid(void); }
-148	AUE_NULL	MSTD	{ int quotactl(char *path, int cmd, int uid, \
+146	AUE_KILLPG	COMPAT	{ int killpg(int pgid, int signum); }
+147	AUE_SETSID	STD	{ int setsid(void); }
+148	AUE_QUOTACTL	STD	{ int quotactl(char *path, int cmd, int uid, \
 				    caddr_t arg); }
-149	AUE_NULL	MCOMPAT	{ int quota(void); }
-150	AUE_NULL	MCPT_NOA	{ int getsockname(int fdec, \
+149	AUE_O_QUOTA	COMPAT	{ int quota(void); }
+150	AUE_GETSOCKNAME	CPT_NOA	{ int getsockname(int fdec, \
 				    caddr_t asa, int *alen); } getsockname \
 				    getsockname_args int
 
@@ -303,95 +299,96 @@
 153	AUE_NULL	UNIMPL	asyncdaemon (BSD/OS 2.x)
 154	AUE_NULL	UNIMPL	nosys
 ; 155 is initialized by the NFS code, if present.
-155	AUE_NULL	MNOIMPL	{ int nfssvc(int flag, caddr_t argp); }
-156	AUE_NULL	COMPAT	{ int getdirentries(int fd, char *buf, \
+155	AUE_NFS_SVC	NOSTD	{ int nfssvc(int flag, caddr_t argp); }
+156	AUE_GETDIRENTRIES	COMPAT	{ int getdirentries(int fd, char *buf, \
 				    u_int count, long *basep); }
-157	AUE_NULL	MCOMPAT4	{ int statfs(char *path, \
+157	AUE_STATFS	COMPAT4	{ int statfs(char *path, \
 				    struct ostatfs *buf); }
-158	AUE_NULL	MCOMPAT4	{ int fstatfs(int fd, \
+158	AUE_FSTATFS	COMPAT4	{ int fstatfs(int fd, \
 				    struct ostatfs *buf); }
 159	AUE_NULL	UNIMPL	nosys
-160	AUE_NULL	MSTD	{ int lgetfh(char *fname, \
+160	AUE_LGETFH	STD	{ int lgetfh(char *fname, \
 				    struct fhandle *fhp); }
-161	AUE_NULL	MSTD	{ int getfh(char *fname, \
+161	AUE_NFS_GETFH	STD	{ int getfh(char *fname, \
 				    struct fhandle *fhp); }
-162	AUE_NULL	MSTD	{ int getdomainname(char *domainname, \
+162	AUE_SYSCTL	STD	{ int getdomainname(char *domainname, \
 				    int len); }
-163	AUE_NULL	MSTD	{ int setdomainname(char *domainname, \
+163	AUE_SYSCTL	STD	{ int setdomainname(char *domainname, \
 				    int len); }
-164	AUE_NULL	MSTD	{ int uname(struct utsname *name); }
-165	AUE_NULL	MSTD	{ int sysarch(int op, char *parms); }
-166	AUE_NULL	MSTD	{ int rtprio(int function, pid_t pid, \
+164	AUE_NULL	STD	{ int uname(struct utsname *name); }
+165	AUE_SYSARCH	STD	{ int sysarch(int op, char *parms); }
+166	AUE_RTPRIO	STD	{ int rtprio(int function, pid_t pid, \
 				    struct rtprio *rtp); }
 167	AUE_NULL	UNIMPL	nosys
 168	AUE_NULL	UNIMPL	nosys
 ; 169 is initialized by the SYSVSEM code if present or loaded
-169	AUE_NULL	MNOSTD	{ int semsys(int which, int a2, int a3, \
+169	AUE_SEMSYS	NOSTD	{ int semsys(int which, int a2, int a3, \
 				    int a4, int a5); }
 ; 169 is initialized by the SYSVMSG code if present or loaded
 ; XXX should be	{ int semsys(int which, ...); }
-170	AUE_NULL	MNOSTD	{ int msgsys(int which, int a2, int a3, \
+170	AUE_MSGSYS	NOSTD	{ int msgsys(int which, int a2, int a3, \
 				    int a4, int a5, int a6); }
 ; 169 is initialized by the SYSVSHM code if present or loaded
 ; XXX should be	{ int msgsys(int which, ...); }
-171	AUE_NULL	MNOSTD	{ int shmsys(int which, int a2, int a3, \
+171	AUE_SHMSYS	NOSTD	{ int shmsys(int which, int a2, int a3, \
 				    int a4); }
 ; XXX should be	{ int shmsys(int which, ...); }
 172	AUE_NULL	UNIMPL	nosys
-173	AUE_NULL	MSTD	{ ssize_t pread(int fd, void *buf, \
+173	AUE_PREAD	STD	{ ssize_t freebsd6_pread(int fd, void *buf, \
 				    size_t nbyte, int pad, off_t offset); }
-174	AUE_NULL	MSTD	{ ssize_t pwrite(int fd, const void *buf, \
+174	AUE_PWRITE	STD	{ ssize_t freebsd6_pwrite(int fd, \
+				    const void *buf, \
 				    size_t nbyte, int pad, off_t offset); }
 175	AUE_NULL	UNIMPL	nosys
-176	AUE_NULL	MSTD	{ int ntp_adjtime(struct timex *tp); }
+176	AUE_NTP_ADJTIME	STD	{ int ntp_adjtime(struct timex *tp); }
 177	AUE_NULL	UNIMPL	sfork (BSD/OS 2.x)
 178	AUE_NULL	UNIMPL	getdescriptor (BSD/OS 2.x)
 179	AUE_NULL	UNIMPL	setdescriptor (BSD/OS 2.x)
 180	AUE_NULL	UNIMPL	nosys
 
 ; Syscalls 181-199 are used by/reserved for BSD
-181	AUE_NULL	MSTD	{ int setgid(gid_t gid); }
-182	AUE_NULL	MSTD	{ int setegid(gid_t egid); }
-183	AUE_NULL	MSTD	{ int seteuid(uid_t euid); }
+181	AUE_SETGID	STD	{ int setgid(gid_t gid); }
+182	AUE_SETEGID	STD	{ int setegid(gid_t egid); }
+183	AUE_SETEUID	STD	{ int seteuid(uid_t euid); }
 184	AUE_NULL	UNIMPL	lfs_bmapv
 185	AUE_NULL	UNIMPL	lfs_markv
 186	AUE_NULL	UNIMPL	lfs_segclean
 187	AUE_NULL	UNIMPL	lfs_segwait
-188	AUE_NULL	MSTD	{ int stat(char *path, struct stat *ub); }
-189	AUE_NULL	MSTD	{ int fstat(int fd, struct stat *sb); }
-190	AUE_NULL	MSTD	{ int lstat(char *path, struct stat *ub); }
-191	AUE_NULL	MSTD	{ int pathconf(char *path, int name); }
-192	AUE_NULL	MSTD	{ int fpathconf(int fd, int name); }
+188	AUE_STAT	STD	{ int stat(char *path, struct stat *ub); }
+189	AUE_FSTAT	STD	{ int fstat(int fd, struct stat *sb); }
+190	AUE_LSTAT	STD	{ int lstat(char *path, struct stat *ub); }
+191	AUE_PATHCONF	STD	{ int pathconf(char *path, int name); }
+192	AUE_FPATHCONF	STD	{ int fpathconf(int fd, int name); }
 193	AUE_NULL	UNIMPL	nosys
-194	AUE_NULL	MSTD	{ int getrlimit(u_int which, \
+194	AUE_GETRLIMIT	STD	{ int getrlimit(u_int which, \
 				    struct rlimit *rlp); } getrlimit \
 				    __getrlimit_args int
-195	AUE_NULL	MSTD	{ int setrlimit(u_int which, \
+195	AUE_SETRLIMIT	STD	{ int setrlimit(u_int which, \
 				    struct rlimit *rlp); } setrlimit \
 				    __setrlimit_args int
-196	AUE_NULL	MSTD	{ int getdirentries(int fd, char *buf, \
+196	AUE_GETDIRENTRIES	STD	{ int getdirentries(int fd, char *buf, \
 				    u_int count, long *basep); }
-197	AUE_NULL	MSTD	{ caddr_t mmap(caddr_t addr, size_t len, \
-				    int prot, int flags, int fd, int pad, \
-				    off_t pos); }
-198	AUE_NULL	MSTD	{ int nosys(void); } __syscall \
+197	AUE_MMAP	STD	{ caddr_t freebsd6_mmap(caddr_t addr, \
+				    size_t len, int prot, int flags, int fd, \
+				    int pad, off_t pos); }
+198	AUE_NULL	STD	{ int nosys(void); } __syscall \
 				    __syscall_args int
-199	AUE_NULL	MSTD	{ off_t lseek(int fd, int pad, off_t offset, \
-				    int whence); }
-200	AUE_NULL	MSTD	{ int truncate(char *path, int pad, \
+199	AUE_LSEEK	STD	{ off_t freebsd6_lseek(int fd, int pad, \
+				    off_t offset, int whence); }
+200	AUE_TRUNCATE	STD	{ int freebsd6_truncate(char *path, int pad, \
 				    off_t length); }
-201	AUE_NULL	MSTD	{ int ftruncate(int fd, int pad, \
+201	AUE_FTRUNCATE	STD	{ int freebsd6_ftruncate(int fd, int pad, \
 				    off_t length); }
-202	AUE_NULL	MSTD	{ int __sysctl(int *name, u_int namelen, \
+202	AUE_SYSCTL	STD	{ int __sysctl(int *name, u_int namelen, \
 				    void *old, size_t *oldlenp, void *new, \
 				    size_t newlen); } __sysctl sysctl_args int
-203	AUE_NULL	MSTD	{ int mlock(const void *addr, size_t len); }
-204	AUE_NULL	MSTD	{ int munlock(const void *addr, size_t len); }
-205	AUE_NULL	MSTD	{ int undelete(char *path); }
-206	AUE_NULL	MSTD	{ int futimes(int fd, struct timeval *tptr); }
-207	AUE_NULL	MSTD	{ int getpgid(pid_t pid); }
+203	AUE_MLOCK	STD	{ int mlock(const void *addr, size_t len); }
+204	AUE_MUNLOCK	STD	{ int munlock(const void *addr, size_t len); }
+205	AUE_UNDELETE	STD	{ int undelete(char *path); }
+206	AUE_FUTIMES	STD	{ int futimes(int fd, struct timeval *tptr); }
+207	AUE_GETPGID	STD	{ int getpgid(pid_t pid); }
 208	AUE_NULL	UNIMPL	newreboot (NetBSD)
-209	AUE_NULL	MSTD	{ int poll(struct pollfd *fds, u_int nfds, \
+209	AUE_POLL	STD	{ int poll(struct pollfd *fds, u_int nfds, \
 				    int timeout); }
 
 ;
@@ -410,41 +407,45 @@
 
 ;
 ; The following were introduced with NetBSD/4.4Lite-2
-; They are initialized by their respective modules/sysinits
-220	AUE_NULL	MNOSTD	{ int __semctl(int semid, int semnum, \
+220	AUE_SEMCTL	NOSTD	{ int __semctl(int semid, int semnum, \
 				    int cmd, union semun *arg); }
-221	AUE_NULL	MNOSTD	{ int semget(key_t key, int nsems, \
+221	AUE_SEMGET	NOSTD	{ int semget(key_t key, int nsems, \
 				    int semflg); }
-222	AUE_NULL	MNOSTD	{ int semop(int semid, struct sembuf *sops, \
+222	AUE_SEMOP	NOSTD	{ int semop(int semid, struct sembuf *sops, \
 				    size_t nsops); }
 223	AUE_NULL	UNIMPL	semconfig
-224	AUE_NULL	MNOSTD	{ int msgctl(int msqid, int cmd, \
+224	AUE_MSGCTL	NOSTD	{ int msgctl(int msqid, int cmd, \
 				    struct msqid_ds *buf); }
-225	AUE_NULL	MNOSTD	{ int msgget(key_t key, int msgflg); }
-226	AUE_NULL	MNOSTD	{ int msgsnd(int msqid, const void *msgp, \
+225	AUE_MSGGET	NOSTD	{ int msgget(key_t key, int msgflg); }
+226	AUE_MSGSND	NOSTD	{ int msgsnd(int msqid, const void *msgp, \
 				    size_t msgsz, int msgflg); }
-227	AUE_NULL	MNOSTD	{ int msgrcv(int msqid, void *msgp, \
+227	AUE_MSGRCV	NOSTD	{ int msgrcv(int msqid, void *msgp, \
 				    size_t msgsz, long msgtyp, int msgflg); }
-228	AUE_NULL	MNOSTD	{ int shmat(int shmid, const void *shmaddr, \
+228	AUE_SHMAT	NOSTD	{ int shmat(int shmid, const void *shmaddr, \
 				    int shmflg); }
-229	AUE_NULL	MNOSTD	{ int shmctl(int shmid, int cmd, \
+229	AUE_SHMCTL	NOSTD	{ int shmctl(int shmid, int cmd, \
 				    struct shmid_ds *buf); }
-230	AUE_NULL	MNOSTD	{ int shmdt(const void *shmaddr); }
-231	AUE_NULL	MNOSTD	{ int shmget(key_t key, size_t size, \
+230	AUE_SHMDT	NOSTD	{ int shmdt(const void *shmaddr); }
+231	AUE_SHMGET	NOSTD	{ int shmget(key_t key, size_t size, \
 				    int shmflg); }
 ;
-232	AUE_NULL	MSTD	{ int clock_gettime(clockid_t clock_id, \
+232	AUE_NULL	STD	{ int clock_gettime(clockid_t clock_id, \
 				    struct timespec *tp); }
-233	AUE_NULL	MSTD	{ int clock_settime(clockid_t clock_id, \
+233	AUE_CLOCK_SETTIME	STD	{ int clock_settime( \
+				    clockid_t clock_id, \
 				    const struct timespec *tp); }
-234	AUE_NULL	MSTD	{ int clock_getres(clockid_t clock_id, \
+234	AUE_NULL	STD	{ int clock_getres(clockid_t clock_id, \
 				    struct timespec *tp); }
-235	AUE_NULL	UNIMPL	timer_create
-236	AUE_NULL	UNIMPL	timer_delete
-237	AUE_NULL	UNIMPL	timer_settime
-238	AUE_NULL	UNIMPL	timer_gettime
-239	AUE_NULL	UNIMPL	timer_getoverrun
-240	AUE_NULL	MSTD	{ int nanosleep(const struct timespec *rqtp, \
+235	AUE_NULL	STD	{ int ktimer_create(clockid_t clock_id, \
+				    struct sigevent *evp, int *timerid); }
+236	AUE_NULL	STD	{ int ktimer_delete(int timerid); }
+237	AUE_NULL	STD	{ int ktimer_settime(int timerid, int flags, \
+				    const struct itimerspec *value, \
+				    struct itimerspec *ovalue); }
+238	AUE_NULL	STD	{ int ktimer_gettime(int timerid, struct \
+				    itimerspec *value); }
+239	AUE_NULL	STD	{ int ktimer_getoverrun(int timerid); }
+240	AUE_NULL	STD	{ int nanosleep(const struct timespec *rqtp, \
 				    struct timespec *rmtp); }
 241	AUE_NULL	UNIMPL	nosys
 242	AUE_NULL	UNIMPL	nosys
@@ -453,19 +454,21 @@
 245	AUE_NULL	UNIMPL	nosys
 246	AUE_NULL	UNIMPL	nosys
 247	AUE_NULL	UNIMPL	nosys
-248	AUE_NULL	MSTD	{ int ntp_gettime(struct ntptimeval *ntvp); }
+248	AUE_NULL	STD	{ int ntp_gettime(struct ntptimeval *ntvp); }
 249	AUE_NULL	UNIMPL	nosys
 ; syscall numbers initially used in OpenBSD
-250	AUE_NULL	MSTD	{ int minherit(void *addr, size_t len, \
+250	AUE_MINHERIT	STD	{ int minherit(void *addr, size_t len, \
 				    int inherit); }
-251	AUE_NULL	MSTD	{ int rfork(int flags); }
-252	AUE_NULL	MSTD	{ int openbsd_poll(struct pollfd *fds, \
+251	AUE_RFORK	STD	{ int rfork(int flags); }
+252	AUE_POLL	STD	{ int openbsd_poll(struct pollfd *fds, \
 				    u_int nfds, int timeout); }
-253	AUE_NULL	MSTD	{ int issetugid(void); }
-254	AUE_NULL	MSTD	{ int lchown(char *path, int uid, int gid); }
-255	AUE_NULL	UNIMPL	nosys
-256	AUE_NULL	UNIMPL	nosys
-257	AUE_NULL	UNIMPL	nosys
+253	AUE_ISSETUGID	STD	{ int issetugid(void); }
+254	AUE_LCHOWN	STD	{ int lchown(char *path, int uid, int gid); }
+255	AUE_NULL	NOSTD	{ int aio_read(struct aiocb *aiocbp); }
+256	AUE_NULL	NOSTD	{ int aio_write(struct aiocb *aiocbp); }
+257	AUE_NULL	NOSTD	{ int lio_listio(int mode, \
+				    struct aiocb * const *acb_list, \
+				    int nent, struct sigevent *sig); }
 258	AUE_NULL	UNIMPL	nosys
 259	AUE_NULL	UNIMPL	nosys
 260	AUE_NULL	UNIMPL	nosys
@@ -480,20 +483,20 @@
 269	AUE_NULL	UNIMPL	nosys
 270	AUE_NULL	UNIMPL	nosys
 271	AUE_NULL	UNIMPL	nosys
-272	AUE_NULL	MSTD	{ int getdents(int fd, char *buf, \
+272	AUE_O_GETDENTS	STD	{ int getdents(int fd, char *buf, \
 				    size_t count); }
 273	AUE_NULL	UNIMPL	nosys
-274	AUE_NULL	MSTD	{ int lchmod(char *path, mode_t mode); }
-275	AUE_NULL	MNOPROTO { int lchown(char *path, uid_t uid, \
+274	AUE_LCHMOD	STD	{ int lchmod(char *path, mode_t mode); }
+275	AUE_LCHOWN	NOPROTO	{ int lchown(char *path, uid_t uid, \
 				    gid_t gid); } netbsd_lchown lchown_args \
 				    int
-276	AUE_NULL	MSTD	{ int lutimes(char *path, \
+276	AUE_LUTIMES	STD	{ int lutimes(char *path, \
 				    struct timeval *tptr); }
-277	AUE_NULL	MNOPROTO	{ int msync(void *addr, size_t len, \
+277	AUE_MSYNC	NOPROTO	{ int msync(void *addr, size_t len, \
 				    int flags); } netbsd_msync msync_args int
-278	AUE_NULL	MSTD	{ int nstat(char *path, struct nstat *ub); }
-279	AUE_NULL	MSTD	{ int nfstat(int fd, struct nstat *sb); }
-280	AUE_NULL	MSTD	{ int nlstat(char *path, struct nstat *ub); }
+278	AUE_STAT	STD	{ int nstat(char *path, struct nstat *ub); }
+279	AUE_FSTAT	STD	{ int nfstat(int fd, struct nstat *sb); }
+280	AUE_LSTAT	STD	{ int nlstat(char *path, struct nstat *ub); }
 281	AUE_NULL	UNIMPL	nosys
 282	AUE_NULL	UNIMPL	nosys
 283	AUE_NULL	UNIMPL	nosys
@@ -503,9 +506,9 @@
 287	AUE_NULL	UNIMPL	nosys
 288	AUE_NULL	UNIMPL	nosys
 ; 289 and 290 from NetBSD (OpenBSD: 267 and 268)
-289	AUE_NULL	MSTD	{ ssize_t preadv(int fd, struct iovec *iovp, \
+289	AUE_PREADV	STD	{ ssize_t preadv(int fd, struct iovec *iovp, \
 					u_int iovcnt, off_t offset); }
-290	AUE_NULL	MSTD	{ ssize_t pwritev(int fd, struct iovec *iovp, \
+290	AUE_PWRITEV	STD	{ ssize_t pwritev(int fd, struct iovec *iovp, \
 					u_int iovcnt, off_t offset); }
 291	AUE_NULL	UNIMPL	nosys
 292	AUE_NULL	UNIMPL	nosys
@@ -514,30 +517,30 @@
 295	AUE_NULL	UNIMPL	nosys
 296	AUE_NULL	UNIMPL	nosys
 ; XXX 297 is 300 in NetBSD 
-297	AUE_NULL	MCOMPAT4	{ int fhstatfs( \
+297	AUE_FHSTATFS	COMPAT4	{ int fhstatfs( \
 				    const struct fhandle *u_fhp, \
 				    struct ostatfs *buf); }
-298	AUE_NULL	MSTD	{ int fhopen(const struct fhandle *u_fhp, \
+298	AUE_FHOPEN	STD	{ int fhopen(const struct fhandle *u_fhp, \
 				    int flags); }
-299	AUE_NULL	MSTD	{ int fhstat(const struct fhandle *u_fhp, \
+299	AUE_FHSTAT	STD	{ int fhstat(const struct fhandle *u_fhp, \
 				    struct stat *sb); }
 ; syscall numbers for FreeBSD
-300	AUE_NULL	MSTD	{ int modnext(int modid); }
-301	AUE_NULL	MSTD	{ int modstat(int modid, \
+300	AUE_NULL	STD	{ int modnext(int modid); }
+301	AUE_NULL	STD	{ int modstat(int modid, \
 				    struct module_stat *stat); }
-302	AUE_NULL	MSTD	{ int modfnext(int modid); }
-303	AUE_NULL	MSTD	{ int modfind(const char *name); }
-304	AUE_NULL	MSTD	{ int kldload(const char *file); }
-305	AUE_NULL	MSTD	{ int kldunload(int fileid); }
-306	AUE_NULL	MSTD	{ int kldfind(const char *file); }
-307	AUE_NULL	MSTD	{ int kldnext(int fileid); }
-308	AUE_NULL	MSTD	{ int kldstat(int fileid, struct \
+302	AUE_NULL	STD	{ int modfnext(int modid); }
+303	AUE_NULL	STD	{ int modfind(const char *name); }
+304	AUE_MODLOAD	STD	{ int kldload(const char *file); }
+305	AUE_MODUNLOAD	STD	{ int kldunload(int fileid); }
+306	AUE_NULL	STD	{ int kldfind(const char *file); }
+307	AUE_NULL	STD	{ int kldnext(int fileid); }
+308	AUE_NULL	STD	{ int kldstat(int fileid, struct \
 				    kld_file_stat* stat); }
-309	AUE_NULL	MSTD	{ int kldfirstmod(int fileid); }
-310	AUE_NULL	MSTD	{ int getsid(pid_t pid); }
-311	AUE_NULL	MSTD	{ int setresuid(uid_t ruid, uid_t euid, \
+309	AUE_NULL	STD	{ int kldfirstmod(int fileid); }
+310	AUE_GETSID	STD	{ int getsid(pid_t pid); }
+311	AUE_SETRESUID	STD	{ int setresuid(uid_t ruid, uid_t euid, \
 				    uid_t suid); }
-312	AUE_NULL	MSTD	{ int setresgid(gid_t rgid, gid_t egid, \
+312	AUE_SETRESGID	STD	{ int setresgid(gid_t rgid, gid_t egid, \
 				    gid_t sgid); }
 313	AUE_NULL	OBSOL	signanosleep
 314	AUE_NULL	NOSTD	{ int aio_return(struct aiocb *aiocbp); }
@@ -547,93 +550,95 @@
 316	AUE_NULL	NOSTD	{ int aio_cancel(int fd, \
 				    struct aiocb *aiocbp); }
 317	AUE_NULL	NOSTD	{ int aio_error(struct aiocb *aiocbp); }
-318	AUE_NULL	NOSTD	{ int aio_read(struct aiocb *aiocbp); }
-319	AUE_NULL	NOSTD	{ int aio_write(struct aiocb *aiocbp); }
-320	AUE_NULL	NOSTD	{ int lio_listio(int mode, \
-				    struct aiocb * const *acb_list, \
-				    int nent, struct sigevent *sig); }
-321	AUE_NULL	MSTD	{ int yield(void); }
+318	AUE_NULL	NOSTD	{ int oaio_read(struct oaiocb *aiocbp); }
+319	AUE_NULL	NOSTD	{ int oaio_write(struct oaiocb *aiocbp); }
+320	AUE_NULL	NOSTD	{ int olio_listio(int mode, \
+				    struct oaiocb * const *acb_list, \
+				    int nent, struct osigevent *sig); }
+321	AUE_NULL	STD	{ int yield(void); }
 322	AUE_NULL	OBSOL	thr_sleep
 323	AUE_NULL	OBSOL	thr_wakeup
-324	AUE_NULL	MSTD	{ int mlockall(int how); }
-325	AUE_NULL	MSTD	{ int munlockall(void); }
-326	AUE_NULL	MSTD	{ int __getcwd(u_char *buf, u_int buflen); }
+324	AUE_MLOCKALL	STD	{ int mlockall(int how); }
+325	AUE_MUNLOCKALL	STD	{ int munlockall(void); }
+326	AUE_GETCWD	STD	{ int __getcwd(u_char *buf, u_int buflen); }
 
-327	AUE_NULL	MSTD	{ int sched_setparam (pid_t pid, \
+327	AUE_NULL	STD	{ int sched_setparam (pid_t pid, \
 				    const struct sched_param *param); }
-328	AUE_NULL	MSTD	{ int sched_getparam (pid_t pid, struct \
+328	AUE_NULL	STD	{ int sched_getparam (pid_t pid, struct \
 				    sched_param *param); }
 
-329	AUE_NULL	MSTD	{ int sched_setscheduler (pid_t pid, int \
+329	AUE_NULL	STD	{ int sched_setscheduler (pid_t pid, int \
 				    policy, const struct sched_param \
 				    *param); }
-330	AUE_NULL	MSTD	{ int sched_getscheduler (pid_t pid); }
+330	AUE_NULL	STD	{ int sched_getscheduler (pid_t pid); }
 
-331	AUE_NULL	MSTD	{ int sched_yield (void); }
-332	AUE_NULL	MSTD	{ int sched_get_priority_max (int policy); }
-333	AUE_NULL	MSTD	{ int sched_get_priority_min (int policy); }
-334	AUE_NULL	MSTD	{ int sched_rr_get_interval (pid_t pid, \
+331	AUE_NULL	STD	{ int sched_yield (void); }
+332	AUE_NULL	STD	{ int sched_get_priority_max (int policy); }
+333	AUE_NULL	STD	{ int sched_get_priority_min (int policy); }
+334	AUE_NULL	STD	{ int sched_rr_get_interval (pid_t pid, \
 				    struct timespec *interval); }
-335	AUE_NULL	MSTD	{ int utrace(const void *addr, size_t len); }
-336	AUE_NULL	MCOMPAT4	{ int sendfile(int fd, int s, \
+335	AUE_NULL	STD	{ int utrace(const void *addr, size_t len); }
+336	AUE_SENDFILE	COMPAT4	{ int sendfile(int fd, int s, \
 				    off_t offset, size_t nbytes, \
 				    struct sf_hdtr *hdtr, off_t *sbytes, \
 				    int flags); }
-337	AUE_NULL	MSTD	{ int kldsym(int fileid, int cmd, \
+337	AUE_NULL	STD	{ int kldsym(int fileid, int cmd, \
 				    void *data); }
-338	AUE_NULL	MSTD	{ int jail(struct jail *jail); }
+338	AUE_JAIL	STD	{ int jail(struct jail *jail); }
 339	AUE_NULL	UNIMPL	pioctl
-340	AUE_NULL	MSTD	{ int sigprocmask(int how, \
+340	AUE_SIGPROCMASK	STD	{ int sigprocmask(int how, \
 				    const sigset_t *set, sigset_t *oset); }
-341	AUE_NULL	MSTD	{ int sigsuspend(const sigset_t *sigmask); }
-342	AUE_NULL	MCOMPAT4	{ int sigaction(int sig, const \
+341	AUE_SIGSUSPEND	STD	{ int sigsuspend(const sigset_t *sigmask); }
+342	AUE_SIGACTION	COMPAT4	{ int sigaction(int sig, const \
 				    struct sigaction *act, \
 				    struct sigaction *oact); }
-343	AUE_NULL	MSTD	{ int sigpending(sigset_t *set); }
-344	AUE_NULL	MCOMPAT4	{ int sigreturn( \
+343	AUE_SIGPENDING	STD	{ int sigpending(sigset_t *set); }
+344	AUE_SIGRETURN	COMPAT4	{ int sigreturn( \
 				    const struct ucontext4 *sigcntxp); }
-345	AUE_NULL	MSTD	{ int sigtimedwait(const sigset_t *set, \
+345	AUE_SIGWAIT	STD	{ int sigtimedwait(const sigset_t *set, \
 				    siginfo_t *info, \
 				    const struct timespec *timeout); }
-346	AUE_NULL	MSTD	{ int sigwaitinfo(const sigset_t *set, \
+346	AUE_NULL	STD	{ int sigwaitinfo(const sigset_t *set, \
 				    siginfo_t *info); }
-347	AUE_NULL	MSTD	{ int __acl_get_file(const char *path, \
+347	AUE_NULL	STD	{ int __acl_get_file(const char *path, \
 				    acl_type_t type, struct acl *aclp); }
-348	AUE_NULL	MSTD	{ int __acl_set_file(const char *path, \
+348	AUE_NULL	STD	{ int __acl_set_file(const char *path, \
 				    acl_type_t type, struct acl *aclp); }
-349	AUE_NULL	MSTD	{ int __acl_get_fd(int filedes, \
+349	AUE_NULL	STD	{ int __acl_get_fd(int filedes, \
 				    acl_type_t type, struct acl *aclp); }
-350	AUE_NULL	MSTD	{ int __acl_set_fd(int filedes, \
+350	AUE_NULL	STD	{ int __acl_set_fd(int filedes, \
 				    acl_type_t type, struct acl *aclp); }
-351	AUE_NULL	MSTD	{ int __acl_delete_file(const char *path, \
+351	AUE_NULL	STD	{ int __acl_delete_file(const char *path, \
 				    acl_type_t type); }
-352	AUE_NULL	MSTD	{ int __acl_delete_fd(int filedes, \
+352	AUE_NULL	STD	{ int __acl_delete_fd(int filedes, \
 				    acl_type_t type); }
-353	AUE_NULL	MSTD	{ int __acl_aclcheck_file(const char *path, \
+353	AUE_NULL	STD	{ int __acl_aclcheck_file(const char *path, \
 				    acl_type_t type, struct acl *aclp); }
-354	AUE_NULL	MSTD	{ int __acl_aclcheck_fd(int filedes, \
+354	AUE_NULL	STD	{ int __acl_aclcheck_fd(int filedes, \
 				    acl_type_t type, struct acl *aclp); }
-355	AUE_NULL	MSTD	{ int extattrctl(const char *path, int cmd, \
+355	AUE_EXTATTRCTL	STD	{ int extattrctl(const char *path, int cmd, \
 				    const char *filename, int attrnamespace, \
 				    const char *attrname); }
-356	AUE_NULL	MSTD	{ int extattr_set_file(const char *path, \
-				    int attrnamespace, const char *attrname, \
-				    void *data, size_t nbytes); }
-357	AUE_NULL	MSTD	{ ssize_t extattr_get_file(const char *path, \
-				    int attrnamespace, const char *attrname, \
-				    void *data, size_t nbytes); }
-358	AUE_NULL	MSTD	{ int extattr_delete_file(const char *path, \
+356	AUE_EXTATTR_SET_FILE	STD	{ int extattr_set_file( \
+				    const char *path, int attrnamespace, \
+				    const char *attrname, void *data, \
+				    size_t nbytes); }
+357	AUE_EXTATTR_GET_FILE	STD	{ ssize_t extattr_get_file( \
+				    const char *path, int attrnamespace, \
+				    const char *attrname, void *data, \
+				    size_t nbytes); }
+358	AUE_EXTATTR_DELETE_FILE	STD	{ int extattr_delete_file(const char *path, \
 				    int attrnamespace, \
 				    const char *attrname); }
 359	AUE_NULL	NOSTD	{ int aio_waitcomplete( \
 				    struct aiocb **aiocbp, \
 				    struct timespec *timeout); }
-360	AUE_NULL	MSTD	{ int getresuid(uid_t *ruid, uid_t *euid, \
+360	AUE_GETRESUID	STD	{ int getresuid(uid_t *ruid, uid_t *euid, \
 				    uid_t *suid); }
-361	AUE_NULL	MSTD	{ int getresgid(gid_t *rgid, gid_t *egid, \
+361	AUE_GETRESGID	STD	{ int getresgid(gid_t *rgid, gid_t *egid, \
 				    gid_t *sgid); }
-362	AUE_NULL	MSTD	{ int kqueue(void); }
-363	AUE_NULL	MSTD	{ int kevent(int fd, \
+362	AUE_KQUEUE	STD	{ int kqueue(void); }
+363	AUE_NULL	STD	{ int kevent(int fd, \
 				    struct kevent *changelist, int nchanges, \
 				    struct kevent *eventlist, int nevents, \
 				    const struct timespec *timeout); }
@@ -644,155 +649,203 @@
 368	AUE_NULL	UNIMPL	__cap_set_fd
 369	AUE_NULL	UNIMPL	__cap_set_file
 370	AUE_NULL	NODEF	lkmressys lkmressys nosys_args int
-371	AUE_NULL	MSTD	{ int extattr_set_fd(int fd, \
+371	AUE_EXTATTR_SET_FD	STD	{ int extattr_set_fd(int fd, \
 				    int attrnamespace, const char *attrname, \
 				    void *data, size_t nbytes); }
-372	AUE_NULL	MSTD	{ ssize_t extattr_get_fd(int fd, \
+372	AUE_EXTATTR_GET_FD	STD	{ ssize_t extattr_get_fd(int fd, \
 				    int attrnamespace, const char *attrname, \
 				    void *data, size_t nbytes); }
-373	AUE_NULL	MSTD	{ int extattr_delete_fd(int fd, \
+373	AUE_EXTATTR_DELETE_FD	STD	{ int extattr_delete_fd(int fd, \
 				    int attrnamespace, \
 				    const char *attrname); }
-374	AUE_NULL	MSTD	{ int __setugid(int flag); }
+374	AUE_NULL	STD	{ int __setugid(int flag); }
 375	AUE_NULL	NOIMPL	{ int nfsclnt(int flag, caddr_t argp); }
-376	AUE_NULL	MSTD	{ int eaccess(char *path, int flags); }
+376	AUE_EACCESS	STD	{ int eaccess(char *path, int flags); }
 377	AUE_NULL	UNIMPL	afs_syscall
-378	AUE_NULL	STD	{ int nmount(struct iovec *iovp, \
+378	AUE_NMOUNT	STD	{ int nmount(struct iovec *iovp, \
 				    unsigned int iovcnt, int flags); }
-379	AUE_NULL	MSTD	{ int kse_exit(void); }
-380	AUE_NULL	MSTD	{ int kse_wakeup(struct kse_mailbox *mbx); }
-381	AUE_NULL	MSTD	{ int kse_create(struct kse_mailbox *mbx, \
+379	AUE_NULL	STD	{ int kse_exit(void); }
+380	AUE_NULL	STD	{ int kse_wakeup(struct kse_mailbox *mbx); }
+381	AUE_NULL	STD	{ int kse_create(struct kse_mailbox *mbx, \
 				    int newgroup); }
-382	AUE_NULL	MSTD	{ int kse_thr_interrupt( \
+382	AUE_NULL	STD	{ int kse_thr_interrupt( \
 				    struct kse_thr_mailbox *tmbx, int cmd, \
 				    long data); }
-383	AUE_NULL	MSTD	{ int kse_release(struct timespec *timeout); }
-384	AUE_NULL	MSTD	{ int __mac_get_proc(struct mac *mac_p); }
-385	AUE_NULL	MSTD	{ int __mac_set_proc(struct mac *mac_p); }
-386	AUE_NULL	MSTD	{ int __mac_get_fd(int fd, \
+383	AUE_NULL	STD	{ int kse_release(struct timespec *timeout); }
+384	AUE_NULL	STD	{ int __mac_get_proc(struct mac *mac_p); }
+385	AUE_NULL	STD	{ int __mac_set_proc(struct mac *mac_p); }
+386	AUE_NULL	STD	{ int __mac_get_fd(int fd, \
 				    struct mac *mac_p); }
-387	AUE_NULL	MSTD	{ int __mac_get_file(const char *path_p, \
+387	AUE_NULL	STD	{ int __mac_get_file(const char *path_p, \
 				    struct mac *mac_p); }
-388	AUE_NULL	MSTD	{ int __mac_set_fd(int fd, \
+388	AUE_NULL	STD	{ int __mac_set_fd(int fd, \
 				    struct mac *mac_p); }
-389	AUE_NULL	MSTD	{ int __mac_set_file(const char *path_p, \
+389	AUE_NULL	STD	{ int __mac_set_file(const char *path_p, \
 				    struct mac *mac_p); }
-390	AUE_NULL	MSTD	{ int kenv(int what, const char *name, \
+390	AUE_NULL	STD	{ int kenv(int what, const char *name, \
 				    char *value, int len); }
-391	AUE_NULL	MSTD	{ int lchflags(const char *path, int flags); }
-392	AUE_NULL	MSTD	{ int uuidgen(struct uuid *store, \
+391	AUE_LCHFLAGS	STD	{ int lchflags(const char *path, int flags); }
+392	AUE_NULL	STD	{ int uuidgen(struct uuid *store, \
 				    int count); }
-393	AUE_NULL	MSTD	{ int sendfile(int fd, int s, off_t offset, \
+393	AUE_SENDFILE	STD	{ int sendfile(int fd, int s, off_t offset, \
 				    size_t nbytes, struct sf_hdtr *hdtr, \
 				    off_t *sbytes, int flags); }
-394	AUE_NULL	MSTD	{ int mac_syscall(const char *policy, \
+394	AUE_NULL	STD	{ int mac_syscall(const char *policy, \
 				    int call, void *arg); }
-395	AUE_NULL	MSTD	{ int getfsstat(struct statfs *buf, \
+395	AUE_GETFSSTAT	STD	{ int getfsstat(struct statfs *buf, \
 				    long bufsize, int flags); }
-396	AUE_NULL	MSTD	{ int statfs(char *path, \
+396	AUE_STATFS	STD	{ int statfs(char *path, \
 				    struct statfs *buf); }
-397	AUE_NULL	MSTD	{ int fstatfs(int fd, struct statfs *buf); }
-398	AUE_NULL	MSTD	{ int fhstatfs(const struct fhandle *u_fhp, \
+397	AUE_FSTATFS	STD	{ int fstatfs(int fd, struct statfs *buf); }
+398	AUE_FHSTATFS	STD	{ int fhstatfs(const struct fhandle *u_fhp, \
 				    struct statfs *buf); }
 399	AUE_NULL	UNIMPL	nosys
-400	AUE_NULL	MNOSTD	{ int ksem_close(semid_t id); }
-401	AUE_NULL	MNOSTD	{ int ksem_post(semid_t id); }
-402	AUE_NULL	MNOSTD	{ int ksem_wait(semid_t id); }
-403	AUE_NULL	MNOSTD	{ int ksem_trywait(semid_t id); }
-404	AUE_NULL	MNOSTD	{ int ksem_init(semid_t *idp, \
+400	AUE_NULL	NOSTD	{ int ksem_close(semid_t id); }
+401	AUE_NULL	NOSTD	{ int ksem_post(semid_t id); }
+402	AUE_NULL	NOSTD	{ int ksem_wait(semid_t id); }
+403	AUE_NULL	NOSTD	{ int ksem_trywait(semid_t id); }
+404	AUE_NULL	NOSTD	{ int ksem_init(semid_t *idp, \
 				    unsigned int value); }
-405	AUE_NULL	MNOSTD	{ int ksem_open(semid_t *idp, \
+405	AUE_NULL	NOSTD	{ int ksem_open(semid_t *idp, \
 				    const char *name, int oflag, \
 				    mode_t mode, unsigned int value); }
-406	AUE_NULL	MNOSTD	{ int ksem_unlink(const char *name); }
-407	AUE_NULL	MNOSTD	{ int ksem_getvalue(semid_t id, int *val); }
-408	AUE_NULL	MNOSTD	{ int ksem_destroy(semid_t id); }
-409	AUE_NULL	MSTD	{ int __mac_get_pid(pid_t pid, \
+406	AUE_NULL	NOSTD	{ int ksem_unlink(const char *name); }
+407	AUE_NULL	NOSTD	{ int ksem_getvalue(semid_t id, int *val); }
+408	AUE_NULL	NOSTD	{ int ksem_destroy(semid_t id); }
+409	AUE_NULL	STD	{ int __mac_get_pid(pid_t pid, \
 				    struct mac *mac_p); }
-410	AUE_NULL	MSTD	{ int __mac_get_link(const char *path_p, \
+410	AUE_NULL	STD	{ int __mac_get_link(const char *path_p, \
 				    struct mac *mac_p); }
-411	AUE_NULL	MSTD	{ int __mac_set_link(const char *path_p, \
+411	AUE_NULL	STD	{ int __mac_set_link(const char *path_p, \
 				    struct mac *mac_p); }
-412	AUE_NULL	MSTD	{ int extattr_set_link(const char *path, \
-				    int attrnamespace, const char *attrname, \
-				    void *data, size_t nbytes); }
-413	AUE_NULL	MSTD	{ ssize_t extattr_get_link(const char *path, \
-				    int attrnamespace, const char *attrname, \
-				    void *data, size_t nbytes); }
-414	AUE_NULL	MSTD	{ int extattr_delete_link(const char *path, \
-				    int attrnamespace, \
+412	AUE_EXTATTR_SET_LINK	STD	{ int extattr_set_link( \
+				    const char *path, int attrnamespace, \
+				    const char *attrname, void *data, \
+				    size_t nbytes); }
+413	AUE_EXTATTR_GET_LINK	STD	{ ssize_t extattr_get_link( \
+				    const char *path, int attrnamespace, \
+				    const char *attrname, void *data, \
+				    size_t nbytes); }
+414	AUE_EXTATTR_DELETE_LINK	STD	{ int extattr_delete_link( \
+				    const char *path, int attrnamespace, \
 				    const char *attrname); }
-415	AUE_NULL	MSTD	{ int __mac_execve(char *fname, char **argv, \
+415	AUE_NULL	STD	{ int __mac_execve(char *fname, char **argv, \
 				    char **envv, struct mac *mac_p); }
-416	AUE_NULL	MSTD	{ int sigaction(int sig, \
+416	AUE_SIGACTION	STD	{ int sigaction(int sig, \
 				    const struct sigaction *act, \
 				    struct sigaction *oact); }
-417	AUE_NULL	MSTD	{ int sigreturn( \
+417	AUE_SIGRETURN	STD	{ int sigreturn( \
 				    const struct __ucontext *sigcntxp); }
 418	AUE_NULL	UNIMPL	__xstat
 419	AUE_NULL	UNIMPL	__xfstat
 420	AUE_NULL	UNIMPL	__xlstat
-421	AUE_NULL	MSTD	{ int getcontext(struct __ucontext *ucp); }
-422	AUE_NULL	MSTD	{ int setcontext( \
+421	AUE_NULL	STD	{ int getcontext(struct __ucontext *ucp); }
+422	AUE_NULL	STD	{ int setcontext( \
 				    const struct __ucontext *ucp); }
-423	AUE_NULL	MSTD	{ int swapcontext(struct __ucontext *oucp, \
+423	AUE_NULL	STD	{ int swapcontext(struct __ucontext *oucp, \
 				    const struct __ucontext *ucp); }
-424	AUE_NULL	MSTD	{ int swapoff(const char *name); }
-425	AUE_NULL	MSTD	{ int __acl_get_link(const char *path, \
+424	AUE_SWAPOFF	STD	{ int swapoff(const char *name); }
+425	AUE_NULL	STD	{ int __acl_get_link(const char *path, \
 				    acl_type_t type, struct acl *aclp); }
-426	AUE_NULL	MSTD	{ int __acl_set_link(const char *path, \
+426	AUE_NULL	STD	{ int __acl_set_link(const char *path, \
 				    acl_type_t type, struct acl *aclp); }
-427	AUE_NULL	MSTD	{ int __acl_delete_link(const char *path, \
+427	AUE_NULL	STD	{ int __acl_delete_link(const char *path, \
 				    acl_type_t type); }
-428	AUE_NULL	MSTD	{ int __acl_aclcheck_link(const char *path, \
+428	AUE_NULL	STD	{ int __acl_aclcheck_link(const char *path, \
 				    acl_type_t type, struct acl *aclp); }
-429	AUE_NULL	MSTD	{ int sigwait(const sigset_t *set, \
+429	AUE_SIGWAIT	STD	{ int sigwait(const sigset_t *set, \
 				    int *sig); }
-430	AUE_NULL	MSTD	{ int thr_create(ucontext_t *ctx, long *id, \
+430	AUE_NULL	STD	{ int thr_create(ucontext_t *ctx, long *id, \
 				    int flags); }
-431	AUE_NULL	MSTD	{ void thr_exit(long *state); }
-432	AUE_NULL	MSTD	{ int thr_self(long *id); }
-433	AUE_NULL	MSTD	{ int thr_kill(long id, int sig); }
-434	AUE_NULL	MSTD	{ int _umtx_lock(struct umtx *umtx); }
-435	AUE_NULL	MSTD	{ int _umtx_unlock(struct umtx *umtx); }
-436	AUE_NULL	MSTD	{ int jail_attach(int jid); }
-437	AUE_NULL	MSTD	{ ssize_t extattr_list_fd(int fd, \
+431	AUE_NULL	STD	{ void thr_exit(long *state); }
+432	AUE_NULL	STD	{ int thr_self(long *id); }
+433	AUE_NULL	STD	{ int thr_kill(long id, int sig); }
+434	AUE_NULL	STD	{ int _umtx_lock(struct umtx *umtx); }
+435	AUE_NULL	STD	{ int _umtx_unlock(struct umtx *umtx); }
+436	AUE_NULL	STD	{ int jail_attach(int jid); }
+437	AUE_EXTATTR_LIST_FD	STD	{ ssize_t extattr_list_fd(int fd, \
 				    int attrnamespace, void *data, \
 				    size_t nbytes); }
-438	AUE_NULL	MSTD	{ ssize_t extattr_list_file( \
+438	AUE_EXTATTR_LIST_FILE	STD	{ ssize_t extattr_list_file( \
 				    const char *path, int attrnamespace, \
 				    void *data, size_t nbytes); }
-439	AUE_NULL	MSTD	{ ssize_t extattr_list_link( \
+439	AUE_EXTATTR_LIST_LINK	STD	{ ssize_t extattr_list_link( \
 				    const char *path, int attrnamespace, \
 				    void *data, size_t nbytes); }
-440	AUE_NULL	MSTD	{ int kse_switchin( \
+440	AUE_NULL	STD	{ int kse_switchin( \
 				    struct kse_thr_mailbox *tmbx, \
 				    int flags); }
-441	AUE_NULL	MNOSTD	{ int ksem_timedwait(semid_t id, \
-				    struct timespec *abstime); }
-442	AUE_NULL	MSTD	{ int thr_suspend( \
+441	AUE_NULL	NOSTD	{ int ksem_timedwait(semid_t id, \
+				    const struct timespec *abstime); }
+442	AUE_NULL	STD	{ int thr_suspend( \
 				    const struct timespec *timeout); }
-443	AUE_NULL	MSTD	{ int thr_wake(long id); }
-444	AUE_NULL	MSTD	{ int kldunloadf(int fileid, int flags); }
-445	AUE_NULL	MSTD	{ int audit(const void *record, \
+443	AUE_NULL	STD	{ int thr_wake(long id); }
+444	AUE_MODUNLOAD	STD	{ int kldunloadf(int fileid, int flags); }
+445	AUE_AUDIT	STD	{ int audit(const void *record, \
 				    u_int length); }
-446	AUE_NULL	MSTD	{ int auditon(int cmd, void *data, \
+446	AUE_AUDITON	STD	{ int auditon(int cmd, void *data, \
 				    u_int length); }
-447	AUE_NULL	MSTD	{ int getauid(uid_t *auid); }
-448	AUE_NULL	MSTD	{ int setauid(uid_t *auid); }
-449	AUE_NULL	MSTD	{ int getaudit(struct auditinfo *auditinfo); }
-450	AUE_NULL	MSTD	{ int setaudit(struct auditinfo *auditinfo); }
-451	AUE_NULL	MSTD	{ int getaudit_addr( \
+447	AUE_GETAUID	STD	{ int getauid(uid_t *auid); }
+448	AUE_SETAUID	STD	{ int setauid(uid_t *auid); }
+449	AUE_GETAUDIT	STD	{ int getaudit(struct auditinfo *auditinfo); }
+450	AUE_SETAUDIT	STD	{ int setaudit(struct auditinfo *auditinfo); }
+451	AUE_GETAUDIT_ADDR	STD	{ int getaudit_addr( \
 				    struct auditinfo_addr *auditinfo_addr, \
 				    u_int length); }
-452	AUE_NULL	MSTD	{ int setaudit_addr( \
+452	AUE_SETAUDIT_ADDR	STD	{ int setaudit_addr( \
 				    struct auditinfo_addr *auditinfo_addr, \
 				    u_int length); }
-453	AUE_NULL	MSTD	{ int auditctl(int cmd, char *path); }
-454	AUE_NULL	MSTD	{ int _umtx_op(struct umtx *umtx, int op, \
-				    long id, void *uaddr, void *uaddr2); }
-455	AUE_NULL	MSTD	{ int thr_new(struct thr_param *param, \
+453	AUE_AUDITCTL	STD	{ int auditctl(char *path); }
+454	AUE_NULL	STD	{ int _umtx_op(void *obj, int op, \
+				    u_long val, void *uaddr1, void *uaddr2); }
+455	AUE_NULL	STD	{ int thr_new(struct thr_param *param, \
 				    int param_size); }
-
+456	AUE_NULL	STD	{ int sigqueue(pid_t pid, int signum, void *value); }
+457	AUE_NULL	NOSTD	{ int kmq_open(const char *path, int flags, \
+				    mode_t mode, const struct mq_attr *attr); }
+458	AUE_NULL	NOSTD	{ int kmq_setattr(int mqd,		\
+				    const struct mq_attr *attr,		\
+				    struct mq_attr *oattr); }
+459	AUE_NULL	NOSTD	{ int kmq_timedreceive(int mqd,	\
+				    char *msg_ptr, size_t msg_len,	\
+				    unsigned *msg_prio,			\
+				    const struct timespec *abs_timeout); }
+460	AUE_NULL	NOSTD	{ int kmq_timedsend(int mqd,		\
+				    const char *msg_ptr, size_t msg_len,\
+				    unsigned msg_prio,			\
+				    const struct timespec *abs_timeout);}
+461	AUE_NULL	NOSTD	{ int kmq_notify(int mqd,		\
+				    const struct sigevent *sigev); }
+462	AUE_NULL	NOSTD	{ int kmq_unlink(const char *path); }
+463	AUE_NULL	STD	{ int abort2(const char *why, int nargs, void **args); }
+464	AUE_NULL	STD	{ int thr_set_name(long id, const char *name); }
+465	AUE_NULL	NOSTD	{ int aio_fsync(int op, struct aiocb *aiocbp); }
+466	AUE_RTPRIO	STD	{ int rtprio_thread(int function, \
+				    lwpid_t lwpid, struct rtprio *rtp); }
+467	AUE_NULL	UNIMPL	nosys
+468	AUE_NULL	UNIMPL	nosys
+469	AUE_NULL	UNIMPL	__getpath_fromfd
+470	AUE_NULL	UNIMPL	__getpath_fromaddr
+471	AUE_NULL	STD	{ int sctp_peeloff(int sd, uint32_t name); }
+472     AUE_NULL        STD    { int sctp_generic_sendmsg(int sd, caddr_t msg, int mlen, \
+	                            caddr_t to, __socklen_t tolen, \
+				    struct sctp_sndrcvinfo *sinfo, int flags); }
+473     AUE_NULL        STD    { int sctp_generic_sendmsg_iov(int sd, struct iovec *iov, int iovlen, \
+	                            caddr_t to, __socklen_t tolen, \
+				    struct sctp_sndrcvinfo *sinfo, int flags); }
+474     AUE_NULL        STD    { int sctp_generic_recvmsg(int sd, struct iovec *iov, int iovlen, \
+				    struct sockaddr * from, __socklen_t *fromlenaddr, \
+				    struct sctp_sndrcvinfo *sinfo, int *msg_flags); }
+475	AUE_PREAD	STD	{ ssize_t pread(int fd, void *buf, \
+				    size_t nbyte, off_t offset); }
+476	AUE_PWRITE	STD	{ ssize_t pwrite(int fd, const void *buf, \
+				    size_t nbyte, off_t offset); }
+477	AUE_MMAP	STD	{ caddr_t mmap(caddr_t addr, size_t len, \
+				    int prot, int flags, int fd, off_t pos); }
+478	AUE_LSEEK	STD	{ off_t lseek(int fd, off_t offset, \
+				    int whence); }
+479	AUE_TRUNCATE	STD	{ int truncate(char *path, off_t length); }
+480	AUE_FTRUNCATE	STD	{ int ftruncate(int fd, off_t length); }
+481	AUE_KILL	STD	{ int thr_kill2(pid_t pid, long id, int sig); }
 ; Please copy any additions and changes to the following compatability tables:
 ; sys/compat/freebsd32/syscalls.master
Index: subr_sleepqueue.c
===================================================================
RCS file: /home/cvs/src/sys/kern/subr_sleepqueue.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/subr_sleepqueue.c -L sys/kern/subr_sleepqueue.c -u -r1.2 -r1.3
--- sys/kern/subr_sleepqueue.c
+++ sys/kern/subr_sleepqueue.c
@@ -59,17 +59,18 @@
  * variables.
  */
 
-#include "opt_sleepqueue_profiling.h"
-
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: /repoman/r/ncvs/src/sys/kern/subr_sleepqueue.c,v 1.18.2.2 2006/02/27 00:19:40 davidxu Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/subr_sleepqueue.c,v 1.39.4.1 2008/01/29 16:37:04 jhb Exp $");
+
+#include "opt_sleepqueue_profiling.h"
+#include "opt_ddb.h"
+#include "opt_sched.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/lock.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
-#include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/sched.h>
@@ -77,6 +78,12 @@
 #include <sys/sleepqueue.h>
 #include <sys/sysctl.h>
 
+#include <vm/uma.h>
+
+#ifdef DDB
+#include <ddb/ddb.h>
+#endif
+
 /*
  * Constants for the hash table of sleep queue chains.  These constants are
  * the same ones that 4BSD (and possibly earlier versions of BSD) used.
@@ -89,7 +96,7 @@
 #define	SC_SHIFT	8
 #define	SC_HASH(wc)	(((uintptr_t)(wc) >> SC_SHIFT) & SC_MASK)
 #define	SC_LOOKUP(wc)	&sleepq_chains[SC_HASH(wc)]
-
+#define NR_SLEEPQS      2
 /*
  * There two different lists of sleep queues.  Both lists are connected
  * via the sq_hash entries.  The first list is the sleep queue chain list
@@ -109,13 +116,13 @@
  *  c - sleep queue chain lock
  */
 struct sleepqueue {
-	TAILQ_HEAD(, thread) sq_blocked;	/* (c) Blocked threads. */
+	TAILQ_HEAD(, thread) sq_blocked[NR_SLEEPQS];	/* (c) Blocked threads. */
 	LIST_ENTRY(sleepqueue) sq_hash;		/* (c) Chain and free list. */
 	LIST_HEAD(, sleepqueue) sq_free;	/* (c) Free queues. */
 	void	*sq_wchan;			/* (c) Wait channel. */
 #ifdef INVARIANTS
 	int	sq_type;			/* (c) Queue type. */
-	struct mtx *sq_lock;			/* (c) Associated lock. */
+	struct lock_object *sq_lock;		/* (c) Associated lock. */
 #endif
 };
 
@@ -137,16 +144,22 @@
     0, "maxmimum depth achieved of a single chain");
 #endif
 static struct sleepqueue_chain sleepq_chains[SC_TABLESIZE];
-
-static MALLOC_DEFINE(M_SLEEPQUEUE, "sleep queues", "sleep queues");
+static uma_zone_t sleepq_zone;
 
 /*
  * Prototypes for non-exported routines.
  */
+static int	sleepq_catch_signals(void *wchan);
+static int	sleepq_check_signals(void);
 static int	sleepq_check_timeout(void);
+#ifdef INVARIANTS
+static void	sleepq_dtor(void *mem, int size, void *arg);
+#endif
+static int	sleepq_init(void *mem, int size, int flags);
+static void	sleepq_resume_thread(struct sleepqueue *sq, struct thread *td,
+		    int pri);
 static void	sleepq_switch(void *wchan);
 static void	sleepq_timeout(void *arg);
-static void	sleepq_resume_thread(struct sleepqueue *sq, struct thread *td, int pri);
 
 /*
  * Early initialization of sleep queues that is called from the sleepinit()
@@ -177,21 +190,24 @@
 		    NULL);
 #endif
 	}
+	sleepq_zone = uma_zcreate("SLEEPQUEUE", sizeof(struct sleepqueue),
+#ifdef INVARIANTS
+	    NULL, sleepq_dtor, sleepq_init, NULL, UMA_ALIGN_CACHE, 0);
+#else
+	    NULL, NULL, sleepq_init, NULL, UMA_ALIGN_CACHE, 0);
+#endif
+	
 	thread0.td_sleepqueue = sleepq_alloc();
 }
 
 /*
- * Malloc and initialize a new sleep queue for a new thread.
+ * Get a sleep queue for a new thread.
  */
 struct sleepqueue *
 sleepq_alloc(void)
 {
-	struct sleepqueue *sq;
 
-	sq = malloc(sizeof(struct sleepqueue), M_SLEEPQUEUE, M_WAITOK | M_ZERO);
-	TAILQ_INIT(&sq->sq_blocked);
-	LIST_INIT(&sq->sq_free);
-	return (sq);
+	return (uma_zalloc(sleepq_zone, M_WAITOK));
 }
 
 /*
@@ -201,9 +217,7 @@
 sleepq_free(struct sleepqueue *sq)
 {
 
-	MPASS(sq != NULL);
-	MPASS(TAILQ_EMPTY(&sq->sq_blocked));
-	free(sq, M_SLEEPQUEUE);
+	uma_zfree(sleepq_zone, sq);
 }
 
 /*
@@ -257,7 +271,8 @@
  * woken up.
  */
 void
-sleepq_add(void *wchan, struct mtx *lock, const char *wmesg, int flags)
+sleepq_add(void *wchan, struct lock_object *lock, const char *wmesg, int flags,
+    int queue)
 {
 	struct sleepqueue_chain *sc;
 	struct sleepqueue *sq;
@@ -268,10 +283,11 @@
 	mtx_assert(&sc->sc_lock, MA_OWNED);
 	MPASS(td->td_sleepqueue != NULL);
 	MPASS(wchan != NULL);
+	MPASS((queue >= 0) && (queue < NR_SLEEPQS));
 
 	/* If this thread is not allowed to sleep, die a horrible death. */
 	KASSERT(!(td->td_pflags & TDP_NOSLEEPING),
-	    ("trying to sleep while sleeping is prohibited"));
+	    ("Trying sleep, but thread marked as sleeping prohibited"));
 
 	/* Look up the sleep queue associated with the wait channel 'wchan'. */
 	sq = sleepq_lookup(wchan);
@@ -282,6 +298,19 @@
 	 * into the sleep queue already in use by this wait channel.
 	 */
 	if (sq == NULL) {
+#ifdef INVARIANTS
+		int i;
+
+		sq = td->td_sleepqueue;
+		for (i = 0; i < NR_SLEEPQS; i++)
+			KASSERT(TAILQ_EMPTY(&sq->sq_blocked[i]),
+				("thread's sleep queue %d is not empty", i));
+		KASSERT(LIST_EMPTY(&sq->sq_free),
+		    ("thread's sleep queue has a non-empty free list"));
+		KASSERT(sq->sq_wchan == NULL, ("stale sq_wchan pointer"));
+		sq->sq_lock = lock;
+		sq->sq_type = flags & SLEEPQ_TYPE;
+#endif
 #ifdef SLEEPQUEUE_PROFILING
 		sc->sc_depth++;
 		if (sc->sc_depth > sc->sc_max_depth) {
@@ -292,32 +321,24 @@
 #endif
 		sq = td->td_sleepqueue;
 		LIST_INSERT_HEAD(&sc->sc_queues, sq, sq_hash);
-		KASSERT(TAILQ_EMPTY(&sq->sq_blocked),
-		    ("thread's sleep queue has a non-empty queue"));
-		KASSERT(LIST_EMPTY(&sq->sq_free),
-		    ("thread's sleep queue has a non-empty free list"));
-		KASSERT(sq->sq_wchan == NULL, ("stale sq_wchan pointer"));
 		sq->sq_wchan = wchan;
-#ifdef INVARIANTS
-		sq->sq_lock = lock;
-		sq->sq_type = flags & SLEEPQ_TYPE;
-#endif
 	} else {
 		MPASS(wchan == sq->sq_wchan);
 		MPASS(lock == sq->sq_lock);
 		MPASS((flags & SLEEPQ_TYPE) == sq->sq_type);
 		LIST_INSERT_HEAD(&sq->sq_free, td->td_sleepqueue, sq_hash);
 	}
-	TAILQ_INSERT_TAIL(&sq->sq_blocked, td, td_slpq);
+	thread_lock(td);
+	TAILQ_INSERT_TAIL(&sq->sq_blocked[queue], td, td_slpq);
 	td->td_sleepqueue = NULL;
-	mtx_lock_spin(&sched_lock);
+	td->td_sqqueue = queue;
 	td->td_wchan = wchan;
 	td->td_wmesg = wmesg;
 	if (flags & SLEEPQ_INTERRUPTIBLE) {
 		td->td_flags |= TDF_SINTR;
 		td->td_flags &= ~TDF_SLEEPABORT;
 	}
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(td);
 }
 
 /*
@@ -342,7 +363,8 @@
 /*
  * Marks the pending sleep of the current thread as interruptible and
  * makes an initial check for pending signals before putting a thread
- * to sleep. Return with sleep queue and scheduler lock held.
+ * to sleep. Enters and exits with the thread lock held.  Thread lock
+ * may have transitioned from the sleepq lock to a run lock.
  */
 static int
 sleepq_catch_signals(void *wchan)
@@ -362,7 +384,6 @@
 	CTR3(KTR_PROC, "sleepq catching signals: thread %p (pid %ld, %s)",
 		(void *)td, (long)p->p_pid, p->p_comm);
 
-	MPASS(td->td_flags & TDF_SINTR);
 	mtx_unlock_spin(&sc->sc_lock);
 
 	/* See if there are any pending signals for this thread. */
@@ -381,68 +402,79 @@
 			ret = ERESTART;
 		mtx_unlock(&ps->ps_mtx);
 	}
-
+	/*
+	 * Lock sleepq chain before unlocking proc
+	 * without this, we could lose a race.
+	 */
+	mtx_lock_spin(&sc->sc_lock);
+	PROC_UNLOCK(p);
+	thread_lock(td);
 	if (ret == 0) {
-		mtx_lock_spin(&sc->sc_lock);
-		/*
-		 * Lock sched_lock before unlocking proc lock,
-		 * without this, we could lose a race.
-		 */
-		mtx_lock_spin(&sched_lock);
-		PROC_UNLOCK(p);
-		if (!(td->td_flags & TDF_INTERRUPT))
+		if (!(td->td_flags & TDF_INTERRUPT)) {
+			sleepq_switch(wchan);
 			return (0);
+		}
 		/* KSE threads tried unblocking us. */
 		ret = td->td_intrval;
-		mtx_unlock_spin(&sched_lock);
-		MPASS(ret == EINTR || ret == ERESTART);
-	} else {
-		PROC_UNLOCK(p);
-		mtx_lock_spin(&sc->sc_lock);
+		MPASS(ret == EINTR || ret == ERESTART || ret == EWOULDBLOCK);
 	}
 	/*
 	 * There were pending signals and this thread is still
 	 * on the sleep queue, remove it from the sleep queue.
 	 */
-	sq = sleepq_lookup(wchan);
-	mtx_lock_spin(&sched_lock);
-	if (TD_ON_SLEEPQ(td))
+	if (TD_ON_SLEEPQ(td)) {
+		sq = sleepq_lookup(wchan);
 		sleepq_resume_thread(sq, td, -1);
-	td->td_flags &= ~TDF_SINTR;
+	}
+	mtx_unlock_spin(&sc->sc_lock);
+	MPASS(td->td_lock != &sc->sc_lock);
 	return (ret);
 }
 
 /*
- * Switches to another thread if we are still asleep on a sleep queue and
- * drop the lock on the sleep queue chain.  Returns with sched_lock held.
+ * Switches to another thread if we are still asleep on a sleep queue.
+ * Returns with thread lock.
  */
 static void
 sleepq_switch(void *wchan)
 {
 	struct sleepqueue_chain *sc;
+	struct sleepqueue *sq;
 	struct thread *td;
 
 	td = curthread;
 	sc = SC_LOOKUP(wchan);
 	mtx_assert(&sc->sc_lock, MA_OWNED);
-	mtx_assert(&sched_lock, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 
 	/* 
 	 * If we have a sleep queue, then we've already been woken up, so
 	 * just return.
 	 */
 	if (td->td_sleepqueue != NULL) {
-		MPASS(!TD_ON_SLEEPQ(td));
 		mtx_unlock_spin(&sc->sc_lock);
 		return;
 	}
 
 	/*
-	 * Otherwise, actually go to sleep.
+	 * If TDF_TIMEOUT is set, then our sleep has been timed out
+	 * already but we are still on the sleep queue, so dequeue the
+	 * thread and return.
 	 */
-	mtx_unlock_spin(&sc->sc_lock);
+	if (td->td_flags & TDF_TIMEOUT) {
+		MPASS(TD_ON_SLEEPQ(td));
+		sq = sleepq_lookup(wchan);
+		sleepq_resume_thread(sq, td, -1);
+		mtx_unlock_spin(&sc->sc_lock);
+		return;		
+	}
+
+	thread_lock_set(td, &sc->sc_lock);
+
+	MPASS(td->td_sleepqueue == NULL);
 	sched_sleep(td);
 	TD_SET_SLEEPING(td);
+	SCHED_STAT_INC(switch_sleepq);
 	mi_switch(SW_VOL, NULL);
 	KASSERT(TD_IS_RUNNING(td), ("running but not TDS_RUNNING"));
 	CTR3(KTR_PROC, "sleepq resume: thread %p (pid %ld, %s)",
@@ -457,8 +489,8 @@
 {
 	struct thread *td;
 
-	mtx_assert(&sched_lock, MA_OWNED);
 	td = curthread;
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 
 	/*
 	 * If TDF_TIMEOUT is set, we timed out.
@@ -483,6 +515,7 @@
 	else if (callout_stop(&td->td_slpcallout) == 0) {
 		td->td_flags |= TDF_TIMEOUT;
 		TD_SET_SLEEPING(td);
+		SCHED_STAT_INC(switch_sleepqtimo);
 		mi_switch(SW_INVOL, NULL);
 	}
 	return (0);
@@ -496,8 +529,8 @@
 {
 	struct thread *td;
 
-	mtx_assert(&sched_lock, MA_OWNED);
 	td = curthread;
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 
 	/* We are no longer in an interruptible sleep. */
 	if (td->td_flags & TDF_SINTR)
@@ -520,11 +553,13 @@
 void
 sleepq_wait(void *wchan)
 {
+	struct thread *td;
 
-	MPASS(!(curthread->td_flags & TDF_SINTR));
-	mtx_lock_spin(&sched_lock);
+	td = curthread;
+	MPASS(!(td->td_flags & TDF_SINTR));
+	thread_lock(td);
 	sleepq_switch(wchan);
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(td);
 }
 
 /*
@@ -538,12 +573,8 @@
 	int rval;
 
 	rcatch = sleepq_catch_signals(wchan);
-	if (rcatch == 0)
-		sleepq_switch(wchan);
-	else
-		sleepq_release(wchan);
 	rval = sleepq_check_signals();
-	mtx_unlock_spin(&sched_lock); 
+	thread_unlock(curthread);
 	if (rcatch)
 		return (rcatch);
 	return (rval);
@@ -556,13 +587,16 @@
 int
 sleepq_timedwait(void *wchan)
 {
+	struct thread *td;
 	int rval;
 
-	MPASS(!(curthread->td_flags & TDF_SINTR));
-	mtx_lock_spin(&sched_lock);
+	td = curthread;
+	MPASS(!(td->td_flags & TDF_SINTR));
+	thread_lock(td);
 	sleepq_switch(wchan);
 	rval = sleepq_check_timeout();
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(td);
+
 	return (rval);
 }
 
@@ -576,13 +610,9 @@
 	int rcatch, rvalt, rvals;
 
 	rcatch = sleepq_catch_signals(wchan);
-	if (rcatch == 0)
-		sleepq_switch(wchan);
-	else
-		sleepq_release(wchan);
 	rvalt = sleepq_check_timeout();
 	rvals = sleepq_check_signals();
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(curthread);
 	if (rcatch)
 		return (rcatch);
 	if (rvals)
@@ -602,12 +632,13 @@
 	MPASS(td != NULL);
 	MPASS(sq->sq_wchan != NULL);
 	MPASS(td->td_wchan == sq->sq_wchan);
+	MPASS(td->td_sqqueue < NR_SLEEPQS && td->td_sqqueue >= 0);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	sc = SC_LOOKUP(sq->sq_wchan);
 	mtx_assert(&sc->sc_lock, MA_OWNED);
-	mtx_assert(&sched_lock, MA_OWNED);
 
 	/* Remove the thread from the queue. */
-	TAILQ_REMOVE(&sq->sq_blocked, td, td_slpq);
+	TAILQ_REMOVE(&sq->sq_blocked[td->td_sqqueue], td, td_slpq);
 
 	/*
 	 * Get a sleep queue for this thread.  If this is the last waiter,
@@ -628,6 +659,7 @@
 
 	td->td_wmesg = NULL;
 	td->td_wchan = NULL;
+	td->td_flags &= ~TDF_SINTR;
 
 	/*
 	 * Note that thread td might not be sleeping if it is running
@@ -647,22 +679,54 @@
 	setrunnable(td);
 }
 
+#ifdef INVARIANTS
+/*
+ * UMA zone item deallocator.
+ */
+static void
+sleepq_dtor(void *mem, int size, void *arg)
+{
+	struct sleepqueue *sq;
+	int i;
+
+	sq = mem;
+	for (i = 0; i < NR_SLEEPQS; i++)
+		MPASS(TAILQ_EMPTY(&sq->sq_blocked[i]));
+}
+#endif
+
+/*
+ * UMA zone item initializer.
+ */
+static int
+sleepq_init(void *mem, int size, int flags)
+{
+	struct sleepqueue *sq;
+	int i;
+
+	bzero(mem, size);
+	sq = mem;
+	for (i = 0; i < NR_SLEEPQS; i++)
+		TAILQ_INIT(&sq->sq_blocked[i]);
+	LIST_INIT(&sq->sq_free);
+	return (0);
+}
+
 /*
  * Find the highest priority thread sleeping on a wait channel and resume it.
  */
 void
-sleepq_signal(void *wchan, int flags, int pri)
+sleepq_signal(void *wchan, int flags, int pri, int queue)
 {
 	struct sleepqueue *sq;
 	struct thread *td, *besttd;
 
 	CTR2(KTR_PROC, "sleepq_signal(%p, %d)", wchan, flags);
 	KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__));
+	MPASS((queue >= 0) && (queue < NR_SLEEPQS));
 	sq = sleepq_lookup(wchan);
-	if (sq == NULL) {
-		sleepq_release(wchan);
+	if (sq == NULL)
 		return;
-	}
 	KASSERT(sq->sq_type == (flags & SLEEPQ_TYPE),
 	    ("%s: mismatch between sleep/wakeup and cv_*", __func__));
 
@@ -673,27 +737,28 @@
 	 * the tail of sleep queues.
 	 */
 	besttd = NULL;
-	TAILQ_FOREACH(td, &sq->sq_blocked, td_slpq) {
+	TAILQ_FOREACH(td, &sq->sq_blocked[queue], td_slpq) {
 		if (besttd == NULL || td->td_priority < besttd->td_priority)
 			besttd = td;
 	}
 	MPASS(besttd != NULL);
-	mtx_lock_spin(&sched_lock);
+	thread_lock(besttd);
 	sleepq_resume_thread(sq, besttd, pri);
-	mtx_unlock_spin(&sched_lock);
-	sleepq_release(wchan);
+	thread_unlock(besttd);
 }
 
 /*
  * Resume all threads sleeping on a specified wait channel.
  */
 void
-sleepq_broadcast(void *wchan, int flags, int pri)
+sleepq_broadcast(void *wchan, int flags, int pri, int queue)
 {
 	struct sleepqueue *sq;
+	struct thread *td;
 
 	CTR2(KTR_PROC, "sleepq_broadcast(%p, %d)", wchan, flags);
 	KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__));
+	MPASS((queue >= 0) && (queue < NR_SLEEPQS));
 	sq = sleepq_lookup(wchan);
 	if (sq == NULL) {
 		sleepq_release(wchan);
@@ -703,10 +768,12 @@
 	    ("%s: mismatch between sleep/wakeup and cv_*", __func__));
 
 	/* Resume all blocked threads on the sleep queue. */
-	mtx_lock_spin(&sched_lock);
-	while (!TAILQ_EMPTY(&sq->sq_blocked))
-		sleepq_resume_thread(sq, TAILQ_FIRST(&sq->sq_blocked), pri);
-	mtx_unlock_spin(&sched_lock);
+	while (!TAILQ_EMPTY(&sq->sq_blocked[queue])) {
+		td = TAILQ_FIRST(&sq->sq_blocked[queue]);
+		thread_lock(td);
+		sleepq_resume_thread(sq, td, pri);
+		thread_unlock(td);
+	}
 	sleepq_release(wchan);
 }
 
@@ -717,6 +784,7 @@
 static void
 sleepq_timeout(void *arg)
 {
+	struct sleepqueue_chain *sc;
 	struct sleepqueue *sq;
 	struct thread *td;
 	void *wchan;
@@ -729,38 +797,30 @@
 	 * First, see if the thread is asleep and get the wait channel if
 	 * it is.
 	 */
-	mtx_lock_spin(&sched_lock);
-	if (TD_ON_SLEEPQ(td)) {
+	thread_lock(td);
+	if (TD_IS_SLEEPING(td) && TD_ON_SLEEPQ(td)) {
 		wchan = td->td_wchan;
-		mtx_unlock_spin(&sched_lock);
-		sleepq_lock(wchan);
+		sc = SC_LOOKUP(wchan);
+		MPASS(td->td_lock == &sc->sc_lock);
 		sq = sleepq_lookup(wchan);
-		mtx_lock_spin(&sched_lock);
-	} else {
-		wchan = NULL;
-		sq = NULL;
+		MPASS(sq != NULL);
+		td->td_flags |= TDF_TIMEOUT;
+		sleepq_resume_thread(sq, td, -1);
+		thread_unlock(td);
+		return;
 	}
 
 	/*
-	 * At this point, if the thread is still on the sleep queue,
-	 * we have that sleep queue locked as it cannot migrate sleep
-	 * queues while we dropped sched_lock.  If it had resumed and
-	 * was on another CPU while the lock was dropped, it would have
-	 * seen that TDF_TIMEOUT and TDF_TIMOFAIL are clear and the
-	 * call to callout_stop() to stop this routine would have failed
-	 * meaning that it would have already set TDF_TIMEOUT to
-	 * synchronize with this function.
+	 * If the thread is on the SLEEPQ but isn't sleeping yet, it
+	 * can either be on another CPU in between sleepq_add() and
+	 * one of the sleepq_*wait*() routines or it can be in
+	 * sleepq_catch_signals().
 	 */
 	if (TD_ON_SLEEPQ(td)) {
-		MPASS(td->td_wchan == wchan);
-		MPASS(sq != NULL);
 		td->td_flags |= TDF_TIMEOUT;
-		sleepq_resume_thread(sq, td, -1);
-		mtx_unlock_spin(&sched_lock);
-		sleepq_release(wchan);
+		thread_unlock(td);
 		return;
-	} else if (wchan != NULL)
-		sleepq_release(wchan);
+	}
 
 	/*
 	 * Now check for the edge cases.  First, if TDF_TIMEOUT is set,
@@ -778,7 +838,7 @@
 		setrunnable(td);
 	} else
 		td->td_flags |= TDF_TIMOFAIL;
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(td);
 }
 
 /*
@@ -798,33 +858,36 @@
 	MPASS(wchan != NULL);
 	sleepq_lock(wchan);
 	sq = sleepq_lookup(wchan);
-	mtx_lock_spin(&sched_lock);
+	/*
+	 * We can not lock the thread here as it may be sleeping on a
+	 * different sleepq.  However, holding the sleepq lock for this
+	 * wchan can guarantee that we do not miss a wakeup for this
+	 * channel.  The asserts below will catch any false positives.
+	 */
 	if (!TD_ON_SLEEPQ(td) || td->td_wchan != wchan) {
-		mtx_unlock_spin(&sched_lock);
 		sleepq_release(wchan);
 		return;
 	}
-	MPASS(sq != NULL);
-
 	/* Thread is asleep on sleep queue sq, so wake it up. */
+	thread_lock(td);
+	MPASS(sq != NULL);
+	MPASS(td->td_wchan == wchan);
 	sleepq_resume_thread(sq, td, -1);
+	thread_unlock(td);
 	sleepq_release(wchan);
-	mtx_unlock_spin(&sched_lock);
 }
 
 /*
  * Abort a thread as if an interrupt had occurred.  Only abort
  * interruptible waits (unfortunately it isn't safe to abort others).
- *
- * XXX: What in the world does the comment below mean?
- * Also, whatever the signal code does...
  */
 void
 sleepq_abort(struct thread *td, int intrval)
 {
+	struct sleepqueue *sq;
 	void *wchan;
 
-	mtx_assert(&sched_lock, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	MPASS(TD_ON_SLEEPQ(td));
 	MPASS(td->td_flags & TDF_SINTR);
 	MPASS(intrval == EINTR || intrval == ERESTART);
@@ -838,12 +901,87 @@
 
 	CTR3(KTR_PROC, "sleepq_abort: thread %p (pid %ld, %s)",
 	    (void *)td, (long)td->td_proc->p_pid, (void *)td->td_proc->p_comm);
+	td->td_intrval = intrval;
+	td->td_flags |= TDF_SLEEPABORT;
+	/*
+	 * If the thread has not slept yet it will find the signal in
+	 * sleepq_catch_signals() and call sleepq_resume_thread.  Otherwise
+	 * we have to do it here.
+	 */
+	if (!TD_IS_SLEEPING(td))
+		return;
 	wchan = td->td_wchan;
-	if (wchan != NULL) {
-		td->td_intrval = intrval;
-		td->td_flags |= TDF_SLEEPABORT;
-	}
-	mtx_unlock_spin(&sched_lock);
-	sleepq_remove(td, wchan);
-	mtx_lock_spin(&sched_lock);
+	MPASS(wchan != NULL);
+	sq = sleepq_lookup(wchan);
+	MPASS(sq != NULL);
+
+	/* Thread is asleep on sleep queue sq, so wake it up. */
+	sleepq_resume_thread(sq, td, -1);
 }
+
+#ifdef DDB
+DB_SHOW_COMMAND(sleepq, db_show_sleepqueue)
+{
+	struct sleepqueue_chain *sc;
+	struct sleepqueue *sq;
+#ifdef INVARIANTS
+	struct lock_object *lock;
+#endif
+	struct thread *td;
+	void *wchan;
+	int i;
+
+	if (!have_addr)
+		return;
+
+	/*
+	 * First, see if there is an active sleep queue for the wait channel
+	 * indicated by the address.
+	 */
+	wchan = (void *)addr;
+	sc = SC_LOOKUP(wchan);
+	LIST_FOREACH(sq, &sc->sc_queues, sq_hash)
+		if (sq->sq_wchan == wchan)
+			goto found;
+
+	/*
+	 * Second, see if there is an active sleep queue at the address
+	 * indicated.
+	 */
+	for (i = 0; i < SC_TABLESIZE; i++)
+		LIST_FOREACH(sq, &sleepq_chains[i].sc_queues, sq_hash) {
+			if (sq == (struct sleepqueue *)addr)
+				goto found;
+		}
+
+	db_printf("Unable to locate a sleep queue via %p\n", (void *)addr);
+	return;
+found:
+	db_printf("Wait channel: %p\n", sq->sq_wchan);
+#ifdef INVARIANTS
+	db_printf("Queue type: %d\n", sq->sq_type);
+	if (sq->sq_lock) {
+		lock = sq->sq_lock;
+		db_printf("Associated Interlock: %p - (%s) %s\n", lock,
+		    LOCK_CLASS(lock)->lc_name, lock->lo_name);
+	}
+#endif
+	db_printf("Blocked threads:\n");
+	for (i = 0; i < NR_SLEEPQS; i++) {
+		db_printf("\nQueue[%d]:\n", i);
+		if (TAILQ_EMPTY(&sq->sq_blocked[i]))
+			db_printf("\tempty\n");
+		else
+			TAILQ_FOREACH(td, &sq->sq_blocked[0],
+				      td_slpq) {
+				db_printf("\t%p (tid %d, pid %d, \"%s\")\n", td,
+					  td->td_tid, td->td_proc->p_pid,
+					  td->td_name[i] != '\0' ? td->td_name :
+					  td->td_proc->p_comm);
+			}
+	}
+}
+
+/* Alias 'show sleepqueue' to 'show sleepq'. */
+DB_SET(sleepqueue, db_show_sleepqueue, db_show_cmd_set, 0, NULL);
+#endif
Index: kern_sysctl.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_sysctl.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/kern_sysctl.c -L sys/kern/kern_sysctl.c -u -r1.2 -r1.3
--- sys/kern/kern_sysctl.c
+++ sys/kern/kern_sysctl.c
@@ -36,7 +36,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_sysctl.c,v 1.165.2.3 2006/03/01 21:08:53 andre Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_sysctl.c,v 1.177 2007/09/02 09:59:33 rwatson Exp $");
 
 #include "opt_compat.h"
 #include "opt_mac.h"
@@ -45,13 +45,16 @@
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
-#include <sys/mac.h>
 #include <sys/malloc.h>
+#include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/sx.h>
 #include <sys/sysproto.h>
+
+#include <security/mac/mac_framework.h>
+
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 
@@ -510,7 +513,7 @@
 {
 	int error;
 
-	error = suser(req->td);
+	error = priv_check(req->td, PRIV_SYSCTL_DEBUG);
 	if (error)
 		return (error);
 	sysctl_sysctl_debug_dump_node(&sysctl__children, 0);
@@ -889,6 +892,31 @@
 }
 
 /*
+ * Handle a 64 bit int, signed or unsigned.  arg1 points to it.
+ */
+
+int
+sysctl_handle_quad(SYSCTL_HANDLER_ARGS)
+{
+	int error = 0;
+	uint64_t tmpout;
+
+	/*
+	 * Attempt to get a coherent snapshot by making a copy of the data.
+	 */
+	if (!arg1)
+		return (EINVAL);
+	tmpout = *(uint64_t *)arg1;
+	error = SYSCTL_OUT(req, &tmpout, sizeof(uint64_t));
+
+	if (error || !req->newptr)
+		return (error);
+
+	error = SYSCTL_IN(req, arg1, sizeof(uint64_t));
+	return (error);
+}
+
+/*
  * Handle our generic '\0' terminated 'C' string.
  * Two cases:
  * 	a variable string:  point arg1 at it, arg2 is max length.
@@ -1135,10 +1163,6 @@
 /*
  * Wire the user space destination buffer.  If set to a value greater than
  * zero, the len parameter limits the maximum amount of wired memory.
- *
- * XXX - The len parameter is currently ignored due to the lack of
- * a place to save it in the sysctl_req structure so that the matching
- * amount of memory can be unwired in the sysctl exit code.
  */
 int
 sysctl_wire_old_buffer(struct sysctl_req *req, size_t len)
@@ -1255,13 +1279,10 @@
 
 	/* Is this sysctl writable by only privileged users? */
 	if (req->newptr && !(oid->oid_kind & CTLFLAG_ANYBODY)) {
-		int flags;
-
 		if (oid->oid_kind & CTLFLAG_PRISON)
-			flags = SUSER_ALLOWJAIL;
+			error = priv_check(req->td, PRIV_SYSCTL_WRITEJAIL);
 		else
-			flags = 0;
-		error = suser_cred(req->td->td_ucred, flags);
+			error = priv_check(req->td, PRIV_SYSCTL_WRITE);
 		if (error)
 			return (error);
 	}
@@ -1297,10 +1318,6 @@
 	size_t	newlen;
 };
 #endif
-
-/*
- * MPSAFE
- */
 int
 __sysctl(struct thread *td, struct sysctl_args *uap)
 {
@@ -1366,7 +1383,7 @@
 	}
 
 	if (new != NULL) {
-		if (!useracc(new, req.newlen, VM_PROT_READ))
+		if (!useracc(new, newlen, VM_PROT_READ))
 			return (EFAULT);
 		req.newlen = newlen;
 		req.newptr = new;
@@ -1452,6 +1469,7 @@
 	/* the actual string data is appended here */
 
 } bsdi_si;
+
 /*
  * this data is appended to the end of the bsdi_si structure during copyout.
  * The "char *" offsets are relative to the base of the bsdi_si struct.
@@ -1468,10 +1486,6 @@
 	int	arg;
 };
 #endif
-
-/*
- * MPSAFE
- */
 int
 ogetkerninfo(struct thread *td, struct getkerninfo_args *uap)
 {
Index: sys_generic.c
===================================================================
RCS file: /home/cvs/src/sys/kern/sys_generic.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/sys_generic.c -L sys/kern/sys_generic.c -u -r1.2 -r1.3
--- sys/kern/sys_generic.c
+++ sys/kern/sys_generic.c
@@ -35,7 +35,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/sys_generic.c,v 1.146 2005/07/07 18:17:55 jhb Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/sys_generic.c,v 1.158 2007/07/04 22:57:21 peter Exp $");
 
 #include "opt_compat.h"
 #include "opt_ktrace.h"
@@ -68,8 +68,6 @@
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
-#include <vm/vm.h>
-#include <vm/vm_page.h>
 
 static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer");
 static MALLOC_DEFINE(M_SELECT, "select", "select() buffer");
@@ -83,9 +81,6 @@
 		    off_t, int);
 static void	doselwakeup(struct selinfo *, int);
 
-/*
- * Read system call.
- */
 #ifndef _SYS_SYSPROTO_H_
 struct read_args {
 	int	fd;
@@ -93,9 +88,6 @@
 	size_t	nbyte;
 };
 #endif
-/*
- * MPSAFE
- */
 int
 read(td, uap)
 	struct thread *td;
@@ -129,9 +121,6 @@
 	off_t	offset;
 };
 #endif
-/*
- * MPSAFE
- */
 int
 pread(td, uap)
 	struct thread *td;
@@ -153,6 +142,20 @@
 	return(error);
 }
 
+int
+freebsd6_pread(td, uap)
+	struct thread *td;
+	struct freebsd6_pread_args *uap;
+{
+	struct pread_args oargs;
+
+	oargs.fd = uap->fd;
+	oargs.buf = uap->buf;
+	oargs.nbyte = uap->nbyte;
+	oargs.offset = uap->offset;
+	return (pread(td, &oargs));
+}
+
 /*
  * Scatter read system call.
  */
@@ -163,9 +166,6 @@
 	u_int	iovcnt;
 };
 #endif
-/*
- * MPSAFE
- */
 int
 readv(struct thread *td, struct readv_args *uap)
 {
@@ -205,9 +205,6 @@
 	off_t	offset;
 };
 #endif
-/*
- * MPSAFE
- */
 int
 preadv(struct thread *td, struct preadv_args *uap)
 {
@@ -293,9 +290,6 @@
 	return (error);
 }
 
-/*
- * Write system call
- */
 #ifndef _SYS_SYSPROTO_H_
 struct write_args {
 	int	fd;
@@ -303,9 +297,6 @@
 	size_t	nbyte;
 };
 #endif
-/*
- * MPSAFE
- */
 int
 write(td, uap)
 	struct thread *td;
@@ -328,7 +319,7 @@
 }
 
 /*
- * Positioned write system call
+ * Positioned write system call.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct pwrite_args {
@@ -339,9 +330,6 @@
 	off_t	offset;
 };
 #endif
-/*
- * MPSAFE
- */
 int
 pwrite(td, uap)
 	struct thread *td;
@@ -363,8 +351,22 @@
 	return(error);
 }
 
+int
+freebsd6_pwrite(td, uap)
+	struct thread *td;
+	struct freebsd6_pwrite_args *uap;
+{
+	struct pwrite_args oargs;
+
+	oargs.fd = uap->fd;
+	oargs.buf = uap->buf;
+	oargs.nbyte = uap->nbyte;
+	oargs.offset = uap->offset;
+	return (pwrite(td, &oargs));
+}
+
 /*
- * Gather write system call
+ * Gather write system call.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct writev_args {
@@ -373,9 +375,6 @@
 	u_int	iovcnt;
 };
 #endif
-/*
- * MPSAFE
- */
 int
 writev(struct thread *td, struct writev_args *uap)
 {
@@ -398,14 +397,14 @@
 
 	error = fget_write(td, fd, &fp);
 	if (error)
-		return (EBADF);	/* XXX this can't be right */
+		return (error);
 	error = dofilewrite(td, fd, fp, auio, (off_t)-1, 0);
 	fdrop(fp, td);
 	return (error);
 }
 
 /*
- * Gather positioned write system call
+ * Gather positioned write system call.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct pwritev_args {
@@ -415,9 +414,6 @@
 	off_t	offset;
 };
 #endif
-/*
- * MPSAFE
- */
 int
 pwritev(struct thread *td, struct pwritev_args *uap)
 {
@@ -444,7 +440,7 @@
 
 	error = fget_write(td, fd, &fp);
 	if (error)
-		return (EBADF);	/* XXX this can't be right */
+		return (error);
 	if (!(fp->f_ops->fo_flags & DFLAG_SEEKABLE))
 		error = ESPIPE;
 	else if (offset < 0 && fp->f_vnode->v_type != VCHR)
@@ -506,9 +502,6 @@
 	return (error);
 }
 
-/*
- * Ioctl system call
- */
 #ifndef _SYS_SYSPROTO_H_
 struct ioctl_args {
 	int	fd;
@@ -516,20 +509,14 @@
 	caddr_t	data;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 ioctl(struct thread *td, struct ioctl_args *uap)
 {
-	struct file *fp;
-	struct filedesc *fdp;
 	u_long com;
-	int error = 0;
+	int arg, error;
 	u_int size;
-	caddr_t data, memp;
-	int tmp;
+	caddr_t data;
 
 	if (uap->com > 0xffffffff) {
 		printf(
@@ -537,27 +524,7 @@
 		    td->td_proc->p_pid, td->td_proc->p_comm, uap->com);
 		uap->com &= 0xffffffff;
 	}
-	if ((error = fget(td, uap->fd, &fp)) != 0)
-		return (error);
-	if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
-		fdrop(fp, td);
-		return (EBADF);
-	}
-	fdp = td->td_proc->p_fd;
-	switch (com = uap->com) {
-	case FIONCLEX:
-		FILEDESC_LOCK_FAST(fdp);
-		fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE;
-		FILEDESC_UNLOCK_FAST(fdp);
-		fdrop(fp, td);
-		return (0);
-	case FIOCLEX:
-		FILEDESC_LOCK_FAST(fdp);
-		fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE;
-		FILEDESC_UNLOCK_FAST(fdp);
-		fdrop(fp, td);
-		return (0);
-	}
+	com = uap->com;
 
 	/*
 	 * Interpret high order word to find amount of data to be
@@ -571,23 +538,25 @@
 #else
 	    ((com & (IOC_IN | IOC_OUT)) && size == 0) ||
 #endif
-	    ((com & IOC_VOID) && size > 0)) {
-		fdrop(fp, td);
+	    ((com & IOC_VOID) && size > 0 && size != sizeof(int)))
 		return (ENOTTY);
-	}
 
 	if (size > 0) {
-		memp = malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
-		data = memp;
-	} else {
-		memp = NULL;
+		if (!(com & IOC_VOID))
+			data = malloc((u_long)size, M_IOCTLOPS, M_WAITOK);
+		else {
+			/* Integer argument. */
+			arg = (intptr_t)uap->data;
+			data = (void *)&arg;
+			size = 0;
+		}
+	} else
 		data = (void *)&uap->data;
-	}
 	if (com & IOC_IN) {
 		error = copyin(uap->data, data, (u_int)size);
 		if (error) {
-			free(memp, M_IOCTLOPS);
-			fdrop(fp, td);
+			if (size > 0)
+				free(data, M_IOCTLOPS);
 			return (error);
 		}
 	} else if (com & IOC_OUT) {
@@ -598,7 +567,43 @@
 		bzero(data, size);
 	}
 
-	if (com == FIONBIO) {
+	error = kern_ioctl(td, uap->fd, com, data);
+
+	if (error == 0 && (com & IOC_OUT))
+		error = copyout(data, uap->data, (u_int)size);
+
+	if (size > 0)
+		free(data, M_IOCTLOPS);
+	return (error);
+}
+
+int
+kern_ioctl(struct thread *td, int fd, u_long com, caddr_t data)
+{
+	struct file *fp;
+	struct filedesc *fdp;
+	int error;
+	int tmp;
+
+	if ((error = fget(td, fd, &fp)) != 0)
+		return (error);
+	if ((fp->f_flag & (FREAD | FWRITE)) == 0) {
+		fdrop(fp, td);
+		return (EBADF);
+	}
+	fdp = td->td_proc->p_fd;
+	switch (com) {
+	case FIONCLEX:
+		FILEDESC_XLOCK(fdp);
+		fdp->fd_ofileflags[fd] &= ~UF_EXCLOSE;
+		FILEDESC_XUNLOCK(fdp);
+		goto out;
+	case FIOCLEX:
+		FILEDESC_XLOCK(fdp);
+		fdp->fd_ofileflags[fd] |= UF_EXCLOSE;
+		FILEDESC_XUNLOCK(fdp);
+		goto out;
+	case FIONBIO:
 		FILE_LOCK(fp);
 		if ((tmp = *(int *)data))
 			fp->f_flag |= FNONBLOCK;
@@ -606,7 +611,8 @@
 			fp->f_flag &= ~FNONBLOCK;
 		FILE_UNLOCK(fp);
 		data = (void *)&tmp;
-	} else if (com == FIOASYNC) {
+		break;
+	case FIOASYNC:
 		FILE_LOCK(fp);
 		if ((tmp = *(int *)data))
 			fp->f_flag |= FASYNC;
@@ -614,15 +620,11 @@
 			fp->f_flag &= ~FASYNC;
 		FILE_UNLOCK(fp);
 		data = (void *)&tmp;
+		break;
 	}
 
 	error = fo_ioctl(fp, com, data, td->td_ucred, td);
-
-	if (error == 0 && (com & IOC_OUT))
-		error = copyout(data, uap->data, (u_int)size);
-
-	if (memp != NULL)
-		free(memp, M_IOCTLOPS);
+out:
 	fdrop(fp, td);
 	return (error);
 }
@@ -635,9 +637,6 @@
 u_int		nselcoll;	/* Select collisions since boot */
 SYSCTL_UINT(_kern, OID_AUTO, nselcoll, CTLFLAG_RD, &nselcoll, 0, "");
 
-/*
- * Select system call.
- */
 #ifndef _SYS_SYSPROTO_H_
 struct select_args {
 	int	nd;
@@ -645,9 +644,6 @@
 	struct	timeval *tv;
 };
 #endif
-/*
- * MPSAFE
- */
 int
 select(td, uap)
 	register struct thread *td;
@@ -688,11 +684,10 @@
 		return (EINVAL);
 	fdp = td->td_proc->p_fd;
 	
-	FILEDESC_LOCK_FAST(fdp);
-
+	FILEDESC_SLOCK(fdp);
 	if (nd > td->td_proc->p_fd->fd_nfiles)
 		nd = td->td_proc->p_fd->fd_nfiles;   /* forgiving; slightly wrong */
-	FILEDESC_UNLOCK_FAST(fdp);
+	FILEDESC_SUNLOCK(fdp);
 
 	/*
 	 * Allocate just enough bits for the non-null fd_sets.  Use the
@@ -755,9 +750,9 @@
 	mtx_lock(&sellock);
 retry:
 	ncoll = nselcoll;
-	mtx_lock_spin(&sched_lock);
+	thread_lock(td);
 	td->td_flags |= TDF_SELECT;
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(td);
 	mtx_unlock(&sellock);
 
 	error = selscan(td, ibits, obits, nd);
@@ -780,12 +775,12 @@
 	 * collisions and rescan the file descriptors if
 	 * necessary.
 	 */
-	mtx_lock_spin(&sched_lock);
+	thread_lock(td);
 	if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
-		mtx_unlock_spin(&sched_lock);
+		thread_unlock(td);
 		goto retry;
 	}
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(td);
 
 	if (timo > 0)
 		error = cv_timedwait_sig(&selwait, &sellock, timo);
@@ -797,9 +792,9 @@
 
 done:
 	clear_selinfo_list(td);
-	mtx_lock_spin(&sched_lock);
+	thread_lock(td);
 	td->td_flags &= ~TDF_SELECT;
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(td);
 	mtx_unlock(&sellock);
 
 done_nosellock:
@@ -839,7 +834,7 @@
 	static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND };
 	struct filedesc *fdp = td->td_proc->p_fd;
 
-	FILEDESC_LOCK(fdp);
+	FILEDESC_SLOCK(fdp);
 	for (msk = 0; msk < 3; msk++) {
 		if (ibits[msk] == NULL)
 			continue;
@@ -850,7 +845,7 @@
 				if (!(bits & 1))
 					continue;
 				if ((fp = fget_locked(fdp, fd)) == NULL) {
-					FILEDESC_UNLOCK(fdp);
+					FILEDESC_SUNLOCK(fdp);
 					return (EBADF);
 				}
 				if (fo_poll(fp, flag[msk], td->td_ucred,
@@ -862,14 +857,11 @@
 			}
 		}
 	}
-	FILEDESC_UNLOCK(fdp);
+	FILEDESC_SUNLOCK(fdp);
 	td->td_retval[0] = n;
 	return (0);
 }
 
-/*
- * Poll system call.
- */
 #ifndef _SYS_SYSPROTO_H_
 struct poll_args {
 	struct pollfd *fds;
@@ -877,9 +869,6 @@
 	int	timeout;
 };
 #endif
-/*
- * MPSAFE
- */
 int
 poll(td, uap)
 	struct thread *td;
@@ -935,9 +924,9 @@
 	mtx_lock(&sellock);
 retry:
 	ncoll = nselcoll;
-	mtx_lock_spin(&sched_lock);
+	thread_lock(td);
 	td->td_flags |= TDF_SELECT;
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(td);
 	mtx_unlock(&sellock);
 
 	error = pollscan(td, bits, nfds);
@@ -958,12 +947,12 @@
 	 * sellock, so check TDF_SELECT and the number of collisions
 	 * and rescan the file descriptors if necessary.
 	 */
-	mtx_lock_spin(&sched_lock);
+	thread_lock(td);
 	if ((td->td_flags & TDF_SELECT) == 0 || nselcoll != ncoll) {
-		mtx_unlock_spin(&sched_lock);
+		thread_unlock(td);
 		goto retry;
 	}
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(td);
 
 	if (timo > 0)
 		error = cv_timedwait_sig(&selwait, &sellock, timo);
@@ -975,9 +964,9 @@
 
 done:
 	clear_selinfo_list(td);
-	mtx_lock_spin(&sched_lock);
+	thread_lock(td);
 	td->td_flags &= ~TDF_SELECT;
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(td);
 	mtx_unlock(&sellock);
 
 done_nosellock:
@@ -1009,7 +998,7 @@
 	struct file *fp;
 	int n = 0;
 
-	FILEDESC_LOCK(fdp);
+	FILEDESC_SLOCK(fdp);
 	for (i = 0; i < nfd; i++, fds++) {
 		if (fds->fd >= fdp->fd_nfiles) {
 			fds->revents = POLLNVAL;
@@ -1033,13 +1022,14 @@
 			}
 		}
 	}
-	FILEDESC_UNLOCK(fdp);
+	FILEDESC_SUNLOCK(fdp);
 	td->td_retval[0] = n;
 	return (0);
 }
 
 /*
  * OpenBSD poll system call.
+ *
  * XXX this isn't quite a true representation..  OpenBSD uses select ops.
  */
 #ifndef _SYS_SYSPROTO_H_
@@ -1049,9 +1039,6 @@
 	int	timeout;
 };
 #endif
-/*
- * MPSAFE
- */
 int
 openbsd_poll(td, uap)
 	register struct thread *td;
@@ -1061,12 +1048,12 @@
 }
 
 /*
- * Remove the references to the thread from all of the objects
- * we were polling.
+ * Remove the references to the thread from all of the objects we were
+ * polling.
  *
- * This code assumes that the underlying owner of the selinfo
- * structure will hold sellock before it changes it, and that
- * it will unlink itself from our list if it goes away.
+ * This code assumes that the underlying owner of the selinfo structure will
+ * hold sellock before it changes it, and that it will unlink itself from our
+ * list if it goes away.
  */
 void
 clear_selinfo_list(td)
@@ -1150,9 +1137,9 @@
 	}
 	TAILQ_REMOVE(&td->td_selq, sip, si_thrlist);
 	sip->si_thread = NULL;
-	mtx_lock_spin(&sched_lock);
+	thread_lock(td);
 	td->td_flags &= ~TDF_SELECT;
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(td);
 	sleepq_remove(td, &selwait);
 	mtx_unlock(&sellock);
 }
--- /dev/null
+++ sys/kern/p1003_1b.c
@@ -0,0 +1,321 @@
+/*-
+ * Copyright (c) 1996, 1997, 1998
+ *	HD Associates, Inc.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by HD Associates, Inc
+ * 4. Neither the name of the author nor the names of any co-contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY HD ASSOCIATES AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL HD ASSOCIATES OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/* p1003_1b: Real Time common code.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: src/sys/kern/p1003_1b.c,v 1.36 2007/10/08 23:45:23 jeff Exp $");
+
+#include "opt_posix.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/module.h>
+#include <sys/mutex.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/posix4.h>
+#include <sys/syscallsubr.h>
+#include <sys/sysctl.h>
+#include <sys/sysent.h>
+#include <sys/syslog.h>
+#include <sys/sysproto.h>
+
+MALLOC_DEFINE(M_P31B, "p1003.1b", "Posix 1003.1B");
+
+/* The system calls return ENOSYS if an entry is called that is not run-time
+ * supported.  I am also logging since some programs start to use this when
+ * they shouldn't.  That will be removed if annoying.
+ */
+int
+syscall_not_present(struct thread *td, const char *s, struct nosys_args *uap)
+{
+	log(LOG_ERR, "cmd %s pid %d tried to use non-present %s\n",
+			td->td_proc->p_comm, td->td_proc->p_pid, s);
+
+	/* a " return nosys(p, uap); " here causes a core dump.
+	 */
+
+	return ENOSYS;
+}
+
+#if !defined(_KPOSIX_PRIORITY_SCHEDULING)
+
+/* Not configured but loadable via a module:
+ */
+
+static int
+sched_attach(void)
+{
+	return 0;
+}
+
+SYSCALL_NOT_PRESENT_GEN(sched_setparam)
+SYSCALL_NOT_PRESENT_GEN(sched_getparam)
+SYSCALL_NOT_PRESENT_GEN(sched_setscheduler)
+SYSCALL_NOT_PRESENT_GEN(sched_getscheduler)
+SYSCALL_NOT_PRESENT_GEN(sched_yield)
+SYSCALL_NOT_PRESENT_GEN(sched_get_priority_max)
+SYSCALL_NOT_PRESENT_GEN(sched_get_priority_min)
+SYSCALL_NOT_PRESENT_GEN(sched_rr_get_interval)
+#else
+
+/* Configured in kernel version:
+ */
+static struct ksched *ksched;
+
+static int
+sched_attach(void)
+{
+	int ret = ksched_attach(&ksched);
+
+	if (ret == 0)
+		p31b_setcfg(CTL_P1003_1B_PRIORITY_SCHEDULING, 1);
+
+	return ret;
+}
+
+int
+sched_setparam(struct thread *td, struct sched_setparam_args *uap)
+{
+	struct thread *targettd;
+	struct proc *targetp;
+	int e;
+	struct sched_param sched_param;
+
+	e = copyin(uap->param, &sched_param, sizeof(sched_param));
+	if (e)
+		return (e);
+
+	if (uap->pid == 0) {
+		targetp = td->td_proc;
+		targettd = td;
+		PROC_LOCK(targetp);
+	} else {
+		targetp = pfind(uap->pid);
+		if (targetp == NULL)
+			return (ESRCH);
+		targettd = FIRST_THREAD_IN_PROC(targetp);
+	}
+
+	e = p_cansched(td, targetp);
+	if (e == 0) {
+		e = ksched_setparam(ksched, targettd,
+			(const struct sched_param *)&sched_param);
+	}
+	PROC_UNLOCK(targetp);
+	return (e);
+}
+
+int
+sched_getparam(struct thread *td, struct sched_getparam_args *uap)
+{
+	int e;
+	struct sched_param sched_param;
+	struct thread *targettd;
+	struct proc *targetp;
+
+	if (uap->pid == 0) {
+		targetp = td->td_proc;
+		targettd = td;
+		PROC_LOCK(targetp);
+	} else {
+		targetp = pfind(uap->pid);
+		if (targetp == NULL) {
+			return (ESRCH);
+		}
+		targettd = FIRST_THREAD_IN_PROC(targetp); /* XXXKSE */
+	}
+
+	e = p_cansee(td, targetp);
+	if (e == 0) {
+		e = ksched_getparam(ksched, targettd, &sched_param);
+	}
+	PROC_UNLOCK(targetp);
+	if (e == 0)
+		e = copyout(&sched_param, uap->param, sizeof(sched_param));
+	return (e);
+}
+
+int
+sched_setscheduler(struct thread *td, struct sched_setscheduler_args *uap)
+{
+	int e;
+	struct sched_param sched_param;
+	struct thread *targettd;
+	struct proc *targetp;
+
+	/* Don't allow non root user to set a scheduler policy. */
+	e = priv_check(td, PRIV_SCHED_SET);
+	if (e)
+		return (e);
+
+	e = copyin(uap->param, &sched_param, sizeof(sched_param));
+	if (e)
+		return (e);
+
+	if (uap->pid == 0) {
+		targetp = td->td_proc;
+		targettd = td;
+		PROC_LOCK(targetp);
+	} else {
+		targetp = pfind(uap->pid);
+		if (targetp == NULL)
+			return (ESRCH);
+		targettd = FIRST_THREAD_IN_PROC(targetp);
+	}
+
+	e = p_cansched(td, targetp);
+	if (e == 0) {
+		e = ksched_setscheduler(ksched, targettd,
+			uap->policy, (const struct sched_param *)&sched_param);
+	}
+	PROC_UNLOCK(targetp);
+	return (e);
+}
+
+int
+sched_getscheduler(struct thread *td, struct sched_getscheduler_args *uap)
+{
+	int e, policy;
+	struct thread *targettd;
+	struct proc *targetp;
+
+	if (uap->pid == 0) {
+		targetp = td->td_proc;
+		targettd = td;
+		PROC_LOCK(targetp);
+	} else {
+		targetp = pfind(uap->pid);
+		if (targetp == NULL) {
+			e = ESRCH;
+			goto done2;
+		}
+		targettd = FIRST_THREAD_IN_PROC(targetp); /* XXXKSE */
+	}
+
+	e = p_cansee(td, targetp);
+	if (e == 0) {
+		e = ksched_getscheduler(ksched, targettd, &policy);
+		td->td_retval[0] = policy;
+	}
+	PROC_UNLOCK(targetp);
+
+done2:
+	return (e);
+}
+
+int
+sched_yield(struct thread *td, struct sched_yield_args *uap)
+{
+
+	sched_relinquish(curthread);
+	return 0;
+}
+
+int
+sched_get_priority_max(struct thread *td,
+    struct sched_get_priority_max_args *uap)
+{
+	int error, prio;
+
+	error = ksched_get_priority_max(ksched, uap->policy, &prio);
+	td->td_retval[0] = prio;
+	return (error);
+}
+
+int
+sched_get_priority_min(struct thread *td,
+    struct sched_get_priority_min_args *uap)
+{
+	int error, prio;
+
+	error = ksched_get_priority_min(ksched, uap->policy, &prio);
+	td->td_retval[0] = prio;
+	return (error);
+}
+
+int
+sched_rr_get_interval(struct thread *td,
+    struct sched_rr_get_interval_args *uap)
+{
+	struct timespec timespec;
+	int error;
+
+	error = kern_sched_rr_get_interval(td, uap->pid, &timespec);
+	if (error == 0)
+		error = copyout(&timespec, uap->interval, sizeof(timespec));
+	return (error);
+}
+
+int
+kern_sched_rr_get_interval(struct thread *td, pid_t pid,
+    struct timespec *ts)
+{
+	int e;
+	struct thread *targettd;
+	struct proc *targetp;
+
+	if (pid == 0) {
+		targettd = td;
+		targetp = td->td_proc;
+		PROC_LOCK(targetp);
+	} else {
+		targetp = td->td_proc;
+		PROC_LOCK(targetp);
+		targettd = thread_find(targetp, pid);
+		if (targettd == NULL) {
+			PROC_UNLOCK(targetp);
+			return (ESRCH);
+		}
+	}
+
+	e = p_cansee(td, targetp);
+	if (e == 0)
+		e = ksched_rr_get_interval(ksched, targettd, ts);
+	PROC_UNLOCK(targetp);
+	return (e);
+}
+
+#endif
+
+static void
+p31binit(void *notused)
+{
+	(void) sched_attach();
+	p31b_setcfg(CTL_P1003_1B_PAGESIZE, PAGE_SIZE);
+}
+
+SYSINIT(p31b, SI_SUB_P1003_1B, SI_ORDER_FIRST, p31binit, NULL);
Index: tty_tty.c
===================================================================
RCS file: /home/cvs/src/sys/kern/tty_tty.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/tty_tty.c -L sys/kern/tty_tty.c -u -r1.1.1.1 -r1.2
--- sys/kern/tty_tty.c
+++ sys/kern/tty_tty.c
@@ -24,15 +24,19 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/tty_tty.c,v 1.56.2.1 2005/08/13 21:24:16 rwatson Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/tty_tty.c,v 1.60 2007/07/03 17:46:37 kib Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
 #include <sys/kernel.h>
 #include <sys/proc.h>
+#include <sys/sx.h>
 #include <sys/vnode.h>
 
+#include <fs/devfs/devfs.h>
+#include <fs/devfs/devfs_int.h>
+
 static	d_open_t	cttyopen;
 
 static struct cdevsw ctty_cdevsw = {
@@ -60,13 +64,25 @@
 		return;
 	if (strcmp(name, "tty"))
 		return;
+	sx_sunlock(&clone_drain_lock);
+	mtx_lock(&Giant);
+	sx_slock(&proctree_lock);
+	sx_slock(&clone_drain_lock);
+	dev_lock();
 	if (!(curthread->td_proc->p_flag & P_CONTROLT))
 		*dev = ctty;
 	else if (curthread->td_proc->p_session->s_ttyvp == NULL)
 		*dev = ctty;
-	else
+	else if (curthread->td_proc->p_session->s_ttyvp->v_type == VBAD ||
+	    curthread->td_proc->p_session->s_ttyvp->v_rdev == NULL) {
+		/* e.g. s_ttyvp was revoked */
+		*dev = ctty;
+	} else
 		*dev = curthread->td_proc->p_session->s_ttyvp->v_rdev;
-	dev_ref(*dev);
+	dev_refl(*dev);
+	dev_unlock();
+	sx_sunlock(&proctree_lock);
+	mtx_unlock(&Giant);
 }
 
 static void
Index: inflate.c
===================================================================
RCS file: /home/cvs/src/sys/kern/inflate.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/inflate.c -L sys/kern/inflate.c -u -r1.1.1.1 -r1.2
--- sys/kern/inflate.c
+++ sys/kern/inflate.c
@@ -9,7 +9,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/inflate.c,v 1.19 2003/06/11 00:56:54 obrien Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/inflate.c,v 1.20 2005/10/31 15:41:25 rwatson Exp $");
 
 #include <sys/param.h>
 #include <sys/inflate.h>
@@ -20,7 +20,7 @@
 #include <sys/malloc.h>
 
 #ifdef _KERNEL
-static MALLOC_DEFINE(M_GZIP, "Gzip trees", "Gzip trees");
+static MALLOC_DEFINE(M_GZIP, "gzip_trees", "Gzip trees");
 #endif
 
 /* needed to make inflate() work */
Index: vfs_vnops.c
===================================================================
RCS file: /home/cvs/src/sys/kern/vfs_vnops.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/vfs_vnops.c -L sys/kern/vfs_vnops.c -u -r1.2 -r1.3
--- sys/kern/vfs_vnops.c
+++ sys/kern/vfs_vnops.c
@@ -35,7 +35,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/vfs_vnops.c,v 1.233.2.1 2006/03/13 03:06:44 jeff Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/vfs_vnops.c,v 1.252 2007/07/26 16:58:09 pjd Exp $");
 
 #include "opt_mac.h"
 
@@ -45,10 +45,10 @@
 #include <sys/file.h>
 #include <sys/kdb.h>
 #include <sys/stat.h>
+#include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
-#include <sys/mac.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
@@ -62,6 +62,8 @@
 #include <sys/syslog.h>
 #include <sys/unistd.h>
 
+#include <security/mac/mac_framework.h>
+
 static fo_rdwr_t	vn_read;
 static fo_rdwr_t	vn_write;
 static fo_ioctl_t	vn_ioctl;
@@ -82,13 +84,14 @@
 };
 
 int
-vn_open(ndp, flagp, cmode, fdidx)
+vn_open(ndp, flagp, cmode, fp)
 	struct nameidata *ndp;
-	int *flagp, cmode, fdidx;
+	int *flagp, cmode;
+	struct file *fp;
 {
 	struct thread *td = ndp->ni_cnd.cn_thread;
 
-	return (vn_open_cred(ndp, flagp, cmode, td->td_ucred, fdidx));
+	return (vn_open_cred(ndp, flagp, cmode, td->td_ucred, fp));
 }
 
 /*
@@ -99,11 +102,11 @@
  * due to the NDINIT being done elsewhere.
  */
 int
-vn_open_cred(ndp, flagp, cmode, cred, fdidx)
+vn_open_cred(ndp, flagp, cmode, cred, fp)
 	struct nameidata *ndp;
 	int *flagp, cmode;
 	struct ucred *cred;
-	int fdidx;
+	struct file *fp;
 {
 	struct vnode *vp;
 	struct mount *mp;
@@ -111,21 +114,24 @@
 	struct vattr vat;
 	struct vattr *vap = &vat;
 	int mode, fmode, error;
-	int vfslocked;
+	int vfslocked, mpsafe;
 
+	mpsafe = ndp->ni_cnd.cn_flags & MPSAFE;
 restart:
 	vfslocked = 0;
 	fmode = *flagp;
 	if (fmode & O_CREAT) {
 		ndp->ni_cnd.cn_nameiop = CREATE;
-		ndp->ni_cnd.cn_flags = ISOPEN | LOCKPARENT | LOCKLEAF | MPSAFE;
+		ndp->ni_cnd.cn_flags = ISOPEN | LOCKPARENT | LOCKLEAF |
+		    MPSAFE | AUDITVNODE1;
 		if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0)
 			ndp->ni_cnd.cn_flags |= FOLLOW;
 		bwillwrite();
 		if ((error = namei(ndp)) != 0)
 			return (error);
-		vfslocked = (ndp->ni_cnd.cn_flags & GIANTHELD) != 0;
-		ndp->ni_cnd.cn_flags &= ~MPSAFE;
+		vfslocked = NDHASGIANT(ndp);
+		if (!mpsafe)
+			ndp->ni_cnd.cn_flags &= ~MPSAFE;
 		if (ndp->ni_vp == NULL) {
 			VATTR_NULL(vap);
 			vap->va_type = VREG;
@@ -178,11 +184,12 @@
 		ndp->ni_cnd.cn_nameiop = LOOKUP;
 		ndp->ni_cnd.cn_flags = ISOPEN |
 		    ((fmode & O_NOFOLLOW) ? NOFOLLOW : FOLLOW) |
-		    LOCKSHARED | LOCKLEAF | MPSAFE;
+		    LOCKLEAF | MPSAFE | AUDITVNODE1;
 		if ((error = namei(ndp)) != 0)
 			return (error);
-		ndp->ni_cnd.cn_flags &= ~MPSAFE;
-		vfslocked = (ndp->ni_cnd.cn_flags & GIANTHELD) != 0;
+		if (!mpsafe)
+			ndp->ni_cnd.cn_flags &= ~MPSAFE;
+		vfslocked = NDHASGIANT(ndp);
 		vp = ndp->ni_vp;
 	}
 	if (vp->v_type == VLNK) {
@@ -222,14 +229,14 @@
 				goto bad;
 		}
 	}
-	if ((error = VOP_OPEN(vp, fmode, cred, td, fdidx)) != 0)
+	if ((error = VOP_OPEN(vp, fmode, cred, td, fp)) != 0)
 		goto bad;
 
 	if (fmode & FWRITE)
 		vp->v_writecount++;
 	*flagp = fmode;
-	ASSERT_VOP_LOCKED(vp, "vn_open_cred");
-	if (fdidx == -1)
+	ASSERT_VOP_ELOCKED(vp, "vn_open_cred");
+	if (!mpsafe)
 		VFS_UNLOCK_GIANT(vfslocked);
 	return (0);
 bad:
@@ -279,8 +286,11 @@
 
 	vn_start_write(vp, &mp, V_WAIT);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
-	if (flags & FWRITE)
+	if (flags & FWRITE) {
+		VNASSERT(vp->v_writecount > 0, vp, 
+		    ("vn_close: negative writecount"));
 		vp->v_writecount--;
+	}
 	error = VOP_CLOSE(vp, flags, file_cred, td);
 	vput(vp);
 	vn_finished_write(mp);
@@ -327,7 +337,7 @@
     aresid, td)
 	enum uio_rw rw;
 	struct vnode *vp;
-	caddr_t base;
+	void *base;
 	int len;
 	off_t offset;
 	enum uio_seg segflg;
@@ -400,7 +410,7 @@
 		if (auio.uio_resid && error == 0)
 			error = EIO;
 	if ((ioflg & IO_NODELOCKED) == 0) {
-		if (rw == UIO_WRITE)
+		if (rw == UIO_WRITE && vp->v_type != VCHR)
 			vn_finished_write(mp);
 		VOP_UNLOCK(vp, 0, td);
 	}
@@ -420,7 +430,7 @@
     file_cred, aresid, td)
 	enum uio_rw rw;
 	struct vnode *vp;
-	caddr_t base;
+	void *base;
 	size_t len;
 	off_t offset;
 	enum uio_seg segflg;
@@ -457,7 +467,7 @@
 		if (error)
 			break;
 		offset += chunk;
-		base += chunk;
+		base = (char *)base + chunk;
 		uio_yield();
 	} while (len);
 	if (aresid)
@@ -491,11 +501,18 @@
 	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	VOP_LEASE(vp, td, fp->f_cred, LEASE_READ);
 	/*
-	 * According to McKusick the vn lock is protecting f_offset here.
-	 * Once this field has it's own lock we can acquire this shared.
+	 * According to McKusick the vn lock was protecting f_offset here.
+	 * It is now protected by the FOFFSET_LOCKED flag.
 	 */
 	if ((flags & FOF_OFFSET) == 0) {
-		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+		FILE_LOCK(fp);
+		while(fp->f_vnread_flags & FOFFSET_LOCKED) {
+			fp->f_vnread_flags |= FOFFSET_LOCK_WAITING;
+			msleep(&fp->f_vnread_flags,fp->f_mtxp,PUSER -1,"vnread offlock",0);
+		}
+		fp->f_vnread_flags |= FOFFSET_LOCKED;
+		FILE_UNLOCK(fp);
+		vn_lock(vp, LK_SHARED | LK_RETRY, td);
 		uio->uio_offset = fp->f_offset;
 	} else
 		vn_lock(vp, LK_SHARED | LK_RETRY, td);
@@ -507,8 +524,14 @@
 	if (error == 0)
 #endif
 		error = VOP_READ(vp, uio, ioflag, fp->f_cred);
-	if ((flags & FOF_OFFSET) == 0)
+	if ((flags & FOF_OFFSET) == 0) {
 		fp->f_offset = uio->uio_offset;
+		FILE_LOCK(fp);
+		if (fp->f_vnread_flags & FOFFSET_LOCK_WAITING)
+			wakeup(&fp->f_vnread_flags);
+		fp->f_vnread_flags = 0;
+		FILE_UNLOCK(fp);
+	}
 	fp->f_nextoff = uio->uio_offset;
 	VOP_UNLOCK(vp, 0, td);
 	VFS_UNLOCK_GIANT(vfslocked);
@@ -565,7 +588,8 @@
 		fp->f_offset = uio->uio_offset;
 	fp->f_nextoff = uio->uio_offset;
 	VOP_UNLOCK(vp, 0, td);
-	vn_finished_write(mp);
+	if (vp->v_type != VCHR)
+		vn_finished_write(mp);
 unlock:
 	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
@@ -690,17 +714,12 @@
 	sb->st_blksize = PAGE_SIZE;
 	
 	sb->st_flags = vap->va_flags;
-	if (suser(td))
+	if (priv_check(td, PRIV_VFS_GENERATION))
 		sb->st_gen = 0;
 	else
 		sb->st_gen = vap->va_gen;
 
-#if (S_BLKSIZE == 512)
-	/* Optimize this case */
-	sb->st_blocks = vap->va_bytes >> 9;
-#else
 	sb->st_blocks = vap->va_bytes / S_BLKSIZE;
-#endif
 	return (0);
 }
 
@@ -757,11 +776,11 @@
 	struct thread *td;
 {
 	struct vnode *vp;
+	int vfslocked;
 	int error;
 
-	mtx_lock(&Giant);
-
 	vp = fp->f_vnode;
+	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 #ifdef MAC
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	error = mac_check_vnode_poll(active_cred, fp->f_cred, vp);
@@ -770,7 +789,7 @@
 #endif
 
 	error = VOP_POLL(vp, events, fp->f_cred, td);
-	mtx_unlock(&Giant);
+	VFS_UNLOCK_GIANT(vfslocked);
 	return (error);
 }
 
@@ -779,10 +798,7 @@
  * acquire requested lock.
  */
 int
-vn_lock(vp, flags, td)
-	struct vnode *vp;
-	int flags;
-	struct thread *td;
+_vn_lock(struct vnode *vp, int flags, struct thread *td, char *file, int line)
 {
 	int error;
 
@@ -805,7 +821,7 @@
 		 * lockmgr drops interlock before it will return for
 		 * any reason.  So force the code above to relock it.
 		 */
-		error = VOP_LOCK(vp, flags | LK_INTERLOCK, td);
+		error = VOP_LOCK1(vp, flags | LK_INTERLOCK, td, file, line);
 		flags &= ~LK_INTERLOCK;
 		KASSERT((flags & LK_RETRY) == 0 || error == 0,
 		    ("LK_RETRY set with incompatible flags %d\n", flags));
@@ -844,7 +860,7 @@
 		lf.l_start = 0;
 		lf.l_len = 0;
 		lf.l_type = F_UNLCK;
-		(void) VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK);
+		(void) VOP_ADVLOCK(vp, fp, F_UNLCK, &lf, F_FLOCK);
 	}
 
 	fp->f_ops = &badfileops;
@@ -885,6 +901,8 @@
 	if ((mp = *mpp) == NULL)
 		return (0);
 	MNT_ILOCK(mp);
+	if (vp == NULL)
+		MNT_REF(mp);
 	/*
 	 * Check on status of suspension.
 	 */
@@ -902,6 +920,7 @@
 		goto unlock;
 	mp->mnt_writeopcount++;
 unlock:
+	MNT_REL(mp);
 	MNT_IUNLOCK(mp);
 	return (error);
 }
@@ -935,19 +954,25 @@
 	if (mp == NULL)
 		return (0);
 	MNT_ILOCK(mp);
+	if (vp == NULL)
+		MNT_REF(mp);
 	if ((mp->mnt_kern_flag & MNTK_SUSPENDED) == 0) {
+		MNT_REL(mp);
 		MNT_IUNLOCK(mp);
 		return (0);
 	}
 	if (flags & V_NOWAIT) {
+		MNT_REL(mp);
 		MNT_IUNLOCK(mp);
 		return (EWOULDBLOCK);
 	}
 	/*
 	 * Wait for the suspension to finish.
 	 */
-	return (msleep(&mp->mnt_flag, MNT_MTX(mp),
-	    (PUSER - 1) | (flags & PCATCH) | PDROP, "suspfs", 0));
+	error = msleep(&mp->mnt_flag, MNT_MTX(mp),
+	    (PUSER - 1) | (flags & PCATCH) | PDROP, "suspfs", 0);
+	vfs_rel(mp);
+	return (error);
 }
 
 /*
@@ -982,13 +1007,17 @@
 	if ((mp = *mpp) == NULL)
 		return (0);
 	MNT_ILOCK(mp);
+	if (vp == NULL)
+		MNT_REF(mp);
 	if ((mp->mnt_kern_flag & (MNTK_SUSPENDED | MNTK_SUSPEND2)) == 0) {
 		mp->mnt_secondary_writes++;
 		mp->mnt_secondary_accwrites++;
+		MNT_REL(mp);
 		MNT_IUNLOCK(mp);
 		return (0);
 	}
 	if (flags & V_NOWAIT) {
+		MNT_REL(mp);
 		MNT_IUNLOCK(mp);
 		return (EWOULDBLOCK);
 	}
@@ -997,6 +1026,7 @@
 	 */
 	error = msleep(&mp->mnt_flag, MNT_MTX(mp),
 		       (PUSER - 1) | (flags & PCATCH) | PDROP, "suspfs", 0);
+	vfs_rel(mp);
 	if (error == 0)
 		goto retry;
 	return (error);
@@ -1057,23 +1087,19 @@
 	struct thread *td = curthread;
 	int error;
 
-	error = 0;
 	MNT_ILOCK(mp);
-	if (mp->mnt_kern_flag & MNTK_SUSPEND)
-		goto unlock;
+	if (mp->mnt_kern_flag & MNTK_SUSPEND) {
+		MNT_IUNLOCK(mp);
+		return (0);
+	}
 	mp->mnt_kern_flag |= MNTK_SUSPEND;
 	if (mp->mnt_writeopcount > 0)
 		(void) msleep(&mp->mnt_writeopcount, 
 		    MNT_MTX(mp), (PUSER - 1)|PDROP, "suspwt", 0);
 	else
 		MNT_IUNLOCK(mp);
-	if ((error = VFS_SYNC(mp, MNT_SUSPEND, td)) != 0) {
+	if ((error = VFS_SYNC(mp, MNT_SUSPEND, td)) != 0)
 		vfs_write_resume(mp);
-		return (error);
-	}
-	MNT_ILOCK(mp);
-unlock:
-	MNT_IUNLOCK(mp);
 	return (error);
 }
 
@@ -1101,11 +1127,12 @@
 static int
 vn_kqfilter(struct file *fp, struct knote *kn)
 {
+	int vfslocked;
 	int error;
 
-	mtx_lock(&Giant);
+	vfslocked = VFS_LOCK_GIANT(fp->f_vnode->v_mount);
 	error = VOP_KQFILTER(fp->f_vnode, kn);
-	mtx_unlock(&Giant);
+	VFS_UNLOCK_GIANT(vfslocked);
 
 	return error;
 }
--- sys/kern/uipc_socket2.c
+++ /dev/null
@@ -1,1458 +0,0 @@
-/*-
- * Copyright (c) 1982, 1986, 1988, 1990, 1993
- *	The Regents of the University of California.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 4. Neither the name of the University nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- *	@(#)uipc_socket2.c	8.1 (Berkeley) 6/10/93
- */
-
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/uipc_socket2.c,v 1.147.2.2 2005/11/26 19:30:40 jdp Exp $");
-
-#include "opt_mac.h"
-#include "opt_param.h"
-
-#include <sys/param.h>
-#include <sys/aio.h> /* for aio_swake proto */
-#include <sys/domain.h>
-#include <sys/event.h>
-#include <sys/file.h>	/* for maxfiles */
-#include <sys/kernel.h>
-#include <sys/lock.h>
-#include <sys/mac.h>
-#include <sys/malloc.h>
-#include <sys/mbuf.h>
-#include <sys/mutex.h>
-#include <sys/proc.h>
-#include <sys/protosw.h>
-#include <sys/resourcevar.h>
-#include <sys/signalvar.h>
-#include <sys/socket.h>
-#include <sys/socketvar.h>
-#include <sys/stat.h>
-#include <sys/sysctl.h>
-#include <sys/systm.h>
-
-int	maxsockets;
-
-void (*aio_swake)(struct socket *, struct sockbuf *);
-
-/*
- * Primitive routines for operating on sockets and socket buffers
- */
-
-u_long	sb_max = SB_MAX;
-static	u_long sb_max_adj =
-    SB_MAX * MCLBYTES / (MSIZE + MCLBYTES); /* adjusted sb_max */
-
-static	u_long sb_efficiency = 8;	/* parameter for sbreserve() */
-
-/*
- * Procedures to manipulate state flags of socket
- * and do appropriate wakeups.  Normal sequence from the
- * active (originating) side is that soisconnecting() is
- * called during processing of connect() call,
- * resulting in an eventual call to soisconnected() if/when the
- * connection is established.  When the connection is torn down
- * soisdisconnecting() is called during processing of disconnect() call,
- * and soisdisconnected() is called when the connection to the peer
- * is totally severed.  The semantics of these routines are such that
- * connectionless protocols can call soisconnected() and soisdisconnected()
- * only, bypassing the in-progress calls when setting up a ``connection''
- * takes no time.
- *
- * From the passive side, a socket is created with
- * two queues of sockets: so_incomp for connections in progress
- * and so_comp for connections already made and awaiting user acceptance.
- * As a protocol is preparing incoming connections, it creates a socket
- * structure queued on so_incomp by calling sonewconn().  When the connection
- * is established, soisconnected() is called, and transfers the
- * socket structure to so_comp, making it available to accept().
- *
- * If a socket is closed with sockets on either
- * so_incomp or so_comp, these sockets are dropped.
- *
- * If higher level protocols are implemented in
- * the kernel, the wakeups done here will sometimes
- * cause software-interrupt process scheduling.
- */
-
-void
-soisconnecting(so)
-	register struct socket *so;
-{
-
-	SOCK_LOCK(so);
-	so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
-	so->so_state |= SS_ISCONNECTING;
-	SOCK_UNLOCK(so);
-}
-
-void
-soisconnected(so)
-	struct socket *so;
-{
-	struct socket *head;
-
-	ACCEPT_LOCK();
-	SOCK_LOCK(so);
-	so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING);
-	so->so_state |= SS_ISCONNECTED;
-	head = so->so_head;
-	if (head != NULL && (so->so_qstate & SQ_INCOMP)) {
-		if ((so->so_options & SO_ACCEPTFILTER) == 0) {
-			SOCK_UNLOCK(so);
-			TAILQ_REMOVE(&head->so_incomp, so, so_list);
-			head->so_incqlen--;
-			so->so_qstate &= ~SQ_INCOMP;
-			TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
-			head->so_qlen++;
-			so->so_qstate |= SQ_COMP;
-			ACCEPT_UNLOCK();
-			sorwakeup(head);
-			wakeup_one(&head->so_timeo);
-		} else {
-			ACCEPT_UNLOCK();
-			so->so_upcall =
-			    head->so_accf->so_accept_filter->accf_callback;
-			so->so_upcallarg = head->so_accf->so_accept_filter_arg;
-			so->so_rcv.sb_flags |= SB_UPCALL;
-			so->so_options &= ~SO_ACCEPTFILTER;
-			SOCK_UNLOCK(so);
-			so->so_upcall(so, so->so_upcallarg, M_DONTWAIT);
-		}
-		return;
-	}
-	SOCK_UNLOCK(so);
-	ACCEPT_UNLOCK();
-	wakeup(&so->so_timeo);
-	sorwakeup(so);
-	sowwakeup(so);
-}
-
-void
-soisdisconnecting(so)
-	register struct socket *so;
-{
-
-	/*
-	 * XXXRW: This code assumes that SOCK_LOCK(so) and
-	 * SOCKBUF_LOCK(&so->so_rcv) are the same.
-	 */
-	SOCKBUF_LOCK(&so->so_rcv);
-	so->so_state &= ~SS_ISCONNECTING;
-	so->so_state |= SS_ISDISCONNECTING;
-	so->so_rcv.sb_state |= SBS_CANTRCVMORE;
-	sorwakeup_locked(so);
-	SOCKBUF_LOCK(&so->so_snd);
-	so->so_snd.sb_state |= SBS_CANTSENDMORE;
-	sowwakeup_locked(so);
-	wakeup(&so->so_timeo);
-}
-
-void
-soisdisconnected(so)
-	register struct socket *so;
-{
-
-	/*
-	 * XXXRW: This code assumes that SOCK_LOCK(so) and
-	 * SOCKBUF_LOCK(&so->so_rcv) are the same.
-	 */
-	SOCKBUF_LOCK(&so->so_rcv);
-	so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
-	so->so_state |= SS_ISDISCONNECTED;
-	so->so_rcv.sb_state |= SBS_CANTRCVMORE;
-	sorwakeup_locked(so);
-	SOCKBUF_LOCK(&so->so_snd);
-	so->so_snd.sb_state |= SBS_CANTSENDMORE;
-	sbdrop_locked(&so->so_snd, so->so_snd.sb_cc);
-	sowwakeup_locked(so);
-	wakeup(&so->so_timeo);
-}
-
-/*
- * When an attempt at a new connection is noted on a socket
- * which accepts connections, sonewconn is called.  If the
- * connection is possible (subject to space constraints, etc.)
- * then we allocate a new structure, propoerly linked into the
- * data structure of the original socket, and return this.
- * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED.
- *
- * note: the ref count on the socket is 0 on return
- */
-struct socket *
-sonewconn(head, connstatus)
-	register struct socket *head;
-	int connstatus;
-{
-	register struct socket *so;
-	int over;
-
-	ACCEPT_LOCK();
-	over = (head->so_qlen > 3 * head->so_qlimit / 2);
-	ACCEPT_UNLOCK();
-	if (over)
-		return (NULL);
-	so = soalloc(M_NOWAIT);
-	if (so == NULL)
-		return (NULL);
-	if ((head->so_options & SO_ACCEPTFILTER) != 0)
-		connstatus = 0;
-	so->so_head = head;
-	so->so_type = head->so_type;
-	so->so_options = head->so_options &~ SO_ACCEPTCONN;
-	so->so_linger = head->so_linger;
-	so->so_state = head->so_state | SS_NOFDREF;
-	so->so_proto = head->so_proto;
-	so->so_cred = crhold(head->so_cred);
-
-#ifdef MAC
-	SOCK_LOCK(head);
-	mac_create_socket_from_socket(head, so);
-	SOCK_UNLOCK(head);
-#endif
-	knlist_init(&so->so_rcv.sb_sel.si_note, SOCKBUF_MTX(&so->so_rcv),
-	    NULL, NULL, NULL);
-	knlist_init(&so->so_snd.sb_sel.si_note, SOCKBUF_MTX(&so->so_snd),
-	    NULL, NULL, NULL);
-	if (soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat) ||
-	    (*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) {
-		sodealloc(so);
-		return (NULL);
-	}
-	so->so_rcv.sb_lowat = head->so_rcv.sb_lowat;
-	so->so_snd.sb_lowat = head->so_snd.sb_lowat;
-	so->so_rcv.sb_timeo = head->so_rcv.sb_timeo;
- 	so->so_snd.sb_timeo = head->so_snd.sb_timeo;
-	so->so_rcv.sb_flags |= head->so_rcv.sb_flags & SB_AUTOSIZE;
-	so->so_snd.sb_flags |= head->so_snd.sb_flags & SB_AUTOSIZE;
-	so->so_state |= connstatus;
-	ACCEPT_LOCK();
-	if (connstatus) {
-		TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
-		so->so_qstate |= SQ_COMP;
-		head->so_qlen++;
-	} else {
-		/*
-		 * Keep removing sockets from the head until there's room for
-		 * us to insert on the tail.  In pre-locking revisions, this
-		 * was a simple if(), but as we could be racing with other
-		 * threads and soabort() requires dropping locks, we must
-		 * loop waiting for the condition to be true.
-		 */
-		while (head->so_incqlen > head->so_qlimit) {
-			struct socket *sp;
-			sp = TAILQ_FIRST(&head->so_incomp);
-			TAILQ_REMOVE(&head->so_incomp, sp, so_list);
-			head->so_incqlen--;
-			sp->so_qstate &= ~SQ_INCOMP;
-			sp->so_head = NULL;
-			ACCEPT_UNLOCK();
-			(void) soabort(sp);
-			ACCEPT_LOCK();
-		}
-		TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list);
-		so->so_qstate |= SQ_INCOMP;
-		head->so_incqlen++;
-	}
-	ACCEPT_UNLOCK();
-	if (connstatus) {
-		sorwakeup(head);
-		wakeup_one(&head->so_timeo);
-	}
-	return (so);
-}
-
-/*
- * Socantsendmore indicates that no more data will be sent on the
- * socket; it would normally be applied to a socket when the user
- * informs the system that no more data is to be sent, by the protocol
- * code (in case PRU_SHUTDOWN).  Socantrcvmore indicates that no more data
- * will be received, and will normally be applied to the socket by a
- * protocol when it detects that the peer will send no more data.
- * Data queued for reading in the socket may yet be read.
- */
-void
-socantsendmore_locked(so)
-	struct socket *so;
-{
-
-	SOCKBUF_LOCK_ASSERT(&so->so_snd);
-
-	so->so_snd.sb_state |= SBS_CANTSENDMORE;
-	sowwakeup_locked(so);
-	mtx_assert(SOCKBUF_MTX(&so->so_snd), MA_NOTOWNED);
-}
-
-void
-socantsendmore(so)
-	struct socket *so;
-{
-
-	SOCKBUF_LOCK(&so->so_snd);
-	socantsendmore_locked(so);
-	mtx_assert(SOCKBUF_MTX(&so->so_snd), MA_NOTOWNED);
-}
-
-void
-socantrcvmore_locked(so)
-	struct socket *so;
-{
-
-	SOCKBUF_LOCK_ASSERT(&so->so_rcv);
-
-	so->so_rcv.sb_state |= SBS_CANTRCVMORE;
-	sorwakeup_locked(so);
-	mtx_assert(SOCKBUF_MTX(&so->so_rcv), MA_NOTOWNED);
-}
-
-void
-socantrcvmore(so)
-	struct socket *so;
-{
-
-	SOCKBUF_LOCK(&so->so_rcv);
-	socantrcvmore_locked(so);
-	mtx_assert(SOCKBUF_MTX(&so->so_rcv), MA_NOTOWNED);
-}
-
-/*
- * Wait for data to arrive at/drain from a socket buffer.
- */
-int
-sbwait(sb)
-	struct sockbuf *sb;
-{
-
-	SOCKBUF_LOCK_ASSERT(sb);
-
-	sb->sb_flags |= SB_WAIT;
-	return (msleep(&sb->sb_cc, &sb->sb_mtx,
-	    (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH, "sbwait",
-	    sb->sb_timeo));
-}
-
-/*
- * Lock a sockbuf already known to be locked;
- * return any error returned from sleep (EINTR).
- */
-int
-sb_lock(sb)
-	register struct sockbuf *sb;
-{
-	int error;
-
-	SOCKBUF_LOCK_ASSERT(sb);
-
-	while (sb->sb_flags & SB_LOCK) {
-		sb->sb_flags |= SB_WANT;
-		error = msleep(&sb->sb_flags, &sb->sb_mtx,
-		    (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK|PCATCH,
-		    "sblock", 0);
-		if (error)
-			return (error);
-	}
-	sb->sb_flags |= SB_LOCK;
-	return (0);
-}
-
-/*
- * Wakeup processes waiting on a socket buffer.  Do asynchronous
- * notification via SIGIO if the socket has the SS_ASYNC flag set.
- *
- * Called with the socket buffer lock held; will release the lock by the end
- * of the function.  This allows the caller to acquire the socket buffer lock
- * while testing for the need for various sorts of wakeup and hold it through
- * to the point where it's no longer required.  We currently hold the lock
- * through calls out to other subsystems (with the exception of kqueue), and
- * then release it to avoid lock order issues.  It's not clear that's
- * correct.
- */
-void
-sowakeup(so, sb)
-	register struct socket *so;
-	register struct sockbuf *sb;
-{
-
-	SOCKBUF_LOCK_ASSERT(sb);
-
-	selwakeuppri(&sb->sb_sel, PSOCK);
-	sb->sb_flags &= ~SB_SEL;
-	if (sb->sb_flags & SB_WAIT) {
-		sb->sb_flags &= ~SB_WAIT;
-		wakeup(&sb->sb_cc);
-	}
-	KNOTE_LOCKED(&sb->sb_sel.si_note, 0);
-	SOCKBUF_UNLOCK(sb);
-	if ((so->so_state & SS_ASYNC) && so->so_sigio != NULL)
-		pgsigio(&so->so_sigio, SIGIO, 0);
-	if (sb->sb_flags & SB_UPCALL)
-		(*so->so_upcall)(so, so->so_upcallarg, M_DONTWAIT);
-	if (sb->sb_flags & SB_AIO)
-		aio_swake(so, sb);
-	mtx_assert(SOCKBUF_MTX(sb), MA_NOTOWNED);
-}
-
-/*
- * Socket buffer (struct sockbuf) utility routines.
- *
- * Each socket contains two socket buffers: one for sending data and
- * one for receiving data.  Each buffer contains a queue of mbufs,
- * information about the number of mbufs and amount of data in the
- * queue, and other fields allowing select() statements and notification
- * on data availability to be implemented.
- *
- * Data stored in a socket buffer is maintained as a list of records.
- * Each record is a list of mbufs chained together with the m_next
- * field.  Records are chained together with the m_nextpkt field. The upper
- * level routine soreceive() expects the following conventions to be
- * observed when placing information in the receive buffer:
- *
- * 1. If the protocol requires each message be preceded by the sender's
- *    name, then a record containing that name must be present before
- *    any associated data (mbuf's must be of type MT_SONAME).
- * 2. If the protocol supports the exchange of ``access rights'' (really
- *    just additional data associated with the message), and there are
- *    ``rights'' to be received, then a record containing this data
- *    should be present (mbuf's must be of type MT_RIGHTS).
- * 3. If a name or rights record exists, then it must be followed by
- *    a data record, perhaps of zero length.
- *
- * Before using a new socket structure it is first necessary to reserve
- * buffer space to the socket, by calling sbreserve().  This should commit
- * some of the available buffer space in the system buffer pool for the
- * socket (currently, it does nothing but enforce limits).  The space
- * should be released by calling sbrelease() when the socket is destroyed.
- */
-
-int
-soreserve(so, sndcc, rcvcc)
-	register struct socket *so;
-	u_long sndcc, rcvcc;
-{
-	struct thread *td = curthread;
-
-	SOCKBUF_LOCK(&so->so_snd);
-	SOCKBUF_LOCK(&so->so_rcv);
-	if (sbreserve_locked(&so->so_snd, sndcc, so, td) == 0)
-		goto bad;
-	if (sbreserve_locked(&so->so_rcv, rcvcc, so, td) == 0)
-		goto bad2;
-	if (so->so_rcv.sb_lowat == 0)
-		so->so_rcv.sb_lowat = 1;
-	if (so->so_snd.sb_lowat == 0)
-		so->so_snd.sb_lowat = MCLBYTES;
-	if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat)
-		so->so_snd.sb_lowat = so->so_snd.sb_hiwat;
-	SOCKBUF_UNLOCK(&so->so_rcv);
-	SOCKBUF_UNLOCK(&so->so_snd);
-	return (0);
-bad2:
-	sbrelease_locked(&so->so_snd, so);
-bad:
-	SOCKBUF_UNLOCK(&so->so_rcv);
-	SOCKBUF_UNLOCK(&so->so_snd);
-	return (ENOBUFS);
-}
-
-static int
-sysctl_handle_sb_max(SYSCTL_HANDLER_ARGS)
-{
-	int error = 0;
-	u_long old_sb_max = sb_max;
-
-	error = SYSCTL_OUT(req, arg1, sizeof(u_long));
-	if (error || !req->newptr)
-		return (error);
-	error = SYSCTL_IN(req, arg1, sizeof(u_long));
-	if (error)
-		return (error);
-	if (sb_max < MSIZE + MCLBYTES) {
-		sb_max = old_sb_max;
-		return (EINVAL);
-	}
-	sb_max_adj = (u_quad_t)sb_max * MCLBYTES / (MSIZE + MCLBYTES);
-	return (0);
-}
-	
-/*
- * Allot mbufs to a sockbuf.
- * Attempt to scale mbmax so that mbcnt doesn't become limiting
- * if buffering efficiency is near the normal case.
- */
-int
-sbreserve_locked(sb, cc, so, td)
-	struct sockbuf *sb;
-	u_long cc;
-	struct socket *so;
-	struct thread *td;
-{
-	rlim_t sbsize_limit;
-
-	SOCKBUF_LOCK_ASSERT(sb);
-
-	/*
-	 * td will only be NULL when we're in an interrupt
-	 * (e.g. in tcp_input())
-	 */
-	if (cc > sb_max_adj)
-		return (0);
-	if (td != NULL) {
-		PROC_LOCK(td->td_proc);
-		sbsize_limit = lim_cur(td->td_proc, RLIMIT_SBSIZE);
-		PROC_UNLOCK(td->td_proc);
-	} else
-		sbsize_limit = RLIM_INFINITY;
-	if (!chgsbsize(so->so_cred->cr_uidinfo, &sb->sb_hiwat, cc,
-	    sbsize_limit))
-		return (0);
-	sb->sb_mbmax = min(cc * sb_efficiency, sb_max);
-	if (sb->sb_lowat > sb->sb_hiwat)
-		sb->sb_lowat = sb->sb_hiwat;
-	return (1);
-}
-
-int
-sbreserve(sb, cc, so, td)
-	struct sockbuf *sb;
-	u_long cc;
-	struct socket *so;
-	struct thread *td;
-{
-	int error;
-
-	SOCKBUF_LOCK(sb);
-	error = sbreserve_locked(sb, cc, so, td);
-	SOCKBUF_UNLOCK(sb);
-	return (error);
-}
-
-/*
- * Free mbufs held by a socket, and reserved mbuf space.
- */
-void
-sbrelease_locked(sb, so)
-	struct sockbuf *sb;
-	struct socket *so;
-{
-
-	SOCKBUF_LOCK_ASSERT(sb);
-
-	sbflush_locked(sb);
-	(void)chgsbsize(so->so_cred->cr_uidinfo, &sb->sb_hiwat, 0,
-	    RLIM_INFINITY);
-	sb->sb_mbmax = 0;
-}
-
-void
-sbrelease(sb, so)
-	struct sockbuf *sb;
-	struct socket *so;
-{
-
-	SOCKBUF_LOCK(sb);
-	sbrelease_locked(sb, so);
-	SOCKBUF_UNLOCK(sb);
-}
-/*
- * Routines to add and remove
- * data from an mbuf queue.
- *
- * The routines sbappend() or sbappendrecord() are normally called to
- * append new mbufs to a socket buffer, after checking that adequate
- * space is available, comparing the function sbspace() with the amount
- * of data to be added.  sbappendrecord() differs from sbappend() in
- * that data supplied is treated as the beginning of a new record.
- * To place a sender's address, optional access rights, and data in a
- * socket receive buffer, sbappendaddr() should be used.  To place
- * access rights and data in a socket receive buffer, sbappendrights()
- * should be used.  In either case, the new data begins a new record.
- * Note that unlike sbappend() and sbappendrecord(), these routines check
- * for the caller that there will be enough space to store the data.
- * Each fails if there is not enough space, or if it cannot find mbufs
- * to store additional information in.
- *
- * Reliable protocols may use the socket send buffer to hold data
- * awaiting acknowledgement.  Data is normally copied from a socket
- * send buffer in a protocol with m_copy for output to a peer,
- * and then removing the data from the socket buffer with sbdrop()
- * or sbdroprecord() when the data is acknowledged by the peer.
- */
-
-#ifdef SOCKBUF_DEBUG
-void
-sblastrecordchk(struct sockbuf *sb, const char *file, int line)
-{
-	struct mbuf *m = sb->sb_mb;
-
-	SOCKBUF_LOCK_ASSERT(sb);
-
-	while (m && m->m_nextpkt)
-		m = m->m_nextpkt;
-
-	if (m != sb->sb_lastrecord) {
-		printf("%s: sb_mb %p sb_lastrecord %p last %p\n",
-			__func__, sb->sb_mb, sb->sb_lastrecord, m);
-		printf("packet chain:\n");
-		for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt)
-			printf("\t%p\n", m);
-		panic("%s from %s:%u", __func__, file, line);
-	}
-}
-
-void
-sblastmbufchk(struct sockbuf *sb, const char *file, int line)
-{
-	struct mbuf *m = sb->sb_mb;
-	struct mbuf *n;
-
-	SOCKBUF_LOCK_ASSERT(sb);
-
-	while (m && m->m_nextpkt)
-		m = m->m_nextpkt;
-
-	while (m && m->m_next)
-		m = m->m_next;
-
-	if (m != sb->sb_mbtail) {
-		printf("%s: sb_mb %p sb_mbtail %p last %p\n",
-			__func__, sb->sb_mb, sb->sb_mbtail, m);
-		printf("packet tree:\n");
-		for (m = sb->sb_mb; m != NULL; m = m->m_nextpkt) {
-			printf("\t");
-			for (n = m; n != NULL; n = n->m_next)
-				printf("%p ", n);
-			printf("\n");
-		}
-		panic("%s from %s:%u", __func__, file, line);
-	}
-}
-#endif /* SOCKBUF_DEBUG */
-
-#define SBLINKRECORD(sb, m0) do {					\
-	SOCKBUF_LOCK_ASSERT(sb);					\
-	if ((sb)->sb_lastrecord != NULL)				\
-		(sb)->sb_lastrecord->m_nextpkt = (m0);			\
-	else								\
-		(sb)->sb_mb = (m0);					\
-	(sb)->sb_lastrecord = (m0);					\
-} while (/*CONSTCOND*/0)
-
-/*
- * Append mbuf chain m to the last record in the
- * socket buffer sb.  The additional space associated
- * the mbuf chain is recorded in sb.  Empty mbufs are
- * discarded and mbufs are compacted where possible.
- */
-void
-sbappend_locked(sb, m)
-	struct sockbuf *sb;
-	struct mbuf *m;
-{
-	register struct mbuf *n;
-
-	SOCKBUF_LOCK_ASSERT(sb);
-
-	if (m == 0)
-		return;
-
-	SBLASTRECORDCHK(sb);
-	n = sb->sb_mb;
-	if (n) {
-		while (n->m_nextpkt)
-			n = n->m_nextpkt;
-		do {
-			if (n->m_flags & M_EOR) {
-				sbappendrecord_locked(sb, m); /* XXXXXX!!!! */
-				return;
-			}
-		} while (n->m_next && (n = n->m_next));
-	} else {
-		/*
-		 * XXX Would like to simply use sb_mbtail here, but
-		 * XXX I need to verify that I won't miss an EOR that
-		 * XXX way.
-		 */
-		if ((n = sb->sb_lastrecord) != NULL) {
-			do {
-				if (n->m_flags & M_EOR) {
-					sbappendrecord_locked(sb, m); /* XXXXXX!!!! */
-					return;
-				}
-			} while (n->m_next && (n = n->m_next));
-		} else {
-			/*
-			 * If this is the first record in the socket buffer,
-			 * it's also the last record.
-			 */
-			sb->sb_lastrecord = m;
-		}
-	}
-	sbcompress(sb, m, n);
-	SBLASTRECORDCHK(sb);
-}
-
-/*
- * Append mbuf chain m to the last record in the
- * socket buffer sb.  The additional space associated
- * the mbuf chain is recorded in sb.  Empty mbufs are
- * discarded and mbufs are compacted where possible.
- */
-void
-sbappend(sb, m)
-	struct sockbuf *sb;
-	struct mbuf *m;
-{
-
-	SOCKBUF_LOCK(sb);
-	sbappend_locked(sb, m);
-	SOCKBUF_UNLOCK(sb);
-}
-
-/*
- * This version of sbappend() should only be used when the caller
- * absolutely knows that there will never be more than one record
- * in the socket buffer, that is, a stream protocol (such as TCP).
- */
-void
-sbappendstream_locked(struct sockbuf *sb, struct mbuf *m)
-{
-	SOCKBUF_LOCK_ASSERT(sb);
-
-	KASSERT(m->m_nextpkt == NULL,("sbappendstream 0"));
-	KASSERT(sb->sb_mb == sb->sb_lastrecord,("sbappendstream 1"));
-
-	SBLASTMBUFCHK(sb);
-
-	sbcompress(sb, m, sb->sb_mbtail);
-
-	sb->sb_lastrecord = sb->sb_mb;
-	SBLASTRECORDCHK(sb);
-}
-
-/*
- * This version of sbappend() should only be used when the caller
- * absolutely knows that there will never be more than one record
- * in the socket buffer, that is, a stream protocol (such as TCP).
- */
-void
-sbappendstream(struct sockbuf *sb, struct mbuf *m)
-{
-
-	SOCKBUF_LOCK(sb);
-	sbappendstream_locked(sb, m);
-	SOCKBUF_UNLOCK(sb);
-}
-
-#ifdef SOCKBUF_DEBUG
-void
-sbcheck(sb)
-	struct sockbuf *sb;
-{
-	struct mbuf *m;
-	struct mbuf *n = 0;
-	u_long len = 0, mbcnt = 0;
-
-	SOCKBUF_LOCK_ASSERT(sb);
-
-	for (m = sb->sb_mb; m; m = n) {
-	    n = m->m_nextpkt;
-	    for (; m; m = m->m_next) {
-		len += m->m_len;
-		mbcnt += MSIZE;
-		if (m->m_flags & M_EXT) /*XXX*/ /* pretty sure this is bogus */
-			mbcnt += m->m_ext.ext_size;
-	    }
-	}
-	if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) {
-		printf("cc %ld != %u || mbcnt %ld != %u\n", len, sb->sb_cc,
-		    mbcnt, sb->sb_mbcnt);
-		panic("sbcheck");
-	}
-}
-#endif
-
-/*
- * As above, except the mbuf chain
- * begins a new record.
- */
-void
-sbappendrecord_locked(sb, m0)
-	register struct sockbuf *sb;
-	register struct mbuf *m0;
-{
-	register struct mbuf *m;
-
-	SOCKBUF_LOCK_ASSERT(sb);
-
-	if (m0 == 0)
-		return;
-	m = sb->sb_mb;
-	if (m)
-		while (m->m_nextpkt)
-			m = m->m_nextpkt;
-	/*
-	 * Put the first mbuf on the queue.
-	 * Note this permits zero length records.
-	 */
-	sballoc(sb, m0);
-	SBLASTRECORDCHK(sb);
-	SBLINKRECORD(sb, m0);
-	if (m)
-		m->m_nextpkt = m0;
-	else
-		sb->sb_mb = m0;
-	m = m0->m_next;
-	m0->m_next = 0;
-	if (m && (m0->m_flags & M_EOR)) {
-		m0->m_flags &= ~M_EOR;
-		m->m_flags |= M_EOR;
-	}
-	sbcompress(sb, m, m0);
-}
-
-/*
- * As above, except the mbuf chain
- * begins a new record.
- */
-void
-sbappendrecord(sb, m0)
-	register struct sockbuf *sb;
-	register struct mbuf *m0;
-{
-
-	SOCKBUF_LOCK(sb);
-	sbappendrecord_locked(sb, m0);
-	SOCKBUF_UNLOCK(sb);
-}
-
-/*
- * Append address and data, and optionally, control (ancillary) data
- * to the receive queue of a socket.  If present,
- * m0 must include a packet header with total length.
- * Returns 0 if no space in sockbuf or insufficient mbufs.
- */
-int
-sbappendaddr_locked(sb, asa, m0, control)
-	struct sockbuf *sb;
-	const struct sockaddr *asa;
-	struct mbuf *m0, *control;
-{
-	struct mbuf *m, *n, *nlast;
-	int space = asa->sa_len;
-
-	SOCKBUF_LOCK_ASSERT(sb);
-
-	if (m0 && (m0->m_flags & M_PKTHDR) == 0)
-		panic("sbappendaddr_locked");
-	if (m0)
-		space += m0->m_pkthdr.len;
-	space += m_length(control, &n);
-
-	if (space > sbspace(sb))
-		return (0);
-#if MSIZE <= 256
-	if (asa->sa_len > MLEN)
-		return (0);
-#endif
-	MGET(m, M_DONTWAIT, MT_SONAME);
-	if (m == 0)
-		return (0);
-	m->m_len = asa->sa_len;
-	bcopy(asa, mtod(m, caddr_t), asa->sa_len);
-	if (n)
-		n->m_next = m0;		/* concatenate data to control */
-	else
-		control = m0;
-	m->m_next = control;
-	for (n = m; n->m_next != NULL; n = n->m_next)
-		sballoc(sb, n);
-	sballoc(sb, n);
-	nlast = n;
-	SBLINKRECORD(sb, m);
-
-	sb->sb_mbtail = nlast;
-	SBLASTMBUFCHK(sb);
-
-	SBLASTRECORDCHK(sb);
-	return (1);
-}
-
-/*
- * Append address and data, and optionally, control (ancillary) data
- * to the receive queue of a socket.  If present,
- * m0 must include a packet header with total length.
- * Returns 0 if no space in sockbuf or insufficient mbufs.
- */
-int
-sbappendaddr(sb, asa, m0, control)
-	struct sockbuf *sb;
-	const struct sockaddr *asa;
-	struct mbuf *m0, *control;
-{
-	int retval;
-
-	SOCKBUF_LOCK(sb);
-	retval = sbappendaddr_locked(sb, asa, m0, control);
-	SOCKBUF_UNLOCK(sb);
-	return (retval);
-}
-
-int
-sbappendcontrol_locked(sb, m0, control)
-	struct sockbuf *sb;
-	struct mbuf *control, *m0;
-{
-	struct mbuf *m, *n, *mlast;
-	int space;
-
-	SOCKBUF_LOCK_ASSERT(sb);
-
-	if (control == 0)
-		panic("sbappendcontrol_locked");
-	space = m_length(control, &n) + m_length(m0, NULL);
-
-	if (space > sbspace(sb))
-		return (0);
-	n->m_next = m0;			/* concatenate data to control */
-
-	SBLASTRECORDCHK(sb);
-
-	for (m = control; m->m_next; m = m->m_next)
-		sballoc(sb, m);
-	sballoc(sb, m);
-	mlast = m;
-	SBLINKRECORD(sb, control);
-
-	sb->sb_mbtail = mlast;
-	SBLASTMBUFCHK(sb);
-
-	SBLASTRECORDCHK(sb);
-	return (1);
-}
-
-int
-sbappendcontrol(sb, m0, control)
-	struct sockbuf *sb;
-	struct mbuf *control, *m0;
-{
-	int retval;
-
-	SOCKBUF_LOCK(sb);
-	retval = sbappendcontrol_locked(sb, m0, control);
-	SOCKBUF_UNLOCK(sb);
-	return (retval);
-}
-
-/*
- * Append the data in mbuf chain (m) into the socket buffer sb following mbuf
- * (n).  If (n) is NULL, the buffer is presumed empty.
- *
- * When the data is compressed, mbufs in the chain may be handled in one of
- * three ways:
- *
- * (1) The mbuf may simply be dropped, if it contributes nothing (no data, no
- *     record boundary, and no change in data type).
- *
- * (2) The mbuf may be coalesced -- i.e., data in the mbuf may be copied into
- *     an mbuf already in the socket buffer.  This can occur if an
- *     appropriate mbuf exists, there is room, and no merging of data types
- *     will occur.
- *
- * (3) The mbuf may be appended to the end of the existing mbuf chain.
- *
- * If any of the new mbufs is marked as M_EOR, mark the last mbuf appended as
- * end-of-record.
- */
-void
-sbcompress(sb, m, n)
-	register struct sockbuf *sb;
-	register struct mbuf *m, *n;
-{
-	register int eor = 0;
-	register struct mbuf *o;
-
-	SOCKBUF_LOCK_ASSERT(sb);
-
-	while (m) {
-		eor |= m->m_flags & M_EOR;
-		if (m->m_len == 0 &&
-		    (eor == 0 ||
-		     (((o = m->m_next) || (o = n)) &&
-		      o->m_type == m->m_type))) {
-			if (sb->sb_lastrecord == m)
-				sb->sb_lastrecord = m->m_next;
-			m = m_free(m);
-			continue;
-		}
-		if (n && (n->m_flags & M_EOR) == 0 &&
-		    M_WRITABLE(n) &&
-		    m->m_len <= MCLBYTES / 4 && /* XXX: Don't copy too much */
-		    m->m_len <= M_TRAILINGSPACE(n) &&
-		    n->m_type == m->m_type) {
-			bcopy(mtod(m, caddr_t), mtod(n, caddr_t) + n->m_len,
-			    (unsigned)m->m_len);
-			n->m_len += m->m_len;
-			sb->sb_cc += m->m_len;
-			if (m->m_type != MT_DATA && m->m_type != MT_HEADER &&
-			    m->m_type != MT_OOBDATA)
-				/* XXX: Probably don't need.*/
-				sb->sb_ctl += m->m_len;
-			m = m_free(m);
-			continue;
-		}
-		if (n)
-			n->m_next = m;
-		else
-			sb->sb_mb = m;
-		sb->sb_mbtail = m;
-		sballoc(sb, m);
-		n = m;
-		m->m_flags &= ~M_EOR;
-		m = m->m_next;
-		n->m_next = 0;
-	}
-	if (eor) {
-		KASSERT(n != NULL, ("sbcompress: eor && n == NULL"));
-		n->m_flags |= eor;
-	}
-	SBLASTMBUFCHK(sb);
-}
-
-/*
- * Free all mbufs in a sockbuf.
- * Check that all resources are reclaimed.
- */
-void
-sbflush_locked(sb)
-	register struct sockbuf *sb;
-{
-
-	SOCKBUF_LOCK_ASSERT(sb);
-
-	if (sb->sb_flags & SB_LOCK)
-		panic("sbflush_locked: locked");
-	while (sb->sb_mbcnt) {
-		/*
-		 * Don't call sbdrop(sb, 0) if the leading mbuf is non-empty:
-		 * we would loop forever. Panic instead.
-		 */
-		if (!sb->sb_cc && (sb->sb_mb == NULL || sb->sb_mb->m_len))
-			break;
-		sbdrop_locked(sb, (int)sb->sb_cc);
-	}
-	if (sb->sb_cc || sb->sb_mb || sb->sb_mbcnt)
-		panic("sbflush_locked: cc %u || mb %p || mbcnt %u", sb->sb_cc, (void *)sb->sb_mb, sb->sb_mbcnt);
-}
-
-void
-sbflush(sb)
-	register struct sockbuf *sb;
-{
-
-	SOCKBUF_LOCK(sb);
-	sbflush_locked(sb);
-	SOCKBUF_UNLOCK(sb);
-}
-
-/*
- * Drop data from (the front of) a sockbuf.
- */
-void
-sbdrop_locked(sb, len)
-	register struct sockbuf *sb;
-	register int len;
-{
-	register struct mbuf *m;
-	struct mbuf *next;
-
-	SOCKBUF_LOCK_ASSERT(sb);
-
-	next = (m = sb->sb_mb) ? m->m_nextpkt : 0;
-	while (len > 0) {
-		if (m == 0) {
-			if (next == 0)
-				panic("sbdrop");
-			m = next;
-			next = m->m_nextpkt;
-			continue;
-		}
-		if (m->m_len > len) {
-			m->m_len -= len;
-			m->m_data += len;
-			sb->sb_cc -= len;
-			if (m->m_type != MT_DATA && m->m_type != MT_HEADER &&
-			    m->m_type != MT_OOBDATA)
-				sb->sb_ctl -= len;
-			break;
-		}
-		len -= m->m_len;
-		sbfree(sb, m);
-		m = m_free(m);
-	}
-	while (m && m->m_len == 0) {
-		sbfree(sb, m);
-		m = m_free(m);
-	}
-	if (m) {
-		sb->sb_mb = m;
-		m->m_nextpkt = next;
-	} else
-		sb->sb_mb = next;
-	/*
-	 * First part is an inline SB_EMPTY_FIXUP().  Second part
-	 * makes sure sb_lastrecord is up-to-date if we dropped
-	 * part of the last record.
-	 */
-	m = sb->sb_mb;
-	if (m == NULL) {
-		sb->sb_mbtail = NULL;
-		sb->sb_lastrecord = NULL;
-	} else if (m->m_nextpkt == NULL) {
-		sb->sb_lastrecord = m;
-	}
-}
-
-/*
- * Drop data from (the front of) a sockbuf.
- */
-void
-sbdrop(sb, len)
-	register struct sockbuf *sb;
-	register int len;
-{
-
-	SOCKBUF_LOCK(sb);
-	sbdrop_locked(sb, len);
-	SOCKBUF_UNLOCK(sb);
-}
-
-/*
- * Drop a record off the front of a sockbuf
- * and move the next record to the front.
- */
-void
-sbdroprecord_locked(sb)
-	register struct sockbuf *sb;
-{
-	register struct mbuf *m;
-
-	SOCKBUF_LOCK_ASSERT(sb);
-
-	m = sb->sb_mb;
-	if (m) {
-		sb->sb_mb = m->m_nextpkt;
-		do {
-			sbfree(sb, m);
-			m = m_free(m);
-		} while (m);
-	}
-	SB_EMPTY_FIXUP(sb);
-}
-
-/*
- * Drop a record off the front of a sockbuf
- * and move the next record to the front.
- */
-void
-sbdroprecord(sb)
-	register struct sockbuf *sb;
-{
-
-	SOCKBUF_LOCK(sb);
-	sbdroprecord_locked(sb);
-	SOCKBUF_UNLOCK(sb);
-}
-
-/*
- * Create a "control" mbuf containing the specified data
- * with the specified type for presentation on a socket buffer.
- */
-struct mbuf *
-sbcreatecontrol(p, size, type, level)
-	caddr_t p;
-	register int size;
-	int type, level;
-{
-	register struct cmsghdr *cp;
-	struct mbuf *m;
-
-	if (CMSG_SPACE((u_int)size) > MCLBYTES)
-		return ((struct mbuf *) NULL);
-	if (CMSG_SPACE((u_int)size) > MLEN)
-		m = m_getcl(M_DONTWAIT, MT_CONTROL, 0);
-	else
-		m = m_get(M_DONTWAIT, MT_CONTROL);
-	if (m == NULL)
-		return ((struct mbuf *) NULL);
-	cp = mtod(m, struct cmsghdr *);
-	m->m_len = 0;
-	KASSERT(CMSG_SPACE((u_int)size) <= M_TRAILINGSPACE(m),
-	    ("sbcreatecontrol: short mbuf"));
-	if (p != NULL)
-		(void)memcpy(CMSG_DATA(cp), p, size);
-	m->m_len = CMSG_SPACE(size);
-	cp->cmsg_len = CMSG_LEN(size);
-	cp->cmsg_level = level;
-	cp->cmsg_type = type;
-	return (m);
-}
-
-/*
- * Some routines that return EOPNOTSUPP for entry points that are not
- * supported by a protocol.  Fill in as needed.
- */
-int
-pru_abort_notsupp(struct socket *so)
-{
-	return EOPNOTSUPP;
-}
-
-int
-pru_accept_notsupp(struct socket *so, struct sockaddr **nam)
-{
-	return EOPNOTSUPP;
-}
-
-int
-pru_attach_notsupp(struct socket *so, int proto, struct thread *td)
-{
-	return EOPNOTSUPP;
-}
-
-int
-pru_bind_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
-{
-	return EOPNOTSUPP;
-}
-
-int
-pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct thread *td)
-{
-	return EOPNOTSUPP;
-}
-
-int
-pru_connect2_notsupp(struct socket *so1, struct socket *so2)
-{
-	return EOPNOTSUPP;
-}
-
-int
-pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data,
-	struct ifnet *ifp, struct thread *td)
-{
-	return EOPNOTSUPP;
-}
-
-int
-pru_detach_notsupp(struct socket *so)
-{
-	return EOPNOTSUPP;
-}
-
-int
-pru_disconnect_notsupp(struct socket *so)
-{
-	return EOPNOTSUPP;
-}
-
-int
-pru_listen_notsupp(struct socket *so, struct thread *td)
-{
-	return EOPNOTSUPP;
-}
-
-int
-pru_peeraddr_notsupp(struct socket *so, struct sockaddr **nam)
-{
-	return EOPNOTSUPP;
-}
-
-int
-pru_rcvd_notsupp(struct socket *so, int flags)
-{
-	return EOPNOTSUPP;
-}
-
-int
-pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags)
-{
-	return EOPNOTSUPP;
-}
-
-int
-pru_send_notsupp(struct socket *so, int flags, struct mbuf *m,
-	struct sockaddr *addr, struct mbuf *control, struct thread *td)
-{
-	return EOPNOTSUPP;
-}
-
-/*
- * This isn't really a ``null'' operation, but it's the default one
- * and doesn't do anything destructive.
- */
-int
-pru_sense_null(struct socket *so, struct stat *sb)
-{
-	sb->st_blksize = so->so_snd.sb_hiwat;
-	return 0;
-}
-
-int
-pru_shutdown_notsupp(struct socket *so)
-{
-	return EOPNOTSUPP;
-}
-
-int
-pru_sockaddr_notsupp(struct socket *so, struct sockaddr **nam)
-{
-	return EOPNOTSUPP;
-}
-
-int
-pru_sosend_notsupp(struct socket *so, struct sockaddr *addr, struct uio *uio,
-	struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
-{
-	return EOPNOTSUPP;
-}
-
-int
-pru_soreceive_notsupp(struct socket *so, struct sockaddr **paddr,
-	struct uio *uio, struct mbuf **mp0, struct mbuf **controlp,
-	int *flagsp)
-{
-	return EOPNOTSUPP;
-}
-
-int
-pru_sopoll_notsupp(struct socket *so, int events, struct ucred *cred,
-	struct thread *td)
-{
-	return EOPNOTSUPP;
-}
-
-/*
- * For protocol types that don't keep cached copies of labels in their
- * pcbs, provide a null sosetlabel that does a NOOP.
- */
-void
-pru_sosetlabel_null(struct socket *so)
-{
-
-}
-
-/*
- * Make a copy of a sockaddr in a malloced buffer of type M_SONAME.
- */
-struct sockaddr *
-sodupsockaddr(const struct sockaddr *sa, int mflags)
-{
-	struct sockaddr *sa2;
-
-	sa2 = malloc(sa->sa_len, M_SONAME, mflags);
-	if (sa2)
-		bcopy(sa, sa2, sa->sa_len);
-	return sa2;
-}
-
-/*
- * Create an external-format (``xsocket'') structure using the information
- * in the kernel-format socket structure pointed to by so.  This is done
- * to reduce the spew of irrelevant information over this interface,
- * to isolate user code from changes in the kernel structure, and
- * potentially to provide information-hiding if we decide that
- * some of this information should be hidden from users.
- */
-void
-sotoxsocket(struct socket *so, struct xsocket *xso)
-{
-	xso->xso_len = sizeof *xso;
-	xso->xso_so = so;
-	xso->so_type = so->so_type;
-	xso->so_options = so->so_options;
-	xso->so_linger = so->so_linger;
-	xso->so_state = so->so_state;
-	xso->so_pcb = so->so_pcb;
-	xso->xso_protocol = so->so_proto->pr_protocol;
-	xso->xso_family = so->so_proto->pr_domain->dom_family;
-	xso->so_qlen = so->so_qlen;
-	xso->so_incqlen = so->so_incqlen;
-	xso->so_qlimit = so->so_qlimit;
-	xso->so_timeo = so->so_timeo;
-	xso->so_error = so->so_error;
-	xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0;
-	xso->so_oobmark = so->so_oobmark;
-	sbtoxsockbuf(&so->so_snd, &xso->so_snd);
-	sbtoxsockbuf(&so->so_rcv, &xso->so_rcv);
-	xso->so_uid = so->so_cred->cr_uid;
-}
-
-/*
- * This does the same for sockbufs.  Note that the xsockbuf structure,
- * since it is always embedded in a socket, does not include a self
- * pointer nor a length.  We make this entry point public in case
- * some other mechanism needs it.
- */
-void
-sbtoxsockbuf(struct sockbuf *sb, struct xsockbuf *xsb)
-{
-	xsb->sb_cc = sb->sb_cc;
-	xsb->sb_hiwat = sb->sb_hiwat;
-	xsb->sb_mbcnt = sb->sb_mbcnt;
-	xsb->sb_mbmax = sb->sb_mbmax;
-	xsb->sb_lowat = sb->sb_lowat;
-	xsb->sb_flags = sb->sb_flags;
-	xsb->sb_timeo = sb->sb_timeo;
-}
-
-/*
- * Here is the definition of some of the basic objects in the kern.ipc
- * branch of the MIB.
- */
-SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC");
-
-/* This takes the place of kern.maxsockbuf, which moved to kern.ipc. */
-static int dummy;
-SYSCTL_INT(_kern, KERN_DUMMY, dummy, CTLFLAG_RW, &dummy, 0, "");
-SYSCTL_OID(_kern_ipc, KIPC_MAXSOCKBUF, maxsockbuf, CTLTYPE_ULONG|CTLFLAG_RW, 
-    &sb_max, 0, sysctl_handle_sb_max, "LU", "Maximum socket buffer size");
-SYSCTL_INT(_kern_ipc, OID_AUTO, maxsockets, CTLFLAG_RDTUN, 
-    &maxsockets, 0, "Maximum number of sockets avaliable");
-SYSCTL_ULONG(_kern_ipc, KIPC_SOCKBUF_WASTE, sockbuf_waste_factor, CTLFLAG_RW,
-    &sb_efficiency, 0, "");
-
-/*
- * Initialise maxsockets 
- */
-static void init_maxsockets(void *ignored)
-{
-	TUNABLE_INT_FETCH("kern.ipc.maxsockets", &maxsockets);
-	maxsockets = imax(maxsockets, imax(maxfiles, nmbclusters));
-}
-SYSINIT(param, SI_SUB_TUNABLES, SI_ORDER_ANY, init_maxsockets, NULL);
Index: kern_mtxpool.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_mtxpool.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/kern_mtxpool.c -L sys/kern/kern_mtxpool.c -u -r1.1.1.1 -r1.2
--- sys/kern/kern_mtxpool.c
+++ sys/kern/kern_mtxpool.c
@@ -24,7 +24,7 @@
  */
 
 /* Mutex pool routines.  These routines are designed to be used as short
- * term leaf mutexes (e.g. the last mutex you might aquire other then
+ * term leaf mutexes (e.g. the last mutex you might acquire other then
  * calling msleep()).  They operate using a shared pool.  A mutex is chosen
  * from the pool based on the supplied pointer (which may or may not be
  * valid).
@@ -44,7 +44,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_mtxpool.c,v 1.11 2005/02/10 12:02:37 phk Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_mtxpool.c,v 1.12 2007/05/27 20:50:23 rwatson Exp $");
 
 #include <sys/param.h>
 #include <sys/proc.h>
Index: kern_synch.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_synch.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/kern_synch.c -L sys/kern/kern_synch.c -u -r1.2 -r1.3
--- sys/kern/kern_synch.c
+++ sys/kern/kern_synch.c
@@ -35,7 +35,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: /repoman/r/ncvs/src/sys/kern/kern_synch.c,v 1.270.2.6 2006/07/06 08:32:50 glebius Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_synch.c,v 1.302 2007/10/08 23:40:40 jeff Exp $");
 
 #include "opt_ktrace.h"
 
@@ -69,6 +69,7 @@
 
 int	hogticks;
 int	lbolt;
+static int pause_wchan;
 
 static struct callout loadav_callout;
 static struct callout lbolt_callout;
@@ -101,8 +102,8 @@
 }
 
 /*
- * General sleep call.  Suspends the current process until a wakeup is
- * performed on the specified identifier.  The process will then be made
+ * General sleep call.  Suspends the current thread until a wakeup is
+ * performed on the specified identifier.  The thread will then be made
  * runnable with the specified priority.  Sleeps at most timo/hz seconds
  * (0 means no timeout).  If pri includes PCATCH flag, signals are checked
  * before and after sleeping, else signals are not checked.  Returns 0 if
@@ -111,21 +112,22 @@
  * call should be restarted if possible, and EINTR is returned if the system
  * call should be interrupted by the signal (return EINTR).
  *
- * The mutex argument is exited before the caller is suspended, and
- * entered before msleep returns.  If priority includes the PDROP
- * flag the mutex is not entered before returning.
+ * The lock argument is unlocked before the caller is suspended, and
+ * re-locked before _sleep() returns.  If priority includes the PDROP
+ * flag the lock is not re-locked before returning.
  */
 int
-msleep(ident, mtx, priority, wmesg, timo)
+_sleep(ident, lock, priority, wmesg, timo)
 	void *ident;
-	struct mtx *mtx;
+	struct lock_object *lock;
 	int priority, timo;
 	const char *wmesg;
 {
 	struct thread *td;
 	struct proc *p;
-	int catch, rval, flags;
-	WITNESS_SAVE_DECL(mtx);
+	struct lock_class *class;
+	int catch, flags, lock_state, pri, rval;
+	WITNESS_SAVE_DECL(lock_witness);
 
 	td = curthread;
 	p = td->td_proc;
@@ -133,12 +135,16 @@
 	if (KTRPOINT(td, KTR_CSW))
 		ktrcsw(1, 0);
 #endif
-	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, mtx == NULL ? NULL :
-	    &mtx->mtx_object, "Sleeping on \"%s\"", wmesg);
-	KASSERT(timo != 0 || mtx_owned(&Giant) || mtx != NULL,
-	    ("sleeping without a mutex"));
+	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, lock,
+	    "Sleeping on \"%s\"", wmesg);
+	KASSERT(timo != 0 || mtx_owned(&Giant) || lock != NULL ||
+	    ident == &lbolt, ("sleeping without a lock"));
 	KASSERT(p != NULL, ("msleep1"));
 	KASSERT(ident != NULL && TD_IS_RUNNING(td), ("msleep"));
+	if (lock != NULL)
+		class = LOCK_CLASS(lock);
+	else
+		class = NULL;
 
 	if (cold) {
 		/*
@@ -149,8 +155,8 @@
 		 * splx(s);" to give interrupts a chance, but there is
 		 * no way to give interrupts a chance now.
 		 */
-		if (mtx != NULL && priority & PDROP)
-			mtx_unlock(mtx);
+		if (lock != NULL && priority & PDROP)
+			class->lc_unlock(lock);
 		return (0);
 	}
 	catch = priority & PCATCH;
@@ -164,20 +170,24 @@
 	if (TD_ON_SLEEPQ(td))
 		sleepq_remove(td, td->td_wchan);
 
-	flags = SLEEPQ_MSLEEP;
+	if (ident == &pause_wchan)
+		flags = SLEEPQ_PAUSE;
+	else
+		flags = SLEEPQ_SLEEP;
 	if (catch)
 		flags |= SLEEPQ_INTERRUPTIBLE;
 
 	sleepq_lock(ident);
-	CTR5(KTR_PROC, "msleep: thread %p (pid %ld, %s) on %s (%p)",
-	    (void *)td, (long)p->p_pid, p->p_comm, wmesg, ident);
+	CTR5(KTR_PROC, "sleep: thread %ld (pid %ld, %s) on %s (%p)",
+	    td->td_tid, p->p_pid, p->p_comm, wmesg, ident);
 
 	DROP_GIANT();
-	if (mtx != NULL) {
-		mtx_assert(mtx, MA_OWNED | MA_NOTRECURSED);
-		WITNESS_SAVE(&mtx->mtx_object, mtx);
-		mtx_unlock(mtx);
-	}
+	if (lock != NULL && !(class->lc_flags & LC_SLEEPABLE)) {
+		WITNESS_SAVE(lock, lock_witness);
+		lock_state = class->lc_unlock(lock);
+	} else
+		/* GCC needs to follow the Yellow Brick Road */
+		lock_state = -1;
 
 	/*
 	 * We put ourselves on the sleep queue and start our timeout
@@ -188,17 +198,24 @@
 	 * stopped, then td will no longer be on a sleep queue upon
 	 * return from cursig().
 	 */
-	sleepq_add(ident, mtx, wmesg, flags);
+	sleepq_add(ident, ident == &lbolt ? NULL : lock, wmesg, flags, 0);
 	if (timo)
 		sleepq_set_timeout(ident, timo);
+	if (lock != NULL && class->lc_flags & LC_SLEEPABLE) {
+		sleepq_release(ident);
+		WITNESS_SAVE(lock, lock_witness);
+		lock_state = class->lc_unlock(lock);
+		sleepq_lock(ident);
+	}
 
 	/*
-	 * Adjust this thread's priority.
+	 * Adjust this thread's priority, if necessary.
 	 */
-	if ((priority & PRIMASK) != 0) {
-		mtx_lock_spin(&sched_lock);
-		sched_prio(td, priority & PRIMASK);
-		mtx_unlock_spin(&sched_lock);
+	pri = priority & PRIMASK;
+	if (pri != 0 && pri != td->td_priority) {
+		thread_lock(td);
+		sched_prio(td, pri);
+		thread_unlock(td);
 	}
 
 	if (timo && catch)
@@ -216,9 +233,9 @@
 		ktrcsw(0, 0);
 #endif
 	PICKUP_GIANT();
-	if (mtx != NULL && !(priority & PDROP)) {
-		mtx_lock(mtx);
-		WITNESS_RESTORE(&mtx->mtx_object, mtx);
+	if (lock != NULL && !(priority & PDROP)) {
+		class->lc_lock(lock, lock_state);
+		WITNESS_RESTORE(lock, lock_witness);
 	}
 	return (rval);
 }
@@ -254,18 +271,18 @@
 	}
 
 	sleepq_lock(ident);
-	CTR5(KTR_PROC, "msleep_spin: thread %p (pid %ld, %s) on %s (%p)",
-	    (void *)td, (long)p->p_pid, p->p_comm, wmesg, ident);
+	CTR5(KTR_PROC, "msleep_spin: thread %ld (pid %ld, %s) on %s (%p)",
+	    td->td_tid, p->p_pid, p->p_comm, wmesg, ident);
 
 	DROP_GIANT();
 	mtx_assert(mtx, MA_OWNED | MA_NOTRECURSED);
-	WITNESS_SAVE(&mtx->mtx_object, mtx);
+	WITNESS_SAVE(&mtx->lock_object, mtx);
 	mtx_unlock_spin(mtx);
 
 	/*
 	 * We put ourselves on the sleep queue and start our timeout.
 	 */
-	sleepq_add(ident, mtx, wmesg, SLEEPQ_MSLEEP);
+	sleepq_add(ident, &mtx->lock_object, wmesg, SLEEPQ_SLEEP, 0);
 	if (timo)
 		sleepq_set_timeout(ident, timo);
 
@@ -301,11 +318,27 @@
 #endif
 	PICKUP_GIANT();
 	mtx_lock_spin(mtx);
-	WITNESS_RESTORE(&mtx->mtx_object, mtx);
+	WITNESS_RESTORE(&mtx->lock_object, mtx);
 	return (rval);
 }
 
 /*
+ * pause() is like tsleep() except that the intention is to not be
+ * explicitly woken up by another thread.  Instead, the current thread
+ * simply wishes to sleep until the timeout expires.  It is
+ * implemented using a dummy wait channel.
+ */
+int
+pause(wmesg, timo)
+	const char *wmesg;
+	int timo;
+{
+
+	KASSERT(timo != 0, ("pause: timeout required"));
+	return (tsleep(&pause_wchan, 0, wmesg, timo));
+}
+
+/*
  * Make all threads sleeping on the specified identifier runnable.
  */
 void
@@ -314,7 +347,7 @@
 {
 
 	sleepq_lock(ident);
-	sleepq_broadcast(ident, SLEEPQ_MSLEEP, -1);
+	sleepq_broadcast(ident, SLEEPQ_SLEEP, -1, 0);
 }
 
 /*
@@ -328,7 +361,8 @@
 {
 
 	sleepq_lock(ident);
-	sleepq_signal(ident, SLEEPQ_MSLEEP, -1);
+	sleepq_signal(ident, SLEEPQ_SLEEP, -1, 0);
+	sleepq_release(ident);
 }
 
 /*
@@ -337,12 +371,12 @@
 void
 mi_switch(int flags, struct thread *newtd)
 {
-	struct bintime new_switchtime;
+	uint64_t new_switchtime;
 	struct thread *td;
 	struct proc *p;
 
-	mtx_assert(&sched_lock, MA_OWNED | MA_NOTRECURSED);
 	td = curthread;			/* XXX */
+	THREAD_LOCK_ASSERT(td, MA_OWNED | MA_NOTRECURSED);
 	p = td->td_proc;		/* XXX */
 	KASSERT(!TD_ON_RUNQ(td), ("mi_switch: called by old code"));
 #ifdef INVARIANTS
@@ -357,53 +391,33 @@
 	    ("mi_switch: switch must be voluntary or involuntary"));
 	KASSERT(newtd != curthread, ("mi_switch: preempting back to ourself"));
 
-	if (flags & SW_VOL)
-		p->p_stats->p_ru.ru_nvcsw++;
-	else
-		p->p_stats->p_ru.ru_nivcsw++;
-
-	/*
-	 * Compute the amount of time during which the current
-	 * process was running, and add that to its total so far.
-	 */
-	binuptime(&new_switchtime);
-	bintime_add(&p->p_rux.rux_runtime, &new_switchtime);
-	bintime_sub(&p->p_rux.rux_runtime, PCPU_PTR(switchtime));
-
-	td->td_generation++;	/* bump preempt-detect counter */
-
 	/*
 	 * Don't perform context switches from the debugger.
 	 */
 	if (kdb_active) {
-		mtx_unlock_spin(&sched_lock);
+		thread_unlock(td);
 		kdb_backtrace();
 		kdb_reenter();
 		panic("%s: did not reenter debugger", __func__);
 	}
-
-	/*
-	 * Check if the process exceeds its cpu resource allocation.  If
-	 * it reaches the max, arrange to kill the process in ast().
-	 */
-	if (p->p_cpulimit != RLIM_INFINITY &&
-	    p->p_rux.rux_runtime.sec >= p->p_cpulimit) {
-		p->p_sflag |= PS_XCPU;
-		td->td_flags |= TDF_ASTPENDING;
-	}
-
+	if (flags & SW_VOL)
+		td->td_ru.ru_nvcsw++;
+	else
+		td->td_ru.ru_nivcsw++;
 	/*
-	 * Finish up stats for outgoing thread.
+	 * Compute the amount of time during which the current
+	 * thread was running, and add that to its total so far.
 	 */
-	cnt.v_swtch++;
+	new_switchtime = cpu_ticks();
+	td->td_runtime += new_switchtime - PCPU_GET(switchtime);
 	PCPU_SET(switchtime, new_switchtime);
+	td->td_generation++;	/* bump preempt-detect counter */
+	PCPU_INC(cnt.v_swtch);
 	PCPU_SET(switchticks, ticks);
-	CTR4(KTR_PROC, "mi_switch: old thread %p (kse %p, pid %ld, %s)",
-	    (void *)td, td->td_sched, (long)p->p_pid, p->p_comm);
-	if ((flags & SW_VOL) && (td->td_proc->p_flag & P_SA))
-		newtd = thread_switchout(td, flags, newtd);
+	CTR4(KTR_PROC, "mi_switch: old thread %ld (kse %p, pid %ld, %s)",
+	    td->td_tid, td->td_sched, p->p_pid, p->p_comm);
 #if (KTR_COMPILE & KTR_SCHED) != 0
-	if (td == PCPU_GET(idlethread))
+	if (TD_IS_IDLETHREAD(td))
 		CTR3(KTR_SCHED, "mi_switch: %p(%s) prio %d idle",
 		    td, td->td_proc->p_comm, td->td_priority);
 	else if (newtd != NULL)
@@ -417,12 +431,20 @@
 		    td, td->td_proc->p_comm, td->td_priority,
 		    td->td_inhibitors, td->td_wmesg, td->td_lockname);
 #endif
+	/*
+	 * We call thread_switchout after the KTR_SCHED prints above so kse
+	 * selecting a new thread to run does not show up as a preemption.
+	 */
+#ifdef KSE
+	if ((flags & SW_VOL) && (td->td_proc->p_flag & P_SA))
+		newtd = thread_switchout(td, flags, newtd);
+#endif
 	sched_switch(td, newtd, flags);
 	CTR3(KTR_SCHED, "mi_switch: running %p(%s) prio %d",
 	    td, td->td_proc->p_comm, td->td_priority);
 
-	CTR4(KTR_PROC, "mi_switch: new thread %p (kse %p, pid %ld, %s)",
-	    (void *)td, td->td_sched, (long)p->p_pid, p->p_comm);
+	CTR4(KTR_PROC, "mi_switch: new thread %ld (kse %p, pid %ld, %s)",
+	    td->td_tid, td->td_sched, p->p_pid, p->p_comm);
 
 	/* 
 	 * If the last thread was exiting, finish cleaning it up.
@@ -441,16 +463,10 @@
 void
 setrunnable(struct thread *td)
 {
-	struct proc *p;
 
-	p = td->td_proc;
-	mtx_assert(&sched_lock, MA_OWNED);
-	switch (p->p_state) {
-	case PRS_ZOMBIE:
-		panic("setrunnable(1)");
-	default:
-		break;
-	}
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	KASSERT(td->td_proc->p_state != PRS_ZOMBIE,
+	    ("setrunnable: pid %d is a zombie", td->td_proc->p_pid));
 	switch (td->td_state) {
 	case TDS_RUNNING:
 	case TDS_RUNQ:
@@ -469,11 +485,11 @@
 		printf("state is 0x%x", td->td_state);
 		panic("setrunnable(2)");
 	}
-	if ((p->p_sflag & PS_INMEM) == 0) {
-		if ((p->p_sflag & PS_SWAPPINGIN) == 0) {
-			p->p_sflag |= PS_SWAPINREQ;
+	if ((td->td_flags & TDF_INMEM) == 0) {
+		if ((td->td_flags & TDF_SWAPINREQ) == 0) {
+			td->td_flags |= TDF_SWAPINREQ;
 			/*
-			 * due to a LOR between sched_lock and
+			 * due to a LOR between the thread lock and
 			 * the sleepqueue chain locks, use
 			 * lower level scheduling functions.
 			 */
@@ -532,19 +548,16 @@
 }
 
 /*
- * General purpose yield system call
+ * General purpose yield system call.
  */
 int
 yield(struct thread *td, struct yield_args *uap)
 {
-	struct ksegrp *kg;
 
-	kg = td->td_ksegrp;
-	mtx_assert(&Giant, MA_NOTOWNED);
-	mtx_lock_spin(&sched_lock);
+	thread_lock(td);
 	sched_prio(td, PRI_MAX_TIMESHARE);
 	mi_switch(SW_VOL, NULL);
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(td);
 	td->td_retval[0] = 0;
 	return (0);
 }
Index: sysv_msg.c
===================================================================
RCS file: /home/cvs/src/sys/kern/sysv_msg.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/sysv_msg.c -L sys/kern/sysv_msg.c -u -r1.1.1.1 -r1.2
--- sys/kern/sysv_msg.c
+++ sys/kern/sysv_msg.c
@@ -48,7 +48,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/sysv_msg.c,v 1.60 2005/02/12 01:22:39 csjp Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/sysv_msg.c,v 1.70 2007/06/12 00:11:59 rwatson Exp $");
 
 #include "opt_sysvipc.h"
 #include "opt_mac.h"
@@ -57,9 +57,9 @@
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/kernel.h>
+#include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/lock.h>
-#include <sys/mac.h>
 #include <sys/mutex.h>
 #include <sys/module.h>
 #include <sys/msg.h>
@@ -70,6 +70,8 @@
 #include <sys/malloc.h>
 #include <sys/jail.h>
 
+#include <security/mac/mac_framework.h>
+
 static MALLOC_DEFINE(M_MSG, "msg", "SVID compatible message queues");
 
 static void msginit(void);
@@ -81,11 +83,6 @@
 #else
 #define DPRINTF(a)
 #endif
-#ifdef MAC_DEBUG
-#define MPRINTF(a)	printf a
-#else
-#define MPRINTF(a)
-#endif
 
 static void msg_freehdr(struct msg *msghdr);
 
@@ -323,9 +320,7 @@
 MODULE_VERSION(sysvmsg, 1);
 
 /*
- * Entry point for all MSG calls
- *
- * MPSAFE
+ * Entry point for all MSG calls.
  */
 int
 msgsys(td, uap)
@@ -385,10 +380,6 @@
 	struct	msqid_ds *buf;
 };
 #endif
-
-/*
- * MPSAFE
- */
 int
 msgctl(td, uap)
 	struct thread *td;
@@ -399,7 +390,7 @@
 	struct msqid_ds msqbuf;
 	int error;
 
-	DPRINTF(("call to msgctl(%d, %d, 0x%x)\n", msqid, cmd, uap->buf));
+	DPRINTF(("call to msgctl(%d, %d, %p)\n", msqid, cmd, uap->buf));
 	if (cmd == IPC_SET &&
 	    (error = copyin(uap->buf, &msqbuf, sizeof(msqbuf))) != 0)
 		return (error);
@@ -445,10 +436,8 @@
 	}
 #ifdef MAC
 	error = mac_check_sysv_msqctl(td->td_ucred, msqkptr, cmd);
-	if (error != 0) {
-		MPRINTF(("mac_check_sysv_msqctl returned %d\n", error));
+	if (error != 0)
 		goto done2;
-	}
 #endif
 
 	error = 0;
@@ -475,11 +464,8 @@
 		for (msghdr = msqkptr->u.msg_first; msghdr != NULL;
 		    msghdr = msghdr->msg_next) {
 			error = mac_check_sysv_msgrmid(td->td_ucred, msghdr);
-			if (error != 0) {
-				MPRINTF(("mac_check_sysv_msgrmid returned %d\n",
-				    error));
+			if (error != 0)
 				goto done2;
-			}
 		}
 #endif
 
@@ -516,7 +502,7 @@
 		if ((error = ipcperm(td, &msqkptr->u.msg_perm, IPC_M)))
 			goto done2;
 		if (msqbuf->msg_qbytes > msqkptr->u.msg_qbytes) {
-			error = suser(td);
+			error = priv_check(td, PRIV_IPC_MSGSIZE);
 			if (error)
 				goto done2;
 		}
@@ -565,10 +551,6 @@
 	int	msgflg;
 };
 #endif
-
-/*
- * MPSAFE
- */
 int
 msgget(td, uap)
 	struct thread *td;
@@ -608,11 +590,8 @@
 			}
 #ifdef MAC
 			error = mac_check_sysv_msqget(cred, msqkptr);
-			if (error != 0) {
-				MPRINTF(("mac_check_sysv_msqget returned %d\n",
-				    error));
+			if (error != 0)
 				goto done2;
-			}
 #endif
 			goto found;
 		}
@@ -681,46 +660,40 @@
 	int	msgflg;
 };
 #endif
-
-/*
- * MPSAFE
- */
 int
-msgsnd(td, uap)
+kern_msgsnd(td, msqid, msgp, msgsz, msgflg, mtype)
 	struct thread *td;
-	register struct msgsnd_args *uap;
+	int msqid;
+	const void *msgp;	/* XXX msgp is actually mtext. */
+	size_t msgsz;
+	int msgflg;
+	long mtype;
 {
-	int msqid = uap->msqid;
-	const void *user_msgp = uap->msgp;
-	size_t msgsz = uap->msgsz;
-	int msgflg = uap->msgflg;
-	int segs_needed, error = 0;
+	int msqix, segs_needed, error = 0;
 	register struct msqid_kernel *msqkptr;
 	register struct msg *msghdr;
 	short next;
 
-	DPRINTF(("call to msgsnd(%d, 0x%x, %d, %d)\n", msqid, user_msgp, msgsz,
-	    msgflg));
 	if (!jail_sysvipc_allowed && jailed(td->td_ucred))
 		return (ENOSYS);
 
 	mtx_lock(&msq_mtx);
-	msqid = IPCID_TO_IX(msqid);
+	msqix = IPCID_TO_IX(msqid);
 
-	if (msqid < 0 || msqid >= msginfo.msgmni) {
-		DPRINTF(("msqid (%d) out of range (0<=msqid<%d)\n", msqid,
+	if (msqix < 0 || msqix >= msginfo.msgmni) {
+		DPRINTF(("msqid (%d) out of range (0<=msqid<%d)\n", msqix,
 		    msginfo.msgmni));
 		error = EINVAL;
 		goto done2;
 	}
 
-	msqkptr = &msqids[msqid];
+	msqkptr = &msqids[msqix];
 	if (msqkptr->u.msg_qbytes == 0) {
 		DPRINTF(("no such message queue id\n"));
 		error = EINVAL;
 		goto done2;
 	}
-	if (msqkptr->u.msg_perm.seq != IPCID_TO_SEQ(uap->msqid)) {
+	if (msqkptr->u.msg_perm.seq != IPCID_TO_SEQ(msqid)) {
 		DPRINTF(("wrong sequence number\n"));
 		error = EINVAL;
 		goto done2;
@@ -733,15 +706,13 @@
 
 #ifdef MAC
 	error = mac_check_sysv_msqsnd(td->td_ucred, msqkptr);
-	if (error != 0) {
-		MPRINTF(("mac_check_sysv_msqsnd returned %d\n", error));
+	if (error != 0)
 		goto done2;
-	}
 #endif
 
 	segs_needed = (msgsz + msginfo.msgssz - 1) / msginfo.msgssz;
-	DPRINTF(("msgsz=%d, msgssz=%d, segs_needed=%d\n", msgsz, msginfo.msgssz,
-	    segs_needed));
+	DPRINTF(("msgsz=%zu, msgssz=%d, segs_needed=%d\n", msgsz,
+	    msginfo.msgssz, segs_needed));
 	for (;;) {
 		int need_more_resources = 0;
 
@@ -793,12 +764,16 @@
 				msqkptr->u.msg_perm.mode |= MSG_LOCKED;
 				we_own_it = 1;
 			}
-			DPRINTF(("goodnight\n"));
+			DPRINTF(("msgsnd:  goodnight\n"));
 			error = msleep(msqkptr, &msq_mtx, (PZERO - 4) | PCATCH,
-			    "msgwait", 0);
-			DPRINTF(("good morning, error=%d\n", error));
+			    "msgsnd", hz);
+			DPRINTF(("msgsnd:  good morning, error=%d\n", error));
 			if (we_own_it)
 				msqkptr->u.msg_perm.mode &= ~MSG_LOCKED;
+			if (error == EWOULDBLOCK) {
+				DPRINTF(("msgsnd:  timed out\n"));
+				continue;
+			}
 			if (error != 0) {
 				DPRINTF(("msgsnd:  interrupted system call\n"));
 				error = EINTR;
@@ -852,6 +827,7 @@
 	free_msghdrs = msghdr->msg_next;
 	msghdr->msg_spot = -1;
 	msghdr->msg_ts = msgsz;
+	msghdr->msg_type = mtype;
 #ifdef MAC
 	/*
 	 * XXXMAC: Should the mac_check_sysv_msgmsq check follow here
@@ -884,23 +860,6 @@
 	}
 
 	/*
-	 * Copy in the message type
-	 */
-
-	mtx_unlock(&msq_mtx);
-	if ((error = copyin(user_msgp, &msghdr->msg_type,
-	    sizeof(msghdr->msg_type))) != 0) {
-		mtx_lock(&msq_mtx);
-		DPRINTF(("error %d copying the message type\n", error));
-		msg_freehdr(msghdr);
-		msqkptr->u.msg_perm.mode &= ~MSG_LOCKED;
-		wakeup(msqkptr);
-		goto done2;
-	}
-	mtx_lock(&msq_mtx);
-	user_msgp = (const char *)user_msgp + sizeof(msghdr->msg_type);
-
-	/*
 	 * Validate the message type
 	 */
 
@@ -908,7 +867,7 @@
 		msg_freehdr(msghdr);
 		msqkptr->u.msg_perm.mode &= ~MSG_LOCKED;
 		wakeup(msqkptr);
-		DPRINTF(("mtype (%d) < 1\n", msghdr->msg_type));
+		DPRINTF(("mtype (%ld) < 1\n", msghdr->msg_type));
 		error = EINVAL;
 		goto done2;
 	}
@@ -929,7 +888,7 @@
 		if (next >= msginfo.msgseg)
 			panic("next out of range #2");
 		mtx_unlock(&msq_mtx);
-		if ((error = copyin(user_msgp, &msgpool[next * msginfo.msgssz],
+		if ((error = copyin(msgp, &msgpool[next * msginfo.msgssz],
 		    tlen)) != 0) {
 			mtx_lock(&msq_mtx);
 			DPRINTF(("error %d copying in message segment\n",
@@ -941,7 +900,7 @@
 		}
 		mtx_lock(&msq_mtx);
 		msgsz -= tlen;
-		user_msgp = (const char *)user_msgp + tlen;
+		msgp = (const char *)msgp + tlen;
 		next = msgmaps[next].next;
 	}
 	if (next != -1)
@@ -978,7 +937,6 @@
 	 */
 	error = mac_check_sysv_msgmsq(td->td_ucred, msghdr, msqkptr);
 	if (error != 0) {
-		MPRINTF(("mac_check_sysv_msqmsq returned %d\n", error));
 		msg_freehdr(msghdr);
 		wakeup(msqkptr);
 		goto done2;
@@ -1009,6 +967,26 @@
 	return (error);
 }
 
+int
+msgsnd(td, uap)
+	struct thread *td;
+	register struct msgsnd_args *uap;
+{
+	int error;
+	long mtype;
+
+	DPRINTF(("call to msgsnd(%d, %p, %zu, %d)\n", uap->msqid, uap->msgp,
+	    uap->msgsz, uap->msgflg));
+
+	if ((error = copyin(uap->msgp, &mtype, sizeof(mtype))) != 0) {
+		DPRINTF(("error %d copying the message type\n", error));
+		return (error);
+	}
+	return (kern_msgsnd(td, uap->msqid,
+	    (const char *)uap->msgp + sizeof(mtype),
+	    uap->msgsz, uap->msgflg, mtype));
+}
+
 #ifndef _SYS_SYSPROTO_H_
 struct msgrcv_args {
 	int	msqid;
@@ -1018,48 +996,41 @@
 	int	msgflg;
 };
 #endif
-
-/*
- * MPSAFE
- */
 int
-msgrcv(td, uap)
+kern_msgrcv(td, msqid, msgp, msgsz, msgtyp, msgflg, mtype)
 	struct thread *td;
-	register struct msgrcv_args *uap;
+	int msqid;
+	void *msgp;	/* XXX msgp is actually mtext. */
+	size_t msgsz;
+	long msgtyp;
+	int msgflg;
+	long *mtype;
 {
-	int msqid = uap->msqid;
-	void *user_msgp = uap->msgp;
-	size_t msgsz = uap->msgsz;
-	long msgtyp = uap->msgtyp;
-	int msgflg = uap->msgflg;
 	size_t len;
 	register struct msqid_kernel *msqkptr;
 	register struct msg *msghdr;
-	int error = 0;
+	int msqix, error = 0;
 	short next;
 
-	DPRINTF(("call to msgrcv(%d, 0x%x, %d, %ld, %d)\n", msqid, user_msgp,
-	    msgsz, msgtyp, msgflg));
-
 	if (!jail_sysvipc_allowed && jailed(td->td_ucred))
 		return (ENOSYS);
 
-	msqid = IPCID_TO_IX(msqid);
+	msqix = IPCID_TO_IX(msqid);
 
-	if (msqid < 0 || msqid >= msginfo.msgmni) {
-		DPRINTF(("msqid (%d) out of range (0<=msqid<%d)\n", msqid,
+	if (msqix < 0 || msqix >= msginfo.msgmni) {
+		DPRINTF(("msqid (%d) out of range (0<=msqid<%d)\n", msqix,
 		    msginfo.msgmni));
 		return (EINVAL);
 	}
 
-	msqkptr = &msqids[msqid];
+	msqkptr = &msqids[msqix];
 	mtx_lock(&msq_mtx);
 	if (msqkptr->u.msg_qbytes == 0) {
 		DPRINTF(("no such message queue id\n"));
 		error = EINVAL;
 		goto done2;
 	}
-	if (msqkptr->u.msg_perm.seq != IPCID_TO_SEQ(uap->msqid)) {
+	if (msqkptr->u.msg_perm.seq != IPCID_TO_SEQ(msqid)) {
 		DPRINTF(("wrong sequence number\n"));
 		error = EINVAL;
 		goto done2;
@@ -1072,10 +1043,8 @@
 
 #ifdef MAC
 	error = mac_check_sysv_msqrcv(td->td_ucred, msqkptr);
-	if (error != 0) {
-		MPRINTF(("mac_check_sysv_msqrcv returned %d\n", error));
+	if (error != 0)
 		goto done2;
-	}
 #endif
 
 	msghdr = NULL;
@@ -1086,7 +1055,7 @@
 				if (msgsz < msghdr->msg_ts &&
 				    (msgflg & MSG_NOERROR) == 0) {
 					DPRINTF(("first message on the queue "
-					    "is too big (want %d, got %d)\n",
+					    "is too big (want %zu, got %d)\n",
 					    msgsz, msghdr->msg_ts));
 					error = E2BIG;
 					goto done2;
@@ -1094,11 +1063,8 @@
 #ifdef MAC
 				error = mac_check_sysv_msgrcv(td->td_ucred,
 				    msghdr);
-				if (error != 0) {
-					MPRINTF(("mac_check_sysv_msgrcv "
-					    "returned %d\n", error));
+				if (error != 0)
 					goto done2;
-				}
 #endif
 				if (msqkptr->u.msg_first == msqkptr->u.msg_last) {
 					msqkptr->u.msg_first = NULL;
@@ -1127,14 +1093,14 @@
 
 				if (msgtyp == msghdr->msg_type ||
 				    msghdr->msg_type <= -msgtyp) {
-					DPRINTF(("found message type %d, "
-					    "requested %d\n",
+					DPRINTF(("found message type %ld, "
+					    "requested %ld\n",
 					    msghdr->msg_type, msgtyp));
 					if (msgsz < msghdr->msg_ts &&
 					    (msgflg & MSG_NOERROR) == 0) {
 						DPRINTF(("requested message "
 						    "on the queue is too big "
-						    "(want %d, got %d)\n",
+						    "(want %zu, got %hu)\n",
 						    msgsz, msghdr->msg_ts));
 						error = E2BIG;
 						goto done2;
@@ -1142,12 +1108,8 @@
 #ifdef MAC
 					error = mac_check_sysv_msgrcv(
 					    td->td_ucred, msghdr);
-					if (error != 0) {
-						MPRINTF(("mac_check_sysv_"
-						    "msgrcv returned %d\n",
-						    error));
+					if (error != 0)
 						goto done2;
-					}
 #endif
 					*prev = msghdr->msg_next;
 					if (msghdr == msqkptr->u.msg_last) {
@@ -1188,7 +1150,7 @@
 		 */
 
 		if ((msgflg & IPC_NOWAIT) != 0) {
-			DPRINTF(("no appropriate message found (msgtyp=%d)\n",
+			DPRINTF(("no appropriate message found (msgtyp=%ld)\n",
 			    msgtyp));
 			/* The SVID says to return ENOMSG. */
 			error = ENOMSG;
@@ -1201,11 +1163,11 @@
 
 		DPRINTF(("msgrcv:  goodnight\n"));
 		error = msleep(msqkptr, &msq_mtx, (PZERO - 4) | PCATCH,
-		    "msgwait", 0);
+		    "msgrcv", 0);
 		DPRINTF(("msgrcv:  good morning (error=%d)\n", error));
 
 		if (error != 0) {
-			DPRINTF(("msgsnd:  interrupted system call\n"));
+			DPRINTF(("msgrcv:  interrupted system call\n"));
 			error = EINTR;
 			goto done2;
 		}
@@ -1215,7 +1177,7 @@
 		 */
 
 		if (msqkptr->u.msg_qbytes == 0 ||
-		    msqkptr->u.msg_perm.seq != IPCID_TO_SEQ(uap->msqid)) {
+		    msqkptr->u.msg_perm.seq != IPCID_TO_SEQ(msqid)) {
 			DPRINTF(("msqid deleted\n"));
 			error = EIDRM;
 			goto done2;
@@ -1239,26 +1201,11 @@
 	 * (since msgsz is never increased).
 	 */
 
-	DPRINTF(("found a message, msgsz=%d, msg_ts=%d\n", msgsz,
+	DPRINTF(("found a message, msgsz=%zu, msg_ts=%hu\n", msgsz,
 	    msghdr->msg_ts));
 	if (msgsz > msghdr->msg_ts)
 		msgsz = msghdr->msg_ts;
-
-	/*
-	 * Return the type to the user.
-	 */
-
-	mtx_unlock(&msq_mtx);
-	error = copyout(&(msghdr->msg_type), user_msgp,
-	    sizeof(msghdr->msg_type));
-	mtx_lock(&msq_mtx);
-	if (error != 0) {
-		DPRINTF(("error (%d) copying out message type\n", error));
-		msg_freehdr(msghdr);
-		wakeup(msqkptr);
-		goto done2;
-	}
-	user_msgp = (char *)user_msgp + sizeof(msghdr->msg_type);
+	*mtype = msghdr->msg_type;
 
 	/*
 	 * Return the segments to the user
@@ -1277,8 +1224,7 @@
 		if (next >= msginfo.msgseg)
 			panic("next out of range #3");
 		mtx_unlock(&msq_mtx);
-		error = copyout(&msgpool[next * msginfo.msgssz],
-		    user_msgp, tlen);
+		error = copyout(&msgpool[next * msginfo.msgssz], msgp, tlen);
 		mtx_lock(&msq_mtx);
 		if (error != 0) {
 			DPRINTF(("error (%d) copying out message segment\n",
@@ -1287,7 +1233,7 @@
 			wakeup(msqkptr);
 			goto done2;
 		}
-		user_msgp = (char *)user_msgp + tlen;
+		msgp = (char *)msgp + tlen;
 		next = msgmaps[next].next;
 	}
 
@@ -1303,6 +1249,26 @@
 	return (error);
 }
 
+int
+msgrcv(td, uap)
+	struct thread *td;
+	register struct msgrcv_args *uap;
+{
+	int error;
+	long mtype;
+
+	DPRINTF(("call to msgrcv(%d, %p, %zu, %ld, %d)\n", uap->msqid,
+	    uap->msgp, uap->msgsz, uap->msgtyp, uap->msgflg));
+
+	if ((error = kern_msgrcv(td, uap->msqid,
+	    (char *)uap->msgp + sizeof(mtype), uap->msgsz,
+	    uap->msgtyp, uap->msgflg, &mtype)) != 0)
+		return (error);
+	if ((error = copyout(&mtype, uap->msgp, sizeof(mtype))) != 0)
+		DPRINTF(("error %d copying the message type\n", error));
+	return (error);
+}
+
 static int
 sysctl_msqids(SYSCTL_HANDLER_ARGS)
 {
@@ -1311,7 +1277,6 @@
 	    sizeof(struct msqid_kernel) * msginfo.msgmni));
 }
 
-SYSCTL_DECL(_kern_ipc);
 SYSCTL_INT(_kern_ipc, OID_AUTO, msgmax, CTLFLAG_RD, &msginfo.msgmax, 0,
     "Maximum message size");
 SYSCTL_INT(_kern_ipc, OID_AUTO, msgmni, CTLFLAG_RDTUN, &msginfo.msgmni, 0,
Index: subr_kobj.c
===================================================================
RCS file: /home/cvs/src/sys/kern/subr_kobj.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/subr_kobj.c -L sys/kern/subr_kobj.c -u -r1.1.1.1 -r1.2
--- sys/kern/subr_kobj.c
+++ sys/kern/subr_kobj.c
@@ -25,7 +25,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/subr_kobj.c,v 1.8 2003/10/16 09:16:28 dfr Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/subr_kobj.c,v 1.10 2005/12/29 18:00:42 jhb Exp $");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
@@ -57,6 +57,7 @@
 #endif
 
 static struct mtx kobj_mtx;
+static int kobj_mutex_inited;
 static int kobj_next_id = 1;
 
 SYSCTL_UINT(_kern, OID_AUTO, kobj_methodcount, CTLFLAG_RD,
@@ -65,12 +66,20 @@
 static void
 kobj_init_mutex(void *arg)
 {
-
-	mtx_init(&kobj_mtx, "kobj", NULL, MTX_DEF);
+	if (!kobj_mutex_inited) {
+		mtx_init(&kobj_mtx, "kobj", NULL, MTX_DEF);
+		kobj_mutex_inited = 1;
+	}
 }
 
 SYSINIT(kobj, SI_SUB_LOCK, SI_ORDER_ANY, kobj_init_mutex, NULL);
 
+void
+kobj_machdep_init(void)
+{
+	kobj_init_mutex(NULL);
+}
+
 /*
  * This method structure is used to initialise new caches. Since the
  * desc pointer is NULL, it is guaranteed never to match any read
@@ -228,7 +237,7 @@
 	 * a 'miss'.
 	 */
 	kobj_lookup_hits--;
-	kobj_lookup_misses--;
+	kobj_lookup_misses++;
 #endif
 
 	ce = kobj_lookup_method_mi(cls, desc);
Index: kern_alq.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_alq.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/kern_alq.c -L sys/kern/kern_alq.c -u -r1.1.1.1 -r1.2
--- sys/kern/kern_alq.c
+++ sys/kern/kern_alq.c
@@ -25,7 +25,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_alq.c,v 1.12 2005/04/16 12:12:27 rwatson Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_alq.c,v 1.19 2007/06/01 14:33:11 kib Exp $");
 
 #include "opt_mac.h"
 
@@ -34,7 +34,7 @@
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/lock.h>
-#include <sys/mac.h>
+#include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/proc.h>
@@ -45,6 +45,8 @@
 #include <sys/fcntl.h>
 #include <sys/eventhandler.h>
 
+#include <security/mac/mac_framework.h>
+
 /* Async. Logging Queue */
 struct alq {
 	int	aq_entmax;		/* Max entries */
@@ -172,8 +174,6 @@
 	int needwakeup;
 	struct alq *alq;
 
-	mtx_lock(&Giant);
-
 	ald_thread = FIRST_THREAD_IN_PROC(ald_proc);
 
 	EVENTHANDLER_REGISTER(shutdown_pre_sync, ald_shutdown, NULL,
@@ -250,6 +250,7 @@
 	struct ale *alstart;
 	int totlen;
 	int iov;
+	int vfslocked;
 
 	vp = alq->aq_vp;
 	td = curthread;
@@ -291,6 +292,7 @@
 	/*
 	 * Do all of the junk required to write now.
 	 */
+	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	vn_start_write(vp, &mp, V_WAIT);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	VOP_LEASE(vp, td, alq->aq_cred, LEASE_WRITE);
@@ -303,6 +305,7 @@
 		VOP_WRITE(vp, &auio, IO_UNIT | IO_APPEND, alq->aq_cred);
 	VOP_UNLOCK(vp, 0, td);
 	vn_finished_write(mp);
+	VFS_UNLOCK_GIANT(vfslocked);
 
 	ALQ_LOCK(alq);
 	alq->aq_flags &= ~AQ_FLUSHING;
@@ -345,21 +348,23 @@
 	char *bufp;
 	int flags;
 	int error;
-	int i;
+	int i, vfslocked;
 
 	*alqp = NULL;
 	td = curthread;
 
-	NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, file, td);
+	NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE, UIO_SYSSPACE, file, td);
 	flags = FWRITE | O_NOFOLLOW | O_CREAT;
 
-	error = vn_open_cred(&nd, &flags, cmode, cred, -1);
+	error = vn_open_cred(&nd, &flags, cmode, cred, NULL);
 	if (error)
 		return (error);
-	
+
+	vfslocked = NDHASGIANT(&nd);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	/* We just unlock so we hold a reference */
 	VOP_UNLOCK(nd.ni_vp, 0, td);
+	VFS_UNLOCK_GIANT(vfslocked);
 
 	alq = malloc(sizeof(*alq), M_ALD, M_WAITOK|M_ZERO);
 	alq->aq_entbuf = malloc(count * size, M_ALD, M_WAITOK|M_ZERO);
Index: kern_syscalls.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_syscalls.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/kern_syscalls.c -L sys/kern/kern_syscalls.c -u -r1.1.1.1 -r1.2
--- sys/kern/kern_syscalls.c
+++ sys/kern/kern_syscalls.c
@@ -25,7 +25,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_syscalls.c,v 1.11 2004/07/15 08:26:05 phk Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_syscalls.c,v 1.12 2006/08/01 16:32:20 jhb Exp $");
 
 #include <sys/param.h>
 #include <sys/sysproto.h>
@@ -97,8 +97,11 @@
        case MOD_LOAD :
                error = syscall_register(data->offset, data->new_sysent,
                                         &data->old_sysent);
-               if (error)
+               if (error) {
+                       /* Leave a mark so we know to safely unload below. */
+                       data->offset = NULL;
                        return error;
+               }
 	       ms.intval = *data->offset;
 	       MOD_XLOCK;
 	       module_setspecific(mod, &ms);
@@ -108,6 +111,13 @@
                return error;
 
        case MOD_UNLOAD :
+               /*
+                * MOD_LOAD failed, so just return without calling the
+                * chained handler since we didn't pass along the MOD_LOAD
+                * event.
+                */
+               if (data->offset == NULL)
+                       return (0);
                if (data->chainevh) {
                        error = data->chainevh(mod, what, data->chainarg);
                        if (error)
Index: subr_unit.c
===================================================================
RCS file: /home/cvs/src/sys/kern/subr_unit.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/subr_unit.c -L sys/kern/subr_unit.c -u -r1.1.1.1 -r1.2
--- sys/kern/subr_unit.c
+++ sys/kern/subr_unit.c
@@ -23,7 +23,7 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * $FreeBSD: src/sys/kern/subr_unit.c,v 1.7 2005/03/14 06:51:29 phk Exp $
+ * $FreeBSD: src/sys/kern/subr_unit.c,v 1.9 2007/07/04 06:56:57 kib Exp $
  *
  *
  * Unit number allocation functions.
@@ -197,6 +197,8 @@
 	u_int			first;	/* items in allocated from start */
 	u_int			last;	/* items free at end */
 	struct mtx		*mtx;
+	TAILQ_HEAD(unrfr,unr)	ppfree;	/* Items to be freed after mtx
+					   lock dropped */
 };
 
 
@@ -281,9 +283,35 @@
 static __inline void
 delete_unr(struct unrhdr *uh, void *ptr)
 {
+	struct unr *up;
 
 	uh->alloc--;
-	Free(ptr);
+	up = ptr;
+	TAILQ_INSERT_TAIL(&uh->ppfree, up, list);
+}
+
+void
+clean_unrhdrl(struct unrhdr *uh)
+{
+	struct unr *up;
+
+	mtx_assert(uh->mtx, MA_OWNED);
+	while ((up = TAILQ_FIRST(&uh->ppfree)) != NULL) {
+		TAILQ_REMOVE(&uh->ppfree, up, list);
+		mtx_unlock(uh->mtx);
+		Free(up);
+		mtx_lock(uh->mtx);
+	}
+
+}
+
+void
+clean_unrhdr(struct unrhdr *uh)
+{
+
+	mtx_lock(uh->mtx);
+	clean_unrhdrl(uh);
+	mtx_unlock(uh->mtx);
 }
 
 /*
@@ -305,6 +333,7 @@
 	else
 		uh->mtx = &unitmtx;
 	TAILQ_INIT(&uh->head);
+	TAILQ_INIT(&uh->ppfree);
 	uh->low = low;
 	uh->high = high;
 	uh->first = 0;
@@ -320,6 +349,8 @@
 	check_unrhdr(uh, __LINE__);
 	KASSERT(uh->busy == 0, ("unrhdr has %u allocations", uh->busy));
 	KASSERT(uh->alloc == 0, ("UNR memory leak in delete_unrhdr"));
+	KASSERT(TAILQ_FIRST(&uh->ppfree) == NULL,
+	    ("unrhdr has postponed item for free"));
 	Free(uh);
 }
 
@@ -591,6 +622,7 @@
 
 	mtx_lock(uh->mtx);
 	i = alloc_unrl(uh);
+	clean_unrhdrl(uh);
 	mtx_unlock(uh->mtx);
 	return (i);
 }
@@ -714,10 +746,12 @@
 {
 	void *p1, *p2;
 
+	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, "free_unr");
 	p1 = Malloc(sizeof(struct unr));
 	p2 = Malloc(sizeof(struct unr));
 	mtx_lock(uh->mtx);
 	free_unrl(uh, item, &p1, &p2);
+	clean_unrhdrl(uh);
 	mtx_unlock(uh->mtx);
 	if (p1 != NULL)
 		Free(p1);
Index: subr_hints.c
===================================================================
RCS file: /home/cvs/src/sys/kern/subr_hints.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/subr_hints.c -L sys/kern/subr_hints.c -u -r1.1.1.1 -r1.2
--- sys/kern/subr_hints.c
+++ sys/kern/subr_hints.c
@@ -25,11 +25,11 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/subr_hints.c,v 1.11.2.1 2005/10/06 18:29:30 delphij Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/subr_hints.c,v 1.13 2006/07/09 21:42:58 scottl Exp $");
 
 #include <sys/param.h>
 #include <sys/lock.h>
-#include <sys/sx.h>
+#include <sys/mutex.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 
@@ -72,7 +72,7 @@
 			break;
 		case 2:		/* fallback mode */
 			if (dynamic_kenv) {
-				sx_slock(&kenv_lock);
+				mtx_lock(&kenv_lock);
 				cp = kenvp[0];
 				for (i = 0; cp != NULL; cp = kenvp[++i]) {
 					if (!strncmp(cp, "hint.", 5)) {
@@ -81,7 +81,7 @@
 						break;
 					}
 				}
-				sx_sunlock(&kenv_lock);
+				mtx_unlock(&kenv_lock);
 			} else {
 				cp = kern_envp;
 				while (cp) {
@@ -114,11 +114,11 @@
 	}
 
 	if (use_kenv) {
-		sx_slock(&kenv_lock);
+		mtx_lock(&kenv_lock);
 		i = 0;
 		cp = kenvp[0];
 		if (cp == NULL) {
-			sx_sunlock(&kenv_lock);
+			mtx_unlock(&kenv_lock);
 			return (ENOENT);
 		}
 	} else
@@ -165,7 +165,7 @@
 		}
 	}
 	if (use_kenv)
-		sx_sunlock(&kenv_lock);
+		mtx_unlock(&kenv_lock);
 	if (cp == NULL)
 		return ENOENT;
 
Index: link_elf.c
===================================================================
RCS file: /home/cvs/src/sys/kern/link_elf.c,v
retrieving revision 1.1.1.2
retrieving revision 1.2
diff -L sys/kern/link_elf.c -L sys/kern/link_elf.c -u -r1.1.1.2 -r1.2
--- sys/kern/link_elf.c
+++ sys/kern/link_elf.c
@@ -25,7 +25,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/link_elf.c,v 1.81.8.5 2005/12/30 22:13:58 marcel Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/link_elf.c,v 1.93 2007/05/31 11:51:51 kib Exp $");
 
 #include "opt_gdb.h"
 #include "opt_mac.h"
@@ -37,9 +37,9 @@
 #endif
 #include <sys/kernel.h>
 #include <sys/lock.h>
-#include <sys/mac.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
+#include <sys/mount.h>
 #include <sys/proc.h>
 #include <sys/namei.h>
 #include <sys/fcntl.h>
@@ -48,6 +48,8 @@
 
 #include <machine/elf.h>
 
+#include <security/mac/mac_framework.h>
+
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #ifdef SPARSE_MAPPING
@@ -62,6 +64,8 @@
 
 #include "linker_if.h"
 
+#define MAXSEGS 4
+
 typedef struct elf_file {
     struct linker_file	lf;		/* Common fields */
     int			preloaded;	/* Was file pre-loaded */
@@ -302,9 +306,10 @@
 #endif
 
     (void)link_elf_link_common_finish(linker_kernel_file);
+    linker_kernel_file->flags |= LINKER_FILE_LINKED;
 }
 
-SYSINIT(link_elf, SI_SUB_KLD, SI_ORDER_SECOND, link_elf_init, 0);
+SYSINIT(link_elf, SI_SUB_KLD, SI_ORDER_THIRD, link_elf_init, 0);
 
 static int
 link_elf_preload_parse_symbols(elf_file_t ef)
@@ -536,7 +541,7 @@
     int nbytes, i;
     Elf_Phdr *phdr;
     Elf_Phdr *phlimit;
-    Elf_Phdr *segs[2];
+    Elf_Phdr *segs[MAXSEGS];
     int nsegs;
     Elf_Phdr *phdyn;
     Elf_Phdr *phphdr;
@@ -554,17 +559,17 @@
     int symstrindex;
     int symcnt;
     int strcnt;
-
-    GIANT_REQUIRED;
+    int vfslocked;
 
     shdr = NULL;
     lf = NULL;
 
-    NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, filename, td);
+    NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE, UIO_SYSSPACE, filename, td);
     flags = FREAD;
-    error = vn_open(&nd, &flags, 0, -1);
+    error = vn_open(&nd, &flags, 0, NULL);
     if (error)
 	return error;
+    vfslocked = NDHASGIANT(&nd);
     NDFREE(&nd, NDF_ONLY_PNBUF);
 #ifdef MAC
     error = mac_check_kld_load(curthread->td_ucred, nd.ni_vp);
@@ -643,7 +648,7 @@
 	switch (phdr->p_type) {
 
 	case PT_LOAD:
-	    if (nsegs == 2) {
+	    if (nsegs == MAXSEGS) {
 		link_elf_error("Too many sections");
 		error = ENOEXEC;
 		goto out;
@@ -676,8 +681,8 @@
 	error = ENOEXEC;
 	goto out;
     }
-    if (nsegs != 2) {
-	link_elf_error("Too few sections");
+    if (nsegs == 0) {
+	link_elf_error("No sections");
 	error = ENOEXEC;
 	goto out;
     }
@@ -688,7 +693,8 @@
      */
     base_offset = trunc_page(segs[0]->p_offset);
     base_vaddr = trunc_page(segs[0]->p_vaddr);
-    base_vlimit = round_page(segs[1]->p_vaddr + segs[1]->p_memsz);
+    base_vlimit = round_page(segs[nsegs - 1]->p_vaddr + 
+	segs[nsegs - 1]->p_memsz);
     mapsize = base_vlimit - base_vaddr;
 
     lf = linker_make_file(filename, &link_elf_class);
@@ -726,7 +732,7 @@
     /*
      * Read the text and data sections and zero the bss.
      */
-    for (i = 0; i < 2; i++) {
+    for (i = 0; i < nsegs; i++) {
 	caddr_t segbase = mapbase + segs[i]->p_vaddr - base_vaddr;
 	error = vn_rdwr(UIO_READ, nd.ni_vp,
 			segbase, segs[i]->p_filesz, segs[i]->p_offset,
@@ -755,8 +761,10 @@
 
 #ifdef GPROF
     /* Update profiling information with the new text segment. */
+    mtx_lock(&Giant);
     kmupetext((uintfptr_t)(mapbase + segs[0]->p_vaddr - base_vaddr +
 	segs[0]->p_memsz));
+    mtx_unlock(&Giant);
 #endif
 
     ef->dynamic = (Elf_Dyn *) (mapbase + phdyn->p_vaddr - base_vaddr);
@@ -856,6 +864,7 @@
 	free(firstpage, M_LINKER);
     VOP_UNLOCK(nd.ni_vp, 0, td);
     vn_close(nd.ni_vp, FREAD, td->td_ucred, td);
+    VFS_UNLOCK_GIANT(vfslocked);
 
     return error;
 }
--- sys/kern/uipc_proto.c
+++ /dev/null
@@ -1,84 +0,0 @@
-/*-
- * Copyright (c) 1982, 1986, 1993
- *	The Regents of the University of California.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 4. Neither the name of the University nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- *	@(#)uipc_proto.c	8.1 (Berkeley) 6/10/93
- */
-
-#include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/uipc_proto.c,v 1.24.8.1 2005/11/16 10:31:21 ru Exp $");
-
-#include <sys/param.h>
-#include <sys/kernel.h>
-#include <sys/domain.h>
-#include <sys/protosw.h>
-#include <sys/socket.h>
-#include <sys/queue.h>
-#include <sys/sysctl.h>
-#include <sys/un.h>
-
-#include <net/raw_cb.h>
-
-/*
- * Definitions of protocols supported in the LOCAL domain.
- */
-
-static struct protosw localsw[] = {
-{
-	.pr_type =		SOCK_STREAM,
-	.pr_domain =		&localdomain,
-	.pr_flags =		PR_CONNREQUIRED|PR_WANTRCVD|PR_RIGHTS,
-	.pr_ctloutput =		&uipc_ctloutput,
-	.pr_usrreqs =		&uipc_usrreqs
-},
-{
-	.pr_type =		SOCK_DGRAM,
-	.pr_domain =		&localdomain,
-	.pr_flags =		PR_ATOMIC|PR_ADDR|PR_RIGHTS,
-	.pr_usrreqs =		&uipc_usrreqs
-},
-{
-	.pr_ctlinput =		raw_ctlinput,
-	.pr_init =		raw_init,
-	.pr_usrreqs =		&raw_usrreqs
-}
-};
-
-struct domain localdomain = {
-	.dom_family =		AF_LOCAL,
-	.dom_name =		"local",
-	.dom_init =		unp_init,
-	.dom_externalize =	unp_externalize,
-	.dom_dispose =		unp_dispose,
-	.dom_protosw =		localsw,
-	.dom_protoswNPROTOSW =	&localsw[sizeof(localsw)/sizeof(localsw[0])]
-};
-DOMAIN_SET(local);
-
-SYSCTL_NODE(_net, PF_LOCAL, local, CTLFLAG_RW, 0, "Local domain");
-SYSCTL_NODE(_net_local, SOCK_STREAM, stream, CTLFLAG_RW, 0, "SOCK_STREAM");
-SYSCTL_NODE(_net_local, SOCK_DGRAM, dgram, CTLFLAG_RW, 0, "SOCK_DGRAM");
Index: md5c.c
===================================================================
RCS file: /home/cvs/src/sys/kern/md5c.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/md5c.c -L sys/kern/md5c.c -u -r1.1.1.1 -r1.2
--- sys/kern/md5c.c
+++ sys/kern/md5c.c
@@ -30,7 +30,7 @@
  * This file should be kept in sync with src/lib/libmd/md5c.c
  */
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/md5c.c,v 1.25 2005/02/10 12:20:42 phk Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/md5c.c,v 1.27 2006/03/30 18:45:50 pjd Exp $");
 
 #include <sys/types.h>
 
@@ -60,10 +60,15 @@
 Encode (unsigned char *output, u_int32_t *input, unsigned int len)
 {
 	unsigned int i;
-	u_int32_t *op = (u_int32_t *)output;
+	uint32_t ip;
 
-	for (i = 0; i < len / 4; i++)
-		op[i] = htole32(input[i]);
+	for (i = 0; i < len / 4; i++) {
+		ip = input[i];
+		*output++ = ip;
+		*output++ = ip >> 8;
+		*output++ = ip >> 16;
+		*output++ = ip >> 24;
+	}
 }
 
 /*
@@ -75,10 +80,11 @@
 Decode (u_int32_t *output, const unsigned char *input, unsigned int len)
 {
 	unsigned int i;
-	const u_int32_t *ip = (const u_int32_t *)input;
 
-	for (i = 0; i < len / 4; i++)
-		output[i] = le32dec(&ip[i]);
+	for (i = 0; i < len; i += 4) { 
+		*output++ = input[i] | (input[i+1] << 8) | (input[i+2] << 16) |
+		    (input[i+3] << 24);
+	}
 }
 #endif
 
@@ -145,12 +151,13 @@
  */
 
 void
-MD5Update (context, input, inputLen)
+MD5Update (context, in, inputLen)
 	MD5_CTX *context;
-	const unsigned char *input;
+	const void *in;
 	unsigned int inputLen;
 {
 	unsigned int i, index, partLen;
+	const unsigned char *input = in;
 
 	/* Compute number of bytes mod 64 */
 	index = (unsigned int)((context->count[0] >> 3) & 0x3F);
Index: kern_ntptime.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_ntptime.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/kern_ntptime.c -L sys/kern/kern_ntptime.c -u -r1.1.1.1 -r1.2
--- sys/kern/kern_ntptime.c
+++ sys/kern/kern_ntptime.c
@@ -31,7 +31,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_ntptime.c,v 1.59 2005/05/28 14:34:41 rwatson Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_ntptime.c,v 1.64 2007/06/14 18:37:58 rwatson Exp $");
 
 #include "opt_ntp.h"
 
@@ -39,6 +39,7 @@
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/kernel.h>
+#include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
@@ -248,9 +249,8 @@
 /*
  * ntp_gettime() - NTP user application interface
  *
- * See the timex.h header file for synopsis and API description. Note
- * that the TAI offset is returned in the ntvtimeval.tai structure
- * member.
+ * See the timex.h header file for synopsis and API description.  Note that
+ * the TAI offset is returned in the ntvtimeval.tai structure member.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct ntp_gettime_args {
@@ -267,6 +267,7 @@
 	ntp_gettime1(&ntv);
 	mtx_unlock(&Giant);
 
+	td->td_retval[0] = ntv.time_state;
 	return (copyout(&ntv, uap->ntvp, sizeof(ntv)));
 }
 
@@ -292,12 +293,13 @@
 SYSCTL_OPAQUE(_kern_ntp_pll, OID_AUTO, pps_freq, CTLFLAG_RD, &pps_freq, sizeof(pps_freq), "I", "");
 SYSCTL_OPAQUE(_kern_ntp_pll, OID_AUTO, time_freq, CTLFLAG_RD, &time_freq, sizeof(time_freq), "I", "");
 #endif
+
 /*
  * ntp_adjtime() - NTP daemon application interface
  *
- * See the timex.h header file for synopsis and API description. Note
- * that the timex.constant structure member has a dual purpose to set
- * the time constant and to set the TAI offset.
+ * See the timex.h header file for synopsis and API description.  Note that
+ * the timex.constant structure member has a dual purpose to set the time
+ * constant and to set the TAI offset.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct ntp_adjtime_args {
@@ -305,9 +307,6 @@
 };
 #endif
 
-/*
- * MPSAFE
- */
 int
 ntp_adjtime(struct thread *td, struct ntp_adjtime_args *uap)
 {
@@ -333,7 +332,7 @@
 	mtx_lock(&Giant);
 	modes = ntv.modes;
 	if (modes)
-		error = suser(td);
+		error = priv_check(td, PRIV_NTP_ADJTIME);
 	if (error)
 		goto done2;
 	s = splclock();
@@ -925,9 +924,6 @@
 	struct timeval *olddelta;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 adjtime(struct thread *td, struct adjtime_args *uap)
@@ -954,9 +950,6 @@
 	struct timeval atv;
 	int error;
 
-	if ((error = suser(td)))
-		return (error);
-
 	mtx_lock(&Giant);
 	if (olddelta) {
 		atv.tv_sec = time_adjtime / 1000000;
@@ -967,10 +960,15 @@
 		}
 		*olddelta = atv;
 	}
-	if (delta)
+	if (delta) {
+		if ((error = priv_check(td, PRIV_ADJTIME))) {
+			mtx_unlock(&Giant);
+			return (error);
+		}
 		time_adjtime = (int64_t)delta->tv_sec * 1000000 +
 		    delta->tv_usec;
+	}
 	mtx_unlock(&Giant);
-	return (error);
+	return (0);
 }
 
Index: imgact_aout.c
===================================================================
RCS file: /home/cvs/src/sys/kern/imgact_aout.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/imgact_aout.c -L sys/kern/imgact_aout.c -u -r1.2 -r1.3
--- sys/kern/imgact_aout.c
+++ sys/kern/imgact_aout.c
@@ -25,7 +25,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/imgact_aout.c,v 1.99.2.1 2006/03/16 00:25:31 alc Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/imgact_aout.c,v 1.101.4.1 2008/01/19 18:15:05 kib Exp $");
 
 #include <sys/param.h>
 #include <sys/exec.h>
@@ -198,9 +198,11 @@
 	/*
 	 * Destroy old process VM and create a new one (with a new stack)
 	 */
-	exec_new_vmspace(imgp, &aout_sysvec);
+	error = exec_new_vmspace(imgp, &aout_sysvec);
 
 	vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY, td);
+	if (error)
+		return (error);
 
 	/*
 	 * The vm space can be changed by exec_new_vmspace
@@ -220,6 +222,7 @@
 		MAP_COPY_ON_WRITE | MAP_PREFAULT);
 	if (error) {
 		vm_map_unlock(map);
+		vm_object_deallocate(object);
 		return (error);
 	}
 	data_end = text_end + a_out->a_data;
@@ -232,6 +235,7 @@
 			MAP_COPY_ON_WRITE | MAP_PREFAULT);
 		if (error) {
 			vm_map_unlock(map);
+			vm_object_deallocate(object);
 			return (error);
 		}
 	}
Index: subr_prof.c
===================================================================
RCS file: /home/cvs/src/sys/kern/subr_prof.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/subr_prof.c -L sys/kern/subr_prof.c -u -r1.1.1.1 -r1.2
--- sys/kern/subr_prof.c
+++ sys/kern/subr_prof.c
@@ -30,7 +30,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/subr_prof.c,v 1.75 2005/03/02 21:33:27 joerg Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/subr_prof.c,v 1.79 2007/06/05 00:00:54 jeff Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -402,9 +402,6 @@
 	u_int	scale;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 profil(td, uap)
@@ -426,12 +423,12 @@
 	}
 	PROC_LOCK(p);
 	upp = &td->td_proc->p_stats->p_prof;
-	mtx_lock_spin(&sched_lock);
+	PROC_SLOCK(p);
 	upp->pr_off = uap->offset;
 	upp->pr_scale = uap->scale;
 	upp->pr_base = uap->samples;
 	upp->pr_size = uap->size;
-	mtx_unlock_spin(&sched_lock);
+	PROC_SUNLOCK(p);
 	startprofclock(p);
 	PROC_UNLOCK(p);
 
@@ -461,7 +458,7 @@
  * inaccurate.
  */
 void
-addupc_intr(struct thread *td, uintptr_t pc, u_int ticks)
+addupc_intr(struct thread *td, uintfptr_t pc, u_int ticks)
 {
 	struct uprof *prof;
 	caddr_t addr;
@@ -471,22 +468,22 @@
 	if (ticks == 0)
 		return;
 	prof = &td->td_proc->p_stats->p_prof;
-	mtx_lock_spin(&sched_lock);
+	PROC_SLOCK(td->td_proc);
 	if (pc < prof->pr_off ||
 	    (i = PC_TO_INDEX(pc, prof)) >= prof->pr_size) {
-		mtx_unlock_spin(&sched_lock);		
+		PROC_SUNLOCK(td->td_proc);
 		return;			/* out of range; ignore */
 	}
 
 	addr = prof->pr_base + i;
-	mtx_unlock_spin(&sched_lock);
+	PROC_SUNLOCK(td->td_proc);
 	if ((v = fuswintr(addr)) == -1 || suswintr(addr, v + ticks) == -1) {
 		td->td_profil_addr = pc;
 		td->td_profil_ticks = ticks;
 		td->td_pflags |= TDP_OWEUPC;
-		mtx_lock_spin(&sched_lock);
+		thread_lock(td);
 		td->td_flags |= TDF_ASTPENDING;
-		mtx_unlock_spin(&sched_lock);
+		thread_unlock(td);
 	}
 }
 
@@ -495,7 +492,7 @@
  * update fails, we simply turn off profiling.
  */
 void
-addupc_task(struct thread *td, uintptr_t pc, u_int ticks)
+addupc_task(struct thread *td, uintfptr_t pc, u_int ticks)
 {
 	struct proc *p = td->td_proc; 
 	struct uprof *prof;
@@ -514,12 +511,15 @@
 	}
 	p->p_profthreads++;
 	prof = &p->p_stats->p_prof;
+	PROC_SLOCK(p);
 	if (pc < prof->pr_off ||
 	    (i = PC_TO_INDEX(pc, prof)) >= prof->pr_size) {
+		PROC_SUNLOCK(p);
 		goto out;
 	}
 
 	addr = prof->pr_base + i;
+	PROC_SUNLOCK(p);
 	PROC_UNLOCK(p);
 	if (copyin(addr, &v, sizeof(v)) == 0) {
 		v += ticks;
Index: kern_thread.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_thread.c,v
retrieving revision 1.4
retrieving revision 1.5
diff -L sys/kern/kern_thread.c -L sys/kern/kern_thread.c -u -r1.4 -r1.5
--- sys/kern/kern_thread.c
+++ sys/kern/kern_thread.c
@@ -27,7 +27,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_thread.c,v 1.216.2.2 2006/02/27 00:19:40 davidxu Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_thread.c,v 1.255.2.1.2.1 2008/01/19 18:15:05 kib Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -44,41 +44,39 @@
 #include <sys/ktr.h>
 #include <sys/umtx.h>
 
+#include <security/audit/audit.h>
+
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/uma.h>
+#include <sys/eventhandler.h>
 
 /*
- * KSEGRP related storage.
+ * thread related storage.
  */
-static uma_zone_t ksegrp_zone;
 static uma_zone_t thread_zone;
 
-/* DEBUG ONLY */
 SYSCTL_NODE(_kern, OID_AUTO, threads, CTLFLAG_RW, 0, "thread allocation");
-static int thread_debug = 0;
-SYSCTL_INT(_kern_threads, OID_AUTO, debug, CTLFLAG_RW,
-	&thread_debug, 0, "thread debug");
 
 int max_threads_per_proc = 1500;
 SYSCTL_INT(_kern_threads, OID_AUTO, max_threads_per_proc, CTLFLAG_RW,
 	&max_threads_per_proc, 0, "Limit on threads per proc");
 
-int max_groups_per_proc = 1500;
-SYSCTL_INT(_kern_threads, OID_AUTO, max_groups_per_proc, CTLFLAG_RW,
-	&max_groups_per_proc, 0, "Limit on thread groups per proc");
-
 int max_threads_hits;
 SYSCTL_INT(_kern_threads, OID_AUTO, max_threads_hits, CTLFLAG_RD,
 	&max_threads_hits, 0, "");
 
+#ifdef KSE
 int virtual_cpu;
 
+#endif
 TAILQ_HEAD(, thread) zombie_threads = TAILQ_HEAD_INITIALIZER(zombie_threads);
-TAILQ_HEAD(, ksegrp) zombie_ksegrps = TAILQ_HEAD_INITIALIZER(zombie_ksegrps);
-struct mtx kse_zombie_lock;
-MTX_SYSINIT(kse_zombie_lock, &kse_zombie_lock, "kse zombie lock", MTX_SPIN);
+static struct mtx zombie_lock;
+MTX_SYSINIT(zombie_lock, &zombie_lock, "zombie lock", MTX_SPIN);
+
+static void thread_zombie(struct thread *);
 
+#ifdef KSE
 static int
 sysctl_kse_virtual_cpu(SYSCTL_HANDLER_ARGS)
 {
@@ -103,6 +101,7 @@
 SYSCTL_PROC(_kern_threads, OID_AUTO, virtual_cpu, CTLTYPE_INT|CTLFLAG_RW,
 	0, sizeof(virtual_cpu), sysctl_kse_virtual_cpu, "I",
 	"debug virtual cpus");
+#endif
 
 struct mtx tid_lock;
 static struct unrhdr *tid_unrhdr;
@@ -120,20 +119,19 @@
 	td->td_oncpu = NOCPU;
 
 	td->td_tid = alloc_unr(tid_unrhdr);
+	td->td_syscalls = 0;
 
 	/*
 	 * Note that td_critnest begins life as 1 because the thread is not
 	 * running and is thereby implicitly waiting to be on the receiving
-	 * end of a context switch.  A context switch must occur inside a
-	 * critical section, and in fact, includes hand-off of the sched_lock.
-	 * After a context switch to a newly created thread, it will release
-	 * sched_lock for the first time, and its td_critnest will hit 0 for
-	 * the first time.  This happens on the far end of a context switch,
-	 * and when it context switches away from itself, it will in fact go
-	 * back into a critical section, and hand off the sched lock to the
-	 * next thread.
+	 * end of a context switch.
 	 */
 	td->td_critnest = 1;
+	EVENTHANDLER_INVOKE(thread_ctor, td);
+#ifdef AUDIT
+	audit_thread_alloc(td);
+#endif
+	umtx_thread_alloc(td);
 	return (0);
 }
 
@@ -167,7 +165,10 @@
 		/* NOTREACHED */
 	}
 #endif
-
+#ifdef AUDIT
+	audit_thread_free(td);
+#endif
+	EVENTHANDLER_INVOKE(thread_dtor, td);
 	free_unr(tid_unrhdr, td->td_tid);
 	sched_newthread(td);
 }
@@ -182,13 +183,13 @@
 
 	td = (struct thread *)mem;
 
-	vm_thread_new(td, 0);
-	cpu_thread_setup(td);
 	td->td_sleepqueue = sleepq_alloc();
 	td->td_turnstile = turnstile_alloc();
-	td->td_umtxq = umtxq_alloc();
+	EVENTHANDLER_INVOKE(thread_init, td);
 	td->td_sched = (struct td_sched *)&td[1];
 	sched_newthread(td);
+	umtx_thread_init(td);
+	td->td_kstack = 0;
 	return (0);
 }
 
@@ -201,66 +202,10 @@
 	struct thread *td;
 
 	td = (struct thread *)mem;
+	EVENTHANDLER_INVOKE(thread_fini, td);
 	turnstile_free(td->td_turnstile);
 	sleepq_free(td->td_sleepqueue);
-	umtxq_free(td->td_umtxq);
-	vm_thread_dispose(td);
-}
-
-/*
- * Initialize type-stable parts of a ksegrp (when newly created).
- */
-static int
-ksegrp_ctor(void *mem, int size, void *arg, int flags)
-{
-	struct ksegrp	*kg;
-
-	kg = (struct ksegrp *)mem;
-	bzero(mem, size);
-	kg->kg_sched = (struct kg_sched *)&kg[1];
-	return (0);
-}
-
-void
-ksegrp_link(struct ksegrp *kg, struct proc *p)
-{
-
-	TAILQ_INIT(&kg->kg_threads);
-	TAILQ_INIT(&kg->kg_runq);	/* links with td_runq */
-	TAILQ_INIT(&kg->kg_upcalls);	/* all upcall structure in ksegrp */
-	kg->kg_proc = p;
-	/*
-	 * the following counters are in the -zero- section
-	 * and may not need clearing
-	 */
-	kg->kg_numthreads = 0;
-	kg->kg_numupcalls = 0;
-	/* link it in now that it's consistent */
-	p->p_numksegrps++;
-	TAILQ_INSERT_HEAD(&p->p_ksegrps, kg, kg_ksegrp);
-}
-
-/*
- * Called from:
- *   thread-exit()
- */
-void
-ksegrp_unlink(struct ksegrp *kg)
-{
-	struct proc *p;
-
-	mtx_assert(&sched_lock, MA_OWNED);
-	KASSERT((kg->kg_numthreads == 0), ("ksegrp_unlink: residual threads"));
-	KASSERT((kg->kg_numupcalls == 0), ("ksegrp_unlink: residual upcalls"));
-
-	p = kg->kg_proc;
-	TAILQ_REMOVE(&p->p_ksegrps, kg, kg_ksegrp);
-	p->p_numksegrps--;
-	/*
-	 * Aggregate stats from the KSE
-	 */
-	if (p->p_procscopegrp == kg)
-		p->p_procscopegrp = NULL;
+	umtx_thread_fini(td);
 }
 
 /*
@@ -272,17 +217,28 @@
  * proc_init()
  */
 void
-proc_linkup(struct proc *p, struct ksegrp *kg, struct thread *td)
+proc_linkup0(struct proc *p, struct thread *td)
 {
-
-	TAILQ_INIT(&p->p_ksegrps);	     /* all ksegrps in proc */
 	TAILQ_INIT(&p->p_threads);	     /* all threads in proc */
-	TAILQ_INIT(&p->p_suspended);	     /* Threads suspended */
-	p->p_numksegrps = 0;
-	p->p_numthreads = 0;
+	proc_linkup(p, td);
+}
 
-	ksegrp_link(kg, p);
-	thread_link(td, kg);
+void
+proc_linkup(struct proc *p, struct thread *td)
+{
+
+#ifdef KSE
+	TAILQ_INIT(&p->p_upcalls);	     /* upcall list */
+#endif
+	sigqueue_init(&p->p_sigqueue, p);
+	p->p_ksi = ksiginfo_alloc(1);
+	if (p->p_ksi != NULL) {
+		/* XXX p_ksi may be null if ksiginfo zone is not ready */
+		p->p_ksi->ksi_flags = KSI_EXT | KSI_INS;
+	}
+	LIST_INIT(&p->p_mqnotifier);
+	p->p_numthreads = 0;
+	thread_link(td, p);
 }
 
 /*
@@ -297,33 +253,32 @@
 
 	thread_zone = uma_zcreate("THREAD", sched_sizeof_thread(),
 	    thread_ctor, thread_dtor, thread_init, thread_fini,
-	    UMA_ALIGN_CACHE, 0);
-	ksegrp_zone = uma_zcreate("KSEGRP", sched_sizeof_ksegrp(),
-	    ksegrp_ctor, NULL, NULL, NULL,
-	    UMA_ALIGN_CACHE, 0);
+	    16 - 1, 0);
+#ifdef KSE
 	kseinit();	/* set up kse specific stuff  e.g. upcall zone*/
+#endif
 }
 
 /*
- * Stash an embarasingly extra thread into the zombie thread queue.
+ * Place an unused thread on the zombie list.
+ * Use the slpq as that must be unused by now.
  */
 void
-thread_stash(struct thread *td)
+thread_zombie(struct thread *td)
 {
-	mtx_lock_spin(&kse_zombie_lock);
-	TAILQ_INSERT_HEAD(&zombie_threads, td, td_runq);
-	mtx_unlock_spin(&kse_zombie_lock);
+	mtx_lock_spin(&zombie_lock);
+	TAILQ_INSERT_HEAD(&zombie_threads, td, td_slpq);
+	mtx_unlock_spin(&zombie_lock);
 }
 
 /*
- * Stash an embarasingly extra ksegrp into the zombie ksegrp queue.
+ * Release a thread that has exited after cpu_throw().
  */
 void
-ksegrp_stash(struct ksegrp *kg)
+thread_stash(struct thread *td)
 {
-	mtx_lock_spin(&kse_zombie_lock);
-	TAILQ_INSERT_HEAD(&zombie_ksegrps, kg, kg_ksegrp);
-	mtx_unlock_spin(&kse_zombie_lock);
+	atomic_subtract_rel_int(&td->td_proc->p_exitthreads, 1);
+	thread_zombie(td);
 }
 
 /*
@@ -333,49 +288,28 @@
 thread_reap(void)
 {
 	struct thread *td_first, *td_next;
-	struct ksegrp *kg_first, * kg_next;
 
 	/*
 	 * Don't even bother to lock if none at this instant,
 	 * we really don't care about the next instant..
 	 */
-	if ((!TAILQ_EMPTY(&zombie_threads))
-	    || (!TAILQ_EMPTY(&zombie_ksegrps))) {
-		mtx_lock_spin(&kse_zombie_lock);
+	if (!TAILQ_EMPTY(&zombie_threads)) {
+		mtx_lock_spin(&zombie_lock);
 		td_first = TAILQ_FIRST(&zombie_threads);
-		kg_first = TAILQ_FIRST(&zombie_ksegrps);
 		if (td_first)
 			TAILQ_INIT(&zombie_threads);
-		if (kg_first)
-			TAILQ_INIT(&zombie_ksegrps);
-		mtx_unlock_spin(&kse_zombie_lock);
+		mtx_unlock_spin(&zombie_lock);
 		while (td_first) {
-			td_next = TAILQ_NEXT(td_first, td_runq);
+			td_next = TAILQ_NEXT(td_first, td_slpq);
 			if (td_first->td_ucred)
 				crfree(td_first->td_ucred);
 			thread_free(td_first);
 			td_first = td_next;
 		}
-		while (kg_first) {
-			kg_next = TAILQ_NEXT(kg_first, kg_ksegrp);
-			ksegrp_free(kg_first);
-			kg_first = kg_next;
-		}
-		/*
-		 * there will always be a thread on the list if one of these
-		 * is there.
-		 */
-		kse_GC();
 	}
-}
-
-/*
- * Allocate a ksegrp.
- */
-struct ksegrp *
-ksegrp_alloc(void)
-{
-	return (uma_zalloc(ksegrp_zone, M_WAITOK));
+#ifdef KSE
+	upcall_reap();
+#endif
 }
 
 /*
@@ -384,19 +318,21 @@
 struct thread *
 thread_alloc(void)
 {
+	struct thread *td;
+
 	thread_reap(); /* check if any zombies to get */
-	return (uma_zalloc(thread_zone, M_WAITOK));
-}
 
-/*
- * Deallocate a ksegrp.
- */
-void
-ksegrp_free(struct ksegrp *td)
-{
-	uma_zfree(ksegrp_zone, td);
+	td = (struct thread *)uma_zalloc(thread_zone, M_WAITOK);
+	KASSERT(td->td_kstack == 0, ("thread_alloc got thread with kstack"));
+	if (!vm_thread_new(td, 0)) {
+		uma_zfree(thread_zone, td);
+		return (NULL);
+	}
+	cpu_thread_setup(td);
+	return (td);
 }
 
+
 /*
  * Deallocate a thread.
  */
@@ -405,6 +341,10 @@
 {
 
 	cpu_thread_clean(td);
+	if (td->td_altkstack != 0)
+		vm_thread_dispose_altkstack(td);
+	if (td->td_kstack != 0)
+		vm_thread_dispose(td);
 	uma_zfree(thread_zone, td);
 }
 
@@ -433,38 +373,48 @@
  * exit1()
  * kse_exit()
  * thr_exit()
+ * ifdef KSE
  * thread_user_enter()
  * thread_userret()
+ * endif
  * thread_suspend_check()
  */
 void
 thread_exit(void)
 {
-	struct bintime new_switchtime;
+	uint64_t new_switchtime;
 	struct thread *td;
+	struct thread *td2;
 	struct proc *p;
-	struct ksegrp	*kg;
 
 	td = curthread;
-	kg = td->td_ksegrp;
 	p = td->td_proc;
 
-	mtx_assert(&sched_lock, MA_OWNED);
+	PROC_SLOCK_ASSERT(p, MA_OWNED);
 	mtx_assert(&Giant, MA_NOTOWNED);
+
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	KASSERT(p != NULL, ("thread exiting without a process"));
-	KASSERT(kg != NULL, ("thread exiting without a kse group"));
 	CTR3(KTR_PROC, "thread_exit: thread %p (pid %ld, %s)", td,
 	    (long)p->p_pid, p->p_comm);
+	KASSERT(TAILQ_EMPTY(&td->td_sigqueue.sq_list), ("signal pending"));
 
+#ifdef AUDIT
+	AUDIT_SYSCALL_EXIT(0, td);
+#endif
+
+#ifdef KSE
 	if (td->td_standin != NULL) {
 		/*
 		 * Note that we don't need to free the cred here as it
 		 * is done in thread_reap().
 		 */
-		thread_stash(td->td_standin);
+		thread_zombie(td->td_standin);
 		td->td_standin = NULL;
 	}
+#endif
+
+	umtx_thread_exit(td);
 
 	/*
 	 * drop FPU & debug register state storage, or any other
@@ -473,24 +423,15 @@
 	 */
 	cpu_thread_exit(td);	/* XXXSMP */
 
-	/*
-	 * The thread is exiting. scheduler can release its stuff
-	 * and collect stats etc.
-	 */
-	sched_thread_exit(td);
-	
 	/* Do the same timestamp bookkeeping that mi_switch() would do. */
-	binuptime(&new_switchtime);
-	bintime_add(&p->p_rux.rux_runtime, &new_switchtime);
-	bintime_sub(&p->p_rux.rux_runtime, PCPU_PTR(switchtime));
+	new_switchtime = cpu_ticks();
+	p->p_rux.rux_runtime += (new_switchtime - PCPU_GET(switchtime));
 	PCPU_SET(switchtime, new_switchtime);
 	PCPU_SET(switchticks, ticks);
-	cnt.v_swtch++;
-
-	/* Add our usage into the usage of all our children. */
-	if (p->p_numthreads == 1)
-		ruadd(p->p_ru, &p->p_rux, &p->p_stats->p_cru, &p->p_crux);
-
+	PCPU_INC(cnt.v_swtch);
+	/* Save our resource usage in our process. */
+	td->td_ru.ru_nvcsw++;
+	rucollect(&p->p_ru, &td->td_ru);
 	/*
 	 * The last thread is left attached to the process
 	 * So that the whole bundle gets recycled. Skip
@@ -501,10 +442,15 @@
 	 */
 	if (p->p_flag & P_HADTHREADS) {
 		if (p->p_numthreads > 1) {
+			thread_lock(td);
+#ifdef KSE
+			kse_unlink(td);
+#else
 			thread_unlink(td);
-
-			/* XXX first arg not used in 4BSD or ULE */
-			sched_exit_thread(FIRST_THREAD_IN_PROC(p), td);
+#endif
+			thread_unlock(td);
+			td2 = FIRST_THREAD_IN_PROC(p);
+			sched_exit_thread(td2, td);
 
 			/*
 			 * The test below is NOT true if we are the
@@ -513,50 +459,13 @@
 			 */
 			if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
 				if (p->p_numthreads == p->p_suspcount) {
+					thread_lock(p->p_singlethread);
 					thread_unsuspend_one(p->p_singlethread);
+					thread_unlock(p->p_singlethread);
 				}
 			}
 
-			/*
-			 * Because each upcall structure has an owner thread,
-			 * owner thread exits only when process is in exiting
-			 * state, so upcall to userland is no longer needed,
-			 * deleting upcall structure is safe here.
-			 * So when all threads in a group is exited, all upcalls
-			 * in the group should be automatically freed.
-			 *  XXXKSE This is a KSE thing and should be exported
-			 * there somehow.
-			 */
-			upcall_remove(td);
-
-			/*
-			 * If the thread we unlinked above was the last one,
-			 * then this ksegrp should go away too.
-			 */
-			if (kg->kg_numthreads == 0) {
-				/*
-				 * let the scheduler know about this in case
-				 * it needs to recover stats or resources.
-				 * Theoretically we could let
-				 * sched_exit_ksegrp()  do the equivalent of
-				 * setting the concurrency to 0
-				 * but don't do it yet to avoid changing
-				 * the existing scheduler code until we
-				 * are ready.
-				 * We supply a random other ksegrp
-				 * as the recipient of any built up
-				 * cpu usage etc. (If the scheduler wants it).
-				 * XXXKSE
-				 * This is probably not fair so think of
- 				 * a better answer.
-				 */
-				sched_exit_ksegrp(FIRST_KSEGRP_IN_PROC(p), td);
-				sched_set_concurrency(kg, 0); /* XXX TEMP */
-				ksegrp_unlink(kg);
-				ksegrp_stash(kg);
-			}
-			PROC_UNLOCK(p);
-			td->td_ksegrp	= NULL;
+			atomic_add_int(&td->td_proc->p_exitthreads, 1);
 			PCPU_SET(deadthread, td);
 		} else {
 			/*
@@ -566,23 +475,23 @@
  			 * exit1() - clears threading flags before coming here
  			 * kse_exit() - treats last thread specially
  			 * thr_exit() - treats last thread specially
+			 * ifdef KSE
  			 * thread_user_enter() - only if more exist
  			 * thread_userret() - only if more exist
+			 * endif
  			 * thread_suspend_check() - only if more exist
 			 */
 			panic ("thread_exit: Last thread exiting on its own");
 		}
-	} else {
-		/*
-		 * non threaded process comes here.
-		 * This includes an EX threaded process that is coming
-		 * here via exit1(). (exit1 dethreads the proc first).
-		 */
-		PROC_UNLOCK(p);
-	}
+	} 
+	PROC_UNLOCK(p);
+	thread_lock(td);
+	/* Save our tick information with both the thread and proc locked */
+	ruxagg(&p->p_rux, td);
+	PROC_SUNLOCK(p);
 	td->td_state = TDS_INACTIVE;
 	CTR1(KTR_PROC, "thread_exit: cpu_throw() thread %p", td);
-	cpu_throw(td, choosethread());
+	sched_throw(td);
 	panic("I'm a teapot!");
 	/* NOTREACHED */
 }
@@ -598,19 +507,25 @@
 
 	mtx_assert(&Giant, MA_NOTOWNED);
 	KASSERT((p->p_numthreads == 1), ("Multiple threads in wait1()"));
-	KASSERT((p->p_numksegrps == 1), ("Multiple ksegrps in wait1()"));
-	FOREACH_THREAD_IN_PROC(p, td) {
-		if (td->td_standin != NULL) {
-			if (td->td_standin->td_ucred != NULL) {
-				crfree(td->td_standin->td_ucred);
-				td->td_standin->td_ucred = NULL;
-			}
-			thread_free(td->td_standin);
-			td->td_standin = NULL;
+	td = FIRST_THREAD_IN_PROC(p);
+#ifdef KSE
+	if (td->td_standin != NULL) {
+		if (td->td_standin->td_ucred != NULL) {
+			crfree(td->td_standin->td_ucred);
+			td->td_standin->td_ucred = NULL;
 		}
-		cpu_thread_clean(td);
-		crfree(td->td_ucred);
+		thread_free(td->td_standin);
+		td->td_standin = NULL;
 	}
+#endif
+	/* Lock the last thread so we spin until it exits cpu_throw(). */
+	thread_lock(td);
+	thread_unlock(td);
+	/* Wait for any remaining threads to exit cpu_throw(). */
+	while (p->p_exitthreads)
+		sched_relinquish(curthread);
+	cpu_thread_clean(td);
+	crfree(td->td_ucred);
 	thread_reap();	/* check for zombie threads etc. */
 }
 
@@ -627,23 +542,23 @@
  *  thr_create()
  */
 void
-thread_link(struct thread *td, struct ksegrp *kg)
+thread_link(struct thread *td, struct proc *p)
 {
-	struct proc *p;
 
-	p = kg->kg_proc;
+	/*
+	 * XXX This can't be enabled because it's called for proc0 before
+	 * it's spinlock has been created.
+	 * PROC_SLOCK_ASSERT(p, MA_OWNED);
+	 */
 	td->td_state    = TDS_INACTIVE;
 	td->td_proc     = p;
-	td->td_ksegrp   = kg;
-	td->td_flags    = 0;
-	td->td_kflags	= 0;
+	td->td_flags    = TDF_INMEM;
 
 	LIST_INIT(&td->td_contested);
+	sigqueue_init(&td->td_sigqueue, p);
 	callout_init(&td->td_slpcallout, CALLOUT_MPSAFE);
 	TAILQ_INSERT_HEAD(&p->p_threads, td, td_plist);
-	TAILQ_INSERT_HEAD(&kg->kg_threads, td, td_kglist);
 	p->p_numthreads++;
-	kg->kg_numthreads++;
 }
 
 /*
@@ -658,15 +573,20 @@
 	struct proc *p = td->td_proc;
 
 	KASSERT((p->p_numthreads == 1), ("Unthreading with >1 threads"));
+#ifdef KSE
+	thread_lock(td);
 	upcall_remove(td);
+	thread_unlock(td);
 	p->p_flag &= ~(P_SA|P_HADTHREADS);
 	td->td_mailbox = NULL;
 	td->td_pflags &= ~(TDP_SA | TDP_CAN_UNBIND);
 	if (td->td_standin != NULL) {
-		thread_stash(td->td_standin);
+		thread_zombie(td->td_standin);
 		td->td_standin = NULL;
 	}
-	sched_set_concurrency(td->td_ksegrp, 1);
+#else
+	p->p_flag &= ~P_HADTHREADS;
+#endif
 }
 
 /*
@@ -677,15 +597,12 @@
 thread_unlink(struct thread *td)
 {
 	struct proc *p = td->td_proc;
-	struct ksegrp *kg = td->td_ksegrp;
 
-	mtx_assert(&sched_lock, MA_OWNED);
+	PROC_SLOCK_ASSERT(p, MA_OWNED);
 	TAILQ_REMOVE(&p->p_threads, td, td_plist);
 	p->p_numthreads--;
-	TAILQ_REMOVE(&kg->kg_threads, td, td_kglist);
-	kg->kg_numthreads--;
 	/* could clear a few other things here */
-	/* Must  NOT clear links to proc and ksegrp! */
+	/* Must  NOT clear links to proc! */
 }
 
 /*
@@ -698,7 +615,7 @@
  * There are no threads in user mode. Threads in the kernel must be
  * allowed to continue until they get to the user boundary. They may even
  * copy out their return values and data before suspending. They may however be
- * accellerated in reaching the user boundary as we will wake up
+ * accelerated in reaching the user boundary as we will wake up
  * any sleeping threads that are interruptable. (PCATCH).
  */
 int
@@ -733,7 +650,7 @@
 			p->p_flag &= ~P_SINGLE_BOUNDARY;
 	}
 	p->p_flag |= P_STOPPED_SINGLE;
-	mtx_lock_spin(&sched_lock);
+	PROC_SLOCK(p);
 	p->p_singlethread = td;
 	if (mode == SINGLE_EXIT)
 		remaining = p->p_numthreads;
@@ -747,6 +664,7 @@
 		FOREACH_THREAD_IN_PROC(p, td2) {
 			if (td2 == td)
 				continue;
+			thread_lock(td2);
 			td2->td_flags |= TDF_ASTPENDING;
 			if (TD_IS_INHIBITED(td2)) {
 				switch (mode) {
@@ -768,10 +686,12 @@
 						sleepq_abort(td2, ERESTART);
 					break;
 				default:	
-					if (TD_IS_SUSPENDED(td2))
+					if (TD_IS_SUSPENDED(td2)) {
+						thread_unlock(td2);
 						continue;
+					}
 					/*
-					 * maybe other inhibitted states too?
+					 * maybe other inhibited states too?
 					 */
 					if ((td2->td_flags & TDF_SINTR) &&
 					    (td2->td_inhibitors &
@@ -785,6 +705,7 @@
 				forward_signal(td2);
 			}
 #endif
+			thread_unlock(td2);
 		}
 		if (mode == SINGLE_EXIT)
 			remaining = p->p_numthreads;
@@ -804,13 +725,7 @@
 		 * Wake us up when everyone else has suspended.
 		 * In the mean time we suspend as well.
 		 */
-		thread_stopped(p);
-		thread_suspend_one(td);
-		PROC_UNLOCK(p);
-		mi_switch(SW_VOL, NULL);
-		mtx_unlock_spin(&sched_lock);
-		PROC_LOCK(p);
-		mtx_lock_spin(&sched_lock);
+		thread_suspend_switch(td);
 		if (mode == SINGLE_EXIT)
 			remaining = p->p_numthreads;
 		else if (mode == SINGLE_BOUNDARY)
@@ -829,7 +744,7 @@
 		p->p_flag &= ~(P_STOPPED_SINGLE | P_SINGLE_EXIT);
 		thread_unthread(td);
 	}
-	mtx_unlock_spin(&sched_lock);
+	PROC_SUNLOCK(p);
 	return (0);
 }
 
@@ -898,7 +813,11 @@
 		    (p->p_flag & P_SINGLE_BOUNDARY) && return_instead)
 			return (ERESTART);
 
-		mtx_lock_spin(&sched_lock);
+		/* If thread will exit, flush its pending signals */
+		if ((p->p_flag & P_SINGLE_EXIT) && (p->p_singlethread != td))
+			sigqueue_flush(&td->td_sigqueue);
+
+		PROC_SLOCK(p);
 		thread_stopped(p);
 		/*
 		 * If the process is waiting for us to exit,
@@ -907,44 +826,75 @@
 		 */
 		if ((p->p_flag & P_SINGLE_EXIT) && (p->p_singlethread != td))
 			thread_exit();
-
+		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
+			if (p->p_numthreads == p->p_suspcount + 1) {
+				thread_lock(p->p_singlethread);
+				thread_unsuspend_one(p->p_singlethread);
+				thread_unlock(p->p_singlethread);
+			}
+		}
+		PROC_UNLOCK(p);
+		thread_lock(td);
 		/*
 		 * When a thread suspends, it just
-		 * moves to the processes's suspend queue
-		 * and stays there.
+		 * gets taken off all queues.
 		 */
 		thread_suspend_one(td);
 		if (return_instead == 0) {
 			p->p_boundary_count++;
 			td->td_flags |= TDF_BOUNDARY;
 		}
-		if (P_SHOULDSTOP(p) == P_STOPPED_SINGLE) {
-			if (p->p_numthreads == p->p_suspcount) 
-				thread_unsuspend_one(p->p_singlethread);
-		}
-		PROC_UNLOCK(p);
+		PROC_SUNLOCK(p);
 		mi_switch(SW_INVOL, NULL);
-		if (return_instead == 0) {
-			p->p_boundary_count--;
+		if (return_instead == 0)
 			td->td_flags &= ~TDF_BOUNDARY;
-		}
-		mtx_unlock_spin(&sched_lock);
+		thread_unlock(td);
 		PROC_LOCK(p);
+		if (return_instead == 0)
+			p->p_boundary_count--;
 	}
 	return (0);
 }
 
 void
+thread_suspend_switch(struct thread *td)
+{
+	struct proc *p;
+
+	p = td->td_proc;
+	KASSERT(!TD_IS_SUSPENDED(td), ("already suspended"));
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	PROC_SLOCK_ASSERT(p, MA_OWNED);
+	/*
+	 * We implement thread_suspend_one in stages here to avoid
+	 * dropping the proc lock while the thread lock is owned.
+	 */
+	thread_stopped(p);
+	p->p_suspcount++;
+	PROC_UNLOCK(p);
+	thread_lock(td);
+	sched_sleep(td);
+	TD_SET_SUSPENDED(td);
+	PROC_SUNLOCK(p);
+	DROP_GIANT();
+	mi_switch(SW_VOL, NULL);
+	thread_unlock(td);
+	PICKUP_GIANT();
+	PROC_LOCK(p);
+	PROC_SLOCK(p);
+}
+
+void
 thread_suspend_one(struct thread *td)
 {
 	struct proc *p = td->td_proc;
 
-	mtx_assert(&sched_lock, MA_OWNED);
-	PROC_LOCK_ASSERT(p, MA_OWNED);
+	PROC_SLOCK_ASSERT(p, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	KASSERT(!TD_IS_SUSPENDED(td), ("already suspended"));
 	p->p_suspcount++;
+	sched_sleep(td);
 	TD_SET_SUSPENDED(td);
-	TAILQ_INSERT_TAIL(&p->p_suspended, td, td_runq);
 }
 
 void
@@ -952,9 +902,9 @@
 {
 	struct proc *p = td->td_proc;
 
-	mtx_assert(&sched_lock, MA_OWNED);
-	PROC_LOCK_ASSERT(p, MA_OWNED);
-	TAILQ_REMOVE(&p->p_suspended, td, td_runq);
+	PROC_SLOCK_ASSERT(p, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	KASSERT(TD_IS_SUSPENDED(td), ("Thread not suspended"));
 	TD_CLR_SUSPENDED(td);
 	p->p_suspcount--;
 	setrunnable(td);
@@ -968,11 +918,15 @@
 {
 	struct thread *td;
 
-	mtx_assert(&sched_lock, MA_OWNED);
 	PROC_LOCK_ASSERT(p, MA_OWNED);
+	PROC_SLOCK_ASSERT(p, MA_OWNED);
 	if (!P_SHOULDSTOP(p)) {
-		while ((td = TAILQ_FIRST(&p->p_suspended))) {
-			thread_unsuspend_one(td);
+                FOREACH_THREAD_IN_PROC(p, td) {
+			thread_lock(td);
+			if (TD_IS_SUSPENDED(td)) {
+				thread_unsuspend_one(td);
+			}
+			thread_unlock(td);
 		}
 	} else if ((P_SHOULDSTOP(p) == P_STOPPED_SINGLE) &&
 	    (p->p_numthreads == p->p_suspcount)) {
@@ -981,7 +935,9 @@
 		 * threading request. Now we've downgraded to single-threaded,
 		 * let it continue.
 		 */
+		thread_lock(p->p_singlethread);
 		thread_unsuspend_one(p->p_singlethread);
+		thread_unlock(p->p_singlethread);
 	}
 }
 
@@ -998,9 +954,8 @@
 	p = td->td_proc;
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	p->p_flag &= ~(P_STOPPED_SINGLE | P_SINGLE_EXIT | P_SINGLE_BOUNDARY);
-	mtx_lock_spin(&sched_lock);
+	PROC_SLOCK(p);
 	p->p_singlethread = NULL;
-	p->p_procscopegrp = NULL;
 	/*
 	 * If there are other threads they mey now run,
 	 * unless of course there is a blanket 'stop order'
@@ -1008,33 +963,28 @@
 	 * to continue however as this is a bad place to stop.
 	 */
 	if ((p->p_numthreads != 1) && (!P_SHOULDSTOP(p))) {
-		while ((td = TAILQ_FIRST(&p->p_suspended))) {
-			thread_unsuspend_one(td);
+                FOREACH_THREAD_IN_PROC(p, td) {
+			thread_lock(td);
+			if (TD_IS_SUSPENDED(td)) {
+				thread_unsuspend_one(td);
+			}
+			thread_unlock(td);
 		}
 	}
-	mtx_unlock_spin(&sched_lock);
+	PROC_SUNLOCK(p);
 }
 
-/*
- * Called before going into an interruptible sleep to see if we have been
- * interrupted or requested to exit.
- */
-int
-thread_sleep_check(struct thread *td)
+struct thread *
+thread_find(struct proc *p, lwpid_t tid)
 {
-	struct proc *p;
+	struct thread *td;
 
-	p = td->td_proc;
-	mtx_assert(&sched_lock, MA_OWNED);
-	if (p->p_flag & P_HADTHREADS) {
-		if (p->p_singlethread != td) {
-			if (p->p_flag & P_SINGLE_EXIT)
-				return (EINTR);
-			if (p->p_flag & P_SINGLE_BOUNDARY)
-				return (ERESTART);
-		}
-		if (td->td_flags & TDF_INTERRUPT)
-			return (td->td_intrval);
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	PROC_SLOCK(p);
+	FOREACH_THREAD_IN_PROC(p, td) {
+		if (td->td_tid == tid)
+			break;
 	}
-	return (0);
+	PROC_SUNLOCK(p);
+	return (td);
 }
Index: kern_module.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_module.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/kern_module.c -L sys/kern/kern_module.c -u -r1.1.1.1 -r1.2
--- sys/kern/kern_module.c
+++ sys/kern/kern_module.c
@@ -27,7 +27,7 @@
 #include "opt_compat.h"
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_module.c,v 1.48 2005/02/18 22:14:40 ps Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_module.c,v 1.52.2.1 2007/12/19 20:37:53 jhb Exp $");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
@@ -99,10 +99,12 @@
 
 	if (arg2 & RB_NOSYNC)
 		return;
+	mtx_lock(&Giant);
 	MOD_SLOCK;
 	TAILQ_FOREACH(mod, &modules, link)
 		MOD_EVENT(mod, MOD_SHUTDOWN);
 	MOD_SUNLOCK;
+	mtx_unlock(&Giant);
 }
 
 void
@@ -112,6 +114,7 @@
 	int error;
 	module_t mod;
 
+	mtx_lock(&Giant);
 	MOD_SLOCK;
 	mod = module_lookupbyname(data->name);
 	if (mod == NULL)
@@ -128,6 +131,7 @@
 		    " %d\n", data->name, (void *)data->evhand, data->priv,
 		    error); 
 	}
+	mtx_unlock(&Giant);
 }
 
 int
@@ -136,20 +140,20 @@
 	size_t namelen;
 	module_t newmod;
 
-	MOD_SLOCK;
+	MOD_XLOCK;
 	newmod = module_lookupbyname(data->name);
 	if (newmod != NULL) {
-		MOD_SUNLOCK;
+		MOD_XUNLOCK;
 		printf("module_register: module %s already exists!\n",
 		    data->name);
 		return (EEXIST);
 	}
-	MOD_SUNLOCK;
 	namelen = strlen(data->name) + 1;
 	newmod = malloc(sizeof(struct module) + namelen, M_MODULE, M_WAITOK);
-	if (newmod == NULL)
+	if (newmod == NULL) {
+		MOD_XUNLOCK;
 		return (ENOMEM);
-	MOD_XLOCK;
+	}
 	newmod->refs = 1;
 	newmod->id = nextid++;
 	newmod->name = (char *)(newmod + 1);
@@ -232,12 +236,14 @@
 {
 	int error;
 
+	mtx_lock(&Giant);
 	error = MOD_EVENT(mod, MOD_QUIESCE);
 	if (error == EOPNOTSUPP || error == EINVAL)
 		error = 0;
-	if (flags == LINKER_UNLOAD_NORMAL && error != 0)
-		return (error);
-        return (MOD_EVENT(mod, MOD_UNLOAD));
+	if (error == 0 || flags == LINKER_UNLOAD_FORCE)
+		error = MOD_EVENT(mod, MOD_UNLOAD);
+	mtx_unlock(&Giant);
+	return (error);
 }
 
 int
@@ -264,12 +270,16 @@
 	mod->data = *datap;
 }
 
+linker_file_t
+module_file(module_t mod)
+{
+
+	return (mod->file);
+}
+
 /*
  * Syscalls.
  */
-/*
- * MPSAFE
- */
 int
 modnext(struct thread *td, struct modnext_args *uap)
 {
@@ -301,9 +311,6 @@
 	return (error);
 }
 
-/*
- * MPSAFE
- */
 int
 modfnext(struct thread *td, struct modfnext_args *uap)
 {
@@ -334,9 +341,6 @@
 	int	id;
 };
 
-/*
- * MPSAFE
- */
 int
 modstat(struct thread *td, struct modstat_args *uap)
 {
@@ -390,9 +394,6 @@
 	return (error);
 }
 
-/*
- * MPSAFE
- */
 int
 modfind(struct thread *td, struct modfind_args *uap)
 {
@@ -415,6 +416,7 @@
 
 #ifdef COMPAT_IA32
 #include <sys/mount.h>
+#include <sys/socket.h>
 #include <compat/freebsd32/freebsd32_util.h>
 #include <compat/freebsd32/freebsd32.h>
 #include <compat/freebsd32/freebsd32_proto.h>
@@ -434,9 +436,6 @@
 	modspecific32_t	data;
 };
 
-/*
- * MPSAFE
- */
 int
 freebsd32_modstat(struct thread *td, struct freebsd32_modstat_args *uap)
 {
Index: subr_kdb.c
===================================================================
RCS file: /home/cvs/src/sys/kern/subr_kdb.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/subr_kdb.c -L sys/kern/subr_kdb.c -u -r1.1.1.1 -r1.2
--- sys/kern/subr_kdb.c
+++ sys/kern/subr_kdb.c
@@ -25,7 +25,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/subr_kdb.c,v 1.12.2.1 2005/10/02 10:06:15 rwatson Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/subr_kdb.c,v 1.24 2007/09/17 05:27:20 jeff Exp $");
 
 #include "opt_kdb.h"
 
@@ -42,18 +42,10 @@
 #include <machine/kdb.h>
 #include <machine/pcb.h>
 
-#ifdef KDB_STOP_NMI
+#ifdef SMP
 #include <machine/smp.h>
 #endif
 
-/* 
- * KDB_STOP_NMI requires SMP to pick up the right dependencies
- * (And isn't useful on UP anyway) 
- */
-#if defined(KDB_STOP_NMI) && !defined(SMP)
-#error "options KDB_STOP_NMI" requires "options SMP"
-#endif
-
 int kdb_active = 0;
 void *kdb_jmpbufp = NULL;
 struct kdb_dbbe *kdb_dbbe = NULL;
@@ -68,6 +60,9 @@
 static int kdb_sysctl_available(SYSCTL_HANDLER_ARGS);
 static int kdb_sysctl_current(SYSCTL_HANDLER_ARGS);
 static int kdb_sysctl_enter(SYSCTL_HANDLER_ARGS);
+static int kdb_sysctl_panic(SYSCTL_HANDLER_ARGS);
+static int kdb_sysctl_trap(SYSCTL_HANDLER_ARGS);
+static int kdb_sysctl_trap_code(SYSCTL_HANDLER_ARGS);
 
 SYSCTL_NODE(_debug, OID_AUTO, kdb, CTLFLAG_RW, NULL, "KDB nodes");
 
@@ -80,6 +75,15 @@
 SYSCTL_PROC(_debug_kdb, OID_AUTO, enter, CTLTYPE_INT | CTLFLAG_RW, 0, 0,
     kdb_sysctl_enter, "I", "set to enter the debugger");
 
+SYSCTL_PROC(_debug_kdb, OID_AUTO, panic, CTLTYPE_INT | CTLFLAG_RW, 0, 0,
+    kdb_sysctl_panic, "I", "set to panic the kernel");
+
+SYSCTL_PROC(_debug_kdb, OID_AUTO, trap, CTLTYPE_INT | CTLFLAG_RW, 0, 0,
+    kdb_sysctl_trap, "I", "set to cause a page fault via data access");
+
+SYSCTL_PROC(_debug_kdb, OID_AUTO, trap_code, CTLTYPE_INT | CTLFLAG_RW, 0, 0,
+    kdb_sysctl_trap_code, "I", "set to cause a page fault via code access");
+
 /*
  * Flag indicating whether or not to IPI the other CPUs to stop them on
  * entering the debugger.  Sometimes, this will result in a deadlock as
@@ -89,21 +93,8 @@
 #ifdef SMP
 static int kdb_stop_cpus = 1;
 SYSCTL_INT(_debug_kdb, OID_AUTO, stop_cpus, CTLTYPE_INT | CTLFLAG_RW,
-    &kdb_stop_cpus, 0, "");
+    &kdb_stop_cpus, 0, "stop other CPUs when entering the debugger");
 TUNABLE_INT("debug.kdb.stop_cpus", &kdb_stop_cpus);
-
-#ifdef KDB_STOP_NMI
-/* 
- * Provide an alternate method of stopping other CPUs. If another CPU has
- * disabled interrupts the conventional STOP IPI will be blocked. This 
- * NMI-based stop should get through in that case.
- */
-static int kdb_stop_cpus_with_nmi = 1;
-SYSCTL_INT(_debug_kdb, OID_AUTO, stop_cpus_with_nmi, CTLTYPE_INT | CTLFLAG_RW,
-    &kdb_stop_cpus_with_nmi, 0, "");
-TUNABLE_INT("debug.kdb.stop_cpus_with_nmi", &kdb_stop_cpus_with_nmi);
-#endif /* KDB_STOP_NMI */
-
 #endif
 
 static int
@@ -176,6 +167,55 @@
 	return (0);
 }
 
+static int
+kdb_sysctl_panic(SYSCTL_HANDLER_ARGS)
+{
+	int error, i;
+
+	error = sysctl_wire_old_buffer(req, sizeof(int));
+	if (error == 0) {
+		i = 0;
+		error = sysctl_handle_int(oidp, &i, 0, req);
+	}
+	if (error != 0 || req->newptr == NULL)
+		return (error);
+	panic("kdb_sysctl_panic");
+	return (0);
+}
+
+static int
+kdb_sysctl_trap(SYSCTL_HANDLER_ARGS)
+{
+	int error, i;
+	int *addr = (int *)0x10;
+
+	error = sysctl_wire_old_buffer(req, sizeof(int));
+	if (error == 0) {
+		i = 0;
+		error = sysctl_handle_int(oidp, &i, 0, req);
+	}
+	if (error != 0 || req->newptr == NULL)
+		return (error);
+	return (*addr);
+}
+
+static int
+kdb_sysctl_trap_code(SYSCTL_HANDLER_ARGS)
+{
+	int error, i;
+	void (*fp)(u_int, u_int, u_int) = (void *)0xdeadc0de;
+
+	error = sysctl_wire_old_buffer(req, sizeof(int));
+	if (error == 0) {
+		i = 0;
+		error = sysctl_handle_int(oidp, &i, 0, req);
+	}
+	if (error != 0 || req->newptr == NULL)
+		return (error);
+	(*fp)(0x11111111, 0x22222222, 0x33333333);
+	return (0);
+}
+
 /*
  * Solaris implements a new BREAK which is initiated by a character sequence
  * CR ~ ^b which is similar to a familiar pattern used on Sun servers by the
@@ -335,27 +375,22 @@
 
 struct pcb *
 kdb_thr_ctx(struct thread *thr)
-#ifdef KDB_STOP_NMI
 {  
-  u_int		cpuid;
-  struct pcpu *pc;
-  
-  if (thr == curthread) 
-    return &kdb_pcb;
-
-  SLIST_FOREACH(pc, &cpuhead, pc_allcpu)  {
-    cpuid = pc->pc_cpuid;
-    if (pc->pc_curthread == thr && (atomic_load_acq_int(&stopped_cpus) & (1 << cpuid)))
-      return &stoppcbs[cpuid];
-  }
-
-  return  thr->td_pcb;
-}
-#else
-{
-	return ((thr == curthread) ? &kdb_pcb : thr->td_pcb);
+#if defined(SMP) && defined(KDB_STOPPEDPCB)
+	struct pcpu *pc;
+#endif
+ 
+	if (thr == curthread) 
+		return (&kdb_pcb);
+
+#if defined(SMP) && defined(KDB_STOPPEDPCB)
+	SLIST_FOREACH(pc, &cpuhead, pc_allcpu)  {
+		if (pc->pc_curthread == thr && (stopped_cpus & pc->pc_cpumask))
+			return (KDB_STOPPEDPCB(pc));
+	}
+#endif
+	return (thr->td_pcb);
 }
-#endif /* KDB_STOP_NMI */
 
 struct thread *
 kdb_thr_first(void)
@@ -365,7 +400,7 @@
 
 	p = LIST_FIRST(&allproc);
 	while (p != NULL) {
-		if (p->p_sflag & PS_INMEM) {
+		if (p->p_flag & P_INMEM) {
 			thr = FIRST_THREAD_IN_PROC(p);
 			if (thr != NULL)
 				return (thr);
@@ -382,7 +417,7 @@
 
 	p = LIST_FIRST(&allproc);
 	while (p != NULL) {
-		if (p->p_sflag & PS_INMEM && p->p_pid == pid)
+		if (p->p_flag & P_INMEM && p->p_pid == pid)
 			return (FIRST_THREAD_IN_PROC(p));
 		p = LIST_NEXT(p, p_list);
 	}
@@ -411,7 +446,7 @@
 		if (thr != NULL)
 			return (thr);
 		p = LIST_NEXT(p, p_list);
-		if (p != NULL && (p->p_sflag & PS_INMEM))
+		if (p != NULL && (p->p_flag & P_INMEM))
 			thr = FIRST_THREAD_IN_PROC(p);
 	} while (p != NULL);
 	return (NULL);
@@ -434,6 +469,7 @@
 int
 kdb_trap(int type, int code, struct trapframe *tf)
 {
+	register_t intr;
 #ifdef SMP
 	int did_stop_cpus;
 #endif
@@ -446,22 +482,15 @@
 	if (kdb_active)
 		return (0);
 
-	critical_enter();
-
-	kdb_active++;
+	intr = intr_disable();
 
 #ifdef SMP
 	if ((did_stop_cpus = kdb_stop_cpus) != 0)
-	  {
-#ifdef KDB_STOP_NMI
-	    if(kdb_stop_cpus_with_nmi)
-	      stop_cpus_nmi(PCPU_GET(other_cpus));
-	    else
-#endif /* KDB_STOP_NMI */
 		stop_cpus(PCPU_GET(other_cpus));
-	  }
 #endif
 
+	kdb_active++;
+
 	kdb_frame = tf;
 
 	/* Let MD code do its thing first... */
@@ -472,14 +501,14 @@
 
 	handled = kdb_dbbe->dbbe_trap(type, code);
 
+	kdb_active--;
+
 #ifdef SMP
 	if (did_stop_cpus)
 		restart_cpus(stopped_cpus);
 #endif
 
-	kdb_active--;
-
-	critical_exit();
+	intr_restore(intr);
 
 	return (handled);
 }
Index: kern_mib.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_mib.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/kern_mib.c -L sys/kern/kern_mib.c -u -r1.1.1.1 -r1.2
--- sys/kern/kern_mib.c
+++ sys/kern/kern_mib.c
@@ -36,12 +36,14 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_mib.c,v 1.74.2.2 2005/10/08 07:06:49 pjd Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_mib.c,v 1.84.2.1 2007/12/06 14:19:42 kib Exp $");
 
 #include "opt_posix.h"
+#include "opt_config.h"
 
 #include <sys/param.h>
 #include <sys/kernel.h>
+#include <sys/sbuf.h>
 #include <sys/systm.h>
 #include <sys/sysctl.h>
 #include <sys/proc.h>
@@ -102,7 +104,6 @@
  * NOTICE: The *userland* release date is available in
  * /usr/include/osreldate.h
  */
-extern int osreldate;
 SYSCTL_INT(_kern, KERN_OSRELDATE, osreldate, CTLFLAG_RD,
     &osreldate, 0, "Kernel release date");
 
@@ -150,6 +151,18 @@
     0, PAGE_SIZE, "System memory page size");
 
 static int
+sysctl_kern_arnd(SYSCTL_HANDLER_ARGS)
+{
+	u_long val;
+
+	arc4rand(&val, sizeof(val), 0);
+	return (sysctl_handle_long(oidp, &val, 0, req));
+}
+
+SYSCTL_PROC(_kern, KERN_ARND, arandom, CTLFLAG_RD,
+	0, 0, sysctl_kern_arnd, "L", "arc4rand");
+
+static int
 sysctl_hw_physmem(SYSCTL_HANDLER_ARGS)
 {
 	u_long val;
@@ -295,12 +308,30 @@
     CTLTYPE_INT|CTLFLAG_RW|CTLFLAG_PRISON, 0, 0, sysctl_kern_securelvl,
     "I", "Current secure level");
 
+#ifdef INCLUDE_CONFIG_FILE
+/* Actual kernel configuration options. */
+extern char kernconfstring[];
+
+static int
+sysctl_kern_config(SYSCTL_HANDLER_ARGS)
+{
+	return (sysctl_handle_string(oidp, kernconfstring,
+	    strlen(kernconfstring), req));
+}
+
+SYSCTL_PROC(_kern, OID_AUTO, conftxt, CTLTYPE_STRING|CTLFLAG_RW, 
+    0, 0, sysctl_kern_config, "", "Kernel configuration file");
+#endif
+
 char domainname[MAXHOSTNAMELEN];
 SYSCTL_STRING(_kern, KERN_NISDOMAINNAME, domainname, CTLFLAG_RW,
     &domainname, sizeof(domainname), "Name of the current YP/NIS domain");
 
 u_long hostid;
 SYSCTL_ULONG(_kern, KERN_HOSTID, hostid, CTLFLAG_RW, &hostid, 0, "Host ID");
+char hostuuid[64] = "00000000-0000-0000-0000-000000000000";
+SYSCTL_STRING(_kern, KERN_HOSTUUID, hostuuid, CTLFLAG_RW, hostuuid,
+    sizeof(hostuuid), "Host UUID");
 
 /*
  * This is really cheating.  These actually live in the libc, something
Index: subr_prf.c
===================================================================
RCS file: /home/cvs/src/sys/kern/subr_prf.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/subr_prf.c -L sys/kern/subr_prf.c -u -r1.2 -r1.3
--- sys/kern/subr_prf.c
+++ sys/kern/subr_prf.c
@@ -35,9 +35,10 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/subr_prf.c,v 1.116.2.3 2005/10/07 12:40:51 phk Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/subr_prf.c,v 1.130 2007/03/08 06:44:34 julian Exp $");
 
 #include "opt_ddb.h"
+#include "opt_printf.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -48,6 +49,7 @@
 #include <sys/kernel.h>
 #include <sys/msgbuf.h>
 #include <sys/malloc.h>
+#include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/stddef.h>
 #include <sys/sysctl.h>
@@ -55,6 +57,7 @@
 #include <sys/syslog.h>
 #include <sys/cons.h>
 #include <sys/uio.h>
+#include <sys/ctype.h>
 
 #ifdef DDB
 #include <ddb/ddb.h>
@@ -77,6 +80,10 @@
 	int	flags;
 	int	pri;
 	struct	tty *tty;
+	char	*p_bufr;
+	size_t	n_bufr;
+	char	*p_next;
+	size_t	remain;
 };
 
 struct snprintf_arg {
@@ -88,10 +95,9 @@
 
 static void  msglogchar(int c, int pri);
 static void  putchar(int ch, void *arg);
-static char *ksprintn(char *nbuf, uintmax_t num, int base, int *len);
+static char *ksprintn(char *nbuf, uintmax_t num, int base, int *len, int upper);
 static void  snprintf_func(int ch, void *arg);
 
-static int consintr = 1;		/* Ok to handle console interrupts? */
 static int msgbufmapped;		/* Set when safe to use msgbuf */
 int msgbuftrigger;
 
@@ -127,7 +133,7 @@
 	struct putchar_arg pca;
 	int retval;
 
-	if (td == NULL || td == PCPU_GET(idlethread))
+	if (td == NULL || TD_IS_IDLETHREAD(td))
 		return (0);
 
 	mtx_lock(&Giant);
@@ -233,6 +239,7 @@
 	pca.tty = NULL;
 	pca.pri = level;
 	pca.flags = log_open ? TOLOG : TOCONS;
+	pca.p_bufr = NULL;
 
 	va_start(ap, fmt);
 	kvprintf(fmt, putchar, &pca, 10, ap);
@@ -283,43 +290,108 @@
 printf(const char *fmt, ...)
 {
 	va_list ap;
-	int savintr;
 	struct putchar_arg pca;
 	int retval;
+#ifdef PRINTF_BUFR_SIZE
+	char bufr[PRINTF_BUFR_SIZE];
+#endif
 
-	savintr = consintr;		/* disable interrupts */
-	consintr = 0;
 	va_start(ap, fmt);
 	pca.tty = NULL;
 	pca.flags = TOCONS | TOLOG;
 	pca.pri = -1;
+#ifdef PRINTF_BUFR_SIZE
+	pca.p_bufr = bufr;
+	pca.p_next = pca.p_bufr;
+	pca.n_bufr = sizeof(bufr);
+	pca.remain = sizeof(bufr);
+	*pca.p_next = '\0';
+#else
+	/* Don't buffer console output. */
+	pca.p_bufr = NULL;
+#endif
+
 	retval = kvprintf(fmt, putchar, &pca, 10, ap);
 	va_end(ap);
+
+#ifdef PRINTF_BUFR_SIZE
+	/* Write any buffered console output: */
+	if (*pca.p_bufr != '\0')
+		cnputs(pca.p_bufr);
+#endif
+
 	if (!panicstr)
 		msgbuftrigger = 1;
-	consintr = savintr;		/* reenable interrupts */
+
 	return (retval);
 }
 
 int
 vprintf(const char *fmt, va_list ap)
 {
-	int savintr;
 	struct putchar_arg pca;
 	int retval;
+#ifdef PRINTF_BUFR_SIZE
+	char bufr[PRINTF_BUFR_SIZE];
+#endif
 
-	savintr = consintr;		/* disable interrupts */
-	consintr = 0;
 	pca.tty = NULL;
 	pca.flags = TOCONS | TOLOG;
 	pca.pri = -1;
+#ifdef PRINTF_BUFR_SIZE
+	pca.p_bufr = bufr;
+	pca.p_next = pca.p_bufr;
+	pca.n_bufr = sizeof(bufr);
+	pca.remain = sizeof(bufr);
+	*pca.p_next = '\0';
+#else
+	/* Don't buffer console output. */
+	pca.p_bufr = NULL;
+#endif
+
 	retval = kvprintf(fmt, putchar, &pca, 10, ap);
+
+#ifdef PRINTF_BUFR_SIZE
+	/* Write any buffered console output: */
+	if (*pca.p_bufr != '\0')
+		cnputs(pca.p_bufr);
+#endif
+
 	if (!panicstr)
 		msgbuftrigger = 1;
-	consintr = savintr;		/* reenable interrupts */
+
 	return (retval);
 }
 
+static void
+putcons(int c, struct putchar_arg *ap)
+{
+	/* Check if no console output buffer was provided. */
+	if (ap->p_bufr == NULL)
+		/* Output direct to the console. */
+		cnputc(c);
+	else {
+		/* Buffer the character: */
+		if (c == '\n') {
+			*ap->p_next++ = '\r';
+			ap->remain--;
+		}
+		*ap->p_next++ = c;
+		ap->remain--;
+
+		/* Always leave the buffer zero terminated. */
+		*ap->p_next = '\0';
+
+		/* Check if the buffer needs to be flushed. */
+		if (ap->remain < 3 || c == '\n') {
+			cnputs(ap->p_bufr);
+			ap->p_next = ap->p_bufr;
+			ap->remain = ap->n_bufr;
+			*ap->p_next = '\0';
+		}
+	}
+}
+
 /*
  * Print a character on console or users terminal.  If destination is
  * the console then the last bunch of characters are saved in msgbuf for
@@ -330,17 +402,15 @@
 {
 	struct putchar_arg *ap = (struct putchar_arg*) arg;
 	struct tty *tp = ap->tty;
-	int consdirect, flags = ap->flags;
+	int flags = ap->flags;
 
-	consdirect = ((flags & TOCONS) && constty == NULL);
 	/* Don't use the tty code after a panic or while in ddb. */
-	if (panicstr)
-		consdirect = 1;
-	if (kdb_active)
-		consdirect = 1;
-	if (consdirect) {
+	if (kdb_active) {
 		if (c != '\0')
 			cnputc(c);
+	} else if (panicstr || ((flags & TOCONS) && constty == NULL)) {
+		if (c != '\0')
+			putcons(c, ap);
 	} else {
 		if ((flags & TOTTY) && tp != NULL)
 			tputchar(c, tp);
@@ -348,7 +418,7 @@
 			if (constty != NULL)
 				msgbuf_addchar(&consmsgbuf, c);
 			if (always_console_output && c != '\0')
-				cnputc(c);
+				putcons(c, ap);
 		}
 	}
 	if ((flags & TOLOG))
@@ -451,14 +521,15 @@
  * The buffer pointed to by `nbuf' must have length >= MAXNBUF.
  */
 static char *
-ksprintn(char *nbuf, uintmax_t num, int base, int *lenp)
+ksprintn(char *nbuf, uintmax_t num, int base, int *lenp, int upper)
 {
-	char *p;
+	char *p, c;
 
 	p = nbuf;
 	*p = '\0';
 	do {
-		*++p = hex2ascii(num % base);
+		c = hex2ascii(num % base);
+		*++p = upper ? toupper(c) : c;
 	} while (num /= base);
 	if (lenp)
 		*lenp = p - nbuf;
@@ -503,7 +574,7 @@
 	uintmax_t num;
 	int base, lflag, qflag, tmp, width, ladjust, sharpflag, neg, sign, dot;
 	int cflag, hflag, jflag, tflag, zflag;
-	int dwidth;
+	int dwidth, upper;
 	char padc;
 	int stop = 0, retval = 0;
 
@@ -529,7 +600,7 @@
 		}
 		percent = fmt - 1;
 		qflag = 0; lflag = 0; ladjust = 0; sharpflag = 0; neg = 0;
-		sign = 0; dot = 0; dwidth = 0;
+		sign = 0; dot = 0; dwidth = 0; upper = 0;
 		cflag = 0; hflag = 0; jflag = 0; tflag = 0; zflag = 0;
 reswitch:	switch (ch = (u_char)*fmt++) {
 		case '.':
@@ -579,7 +650,7 @@
 		case 'b':
 			num = (u_int)va_arg(ap, int);
 			p = va_arg(ap, char *);
-			for (q = ksprintn(nbuf, num, *p++, NULL); *q;)
+			for (q = ksprintn(nbuf, num, *p++, NULL, 0); *q;)
 				PCHAR(*q--);
 
 			if (num == 0)
@@ -698,8 +769,9 @@
 		case 'u':
 			base = 10;
 			goto handle_nosign;
-		case 'x':
 		case 'X':
+			upper = 1;
+		case 'x':
 			base = 16;
 			goto handle_nosign;
 		case 'y':
@@ -750,7 +822,7 @@
 				neg = 1;
 				num = -(intmax_t)num;
 			}
-			p = ksprintn(nbuf, num, base, &tmp);
+			p = ksprintn(nbuf, num, base, &tmp, upper);
 			if (sharpflag && num != 0) {
 				if (base == 8)
 					tmp++;
@@ -823,7 +895,7 @@
 			dangling = 0;
 		}
 		msgbuf_addchar(msgbufp, '<');
-		for (p = ksprintn(nbuf, (uintmax_t)pri, 10, NULL); *p;)
+		for (p = ksprintn(nbuf, (uintmax_t)pri, 10, NULL, 0); *p;)
 			msgbuf_addchar(msgbufp, *p--);
 		msgbuf_addchar(msgbufp, '>');
 		lastpri = pri;
@@ -853,8 +925,6 @@
 	oldp = msgbufp;
 }
 
-SYSCTL_DECL(_security_bsd);
-
 static int unprivileged_read_msgbuf = 1;
 SYSCTL_INT(_security_bsd, OID_AUTO, unprivileged_read_msgbuf,
     CTLFLAG_RW, &unprivileged_read_msgbuf, 0,
@@ -869,7 +939,7 @@
 	int error, len;
 
 	if (!unprivileged_read_msgbuf) {
-		error = suser(req->td);
+		error = priv_check(req->td, PRIV_MSGBUF);
 		if (error)
 			return (error);
 	}
@@ -909,20 +979,17 @@
 
 DB_SHOW_COMMAND(msgbuf, db_show_msgbuf)
 {
-	int i, j, quit;
-
-	quit = 0;
+	int i, j;
 
 	if (!msgbufmapped) {
 		db_printf("msgbuf not mapped yet\n");
 		return;
 	}
-	db_setup_paging(db_simple_pager, &quit, db_lines_per_page);
 	db_printf("msgbufp = %p\n", msgbufp);
 	db_printf("magic = %x, size = %d, r= %u, w = %u, ptr = %p, cksum= %u\n",
 	    msgbufp->msg_magic, msgbufp->msg_size, msgbufp->msg_rseq,
 	    msgbufp->msg_wseq, msgbufp->msg_ptr, msgbufp->msg_cksum);
-	for (i = 0; i < msgbufp->msg_size && !quit; i++) {
+	for (i = 0; i < msgbufp->msg_size && !db_pager_quit; i++) {
 		j = MSGBUF_SEQ_TO_POS(msgbufp, i + msgbufp->msg_rseq);
 		db_printf("%c", msgbufp->msg_ptr[j]);
 	}
Index: kern_time.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_time.c,v
retrieving revision 1.1.1.2
retrieving revision 1.2
diff -L sys/kern/kern_time.c -L sys/kern/kern_time.c -u -r1.1.1.2 -r1.2
--- sys/kern/kern_time.c
+++ sys/kern/kern_time.c
@@ -30,31 +30,37 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_time.c,v 1.116.2.1 2005/12/28 19:30:41 ps Exp $");
-
-#include "opt_mac.h"
+__FBSDID("$FreeBSD: src/sys/kern/kern_time.c,v 1.142 2007/06/09 21:48:44 attilio Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
+#include <sys/limits.h>
+#include <sys/clock.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/sysproto.h>
+#include <sys/eventhandler.h>
 #include <sys/resourcevar.h>
 #include <sys/signalvar.h>
 #include <sys/kernel.h>
-#include <sys/mac.h>
 #include <sys/syscallsubr.h>
+#include <sys/sysctl.h>
 #include <sys/sysent.h>
+#include <sys/priv.h>
 #include <sys/proc.h>
+#include <sys/posix4.h>
 #include <sys/time.h>
+#include <sys/timers.h>
 #include <sys/timetc.h>
 #include <sys/vnode.h>
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 
-int tz_minuteswest;
-int tz_dsttime;
+#define MAX_CLOCKS 	(CLOCK_MONOTONIC+1)
+
+static struct kclock	posix_clocks[MAX_CLOCKS];
+static uma_zone_t	itimer_zone = NULL;
 
 /*
  * Time of day and interval timer support.
@@ -70,6 +76,36 @@
 static void	timevalfix(struct timeval *);
 static void	no_lease_updatetime(int);
 
+static void	itimer_start(void);
+static int	itimer_init(void *, int, int);
+static void	itimer_fini(void *, int);
+static void	itimer_enter(struct itimer *);
+static void	itimer_leave(struct itimer *);
+static struct itimer *itimer_find(struct proc *, int);
+static void	itimers_alloc(struct proc *);
+static void	itimers_event_hook_exec(void *arg, struct proc *p, struct image_params *imgp);
+static void	itimers_event_hook_exit(void *arg, struct proc *p);
+static int	realtimer_create(struct itimer *);
+static int	realtimer_gettime(struct itimer *, struct itimerspec *);
+static int	realtimer_settime(struct itimer *, int,
+			struct itimerspec *, struct itimerspec *);
+static int	realtimer_delete(struct itimer *);
+static void	realtimer_clocktime(clockid_t, struct timespec *);
+static void	realtimer_expire(void *);
+static int	kern_timer_create(struct thread *, clockid_t,
+			struct sigevent *, int *, int);
+static int	kern_timer_delete(struct thread *, int);
+
+int		register_posix_clock(int, struct kclock *);
+void		itimer_fire(struct itimer *it);
+int		itimespecfix(struct timespec *ts);
+
+#define CLOCK_CALL(clock, call, arglist)		\
+	((*posix_clocks[clock].call) arglist)
+
+SYSINIT(posix_timer, SI_SUB_P1003_1B, SI_ORDER_FIRST+4, itimer_start, NULL);
+
+
 static void 
 no_lease_updatetime(deltat)
 	int deltat;
@@ -146,10 +182,6 @@
 	struct	timespec *tp;
 };
 #endif
-
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 clock_gettime(struct thread *td, struct clock_gettime_args *uap)
@@ -172,25 +204,44 @@
 
 	p = td->td_proc;
 	switch (clock_id) {
-	case CLOCK_REALTIME:
+	case CLOCK_REALTIME:		/* Default to precise. */
+	case CLOCK_REALTIME_PRECISE:
 		nanotime(ats);
 		break;
+	case CLOCK_REALTIME_FAST:
+		getnanotime(ats);
+		break;
 	case CLOCK_VIRTUAL:
 		PROC_LOCK(p);
+		PROC_SLOCK(p);
 		calcru(p, &user, &sys);
+		PROC_SUNLOCK(p);
 		PROC_UNLOCK(p);
 		TIMEVAL_TO_TIMESPEC(&user, ats);
 		break;
 	case CLOCK_PROF:
 		PROC_LOCK(p);
+		PROC_SLOCK(p);
 		calcru(p, &user, &sys);
+		PROC_SUNLOCK(p);
 		PROC_UNLOCK(p);
 		timevaladd(&user, &sys);
 		TIMEVAL_TO_TIMESPEC(&user, ats);
 		break;
-	case CLOCK_MONOTONIC:
+	case CLOCK_MONOTONIC:		/* Default to precise. */
+	case CLOCK_MONOTONIC_PRECISE:
+	case CLOCK_UPTIME:
+	case CLOCK_UPTIME_PRECISE:
 		nanouptime(ats);
 		break;
+	case CLOCK_UPTIME_FAST:
+	case CLOCK_MONOTONIC_FAST:
+		getnanouptime(ats);
+		break;
+	case CLOCK_SECOND:
+		ats->tv_sec = time_second;
+		ats->tv_nsec = 0;
+		break;
 	default:
 		return (EINVAL);
 	}
@@ -203,10 +254,6 @@
 	const struct	timespec *tp;
 };
 #endif
-
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 clock_settime(struct thread *td, struct clock_settime_args *uap)
@@ -225,12 +272,7 @@
 	struct timeval atv;
 	int error;
 
-#ifdef MAC
-	error = mac_check_system_settime(td->td_ucred);
-	if (error)
-		return (error);
-#endif
-	if ((error = suser(td)) != 0)
+	if ((error = priv_check(td, PRIV_CLOCK_SETTIME)) != 0)
 		return (error);
 	if (clock_id != CLOCK_REALTIME)
 		return (EINVAL);
@@ -248,7 +290,6 @@
 	struct	timespec *tp;
 };
 #endif
-
 int
 clock_getres(struct thread *td, struct clock_getres_args *uap)
 {
@@ -271,7 +312,14 @@
 	ts->tv_sec = 0;
 	switch (clock_id) {
 	case CLOCK_REALTIME:
+	case CLOCK_REALTIME_FAST:
+	case CLOCK_REALTIME_PRECISE:
 	case CLOCK_MONOTONIC:
+	case CLOCK_MONOTONIC_FAST:
+	case CLOCK_MONOTONIC_PRECISE:
+	case CLOCK_UPTIME:
+	case CLOCK_UPTIME_FAST:
+	case CLOCK_UPTIME_PRECISE:
 		/*
 		 * Round up the result of the division cheaply by adding 1.
 		 * Rounding up is especially important if rounding down
@@ -284,6 +332,10 @@
 		/* Accurately round up here because we can do so cheaply. */
 		ts->tv_nsec = (1000000000 + hz - 1) / hz;
 		break;
+	case CLOCK_SECOND:
+		ts->tv_sec = 1;
+		ts->tv_nsec = 0;
+		break;
 	default:
 		return (EINVAL);
 	}
@@ -335,10 +387,6 @@
 	struct	timespec *rmtp;
 };
 #endif
-
-/* 
- * MPSAFE
- */
 /* ARGSUSED */
 int
 nanosleep(struct thread *td, struct nanosleep_args *uap)
@@ -370,9 +418,6 @@
 	struct	timezone *tzp;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 gettimeofday(struct thread *td, struct gettimeofday_args *uap)
@@ -399,9 +444,6 @@
 	struct	timezone *tzp;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 settimeofday(struct thread *td, struct settimeofday_args *uap)
@@ -432,12 +474,7 @@
 {
 	int error;
 
-#ifdef MAC
-	error = mac_check_system_settime(td->td_ucred);
-	if (error)
-		return (error);
-#endif
-	error = suser(td);
+	error = priv_check(td, PRIV_SETTIMEOFDAY);
 	if (error)
 		return (error);
 	/* Verify all parameters before changing time. */
@@ -454,25 +491,25 @@
 }
 
 /*
- * Get value of an interval timer.  The process virtual and
- * profiling virtual time timers are kept in the p_stats area, since
- * they can be swapped out.  These are kept internally in the
- * way they are specified externally: in time until they expire.
+ * Get value of an interval timer.  The process virtual and profiling virtual
+ * time timers are kept in the p_stats area, since they can be swapped out.
+ * These are kept internally in the way they are specified externally: in
+ * time until they expire.
  *
- * The real time interval timer is kept in the process table slot
- * for the process, and its value (it_value) is kept as an
- * absolute time rather than as a delta, so that it is easy to keep
- * periodic real-time signals from drifting.
+ * The real time interval timer is kept in the process table slot for the
+ * process, and its value (it_value) is kept as an absolute time rather than
+ * as a delta, so that it is easy to keep periodic real-time signals from
+ * drifting.
  *
  * Virtual time timers are processed in the hardclock() routine of
- * kern_clock.c.  The real time timer is processed by a timeout
- * routine, called from the softclock() routine.  Since a callout
- * may be delayed in real time due to interrupt processing in the system,
- * it is possible for the real time timeout routine (realitexpire, given below),
- * to be delayed in real time past when it is supposed to occur.  It
- * does not suffice, therefore, to reload the real timer .it_value from the
- * real time timers .it_interval.  Rather, we compute the next time in
- * absolute time the timer should go off.
+ * kern_clock.c.  The real time timer is processed by a timeout routine,
+ * called from the softclock() routine.  Since a callout may be delayed in
+ * real time due to interrupt processing in the system, it is possible for
+ * the real time timeout routine (realitexpire, given below), to be delayed
+ * in real time past when it is supposed to occur.  It does not suffice,
+ * therefore, to reload the real timer .it_value from the real time timers
+ * .it_interval.  Rather, we compute the next time in absolute time the timer
+ * should go off.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct getitimer_args {
@@ -480,9 +517,6 @@
 	struct	itimerval *itv;
 };
 #endif
-/*
- * MPSAFE
- */
 int
 getitimer(struct thread *td, struct getitimer_args *uap)
 {
@@ -522,9 +556,9 @@
 				timevalsub(&aitv->it_value, &ctv);
 		}
 	} else {
-		mtx_lock_spin(&sched_lock);
+		PROC_SLOCK(p);
 		*aitv = p->p_stats->p_timer[which];
-		mtx_unlock_spin(&sched_lock);
+		PROC_SUNLOCK(p);
 	}
 	return (0);
 }
@@ -535,10 +569,6 @@
 	struct	itimerval *itv, *oitv;
 };
 #endif
-
-/*
- * MPSAFE
- */
 int
 setitimer(struct thread *td, struct setitimer_args *uap)
 {
@@ -597,10 +627,10 @@
 				timevalsub(&oitv->it_value, &ctv);
 		}
 	} else {
-		mtx_lock_spin(&sched_lock);
+		PROC_SLOCK(p);
 		*oitv = p->p_stats->p_timer[which];
 		p->p_stats->p_timer[which] = *aitv;
-		mtx_unlock_spin(&sched_lock);
+		PROC_SUNLOCK(p);
 	}
 	return (0);
 }
@@ -659,8 +689,7 @@
 itimerfix(struct timeval *tv)
 {
 
-	if (tv->tv_sec < 0 || tv->tv_sec > 100000000 ||
-	    tv->tv_usec < 0 || tv->tv_usec >= 1000000)
+	if (tv->tv_sec < 0 || tv->tv_usec < 0 || tv->tv_usec >= 1000000)
 		return (EINVAL);
 	if (tv->tv_sec == 0 && tv->tv_usec != 0 && tv->tv_usec < tick)
 		tv->tv_usec = tick;
@@ -807,3 +836,655 @@
 		return (maxpps < 0 || *curpps < maxpps);
 	}
 }
+
+static void
+itimer_start(void)
+{
+	struct kclock rt_clock = {
+		.timer_create  = realtimer_create,
+		.timer_delete  = realtimer_delete,
+		.timer_settime = realtimer_settime,
+		.timer_gettime = realtimer_gettime,
+		.event_hook    = NULL
+	};
+
+	itimer_zone = uma_zcreate("itimer", sizeof(struct itimer),
+		NULL, NULL, itimer_init, itimer_fini, UMA_ALIGN_PTR, 0);
+	register_posix_clock(CLOCK_REALTIME,  &rt_clock);
+	register_posix_clock(CLOCK_MONOTONIC, &rt_clock);
+	p31b_setcfg(CTL_P1003_1B_TIMERS, 200112L);
+	p31b_setcfg(CTL_P1003_1B_DELAYTIMER_MAX, INT_MAX);
+	p31b_setcfg(CTL_P1003_1B_TIMER_MAX, TIMER_MAX);
+	EVENTHANDLER_REGISTER(process_exit, itimers_event_hook_exit,
+		(void *)ITIMER_EV_EXIT, EVENTHANDLER_PRI_ANY);
+	EVENTHANDLER_REGISTER(process_exec, itimers_event_hook_exec,
+		(void *)ITIMER_EV_EXEC, EVENTHANDLER_PRI_ANY);
+}
+
+int
+register_posix_clock(int clockid, struct kclock *clk)
+{
+	if ((unsigned)clockid >= MAX_CLOCKS) {
+		printf("%s: invalid clockid\n", __func__);
+		return (0);
+	}
+	posix_clocks[clockid] = *clk;
+	return (1);
+}
+
+static int
+itimer_init(void *mem, int size, int flags)
+{
+	struct itimer *it;
+
+	it = (struct itimer *)mem;
+	mtx_init(&it->it_mtx, "itimer lock", NULL, MTX_DEF);
+	return (0);
+}
+
+static void
+itimer_fini(void *mem, int size)
+{
+	struct itimer *it;
+
+	it = (struct itimer *)mem;
+	mtx_destroy(&it->it_mtx);
+}
+
+static void
+itimer_enter(struct itimer *it)
+{
+
+	mtx_assert(&it->it_mtx, MA_OWNED);
+	it->it_usecount++;
+}
+
+static void
+itimer_leave(struct itimer *it)
+{
+
+	mtx_assert(&it->it_mtx, MA_OWNED);
+	KASSERT(it->it_usecount > 0, ("invalid it_usecount"));
+
+	if (--it->it_usecount == 0 && (it->it_flags & ITF_WANTED) != 0)
+		wakeup(it);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct ktimer_create_args {
+	clockid_t clock_id;
+	struct sigevent * evp;
+	int * timerid;
+};
+#endif
+int
+ktimer_create(struct thread *td, struct ktimer_create_args *uap)
+{
+	struct sigevent *evp1, ev;
+	int id;
+	int error;
+
+	if (uap->evp != NULL) {
+		error = copyin(uap->evp, &ev, sizeof(ev));
+		if (error != 0)
+			return (error);
+		evp1 = &ev;
+	} else
+		evp1 = NULL;
+
+	error = kern_timer_create(td, uap->clock_id, evp1, &id, -1);
+
+	if (error == 0) {
+		error = copyout(&id, uap->timerid, sizeof(int));
+		if (error != 0)
+			kern_timer_delete(td, id);
+	}
+	return (error);
+}
+
+static int
+kern_timer_create(struct thread *td, clockid_t clock_id,
+	struct sigevent *evp, int *timerid, int preset_id)
+{
+	struct proc *p = td->td_proc;
+	struct itimer *it;
+	int id;
+	int error;
+
+	if (clock_id < 0 || clock_id >= MAX_CLOCKS)
+		return (EINVAL);
+
+	if (posix_clocks[clock_id].timer_create == NULL)
+		return (EINVAL);
+
+	if (evp != NULL) {
+		if (evp->sigev_notify != SIGEV_NONE &&
+		    evp->sigev_notify != SIGEV_SIGNAL &&
+		    evp->sigev_notify != SIGEV_THREAD_ID)
+			return (EINVAL);
+		if ((evp->sigev_notify == SIGEV_SIGNAL ||
+		     evp->sigev_notify == SIGEV_THREAD_ID) &&
+			!_SIG_VALID(evp->sigev_signo))
+			return (EINVAL);
+	}
+	
+	if (p->p_itimers == NULL)
+		itimers_alloc(p);
+	
+	it = uma_zalloc(itimer_zone, M_WAITOK);
+	it->it_flags = 0;
+	it->it_usecount = 0;
+	it->it_active = 0;
+	timespecclear(&it->it_time.it_value);
+	timespecclear(&it->it_time.it_interval);
+	it->it_overrun = 0;
+	it->it_overrun_last = 0;
+	it->it_clockid = clock_id;
+	it->it_timerid = -1;
+	it->it_proc = p;
+	ksiginfo_init(&it->it_ksi);
+	it->it_ksi.ksi_flags |= KSI_INS | KSI_EXT;
+	error = CLOCK_CALL(clock_id, timer_create, (it));
+	if (error != 0)
+		goto out;
+
+	PROC_LOCK(p);
+	if (preset_id != -1) {
+		KASSERT(preset_id >= 0 && preset_id < 3, ("invalid preset_id"));
+		id = preset_id;
+		if (p->p_itimers->its_timers[id] != NULL) {
+			PROC_UNLOCK(p);
+			error = 0;
+			goto out;
+		}
+	} else {
+		/*
+		 * Find a free timer slot, skipping those reserved
+		 * for setitimer().
+		 */
+		for (id = 3; id < TIMER_MAX; id++)
+			if (p->p_itimers->its_timers[id] == NULL)
+				break;
+		if (id == TIMER_MAX) {
+			PROC_UNLOCK(p);
+			error = EAGAIN;
+			goto out;
+		}
+	}
+	it->it_timerid = id;
+	p->p_itimers->its_timers[id] = it;
+	if (evp != NULL)
+		it->it_sigev = *evp;
+	else {
+		it->it_sigev.sigev_notify = SIGEV_SIGNAL;
+		switch (clock_id) {
+		default:
+		case CLOCK_REALTIME:
+			it->it_sigev.sigev_signo = SIGALRM;
+			break;
+		case CLOCK_VIRTUAL:
+ 			it->it_sigev.sigev_signo = SIGVTALRM;
+			break;
+		case CLOCK_PROF:
+			it->it_sigev.sigev_signo = SIGPROF;
+			break;
+		}
+		it->it_sigev.sigev_value.sival_int = id;
+	}
+
+	if (it->it_sigev.sigev_notify == SIGEV_SIGNAL ||
+	    it->it_sigev.sigev_notify == SIGEV_THREAD_ID) {
+		it->it_ksi.ksi_signo = it->it_sigev.sigev_signo;
+		it->it_ksi.ksi_code = SI_TIMER;
+		it->it_ksi.ksi_value = it->it_sigev.sigev_value;
+		it->it_ksi.ksi_timerid = id;
+	}
+	PROC_UNLOCK(p);
+	*timerid = id;
+	return (0);
+
+out:
+	ITIMER_LOCK(it);
+	CLOCK_CALL(it->it_clockid, timer_delete, (it));
+	ITIMER_UNLOCK(it);
+	uma_zfree(itimer_zone, it);
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct ktimer_delete_args {
+	int timerid;
+};
+#endif
+int
+ktimer_delete(struct thread *td, struct ktimer_delete_args *uap)
+{
+	return (kern_timer_delete(td, uap->timerid));
+}
+
+static struct itimer *
+itimer_find(struct proc *p, int timerid)
+{
+	struct itimer *it;
+
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	if ((p->p_itimers == NULL) || (timerid >= TIMER_MAX) ||
+	    (it = p->p_itimers->its_timers[timerid]) == NULL) {
+		return (NULL);
+	}
+	ITIMER_LOCK(it);
+	if ((it->it_flags & ITF_DELETING) != 0) {
+		ITIMER_UNLOCK(it);
+		it = NULL;
+	}
+	return (it);
+}
+
+static int
+kern_timer_delete(struct thread *td, int timerid)
+{
+	struct proc *p = td->td_proc;
+	struct itimer *it;
+
+	PROC_LOCK(p);
+	it = itimer_find(p, timerid);
+	if (it == NULL) {
+		PROC_UNLOCK(p);
+		return (EINVAL);
+	}
+	PROC_UNLOCK(p);
+
+	it->it_flags |= ITF_DELETING;
+	while (it->it_usecount > 0) {
+		it->it_flags |= ITF_WANTED;
+		msleep(it, &it->it_mtx, PPAUSE, "itimer", 0);
+	}
+	it->it_flags &= ~ITF_WANTED;
+	CLOCK_CALL(it->it_clockid, timer_delete, (it));
+	ITIMER_UNLOCK(it);
+
+	PROC_LOCK(p);
+	if (KSI_ONQ(&it->it_ksi))
+		sigqueue_take(&it->it_ksi);
+	p->p_itimers->its_timers[timerid] = NULL;
+	PROC_UNLOCK(p);
+	uma_zfree(itimer_zone, it);
+	return (0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct ktimer_settime_args {
+	int timerid;
+	int flags;
+	const struct itimerspec * value;
+	struct itimerspec * ovalue;
+};
+#endif
+int
+ktimer_settime(struct thread *td, struct ktimer_settime_args *uap)
+{
+	struct proc *p = td->td_proc;
+	struct itimer *it;
+	struct itimerspec val, oval, *ovalp;
+	int error;
+
+	error = copyin(uap->value, &val, sizeof(val));
+	if (error != 0)
+		return (error);
+	
+	if (uap->ovalue != NULL)
+		ovalp = &oval;
+	else
+		ovalp = NULL;
+
+	PROC_LOCK(p);
+	if (uap->timerid < 3 ||
+	    (it = itimer_find(p, uap->timerid)) == NULL) {
+		PROC_UNLOCK(p);
+		error = EINVAL;
+	} else {
+		PROC_UNLOCK(p);
+		itimer_enter(it);
+		error = CLOCK_CALL(it->it_clockid, timer_settime,
+				(it, uap->flags, &val, ovalp));
+		itimer_leave(it);
+		ITIMER_UNLOCK(it);
+	}
+	if (error == 0 && uap->ovalue != NULL)
+		error = copyout(ovalp, uap->ovalue, sizeof(*ovalp));
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct ktimer_gettime_args {
+	int timerid;
+	struct itimerspec * value;
+};
+#endif
+int
+ktimer_gettime(struct thread *td, struct ktimer_gettime_args *uap)
+{
+	struct proc *p = td->td_proc;
+	struct itimer *it;
+	struct itimerspec val;
+	int error;
+
+	PROC_LOCK(p);
+	if (uap->timerid < 3 ||
+	   (it = itimer_find(p, uap->timerid)) == NULL) {
+		PROC_UNLOCK(p);
+		error = EINVAL;
+	} else {
+		PROC_UNLOCK(p);
+		itimer_enter(it);
+		error = CLOCK_CALL(it->it_clockid, timer_gettime,
+				(it, &val));
+		itimer_leave(it);
+		ITIMER_UNLOCK(it);
+	}
+	if (error == 0)
+		error = copyout(&val, uap->value, sizeof(val));
+	return (error);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct timer_getoverrun_args {
+	int timerid;
+};
+#endif
+int
+ktimer_getoverrun(struct thread *td, struct ktimer_getoverrun_args *uap)
+{
+	struct proc *p = td->td_proc;
+	struct itimer *it;
+	int error ;
+
+	PROC_LOCK(p);
+	if (uap->timerid < 3 ||
+	    (it = itimer_find(p, uap->timerid)) == NULL) {
+		PROC_UNLOCK(p);
+		error = EINVAL;
+	} else {
+		td->td_retval[0] = it->it_overrun_last;
+		ITIMER_UNLOCK(it);
+		PROC_UNLOCK(p);
+		error = 0;
+	}
+	return (error);
+}
+
+static int
+realtimer_create(struct itimer *it)
+{
+	callout_init_mtx(&it->it_callout, &it->it_mtx, 0);
+	return (0);
+}
+
+static int
+realtimer_delete(struct itimer *it)
+{
+	mtx_assert(&it->it_mtx, MA_OWNED);
+	
+	ITIMER_UNLOCK(it);
+	callout_drain(&it->it_callout);
+	ITIMER_LOCK(it);
+	return (0);
+}
+
+static int
+realtimer_gettime(struct itimer *it, struct itimerspec *ovalue)
+{
+	struct timespec cts;
+
+	mtx_assert(&it->it_mtx, MA_OWNED);
+
+	realtimer_clocktime(it->it_clockid, &cts);
+	*ovalue = it->it_time;
+	if (ovalue->it_value.tv_sec != 0 || ovalue->it_value.tv_nsec != 0) {
+		timespecsub(&ovalue->it_value, &cts);
+		if (ovalue->it_value.tv_sec < 0 ||
+		    (ovalue->it_value.tv_sec == 0 &&
+		     ovalue->it_value.tv_nsec == 0)) {
+			ovalue->it_value.tv_sec  = 0;
+			ovalue->it_value.tv_nsec = 1;
+		}
+	}
+	return (0);
+}
+
+static int
+realtimer_settime(struct itimer *it, int flags,
+	struct itimerspec *value, struct itimerspec *ovalue)
+{
+	struct timespec cts, ts;
+	struct timeval tv;
+	struct itimerspec val;
+
+	mtx_assert(&it->it_mtx, MA_OWNED);
+
+	val = *value;
+	if (itimespecfix(&val.it_value))
+		return (EINVAL);
+
+	if (timespecisset(&val.it_value)) {
+		if (itimespecfix(&val.it_interval))
+			return (EINVAL);
+	} else {
+		timespecclear(&val.it_interval);
+	}
+	
+	if (ovalue != NULL)
+		realtimer_gettime(it, ovalue);
+
+	it->it_time = val;
+	if (timespecisset(&val.it_value)) {
+		realtimer_clocktime(it->it_clockid, &cts);
+		ts = val.it_value;
+		if ((flags & TIMER_ABSTIME) == 0) {
+			/* Convert to absolute time. */
+			timespecadd(&it->it_time.it_value, &cts);
+		} else {
+			timespecsub(&ts, &cts);
+			/*
+			 * We don't care if ts is negative, tztohz will
+			 * fix it.
+			 */
+		}
+		TIMESPEC_TO_TIMEVAL(&tv, &ts);
+		callout_reset(&it->it_callout, tvtohz(&tv),
+			realtimer_expire, it);
+	} else {
+		callout_stop(&it->it_callout);
+	}
+
+	return (0);
+}
+
+static void
+realtimer_clocktime(clockid_t id, struct timespec *ts)
+{
+	if (id == CLOCK_REALTIME)
+		getnanotime(ts);
+	else	/* CLOCK_MONOTONIC */
+		getnanouptime(ts);
+}
+
+int
+itimer_accept(struct proc *p, int timerid, ksiginfo_t *ksi)
+{
+	struct itimer *it;
+
+	PROC_LOCK_ASSERT(p, MA_OWNED);
+	it = itimer_find(p, timerid);
+	if (it != NULL) {
+		ksi->ksi_overrun = it->it_overrun;
+		it->it_overrun_last = it->it_overrun;
+		it->it_overrun = 0;
+		ITIMER_UNLOCK(it);
+		return (0);
+	}
+	return (EINVAL);
+}
+
+int
+itimespecfix(struct timespec *ts)
+{
+
+	if (ts->tv_sec < 0 || ts->tv_nsec < 0 || ts->tv_nsec >= 1000000000)
+		return (EINVAL);
+	if (ts->tv_sec == 0 && ts->tv_nsec != 0 && ts->tv_nsec < tick * 1000)
+		ts->tv_nsec = tick * 1000;
+	return (0);
+}
+
+/* Timeout callback for realtime timer */
+static void
+realtimer_expire(void *arg)
+{
+	struct timespec cts, ts;
+	struct timeval tv;
+	struct itimer *it;
+	struct proc *p;
+
+	it = (struct itimer *)arg;
+	p = it->it_proc;
+
+	realtimer_clocktime(it->it_clockid, &cts);
+	/* Only fire if time is reached. */
+	if (timespeccmp(&cts, &it->it_time.it_value, >=)) {
+		if (timespecisset(&it->it_time.it_interval)) {
+			timespecadd(&it->it_time.it_value,
+				    &it->it_time.it_interval);
+			while (timespeccmp(&cts, &it->it_time.it_value, >=)) {
+				if (it->it_overrun < INT_MAX)
+					it->it_overrun++;
+				else
+					it->it_ksi.ksi_errno = ERANGE;
+				timespecadd(&it->it_time.it_value,
+					    &it->it_time.it_interval);
+			}
+		} else {
+			/* single shot timer ? */
+			timespecclear(&it->it_time.it_value);
+		}
+		if (timespecisset(&it->it_time.it_value)) {
+			ts = it->it_time.it_value;
+			timespecsub(&ts, &cts);
+			TIMESPEC_TO_TIMEVAL(&tv, &ts);
+			callout_reset(&it->it_callout, tvtohz(&tv),
+				 realtimer_expire, it);
+		}
+		ITIMER_UNLOCK(it);
+		itimer_fire(it);
+		ITIMER_LOCK(it);
+	} else if (timespecisset(&it->it_time.it_value)) {
+		ts = it->it_time.it_value;
+		timespecsub(&ts, &cts);
+		TIMESPEC_TO_TIMEVAL(&tv, &ts);
+		callout_reset(&it->it_callout, tvtohz(&tv), realtimer_expire,
+ 			it);
+	}
+}
+
+void
+itimer_fire(struct itimer *it)
+{
+	struct proc *p = it->it_proc;
+	int ret;
+
+	if (it->it_sigev.sigev_notify == SIGEV_SIGNAL ||
+	    it->it_sigev.sigev_notify == SIGEV_THREAD_ID) {
+		PROC_LOCK(p);
+		if (!KSI_ONQ(&it->it_ksi)) {
+			it->it_ksi.ksi_errno = 0;
+			ret = psignal_event(p, &it->it_sigev, &it->it_ksi);
+			if (__predict_false(ret != 0)) {
+				it->it_overrun++;
+				/*
+				 * Broken userland code, thread went
+				 * away, disarm the timer.
+				 */
+				if (ret == ESRCH) {
+					ITIMER_LOCK(it);
+					timespecclear(&it->it_time.it_value);
+					timespecclear(&it->it_time.it_interval);
+					callout_stop(&it->it_callout);
+					ITIMER_UNLOCK(it);
+				}
+			}
+		} else {
+			if (it->it_overrun < INT_MAX)
+				it->it_overrun++;
+			else
+				it->it_ksi.ksi_errno = ERANGE;
+		}
+		PROC_UNLOCK(p);
+	}
+}
+
+static void
+itimers_alloc(struct proc *p)
+{
+	struct itimers *its;
+	int i;
+
+	its = malloc(sizeof (struct itimers), M_SUBPROC, M_WAITOK | M_ZERO);
+	LIST_INIT(&its->its_virtual);
+	LIST_INIT(&its->its_prof);
+	TAILQ_INIT(&its->its_worklist);
+	for (i = 0; i < TIMER_MAX; i++)
+		its->its_timers[i] = NULL;
+	PROC_LOCK(p);
+	if (p->p_itimers == NULL) {
+		p->p_itimers = its;
+		PROC_UNLOCK(p);
+	}
+	else {
+		PROC_UNLOCK(p);
+		free(its, M_SUBPROC);
+	}
+}
+
+static void
+itimers_event_hook_exec(void *arg, struct proc *p, struct image_params *imgp __unused)
+{
+	itimers_event_hook_exit(arg, p);
+}
+
+/* Clean up timers when some process events are being triggered. */
+static void
+itimers_event_hook_exit(void *arg, struct proc *p)
+{
+	struct itimers *its;
+	struct itimer *it;
+	int event = (int)(intptr_t)arg;
+	int i;
+
+	if (p->p_itimers != NULL) {
+		its = p->p_itimers;
+		for (i = 0; i < MAX_CLOCKS; ++i) {
+			if (posix_clocks[i].event_hook != NULL)
+				CLOCK_CALL(i, event_hook, (p, i, event));
+		}
+		/*
+		 * According to susv3, XSI interval timers should be inherited
+		 * by new image.
+		 */
+		if (event == ITIMER_EV_EXEC)
+			i = 3;
+		else if (event == ITIMER_EV_EXIT)
+			i = 0;
+		else
+			panic("unhandled event");
+		for (; i < TIMER_MAX; ++i) {
+			if ((it = its->its_timers[i]) != NULL)
+				kern_timer_delete(curthread, i);
+		}
+		if (its->its_timers[0] == NULL &&
+		    its->its_timers[1] == NULL &&
+		    its->its_timers[2] == NULL) {
+			free(its, M_SUBPROC);
+			p->p_itimers = NULL;
+		}
+	}
+}
Index: kern_kthread.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_kthread.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/kern_kthread.c -L sys/kern/kern_kthread.c -u -r1.2 -r1.3
--- sys/kern/kern_kthread.c
+++ sys/kern/kern_kthread.c
@@ -25,7 +25,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_kthread.c,v 1.34 2005/01/06 23:35:39 imp Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_kthread.c,v 1.38 2007/06/05 00:00:54 jeff Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -38,6 +38,7 @@
 #include <sys/sx.h>
 #include <sys/unistd.h>
 #include <sys/wait.h>
+#include <sys/sched.h>
 
 #include <machine/stdarg.h>
 
@@ -112,9 +113,9 @@
 
 	/* Delay putting it on the run queue until now. */
 	if (!(flags & RFSTOPPED)) {
-		mtx_lock_spin(&sched_lock);
-		setrunqueue(td, SRQ_BORING); 
-		mtx_unlock_spin(&sched_lock);
+		thread_lock(td);
+		sched_add(td, SRQ_BORING); 
+		thread_unlock(td);
 	}
 
 	return 0;
@@ -128,11 +129,23 @@
 
 	td = curthread;
 	p = td->td_proc;
+
+	/*
+	 * Reparent curthread from proc0 to init so that the zombie
+	 * is harvested.
+	 */
 	sx_xlock(&proctree_lock);
 	PROC_LOCK(p);
 	proc_reparent(p, initproc);
 	PROC_UNLOCK(p);
 	sx_xunlock(&proctree_lock);
+
+	/*
+	 * Wakeup anyone waiting for us to exit.
+	 */
+	wakeup(p);
+
+	/* Buh-bye! */
 	exit1(td, W_EXITCODE(ecode, 0));
 }
 
Index: kern_thr.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_thr.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/kern_thr.c -L sys/kern/kern_thr.c -u -r1.2 -r1.3
--- sys/kern/kern_thr.c
+++ sys/kern/kern_thr.c
@@ -25,45 +25,64 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_thr.c,v 1.34.2.2 2006/01/16 06:25:32 davidxu Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_thr.c,v 1.62.4.1 2008/01/19 18:15:05 kib Exp $");
 
+#include "opt_compat.h"
+#include "opt_posix.h"
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
+#include <sys/priv.h>
 #include <sys/proc.h>
+#include <sys/posix4.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
 #include <sys/sysctl.h>
 #include <sys/smp.h>
+#include <sys/syscallsubr.h>
 #include <sys/sysent.h>
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/signalvar.h>
 #include <sys/ucontext.h>
 #include <sys/thr.h>
+#include <sys/rtprio.h>
 #include <sys/umtx.h>
+#include <sys/limits.h>
 
 #include <machine/frame.h>
 
-extern int max_threads_per_proc;
-extern int max_groups_per_proc;
+#include <security/audit/audit.h>
+
+#ifdef COMPAT_IA32
 
-SYSCTL_DECL(_kern_threads);
-static int thr_scope = 0;
-SYSCTL_INT(_kern_threads, OID_AUTO, thr_scope, CTLFLAG_RW,
-	&thr_scope, 0, "sys or proc scope scheduling");
-
-static int thr_concurrency = 0;
-SYSCTL_INT(_kern_threads, OID_AUTO, thr_concurrency, CTLFLAG_RW,
-	&thr_concurrency, 0, "a concurrency value if not default");
+extern struct sysentvec ia32_freebsd_sysvec;
+
+static inline int
+suword_lwpid(void *addr, lwpid_t lwpid)
+{
+	int error;
+
+	if (curproc->p_sysent != &ia32_freebsd_sysvec)
+		error = suword(addr, lwpid);
+	else
+		error = suword32(addr, lwpid);
+	return (error);
+}
+
+#else
+#define suword_lwpid	suword
+#endif
+
+extern int max_threads_per_proc;
 
 static int create_thread(struct thread *td, mcontext_t *ctx,
 			 void (*start_func)(void *), void *arg,
 			 char *stack_base, size_t stack_size,
 			 char *tls_base,
 			 long *child_tid, long *parent_tid,
-			 int flags);
+			 int flags, struct rtprio *rtp);
 
 /*
  * System call interface.
@@ -79,7 +98,7 @@
 		return (error);
 
 	error = create_thread(td, &ctx.uc_mcontext, NULL, NULL,
-		NULL, 0, NULL, uap->id, NULL, uap->flags);
+		NULL, 0, NULL, uap->id, NULL, uap->flags, NULL);
 	return (error);
 }
 
@@ -90,13 +109,29 @@
 	struct thr_param param;
 	int error;
 
-	if (uap->param_size < sizeof(param))
+	if (uap->param_size < 0 || uap->param_size > sizeof(param))
 		return (EINVAL);
-	if ((error = copyin(uap->param, &param, sizeof(param))))
+	bzero(&param, sizeof(param));
+	if ((error = copyin(uap->param, &param, uap->param_size)))
 		return (error);
-	error = create_thread(td, NULL, param.start_func, param.arg,
-		param.stack_base, param.stack_size, param.tls_base,
-		param.child_tid, param.parent_tid, param.flags);
+	return (kern_thr_new(td, &param));
+}
+
+int
+kern_thr_new(struct thread *td, struct thr_param *param)
+{
+	struct rtprio rtp, *rtpp;
+	int error;
+
+	rtpp = NULL;
+	if (param->rtp != 0) {
+		error = copyin(param->rtp, &rtp, sizeof(struct rtprio));
+		rtpp = &rtp;
+	}
+	error = create_thread(td, NULL, param->start_func, param->arg,
+		param->stack_base, param->stack_size, param->tls_base,
+		param->child_tid, param->parent_tid, param->flags,
+		rtpp);
 	return (error);
 }
 
@@ -106,36 +141,42 @@
 	    char *stack_base, size_t stack_size,
 	    char *tls_base,
 	    long *child_tid, long *parent_tid,
-	    int flags)
+	    int flags, struct rtprio *rtp)
 {
 	stack_t stack;
 	struct thread *newtd;
-	struct ksegrp *kg, *newkg;
 	struct proc *p;
-	long id;
-	int error, scope_sys, linkkg;
+	int error;
 
 	error = 0;
 	p = td->td_proc;
-	kg = td->td_ksegrp;
 
 	/* Have race condition but it is cheap. */
-	if ((p->p_numksegrps >= max_groups_per_proc) ||
-	    (p->p_numthreads >= max_threads_per_proc)) {
+	if (p->p_numthreads >= max_threads_per_proc)
 		return (EPROCLIM);
-	}
 
-	/* Check PTHREAD_SCOPE_SYSTEM */
-	scope_sys = (flags & THR_SYSTEM_SCOPE) != 0;
-
-	/* sysctl overrides user's flag */
-	if (thr_scope == 1)
-		scope_sys = 0;
-	else if (thr_scope == 2)
-		scope_sys = 1;
+	if (rtp != NULL) {
+		switch(rtp->type) {
+		case RTP_PRIO_REALTIME:
+		case RTP_PRIO_FIFO:
+			/* Only root can set scheduler policy */
+			if (priv_check(td, PRIV_SCHED_SETPOLICY) != 0)
+				return (EPERM);
+			if (rtp->prio > RTP_PRIO_MAX)
+				return (EINVAL);
+			break;
+		case RTP_PRIO_NORMAL:
+			rtp->prio = 0;
+			break;
+		default:
+			return (EINVAL);
+		}
+	}
 
-	/* Initialize our td and new ksegrp.. */
+	/* Initialize our td */
 	newtd = thread_alloc();
+	if (newtd == NULL)
+		return (ENOMEM);
 
 	/*
 	 * Try the copyout as soon as we allocate the td so we don't
@@ -146,14 +187,14 @@
 	 * its storage, because child thread may exit quickly and
 	 * memory is freed before parent thread can access it.
 	 */
-	id = newtd->td_tid;
 	if ((child_tid != NULL &&
-	    (error = copyout(&id, child_tid, sizeof(long)))) ||
+	    suword_lwpid(child_tid, newtd->td_tid)) ||
 	    (parent_tid != NULL &&
-	    (error = copyout(&id, parent_tid, sizeof(long))))) {
-	    	thread_free(newtd);
-		return (error);
+	    suword_lwpid(parent_tid, newtd->td_tid))) {
+		thread_free(newtd);
+		return (EFAULT);
 	}
+
 	bzero(&newtd->td_startzero,
 	    __rangeof(struct thread, td_startzero, td_endzero));
 	bcopy(&td->td_startcopy, &newtd->td_startcopy,
@@ -185,70 +226,29 @@
 		}
 	}
 
-	if ((td->td_proc->p_flag & P_HADTHREADS) == 0) {
-		/* Treat initial thread as it has PTHREAD_SCOPE_PROCESS. */
-		p->p_procscopegrp = kg;
-		mtx_lock_spin(&sched_lock);
-		sched_set_concurrency(kg,
-		    thr_concurrency ? thr_concurrency : (2*mp_ncpus));
-		mtx_unlock_spin(&sched_lock);
-	}
-
-	linkkg = 0;
-	if (scope_sys) {
-		linkkg = 1;
-		newkg = ksegrp_alloc();
-		bzero(&newkg->kg_startzero,
-		    __rangeof(struct ksegrp, kg_startzero, kg_endzero));
-		bcopy(&kg->kg_startcopy, &newkg->kg_startcopy,
-		    __rangeof(struct ksegrp, kg_startcopy, kg_endcopy));
-		sched_init_concurrency(newkg);
-		PROC_LOCK(td->td_proc);
-	} else {
-		/*
-		 * Try to create a KSE group which will be shared
-		 * by all PTHREAD_SCOPE_PROCESS threads.
-		 */
-retry:
-		PROC_LOCK(td->td_proc);
-		if ((newkg = p->p_procscopegrp) == NULL) {
-			PROC_UNLOCK(p);
-			newkg = ksegrp_alloc();
-			bzero(&newkg->kg_startzero,
-			    __rangeof(struct ksegrp, kg_startzero, kg_endzero));
-			bcopy(&kg->kg_startcopy, &newkg->kg_startcopy,
-			    __rangeof(struct ksegrp, kg_startcopy, kg_endcopy));
-			PROC_LOCK(p);
-			if (p->p_procscopegrp == NULL) {
-				p->p_procscopegrp = newkg;
-				sched_init_concurrency(newkg);
-				sched_set_concurrency(newkg,
-				    thr_concurrency ? thr_concurrency : (2*mp_ncpus));
-				linkkg = 1;
-			} else {
-				PROC_UNLOCK(p);
-				ksegrp_free(newkg);
-				goto retry;
-			}
-		}
-	}
-
+	PROC_LOCK(td->td_proc);
 	td->td_proc->p_flag |= P_HADTHREADS;
 	newtd->td_sigmask = td->td_sigmask;
-	mtx_lock_spin(&sched_lock);
-	if (linkkg)
-		ksegrp_link(newkg, p);
-	thread_link(newtd, newkg);
-	PROC_UNLOCK(p);
-
+	PROC_SLOCK(p);
+	thread_link(newtd, p); 
+	thread_lock(td);
 	/* let the scheduler know about these things. */
-	if (linkkg)
-		sched_fork_ksegrp(td, newkg);
 	sched_fork_thread(td, newtd);
+	thread_unlock(td);
+	PROC_SUNLOCK(p);
+	PROC_UNLOCK(p);
+	thread_lock(newtd);
+	if (rtp != NULL) {
+		if (!(td->td_pri_class == PRI_TIMESHARE &&
+		      rtp->type == RTP_PRIO_NORMAL)) {
+			rtp_to_pri(rtp, newtd);
+			sched_prio(newtd, newtd->td_user_pri);
+		} /* ignore timesharing class */
+	}
 	TD_SET_CAN_RUN(newtd);
 	/* if ((flags & THR_SUSPENDED) == 0) */
-		setrunqueue(newtd, SRQ_BORING);
-	mtx_unlock_spin(&sched_lock);
+		sched_add(newtd, SRQ_BORING);
+	thread_unlock(newtd);
 
 	return (error);
 }
@@ -257,13 +257,11 @@
 thr_self(struct thread *td, struct thr_self_args *uap)
     /* long *id */
 {
-	long id;
 	int error;
 
-	id = td->td_tid;
-	if ((error = copyout(&id, uap->id, sizeof(long))))
-		return (error);
-
+	error = suword_lwpid(uap->id, (unsigned)td->td_tid);
+	if (error == -1)
+		return (EFAULT);
 	return (0);
 }
 
@@ -277,12 +275,13 @@
 
 	/* Signal userland that it can free the stack. */
 	if ((void *)uap->state != NULL) {
-		suword((void *)uap->state, 1);
+		suword_lwpid(uap->state, 1);
 		kern_umtx_wake(td, uap->state, INT_MAX);
 	}
 
 	PROC_LOCK(p);
-	mtx_lock_spin(&sched_lock);
+	sigqueue_flush(&td->td_sigqueue);
+	PROC_SLOCK(p);
 
 	/*
 	 * Shutting down last thread in the proc.  This will actually
@@ -293,7 +292,7 @@
 		thread_exit();
 		/* NOTREACHED */
 	}
-	mtx_unlock_spin(&sched_lock);
+	PROC_SUNLOCK(p);
 	PROC_UNLOCK(p);
 	return (0);
 }
@@ -319,17 +318,14 @@
 					error = 0;
 					if (uap->sig == 0)
 						break;
-					tdsignal(ttd, uap->sig, SIGTARGET_TD);
+					tdsignal(p, ttd, uap->sig, NULL);
 				}
 			}
 		}
 	} else {
-		if (uap->id != td->td_tid) {
-			FOREACH_THREAD_IN_PROC(p, ttd) {
-				if (ttd->td_tid == uap->id)
-					break;
-			}
-		} else
+		if (uap->id != td->td_tid)
+			ttd = thread_find(p, uap->id);
+		else
 			ttd = td;
 		if (ttd == NULL)
 			error = ESRCH;
@@ -338,7 +334,60 @@
 		else if (!_SIG_VALID(uap->sig))
 			error = EINVAL;
 		else
-			tdsignal(ttd, uap->sig, SIGTARGET_TD);
+			tdsignal(p, ttd, uap->sig, NULL);
+	}
+	PROC_UNLOCK(p);
+	return (error);
+}
+
+int
+thr_kill2(struct thread *td, struct thr_kill2_args *uap)
+    /* pid_t pid, long id, int sig */
+{
+	struct thread *ttd;
+	struct proc *p;
+	int error;
+
+	AUDIT_ARG(signum, uap->sig);
+
+	if (uap->pid == td->td_proc->p_pid) {
+		p = td->td_proc;
+		PROC_LOCK(p);
+	} else if ((p = pfind(uap->pid)) == NULL) {
+		return (ESRCH);
+	}
+	AUDIT_ARG(process, p);
+
+	error = p_cansignal(td, p, uap->sig);
+	if (error == 0) {
+		if (uap->id == -1) {
+			if (uap->sig != 0 && !_SIG_VALID(uap->sig)) {
+				error = EINVAL;
+			} else {
+				error = ESRCH;
+				FOREACH_THREAD_IN_PROC(p, ttd) {
+					if (ttd != td) {
+						error = 0;
+						if (uap->sig == 0)
+							break;
+						tdsignal(p, ttd, uap->sig, NULL);
+					}
+				}
+			}
+		} else {
+			if (uap->id != td->td_tid)
+				ttd = thread_find(p, uap->id);
+			else
+				ttd = td;
+			if (ttd == NULL)
+				error = ESRCH;
+			else if (uap->sig == 0)
+				;
+			else if (!_SIG_VALID(uap->sig))
+				error = EINVAL;
+			else
+				tdsignal(p, ttd, uap->sig, NULL);
+		}
 	}
 	PROC_UNLOCK(p);
 	return (error);
@@ -348,33 +397,50 @@
 thr_suspend(struct thread *td, struct thr_suspend_args *uap)
 	/* const struct timespec *timeout */
 {
-	struct timespec ts;
-	struct timeval	tv;
+	struct timespec ts, *tsp;
 	int error;
-	int hz;
 
-	hz = 0;
 	error = 0;
+	tsp = NULL;
 	if (uap->timeout != NULL) {
 		error = copyin((const void *)uap->timeout, (void *)&ts,
 		    sizeof(struct timespec));
 		if (error != 0)
 			return (error);
-		if (ts.tv_nsec < 0 || ts.tv_nsec > 1000000000)
+		tsp = &ts;
+	}
+
+	return (kern_thr_suspend(td, tsp));
+}
+
+int
+kern_thr_suspend(struct thread *td, struct timespec *tsp)
+{
+	struct timeval tv;
+	int error = 0, hz = 0;
+
+	if (tsp != NULL) {
+		if (tsp->tv_nsec < 0 || tsp->tv_nsec > 1000000000)
 			return (EINVAL);
-		if (ts.tv_sec == 0 && ts.tv_nsec == 0)
+		if (tsp->tv_sec == 0 && tsp->tv_nsec == 0)
 			return (ETIMEDOUT);
-		TIMESPEC_TO_TIMEVAL(&tv, &ts);
+		TIMESPEC_TO_TIMEVAL(&tv, tsp);
 		hz = tvtohz(&tv);
 	}
+
+	if (td->td_pflags & TDP_WAKEUP) {
+		td->td_pflags &= ~TDP_WAKEUP;
+		return (0);
+	}
+
 	PROC_LOCK(td->td_proc);
 	if ((td->td_flags & TDF_THRWAKEUP) == 0)
-		error = msleep((void *)td, &td->td_proc->p_mtx,
-		    PCATCH, "lthr", hz);
+		error = msleep((void *)td, &td->td_proc->p_mtx, PCATCH, "lthr",
+		    hz);
 	if (td->td_flags & TDF_THRWAKEUP) {
-		mtx_lock_spin(&sched_lock);
+		thread_lock(td);
 		td->td_flags &= ~TDF_THRWAKEUP;
-		mtx_unlock_spin(&sched_lock);
+		thread_unlock(td);
 		PROC_UNLOCK(td->td_proc);
 		return (0);
 	}
@@ -392,21 +458,54 @@
 thr_wake(struct thread *td, struct thr_wake_args *uap)
 	/* long id */
 {
+	struct proc *p;
 	struct thread *ttd;
 
-	PROC_LOCK(td->td_proc);
-	FOREACH_THREAD_IN_PROC(td->td_proc, ttd) {
-		if (ttd->td_tid == uap->id)
-			break;
-	}
+	if (uap->id == td->td_tid) {
+		td->td_pflags |= TDP_WAKEUP;
+		return (0);
+	} 
+
+	p = td->td_proc;
+	PROC_LOCK(p);
+	ttd = thread_find(p, uap->id);
 	if (ttd == NULL) {
-		PROC_UNLOCK(td->td_proc);
+		PROC_UNLOCK(p);
 		return (ESRCH);
 	}
-	mtx_lock_spin(&sched_lock);
+	thread_lock(ttd);
 	ttd->td_flags |= TDF_THRWAKEUP;
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(ttd);
 	wakeup((void *)ttd);
-	PROC_UNLOCK(td->td_proc);
+	PROC_UNLOCK(p);
 	return (0);
 }
+
+int
+thr_set_name(struct thread *td, struct thr_set_name_args *uap)
+{
+	struct proc *p = td->td_proc;
+	char name[MAXCOMLEN + 1];
+	struct thread *ttd;
+	int error;
+
+	error = 0;
+	name[0] = '\0';
+	if (uap->name != NULL) {
+		error = copyinstr(uap->name, name, sizeof(name),
+			NULL);
+		if (error)
+			return (error);
+	}
+	PROC_LOCK(p);
+	if (uap->id == td->td_tid)
+		ttd = td;
+	else
+		ttd = thread_find(p, uap->id);
+	if (ttd != NULL)
+		strcpy(ttd->td_name, name);
+	else 
+		error = ESRCH;
+	PROC_UNLOCK(p);
+	return (error);
+}
Index: kern_event.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_event.c,v
retrieving revision 1.3
retrieving revision 1.4
diff -L sys/kern/kern_event.c -L sys/kern/kern_event.c -u -r1.3 -r1.4
--- sys/kern/kern_event.c
+++ sys/kern/kern_event.c
@@ -26,7 +26,9 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_event.c,v 1.93.2.3.2.1 2006/04/19 16:00:31 jhb Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_event.c,v 1.113 2007/07/14 21:23:30 rodrigc Exp $");
+
+#include "opt_ktrace.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -57,6 +59,9 @@
 #include <sys/syscallsubr.h>
 #include <sys/taskqueue.h>
 #include <sys/uio.h>
+#ifdef KTRACE
+#include <sys/ktrace.h>
+#endif
 
 #include <vm/uma.h>
 
@@ -83,7 +88,9 @@
 
 static int	kevent_copyout(void *arg, struct kevent *kevp, int count);
 static int	kevent_copyin(void *arg, struct kevent *kevp, int count);
-static int	kqueue_aquire(struct file *fp, struct kqueue **kqp);
+static int	kqueue_register(struct kqueue *kq, struct kevent *kev,
+		    struct thread *td, int waitok);
+static int	kqueue_acquire(struct file *fp, struct kqueue **kqp);
 static void	kqueue_release(struct kqueue *kq, int locked);
 static int	kqueue_expand(struct kqueue *kq, struct filterops *fops,
 		    uintptr_t ident, int waitok);
@@ -247,6 +254,7 @@
 	{ &timer_filtops },			/* EVFILT_TIMER */
 	{ &file_filtops },			/* EVFILT_NETDEV */
 	{ &fs_filtops },			/* EVFILT_FS */
+	{ &null_filtops },			/* EVFILT_LIO */
 };
 
 /*
@@ -388,6 +396,7 @@
 		if (!(kn->kn_status & KN_DETACHED))
 			knlist_remove_inevent(&p->p_klist, kn);
 		kn->kn_flags |= (EV_EOF | EV_ONESHOT);
+		kn->kn_data = p->p_xstat;
 		kn->kn_ptr.p_proc = NULL;
 		return (1);
 	}
@@ -497,9 +506,6 @@
 	return (kn->kn_data != 0);
 }
 
-/*
- * MPSAFE
- */
 int
 kqueue(struct thread *td, struct kqueue_args *uap)
 {
@@ -521,15 +527,15 @@
 	knlist_init(&kq->kq_sel.si_note, &kq->kq_lock, NULL, NULL, NULL);
 	TASK_INIT(&kq->kq_task, 0, kqueue_task, kq);
 
-	FILEDESC_LOCK_FAST(fdp);
+	FILEDESC_XLOCK(fdp);
 	SLIST_INSERT_HEAD(&fdp->fd_kqlist, kq, kq_list);
-	FILEDESC_UNLOCK_FAST(fdp);
+	FILEDESC_XUNLOCK(fdp);
 
 	FILE_LOCK(fp);
 	fp->f_flag = FREAD | FWRITE;
 	fp->f_type = DTYPE_KQUEUE;
-	fp->f_ops = &kqueueops;
 	fp->f_data = kq;
+	fp->f_ops = &kqueueops;
 	FILE_UNLOCK(fp);
 	fdrop(fp, td);
 
@@ -548,9 +554,6 @@
 	const struct timespec *timeout;
 };
 #endif
-/*
- * MPSAFE
- */
 int
 kevent(struct thread *td, struct kevent_args *uap)
 {
@@ -559,6 +562,12 @@
 					kevent_copyout,
 					kevent_copyin};
 	int error;
+#ifdef KTRACE
+	struct uio ktruio;
+	struct iovec ktriov;
+	struct uio *ktruioin = NULL;
+	struct uio *ktruioout = NULL;
+#endif
 
 	if (uap->timeout != NULL) {
 		error = copyin(uap->timeout, &ts, sizeof(ts));
@@ -568,8 +577,33 @@
 	} else
 		tsp = NULL;
 
-	return (kern_kevent(td, uap->fd, uap->nchanges, uap->nevents,
-	    &k_ops, tsp));
+#ifdef KTRACE
+	if (KTRPOINT(td, KTR_GENIO)) {
+		ktriov.iov_base = uap->changelist;
+		ktriov.iov_len = uap->nchanges * sizeof(struct kevent);
+		ktruio = (struct uio){ .uio_iov = &ktriov, .uio_iovcnt = 1,
+		    .uio_segflg = UIO_USERSPACE, .uio_rw = UIO_READ,
+		    .uio_td = td };
+		ktruioin = cloneuio(&ktruio);
+		ktriov.iov_base = uap->eventlist;
+		ktriov.iov_len = uap->nevents * sizeof(struct kevent);
+		ktruioout = cloneuio(&ktruio);
+	}
+#endif
+
+	error = kern_kevent(td, uap->fd, uap->nchanges, uap->nevents,
+	    &k_ops, tsp);
+
+#ifdef KTRACE
+	if (ktruioin != NULL) {
+		ktruioin->uio_resid = uap->nchanges * sizeof(struct kevent);
+		ktrgenio(uap->fd, UIO_WRITE, ktruioin, 0);
+		ktruioout->uio_resid = td->td_retval[0] * sizeof(struct kevent);
+		ktrgenio(uap->fd, UIO_READ, ktruioout, error);
+	}
+#endif
+
+	return (error);
 }
 
 /*
@@ -620,7 +654,7 @@
 
 	if ((error = fget(td, fd, &fp)) != 0)
 		return (error);
-	if ((error = kqueue_aquire(fp, &kq)) != 0)
+	if ((error = kqueue_acquire(fp, &kq)) != 0)
 		goto done_norel;
 
 	nerrors = 0;
@@ -633,6 +667,8 @@
 		changes = keva;
 		for (i = 0; i < n; i++) {
 			kevp = &changes[i];
+			if (!kevp->filter)
+				continue;
 			kevp->flags &= ~EV_SYSFLAGS;
 			error = kqueue_register(kq, kevp, td, 1);
 			if (error) {
@@ -660,8 +696,7 @@
 done:
 	kqueue_release(kq, 0);
 done_norel:
-	if (fp != NULL)
-		fdrop(fp, td);
+	fdrop(fp, td);
 	return (error);
 }
 
@@ -744,22 +779,19 @@
 }
 
 /*
- * A ref to kq (obtained via kqueue_aquire) should be held.  waitok will
+ * A ref to kq (obtained via kqueue_acquire) must be held.  waitok will
  * influence if memory allocation should wait.  Make sure it is 0 if you
  * hold any mutexes.
  */
-int
+static int
 kqueue_register(struct kqueue *kq, struct kevent *kev, struct thread *td, int waitok)
 {
-	struct filedesc *fdp;
 	struct filterops *fops;
 	struct file *fp;
 	struct knote *kn, *tkn;
 	int error, filt, event;
 	int haskqglobal;
-	int fd;
 
-	fdp = NULL;
 	fp = NULL;
 	kn = NULL;
 	error = 0;
@@ -775,22 +807,13 @@
 findkn:
 	if (fops->f_isfd) {
 		KASSERT(td != NULL, ("td is NULL"));
-		fdp = td->td_proc->p_fd;
-		FILEDESC_LOCK(fdp);
-		/* validate descriptor */
-		fd = kev->ident;
-		if (fd < 0 || fd >= fdp->fd_nfiles ||
-		    (fp = fdp->fd_ofiles[fd]) == NULL) {
-			FILEDESC_UNLOCK(fdp);
-			error = EBADF;
+		error = fget(td, kev->ident, &fp);
+		if (error)
 			goto done;
-		}
-		fhold(fp);
 
 		if ((kev->flags & EV_ADD) == EV_ADD && kqueue_expand(kq, fops,
 		    kev->ident, 0) != 0) {
-			/* unlock and try again */
-			FILEDESC_UNLOCK(fdp);
+			/* try again */
 			fdrop(fp, td);
 			fp = NULL;
 			error = kqueue_expand(kq, fops, kev->ident, waitok);
@@ -808,15 +831,13 @@
 			 * they are the same thing.
 			 */
 			if (fp->f_data == kq) {
-				FILEDESC_UNLOCK(fdp);
 				error = EINVAL;
-				goto done_noglobal;
+				goto done;
 			}
 
 			KQ_GLOBAL_LOCK(&kq_global, haskqglobal);
 		}
 
-		FILEDESC_UNLOCK(fdp);
 		KQ_LOCK(kq);
 		if (kev->ident < kq->kq_knlistsize) {
 			SLIST_FOREACH(kn, &kq->kq_knlist[kev->ident], kn_link)
@@ -866,6 +887,7 @@
 			kn = tkn;
 			tkn = NULL;
 			if (kn == NULL) {
+				KQ_UNLOCK(kq);
 				error = ENOMEM;
 				goto done;
 			}
@@ -951,7 +973,6 @@
 
 done:
 	KQ_GLOBAL_UNLOCK(&kq_global, haskqglobal);
-done_noglobal:
 	if (fp != NULL)
 		fdrop(fp, td);
 	if (tkn != NULL)
@@ -962,7 +983,7 @@
 }
 
 static int
-kqueue_aquire(struct file *fp, struct kqueue **kqp)
+kqueue_acquire(struct file *fp, struct kqueue **kqp)
 {
 	int error;
 	struct kqueue *kq;
@@ -1370,7 +1391,7 @@
 	int revents = 0;
 	int error;
 
-	if ((error = kqueue_aquire(fp, &kq)))
+	if ((error = kqueue_acquire(fp, &kq)))
 		return POLLERR;
 
 	KQ_LOCK(kq);
@@ -1415,7 +1436,7 @@
 	int i;
 	int error;
 
-	if ((error = kqueue_aquire(fp, &kq)))
+	if ((error = kqueue_acquire(fp, &kq)))
 		return error;
 
 	KQ_LOCK(kq);
@@ -1471,9 +1492,9 @@
 
 	KQ_UNLOCK(kq);
 
-	FILEDESC_LOCK_FAST(fdp);
+	FILEDESC_XLOCK(fdp);
 	SLIST_REMOVE(&fdp->fd_kqlist, kq, kqueue, kq_list);
-	FILEDESC_UNLOCK_FAST(fdp);
+	FILEDESC_XUNLOCK(fdp);
 
 	knlist_destroy(&kq->kq_sel.si_note);
 	mtx_destroy(&kq->kq_lock);
@@ -1669,7 +1690,7 @@
 		knl->kl_lock = knlist_mtx_lock;
 	else
 		knl->kl_lock = kl_lock;
-	if (kl_lock == NULL)
+	if (kl_unlock == NULL)
 		knl->kl_unlock = knlist_mtx_unlock;
 	else
 		knl->kl_unlock = kl_unlock;
@@ -1705,18 +1726,18 @@
 void
 knlist_cleardel(struct knlist *knl, struct thread *td, int islocked, int killkn)
 {
-	struct knote *kn;
+	struct knote *kn, *kn2;
 	struct kqueue *kq;
 
 	if (islocked)
 		KNL_ASSERT_LOCKED(knl);
 	else {
 		KNL_ASSERT_UNLOCKED(knl);
-again:		/* need to reaquire lock since we have dropped it */
+again:		/* need to reacquire lock since we have dropped it */
 		knl->kl_lock(knl->kl_lockarg);
 	}
 
-	SLIST_FOREACH(kn, &knl->kl_list, kn_selnext) {
+	SLIST_FOREACH_SAFE(kn, &knl->kl_list, kn_selnext, kn2) {
 		kq = kn->kn_kq;
 		KQ_LOCK(kq);
 		if ((kn->kn_status & KN_INFLUX)) {
@@ -1759,9 +1780,9 @@
 }
 
 /*
- * remove all knotes referencing a specified fd
- * must be called with FILEDESC lock.  This prevents a race where a new fd
- * comes along and occupies the entry and we attach a knote to the fd.
+ * Remove all knotes referencing a specified fd must be called with FILEDESC
+ * lock.  This prevents a race where a new fd comes along and occupies the
+ * entry and we attach a knote to the fd.
  */
 void
 knote_fdclose(struct thread *td, int fd)
@@ -1771,7 +1792,7 @@
 	struct knote *kn;
 	int influx;
 
-	FILEDESC_LOCK_ASSERT(fdp, MA_OWNED);
+	FILEDESC_XLOCK_ASSERT(fdp);
 
 	/*
 	 * We shouldn't have to worry about new kevents appearing on fd
@@ -1828,7 +1849,7 @@
 }
 
 /*
- * knote must already have been detatched using the f_detach method.
+ * knote must already have been detached using the f_detach method.
  * no lock need to be held, it is assumed that the KN_INFLUX flag is set
  * to prevent other removal.
  */
@@ -1850,7 +1871,8 @@
 	else
 		list = &kq->kq_knhash[KN_HASH(kn->kn_id, kq->kq_knhashmask)];
 
-	SLIST_REMOVE(list, kn, knote, kn_link);
+	if (!SLIST_EMPTY(list))
+		SLIST_REMOVE(list, kn, knote, kn_link);
 	if (kn->kn_status & KN_QUEUED)
 		knote_dequeue(kn);
 	KQ_UNLOCK_FLUX(kq);
@@ -1913,3 +1935,28 @@
 	if (kn != NULL)
 		uma_zfree(knote_zone, kn);
 }
+
+/*
+ * Register the kev w/ the kq specified by fd.
+ */
+int 
+kqfd_register(int fd, struct kevent *kev, struct thread *td, int waitok)
+{
+	struct kqueue *kq;
+	struct file *fp;
+	int error;
+
+	if ((error = fget(td, fd, &fp)) != 0)
+		return (error);
+	if ((error = kqueue_acquire(fp, &kq)) != 0)
+		goto noacquire;
+
+	error = kqueue_register(kq, kev, td, waitok);
+
+	kqueue_release(kq, 0);
+
+noacquire:
+	fdrop(fp, td);
+
+	return error;
+}
Index: uipc_usrreq.c
===================================================================
RCS file: /home/cvs/src/sys/kern/uipc_usrreq.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/uipc_usrreq.c -L sys/kern/uipc_usrreq.c -u -r1.2 -r1.3
--- sys/kern/uipc_usrreq.c
+++ sys/kern/uipc_usrreq.c
@@ -1,7 +1,7 @@
 /*-
  * Copyright (c) 1982, 1986, 1989, 1991, 1993
  *	The Regents of the University of California.
- * Copyright 2004-2005 Robert N. M. Watson
+ * Copyright (c) 2004-2007 Robert N. M. Watson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -31,21 +31,47 @@
  *	From: @(#)uipc_usrreq.c	8.3 (Berkeley) 1/4/94
  */
 
+/*
+ * UNIX Domain (Local) Sockets
+ *
+ * This is an implementation of UNIX (local) domain sockets.  Each socket has
+ * an associated struct unpcb (UNIX protocol control block).  Stream sockets
+ * may be connected to 0 or 1 other socket.  Datagram sockets may be
+ * connected to 0, 1, or many other sockets.  Sockets may be created and
+ * connected in pairs (socketpair(2)), or bound/connected to using the file
+ * system name space.  For most purposes, only the receive socket buffer is
+ * used, as sending on one socket delivers directly to the receive socket
+ * buffer of a second socket.
+ *
+ * The implementation is substantially complicated by the fact that
+ * "ancillary data", such as file descriptors or credentials, may be passed
+ * across UNIX domain sockets.  The potential for passing UNIX domain sockets
+ * over other UNIX domain sockets requires the implementation of a simple
+ * garbage collector to find and tear down cycles of disconnected sockets.
+ *
+ * TODO:
+ *	SEQPACKET, RDM
+ *	rethink name space problems
+ *	need a proper out-of-band
+ *	lock pushdown
+ */
+
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/uipc_usrreq.c,v 1.155.2.3 2006/03/13 03:06:03 jeff Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/uipc_usrreq.c,v 1.206.4.1 2008/01/23 12:08:12 rwatson Exp $");
 
+#include "opt_ddb.h"
 #include "opt_mac.h"
 
 #include <sys/param.h>
 #include <sys/domain.h>
 #include <sys/fcntl.h>
 #include <sys/malloc.h>		/* XXX must be before <sys/file.h> */
+#include <sys/eventhandler.h>
 #include <sys/file.h>
 #include <sys/filedesc.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
-#include <sys/mac.h>
 #include <sys/mbuf.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
@@ -53,6 +79,7 @@
 #include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/resourcevar.h>
+#include <sys/rwlock.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/signalvar.h>
@@ -65,315 +92,653 @@
 #include <sys/unpcb.h>
 #include <sys/vnode.h>
 
+#ifdef DDB
+#include <ddb/ddb.h>
+#endif
+
+#include <security/mac/mac_framework.h>
+
 #include <vm/uma.h>
 
-static uma_zone_t unp_zone;
-static	unp_gen_t unp_gencnt;
-static	u_int unp_count;
+static uma_zone_t	unp_zone;
+static unp_gen_t	unp_gencnt;
+static u_int		unp_count;	/* Count of local sockets. */
+static ino_t		unp_ino;	/* Prototype for fake inode numbers. */
+static int		unp_rights;	/* File descriptors in flight. */
+static struct unp_head	unp_shead;	/* List of local stream sockets. */
+static struct unp_head	unp_dhead;	/* List of local datagram sockets. */
 
-static	struct unp_head unp_shead, unp_dhead;
+static const struct sockaddr	sun_noname = { sizeof(sun_noname), AF_LOCAL };
 
 /*
- * Unix communications domain.
- *
- * TODO:
- *	SEQPACKET, RDM
- *	rethink name space problems
- *	need a proper out-of-band
- *	lock pushdown
+ * Garbage collection of cyclic file descriptor/socket references occurs
+ * asynchronously in a taskqueue context in order to avoid recursion and
+ * reentrance in the UNIX domain socket, file descriptor, and socket layer
+ * code.  See unp_gc() for a full description.
  */
-static const struct	sockaddr sun_noname = { sizeof(sun_noname), AF_LOCAL };
-static ino_t	unp_ino;		/* prototype for fake inode numbers */
-struct mbuf *unp_addsockcred(struct thread *, struct mbuf *);
+static struct task	unp_gc_task;
 
 /*
- * Currently, UNIX domain sockets are protected by a single subsystem lock,
- * which covers global data structures and variables, the contents of each
- * per-socket unpcb structure, and the so_pcb field in sockets attached to
- * the UNIX domain.  This provides for a moderate degree of paralellism, as
- * receive operations on UNIX domain sockets do not need to acquire the
- * subsystem lock.  Finer grained locking to permit send() without acquiring
- * a global lock would be a logical next step.
+ * Both send and receive buffers are allocated PIPSIZ bytes of buffering for
+ * stream sockets, although the total for sender and receiver is actually
+ * only PIPSIZ.
  *
- * The UNIX domain socket lock preceds all socket layer locks, including the
- * socket lock and socket buffer lock, permitting UNIX domain socket code to
- * call into socket support routines without releasing its locks.
+ * Datagram sockets really use the sendspace as the maximum datagram size,
+ * and don't really want to reserve the sendspace.  Their recvspace should be
+ * large enough for at least one max-size datagram plus address.
+ */
+#ifndef PIPSIZ
+#define	PIPSIZ	8192
+#endif
+static u_long	unpst_sendspace = PIPSIZ;
+static u_long	unpst_recvspace = PIPSIZ;
+static u_long	unpdg_sendspace = 2*1024;	/* really max datagram size */
+static u_long	unpdg_recvspace = 4*1024;
+
+SYSCTL_NODE(_net, PF_LOCAL, local, CTLFLAG_RW, 0, "Local domain");
+SYSCTL_NODE(_net_local, SOCK_STREAM, stream, CTLFLAG_RW, 0, "SOCK_STREAM");
+SYSCTL_NODE(_net_local, SOCK_DGRAM, dgram, CTLFLAG_RW, 0, "SOCK_DGRAM");
+
+SYSCTL_ULONG(_net_local_stream, OID_AUTO, sendspace, CTLFLAG_RW,
+	   &unpst_sendspace, 0, "");
+SYSCTL_ULONG(_net_local_stream, OID_AUTO, recvspace, CTLFLAG_RW,
+	   &unpst_recvspace, 0, "");
+SYSCTL_ULONG(_net_local_dgram, OID_AUTO, maxdgram, CTLFLAG_RW,
+	   &unpdg_sendspace, 0, "");
+SYSCTL_ULONG(_net_local_dgram, OID_AUTO, recvspace, CTLFLAG_RW,
+	   &unpdg_recvspace, 0, "");
+SYSCTL_INT(_net_local, OID_AUTO, inflight, CTLFLAG_RD, &unp_rights, 0, "");
+
+/*-
+ * Locking and synchronization:
+ *
+ * The global UNIX domain socket rwlock (unp_global_rwlock) protects all
+ * global variables, including the linked lists tracking the set of allocated
+ * UNIX domain sockets.  The global rwlock also serves to prevent deadlock
+ * when more than one PCB lock is acquired at a time (i.e., during
+ * connect()).  Finally, the global rwlock protects uncounted references from
+ * vnodes to sockets bound to those vnodes: to safely dereference the
+ * v_socket pointer, the global rwlock must be held while a full reference is
+ * acquired.
  *
- * Some caution is required in areas where the UNIX domain socket code enters
- * VFS in order to create or find rendezvous points.  This results in
- * dropping of the UNIX domain socket subsystem lock, acquisition of the
- * Giant lock, and potential sleeping.  This increases the chances of races,
- * and exposes weaknesses in the socket->protocol API by offering poor
- * failure modes.
+ * UNIX domain sockets each have an unpcb hung off of their so_pcb pointer,
+ * allocated in pru_attach() and freed in pru_detach().  The validity of that
+ * pointer is an invariant, so no lock is required to dereference the so_pcb
+ * pointer if a valid socket reference is held by the caller.  In practice,
+ * this is always true during operations performed on a socket.  Each unpcb
+ * has a back-pointer to its socket, unp_socket, which will be stable under
+ * the same circumstances.
+ *
+ * This pointer may only be safely dereferenced as long as a valid reference
+ * to the unpcb is held.  Typically, this reference will be from the socket,
+ * or from another unpcb when the referring unpcb's lock is held (in order
+ * that the reference not be invalidated during use).  For example, to follow
+ * unp->unp_conn->unp_socket, you need unlock the lock on unp, not unp_conn,
+ * as unp_socket remains valid as long as the reference to unp_conn is valid.
+ *
+ * Fields of unpcbss are locked using a per-unpcb lock, unp_mtx.  Individual
+ * atomic reads without the lock may be performed "lockless", but more
+ * complex reads and read-modify-writes require the mutex to be held.  No
+ * lock order is defined between unpcb locks -- multiple unpcb locks may be
+ * acquired at the same time only when holding the global UNIX domain socket
+ * rwlock exclusively, which prevents deadlocks.
+ *
+ * Blocking with UNIX domain sockets is a tricky issue: unlike most network
+ * protocols, bind() is a non-atomic operation, and connect() requires
+ * potential sleeping in the protocol, due to potentially waiting on local or
+ * distributed file systems.  We try to separate "lookup" operations, which
+ * may sleep, and the IPC operations themselves, which typically can occur
+ * with relative atomicity as locks can be held over the entire operation.
+ *
+ * Another tricky issue is simultaneous multi-threaded or multi-process
+ * access to a single UNIX domain socket.  These are handled by the flags
+ * UNP_CONNECTING and UNP_BINDING, which prevent concurrent connecting or
+ * binding, both of which involve dropping UNIX domain socket locks in order
+ * to perform namei() and other file system operations.
  */
-static struct mtx unp_mtx;
-#define	UNP_LOCK_INIT() \
-	mtx_init(&unp_mtx, "unp", NULL, MTX_DEF)
-#define	UNP_LOCK()		mtx_lock(&unp_mtx)
-#define	UNP_UNLOCK()		mtx_unlock(&unp_mtx)
-#define	UNP_LOCK_ASSERT()	mtx_assert(&unp_mtx, MA_OWNED)
-#define	UNP_UNLOCK_ASSERT()	mtx_assert(&unp_mtx, MA_NOTOWNED)
+static struct rwlock	unp_global_rwlock;
+
+#define	UNP_GLOBAL_LOCK_INIT()		rw_init(&unp_global_rwlock,	\
+					    "unp_global_rwlock")
+
+#define	UNP_GLOBAL_LOCK_ASSERT()	rw_assert(&unp_global_rwlock,	\
+					    RA_LOCKED)
+#define	UNP_GLOBAL_UNLOCK_ASSERT()	rw_assert(&unp_global_rwlock,	\
+					    RA_UNLOCKED)
+
+#define	UNP_GLOBAL_WLOCK()		rw_wlock(&unp_global_rwlock)
+#define	UNP_GLOBAL_WUNLOCK()		rw_wunlock(&unp_global_rwlock)
+#define	UNP_GLOBAL_WLOCK_ASSERT()	rw_assert(&unp_global_rwlock,	\
+					    RA_WLOCKED)
+#define	UNP_GLOBAL_WOWNED()		rw_wowned(&unp_global_rwlock)
+
+#define	UNP_GLOBAL_RLOCK()		rw_rlock(&unp_global_rwlock)
+#define	UNP_GLOBAL_RUNLOCK()		rw_runlock(&unp_global_rwlock)
+#define	UNP_GLOBAL_RLOCK_ASSERT()	rw_assert(&unp_global_rwlock,	\
+					    RA_RLOCKED)
+
+#define UNP_PCB_LOCK_INIT(unp)		mtx_init(&(unp)->unp_mtx,	\
+					    "unp_mtx", "unp_mtx",	\
+					    MTX_DUPOK|MTX_DEF|MTX_RECURSE)
+#define	UNP_PCB_LOCK_DESTROY(unp)	mtx_destroy(&(unp)->unp_mtx)
+#define	UNP_PCB_LOCK(unp)		mtx_lock(&(unp)->unp_mtx)
+#define	UNP_PCB_UNLOCK(unp)		mtx_unlock(&(unp)->unp_mtx)
+#define	UNP_PCB_LOCK_ASSERT(unp)	mtx_assert(&(unp)->unp_mtx, MA_OWNED)
+
+static int	unp_connect(struct socket *, struct sockaddr *,
+		    struct thread *);
+static int	unp_connect2(struct socket *so, struct socket *so2, int);
+static void	unp_disconnect(struct unpcb *unp, struct unpcb *unp2);
+static void	unp_shutdown(struct unpcb *);
+static void	unp_drop(struct unpcb *, int);
+static void	unp_gc(__unused void *, int);
+static void	unp_scan(struct mbuf *, void (*)(struct file *));
+static void	unp_mark(struct file *);
+static void	unp_discard(struct file *);
+static void	unp_freerights(struct file **, int);
+static int	unp_internalize(struct mbuf **, struct thread *);
+static struct mbuf	*unp_addsockcred(struct thread *, struct mbuf *);
 
 /*
- * Garbage collection of cyclic file descriptor/socket references occurs
- * asynchronously in a taskqueue context in order to avoid recursion and
- * reentrance in the UNIX domain socket, file descriptor, and socket layer
- * code.  See unp_gc() for a full description.
+ * Definitions of protocols supported in the LOCAL domain.
  */
-static struct task	unp_gc_task;
+static struct domain localdomain;
+static struct protosw localsw[] = {
+{
+	.pr_type =		SOCK_STREAM,
+	.pr_domain =		&localdomain,
+	.pr_flags =		PR_CONNREQUIRED|PR_WANTRCVD|PR_RIGHTS,
+	.pr_ctloutput =		&uipc_ctloutput,
+	.pr_usrreqs =		&uipc_usrreqs
+},
+{
+	.pr_type =		SOCK_DGRAM,
+	.pr_domain =		&localdomain,
+	.pr_flags =		PR_ATOMIC|PR_ADDR|PR_RIGHTS,
+	.pr_usrreqs =		&uipc_usrreqs
+},
+};
 
-static int     unp_attach(struct socket *);
-static void    unp_detach(struct unpcb *);
-static int     unp_bind(struct unpcb *,struct sockaddr *, struct thread *);
-static int     unp_connect(struct socket *,struct sockaddr *, struct thread *);
-static int     unp_connect2(struct socket *so, struct socket *so2, int);
-static void    unp_disconnect(struct unpcb *);
-static void    unp_shutdown(struct unpcb *);
-static void    unp_drop(struct unpcb *, int);
-static void    unp_gc(__unused void *, int);
-static void    unp_scan(struct mbuf *, void (*)(struct file *));
-static void    unp_mark(struct file *);
-static void    unp_discard(struct file *);
-static void    unp_freerights(struct file **, int);
-static int     unp_internalize(struct mbuf **, struct thread *);
-static int     unp_listen(struct socket *, struct unpcb *, struct thread *);
+static struct domain localdomain = {
+	.dom_family =		AF_LOCAL,
+	.dom_name =		"local",
+	.dom_init =		unp_init,
+	.dom_externalize =	unp_externalize,
+	.dom_dispose =		unp_dispose,
+	.dom_protosw =		localsw,
+	.dom_protoswNPROTOSW =	&localsw[sizeof(localsw)/sizeof(localsw[0])]
+};
+DOMAIN_SET(local);
 
-static int
+static void
 uipc_abort(struct socket *so)
 {
-	struct unpcb *unp;
+	struct unpcb *unp, *unp2;
 
-	UNP_LOCK();
 	unp = sotounpcb(so);
-	if (unp == NULL) {
-		UNP_UNLOCK();
-		return (EINVAL);
+	KASSERT(unp != NULL, ("uipc_abort: unp == NULL"));
+
+	UNP_GLOBAL_WLOCK();
+	UNP_PCB_LOCK(unp);
+	unp2 = unp->unp_conn;
+	if (unp2 != NULL) {
+		UNP_PCB_LOCK(unp2);
+		unp_drop(unp2, ECONNABORTED);
+		UNP_PCB_UNLOCK(unp2);
 	}
-	unp_drop(unp, ECONNABORTED);
-	unp_detach(unp);
-	UNP_UNLOCK_ASSERT();
-	ACCEPT_LOCK();
-	SOCK_LOCK(so);
-	sotryfree(so);
-	return (0);
+	UNP_PCB_UNLOCK(unp);
+	UNP_GLOBAL_WUNLOCK();
 }
 
 static int
 uipc_accept(struct socket *so, struct sockaddr **nam)
 {
-	struct unpcb *unp;
+	struct unpcb *unp, *unp2;
 	const struct sockaddr *sa;
 
 	/*
-	 * Pass back name of connected socket,
-	 * if it was bound and we are still connected
-	 * (our peer may have closed already!).
+	 * Pass back name of connected socket, if it was bound and we are
+	 * still connected (our peer may have closed already!).
 	 */
-	*nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
-	UNP_LOCK();
 	unp = sotounpcb(so);
-	if (unp == NULL) {
-		UNP_UNLOCK();
-		free(*nam, M_SONAME);
-		*nam = NULL;
-		return (EINVAL);
-	}
-	if (unp->unp_conn != NULL && unp->unp_conn->unp_addr != NULL)
-		sa = (struct sockaddr *) unp->unp_conn->unp_addr;
-	else
+	KASSERT(unp != NULL, ("uipc_accept: unp == NULL"));
+
+	*nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
+	UNP_GLOBAL_RLOCK();
+	unp2 = unp->unp_conn;
+	if (unp2 != NULL && unp2->unp_addr != NULL) {
+		UNP_PCB_LOCK(unp2);
+		sa = (struct sockaddr *) unp2->unp_addr;
+		bcopy(sa, *nam, sa->sa_len);
+		UNP_PCB_UNLOCK(unp2);
+	} else {
 		sa = &sun_noname;
-	bcopy(sa, *nam, sa->sa_len);
-	UNP_UNLOCK();
+		bcopy(sa, *nam, sa->sa_len);
+	}
+	UNP_GLOBAL_RUNLOCK();
 	return (0);
 }
 
 static int
 uipc_attach(struct socket *so, int proto, struct thread *td)
 {
-	struct unpcb *unp = sotounpcb(so);
+	u_long sendspace, recvspace;
+	struct unpcb *unp;
+	int error, locked;
+
+	KASSERT(so->so_pcb == NULL, ("uipc_attach: so_pcb != NULL"));
+	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
+		switch (so->so_type) {
+		case SOCK_STREAM:
+			sendspace = unpst_sendspace;
+			recvspace = unpst_recvspace;
+			break;
+
+		case SOCK_DGRAM:
+			sendspace = unpdg_sendspace;
+			recvspace = unpdg_recvspace;
+			break;
+
+		default:
+			panic("uipc_attach");
+		}
+		error = soreserve(so, sendspace, recvspace);
+		if (error)
+			return (error);
+	}
+	unp = uma_zalloc(unp_zone, M_NOWAIT | M_ZERO);
+	if (unp == NULL)
+		return (ENOBUFS);
+	LIST_INIT(&unp->unp_refs);
+	UNP_PCB_LOCK_INIT(unp);
+	unp->unp_socket = so;
+	so->so_pcb = unp;
+	unp->unp_refcount = 1;
+
+	/*
+	 * uipc_attach() may be called indirectly from within the UNIX domain
+	 * socket code via sonewconn() in unp_connect().  Since rwlocks can
+	 * not be recursed, we do the closest thing.
+	 */
+	locked = 0;
+	if (!UNP_GLOBAL_WOWNED()) {
+		UNP_GLOBAL_WLOCK();
+		locked = 1;
+	}
+	unp->unp_gencnt = ++unp_gencnt;
+	unp_count++;
+	LIST_INSERT_HEAD(so->so_type == SOCK_DGRAM ? &unp_dhead : &unp_shead,
+	    unp, unp_link);
+	if (locked)
+		UNP_GLOBAL_WUNLOCK();
 
-	if (unp != NULL)
-		return (EISCONN);
-	return (unp_attach(so));
+	return (0);
 }
 
 static int
 uipc_bind(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
+	struct sockaddr_un *soun = (struct sockaddr_un *)nam;
+	struct vattr vattr;
+	int error, namelen, vfslocked;
+	struct nameidata nd;
 	struct unpcb *unp;
-	int error;
+	struct vnode *vp;
+	struct mount *mp;
+	char *buf;
 
-	UNP_LOCK();
 	unp = sotounpcb(so);
-	if (unp == NULL) {
-		UNP_UNLOCK();
+	KASSERT(unp != NULL, ("uipc_bind: unp == NULL"));
+
+	namelen = soun->sun_len - offsetof(struct sockaddr_un, sun_path);
+	if (namelen <= 0)
 		return (EINVAL);
+
+	/*
+	 * We don't allow simultaneous bind() calls on a single UNIX domain
+	 * socket, so flag in-progress operations, and return an error if an
+	 * operation is already in progress.
+	 *
+	 * Historically, we have not allowed a socket to be rebound, so this
+	 * also returns an error.  Not allowing re-binding simplifies the
+	 * implementation and avoids a great many possible failure modes.
+	 */
+	UNP_PCB_LOCK(unp);
+	if (unp->unp_vnode != NULL) {
+		UNP_PCB_UNLOCK(unp);
+		return (EINVAL);
+	}
+	if (unp->unp_flags & UNP_BINDING) {
+		UNP_PCB_UNLOCK(unp);
+		return (EALREADY);
+	}
+	unp->unp_flags |= UNP_BINDING;
+	UNP_PCB_UNLOCK(unp);
+
+	buf = malloc(namelen + 1, M_TEMP, M_WAITOK);
+	strlcpy(buf, soun->sun_path, namelen + 1);
+
+restart:
+	vfslocked = 0;
+	NDINIT(&nd, CREATE, MPSAFE | NOFOLLOW | LOCKPARENT | SAVENAME,
+	    UIO_SYSSPACE, buf, td);
+/* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */
+	error = namei(&nd);
+	if (error)
+		goto error;
+	vp = nd.ni_vp;
+	vfslocked = NDHASGIANT(&nd);
+	if (vp != NULL || vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
+		NDFREE(&nd, NDF_ONLY_PNBUF);
+		if (nd.ni_dvp == vp)
+			vrele(nd.ni_dvp);
+		else
+			vput(nd.ni_dvp);
+		if (vp != NULL) {
+			vrele(vp);
+			error = EADDRINUSE;
+			goto error;
+		}
+		error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH);
+		if (error)
+			goto error;
+		VFS_UNLOCK_GIANT(vfslocked);
+		goto restart;
+	}
+	VATTR_NULL(&vattr);
+	vattr.va_type = VSOCK;
+	vattr.va_mode = (ACCESSPERMS & ~td->td_proc->p_fd->fd_cmask);
+#ifdef MAC
+	error = mac_check_vnode_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
+	    &vattr);
+#endif
+	if (error == 0) {
+		VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE);
+		error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
+	}
+	NDFREE(&nd, NDF_ONLY_PNBUF);
+	vput(nd.ni_dvp);
+	if (error) {
+		vn_finished_write(mp);
+		goto error;
 	}
-	error = unp_bind(unp, nam, td);
-	UNP_UNLOCK();
+	vp = nd.ni_vp;
+	ASSERT_VOP_ELOCKED(vp, "uipc_bind");
+	soun = (struct sockaddr_un *)sodupsockaddr(nam, M_WAITOK);
+
+	UNP_GLOBAL_WLOCK();
+	UNP_PCB_LOCK(unp);
+	vp->v_socket = unp->unp_socket;
+	unp->unp_vnode = vp;
+	unp->unp_addr = soun;
+	unp->unp_flags &= ~UNP_BINDING;
+	UNP_PCB_UNLOCK(unp);
+	UNP_GLOBAL_WUNLOCK();
+	VOP_UNLOCK(vp, 0, td);
+	vn_finished_write(mp);
+	VFS_UNLOCK_GIANT(vfslocked);
+	free(buf, M_TEMP);
+	return (0);
+
+error:
+	VFS_UNLOCK_GIANT(vfslocked);
+	UNP_PCB_LOCK(unp);
+	unp->unp_flags &= ~UNP_BINDING;
+	UNP_PCB_UNLOCK(unp);
+	free(buf, M_TEMP);
 	return (error);
 }
 
 static int
 uipc_connect(struct socket *so, struct sockaddr *nam, struct thread *td)
 {
-	struct unpcb *unp;
 	int error;
 
 	KASSERT(td == curthread, ("uipc_connect: td != curthread"));
+	UNP_GLOBAL_WLOCK();
+	error = unp_connect(so, nam, td);
+	UNP_GLOBAL_WUNLOCK();
+	return (error);
+}
+
+static void
+uipc_close(struct socket *so)
+{
+	struct unpcb *unp, *unp2;
 
-	UNP_LOCK();
 	unp = sotounpcb(so);
-	if (unp == NULL) {
-		UNP_UNLOCK();
-		return (EINVAL);
+	KASSERT(unp != NULL, ("uipc_close: unp == NULL"));
+
+	UNP_GLOBAL_WLOCK();
+	UNP_PCB_LOCK(unp);
+	unp2 = unp->unp_conn;
+	if (unp2 != NULL) {
+		UNP_PCB_LOCK(unp2);
+		unp_disconnect(unp, unp2);
+		UNP_PCB_UNLOCK(unp2);
 	}
-	error = unp_connect(so, nam, td);
-	UNP_UNLOCK();
-	return (error);
+	UNP_PCB_UNLOCK(unp);
+	UNP_GLOBAL_WUNLOCK();
 }
 
 int
 uipc_connect2(struct socket *so1, struct socket *so2)
 {
-	struct unpcb *unp;
+	struct unpcb *unp, *unp2;
 	int error;
 
-	UNP_LOCK();
-	unp = sotounpcb(so1);
-	if (unp == NULL) {
-		UNP_UNLOCK();
-		return (EINVAL);
-	}
+	UNP_GLOBAL_WLOCK();
+	unp = so1->so_pcb;
+	KASSERT(unp != NULL, ("uipc_connect2: unp == NULL"));
+	UNP_PCB_LOCK(unp);
+	unp2 = so2->so_pcb;
+	KASSERT(unp2 != NULL, ("uipc_connect2: unp2 == NULL"));
+	UNP_PCB_LOCK(unp2);
 	error = unp_connect2(so1, so2, PRU_CONNECT2);
-	UNP_UNLOCK();
+	UNP_PCB_UNLOCK(unp2);
+	UNP_PCB_UNLOCK(unp);
+	UNP_GLOBAL_WUNLOCK();
 	return (error);
 }
 
 /* control is EOPNOTSUPP */
 
-static int
+static void
 uipc_detach(struct socket *so)
 {
-	struct unpcb *unp;
+	struct unpcb *unp, *unp2;
+	struct sockaddr_un *saved_unp_addr;
+	struct vnode *vp;
+	int freeunp, local_unp_rights;
 
-	UNP_LOCK();
 	unp = sotounpcb(so);
-	if (unp == NULL) {
-		UNP_UNLOCK();
-		return (EINVAL);
+	KASSERT(unp != NULL, ("uipc_detach: unp == NULL"));
+
+	UNP_GLOBAL_WLOCK();
+	UNP_PCB_LOCK(unp);
+
+	LIST_REMOVE(unp, unp_link);
+	unp->unp_gencnt = ++unp_gencnt;
+	--unp_count;
+
+	/*
+	 * XXXRW: Should assert vp->v_socket == so.
+	 */
+	if ((vp = unp->unp_vnode) != NULL) {
+		unp->unp_vnode->v_socket = NULL;
+		unp->unp_vnode = NULL;
 	}
-	unp_detach(unp);
-	UNP_UNLOCK_ASSERT();
-	return (0);
+	unp2 = unp->unp_conn;
+	if (unp2 != NULL) {
+		UNP_PCB_LOCK(unp2);
+		unp_disconnect(unp, unp2);
+		UNP_PCB_UNLOCK(unp2);
+	}
+
+	/*
+	 * We hold the global lock, so it's OK to acquire multiple pcb locks
+	 * at a time.
+	 */
+	while (!LIST_EMPTY(&unp->unp_refs)) {
+		struct unpcb *ref = LIST_FIRST(&unp->unp_refs);
+
+		UNP_PCB_LOCK(ref);
+		unp_drop(ref, ECONNRESET);
+		UNP_PCB_UNLOCK(ref);
+	}
+	UNP_GLOBAL_WUNLOCK();
+	unp->unp_socket->so_pcb = NULL;
+	local_unp_rights = unp_rights;
+	saved_unp_addr = unp->unp_addr;
+	unp->unp_addr = NULL;
+	unp->unp_refcount--;
+	freeunp = (unp->unp_refcount == 0);
+	if (saved_unp_addr != NULL)
+		FREE(saved_unp_addr, M_SONAME);
+	if (freeunp) {
+		UNP_PCB_LOCK_DESTROY(unp);
+		uma_zfree(unp_zone, unp);
+	} else
+		UNP_PCB_UNLOCK(unp);
+	if (vp) {
+		int vfslocked;
+
+		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
+		vrele(vp);
+		VFS_UNLOCK_GIANT(vfslocked);
+	}
+	if (local_unp_rights)
+		taskqueue_enqueue(taskqueue_thread, &unp_gc_task);
 }
 
 static int
 uipc_disconnect(struct socket *so)
 {
-	struct unpcb *unp;
+	struct unpcb *unp, *unp2;
 
-	UNP_LOCK();
 	unp = sotounpcb(so);
-	if (unp == NULL) {
-		UNP_UNLOCK();
-		return (EINVAL);
+	KASSERT(unp != NULL, ("uipc_disconnect: unp == NULL"));
+
+	UNP_GLOBAL_WLOCK();
+	UNP_PCB_LOCK(unp);
+	unp2 = unp->unp_conn;
+	if (unp2 != NULL) {
+		UNP_PCB_LOCK(unp2);
+		unp_disconnect(unp, unp2);
+		UNP_PCB_UNLOCK(unp2);
 	}
-	unp_disconnect(unp);
-	UNP_UNLOCK();
+	UNP_PCB_UNLOCK(unp);
+	UNP_GLOBAL_WUNLOCK();
 	return (0);
 }
 
 static int
-uipc_listen(struct socket *so, struct thread *td)
+uipc_listen(struct socket *so, int backlog, struct thread *td)
 {
 	struct unpcb *unp;
 	int error;
 
-	UNP_LOCK();
 	unp = sotounpcb(so);
-	if (unp == NULL || unp->unp_vnode == NULL) {
-		UNP_UNLOCK();
+	KASSERT(unp != NULL, ("uipc_listen: unp == NULL"));
+
+	UNP_PCB_LOCK(unp);
+	if (unp->unp_vnode == NULL) {
+		UNP_PCB_UNLOCK(unp);
 		return (EINVAL);
 	}
-	error = unp_listen(so, unp, td);
-	UNP_UNLOCK();
+
+	SOCK_LOCK(so);
+	error = solisten_proto_check(so);
+	if (error == 0) {
+		cru2x(td->td_ucred, &unp->unp_peercred);
+		unp->unp_flags |= UNP_HAVEPCCACHED;
+		solisten_proto(so, backlog);
+	}
+	SOCK_UNLOCK(so);
+	UNP_PCB_UNLOCK(unp);
 	return (error);
 }
 
 static int
 uipc_peeraddr(struct socket *so, struct sockaddr **nam)
 {
-	struct unpcb *unp;
+	struct unpcb *unp, *unp2;
 	const struct sockaddr *sa;
 
-	*nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
-	UNP_LOCK();
 	unp = sotounpcb(so);
-	if (unp == NULL) {
-		UNP_UNLOCK();
-		free(*nam, M_SONAME);
-		*nam = NULL;
-		return (EINVAL);
-	}
-	if (unp->unp_conn != NULL && unp->unp_conn->unp_addr!= NULL)
-		sa = (struct sockaddr *) unp->unp_conn->unp_addr;
-	else {
-		/*
-		 * XXX: It seems that this test always fails even when
-		 * connection is established.  So, this else clause is
-		 * added as workaround to return PF_LOCAL sockaddr.
-		 */
+	KASSERT(unp != NULL, ("uipc_peeraddr: unp == NULL"));
+
+	*nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
+	UNP_PCB_LOCK(unp);
+	/*
+	 * XXX: It seems that this test always fails even when connection is
+	 * established.  So, this else clause is added as workaround to
+	 * return PF_LOCAL sockaddr.
+	 */
+	unp2 = unp->unp_conn;
+	if (unp2 != NULL) {
+		UNP_PCB_LOCK(unp2);
+		if (unp2->unp_addr != NULL)
+			sa = (struct sockaddr *) unp->unp_conn->unp_addr;
+		else
+			sa = &sun_noname;
+		bcopy(sa, *nam, sa->sa_len);
+		UNP_PCB_UNLOCK(unp2);
+	} else {
 		sa = &sun_noname;
+		bcopy(sa, *nam, sa->sa_len);
 	}
-	bcopy(sa, *nam, sa->sa_len);
-	UNP_UNLOCK();
+	UNP_PCB_UNLOCK(unp);
 	return (0);
 }
 
 static int
 uipc_rcvd(struct socket *so, int flags)
 {
-	struct unpcb *unp;
+	struct unpcb *unp, *unp2;
 	struct socket *so2;
+	u_int mbcnt, sbcc;
 	u_long newhiwat;
 
-	UNP_LOCK();
 	unp = sotounpcb(so);
-	if (unp == NULL) {
-		UNP_UNLOCK();
-		return (EINVAL);
-	}
-	switch (so->so_type) {
-	case SOCK_DGRAM:
-		panic("uipc_rcvd DGRAM?");
-		/*NOTREACHED*/
+	KASSERT(unp != NULL, ("uipc_rcvd: unp == NULL"));
 
-	case SOCK_STREAM:
-		if (unp->unp_conn == NULL)
-			break;
-		so2 = unp->unp_conn->unp_socket;
-		SOCKBUF_LOCK(&so2->so_snd);
-		SOCKBUF_LOCK(&so->so_rcv);
-		/*
-		 * Adjust backpressure on sender
-		 * and wakeup any waiting to write.
-		 */
-		so2->so_snd.sb_mbmax += unp->unp_mbcnt - so->so_rcv.sb_mbcnt;
-		unp->unp_mbcnt = so->so_rcv.sb_mbcnt;
-		newhiwat = so2->so_snd.sb_hiwat + unp->unp_cc -
-		    so->so_rcv.sb_cc;
-		(void)chgsbsize(so2->so_cred->cr_uidinfo, &so2->so_snd.sb_hiwat,
-		    newhiwat, RLIM_INFINITY);
-		unp->unp_cc = so->so_rcv.sb_cc;
-		SOCKBUF_UNLOCK(&so->so_rcv);
-		sowwakeup_locked(so2);
-		break;
+	if (so->so_type == SOCK_DGRAM)
+		panic("uipc_rcvd DGRAM?");
 
-	default:
+	if (so->so_type != SOCK_STREAM)
 		panic("uipc_rcvd unknown socktype");
+
+	/*
+	 * Adjust backpressure on sender and wakeup any waiting to write.
+	 *
+	 * The unp lock is acquired to maintain the validity of the unp_conn
+	 * pointer; no lock on unp2 is required as unp2->unp_socket will be
+	 * static as long as we don't permit unp2 to disconnect from unp,
+	 * which is prevented by the lock on unp.  We cache values from
+	 * so_rcv to avoid holding the so_rcv lock over the entire
+	 * transaction on the remote so_snd.
+	 */
+	SOCKBUF_LOCK(&so->so_rcv);
+	mbcnt = so->so_rcv.sb_mbcnt;
+	sbcc = so->so_rcv.sb_cc;
+	SOCKBUF_UNLOCK(&so->so_rcv);
+	UNP_PCB_LOCK(unp);
+	unp2 = unp->unp_conn;
+	if (unp2 == NULL) {
+		UNP_PCB_UNLOCK(unp);
+		return (0);
 	}
-	UNP_UNLOCK();
+	so2 = unp2->unp_socket;
+	SOCKBUF_LOCK(&so2->so_snd);
+	so2->so_snd.sb_mbmax += unp->unp_mbcnt - mbcnt;
+	newhiwat = so2->so_snd.sb_hiwat + unp->unp_cc - sbcc;
+	(void)chgsbsize(so2->so_cred->cr_uidinfo, &so2->so_snd.sb_hiwat,
+	    newhiwat, RLIM_INFINITY);
+	sowwakeup_locked(so2);
+	unp->unp_mbcnt = mbcnt;
+	unp->unp_cc = sbcc;
+	UNP_PCB_UNLOCK(unp);
 	return (0);
 }
 
@@ -383,16 +748,15 @@
 uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam,
     struct mbuf *control, struct thread *td)
 {
-	int error = 0;
-	struct unpcb *unp;
+	struct unpcb *unp, *unp2;
 	struct socket *so2;
+	u_int mbcnt, sbcc;
 	u_long newhiwat;
+	int error = 0;
 
 	unp = sotounpcb(so);
-	if (unp == NULL) {
-		error = EINVAL;
-		goto release;
-	}
+	KASSERT(unp != NULL, ("uipc_send: unp == NULL"));
+
 	if (flags & PRUS_OOB) {
 		error = EOPNOTSUPP;
 		goto release;
@@ -401,40 +765,48 @@
 	if (control != NULL && (error = unp_internalize(&control, td)))
 		goto release;
 
-	UNP_LOCK();
-	unp = sotounpcb(so);
-	if (unp == NULL) {
-		UNP_UNLOCK();
-		error = EINVAL;
-		goto dispose_release;
-	}
+	if ((nam != NULL) || (flags & PRUS_EOF))
+		UNP_GLOBAL_WLOCK();
+	else
+		UNP_GLOBAL_RLOCK();
 
 	switch (so->so_type) {
 	case SOCK_DGRAM:
 	{
 		const struct sockaddr *from;
 
+		unp2 = unp->unp_conn;
 		if (nam != NULL) {
-			if (unp->unp_conn != NULL) {
+			UNP_GLOBAL_WLOCK_ASSERT();
+			if (unp2 != NULL) {
 				error = EISCONN;
 				break;
 			}
 			error = unp_connect(so, nam, td);
 			if (error)
 				break;
-		} else {
-			if (unp->unp_conn == NULL) {
-				error = ENOTCONN;
-				break;
-			}
+			unp2 = unp->unp_conn;
 		}
-		so2 = unp->unp_conn->unp_socket;
+		/*
+		 * Because connect() and send() are non-atomic in a sendto()
+		 * with a target address, it's possible that the socket will
+		 * have disconnected before the send() can run.  In that case
+		 * return the slightly counter-intuitive but otherwise
+		 * correct error that the socket is not connected.
+		 */
+		if (unp2 == NULL) {
+			error = ENOTCONN;
+			break;
+		}
+		/* Lockless read. */
+		if (unp2->unp_flags & UNP_WANTCRED)
+			control = unp_addsockcred(td, control);
+		UNP_PCB_LOCK(unp);
 		if (unp->unp_addr != NULL)
 			from = (struct sockaddr *)unp->unp_addr;
 		else
 			from = &sun_noname;
-		if (unp->unp_conn->unp_flags & UNP_WANTCRED)
-			control = unp_addsockcred(td, control);
+		so2 = unp2->unp_socket;
 		SOCKBUF_LOCK(&so2->so_rcv);
 		if (sbappendaddr_locked(&so2->so_rcv, from, m, control)) {
 			sorwakeup_locked(so2);
@@ -444,19 +816,26 @@
 			SOCKBUF_UNLOCK(&so2->so_rcv);
 			error = ENOBUFS;
 		}
-		if (nam != NULL)
-			unp_disconnect(unp);
+		if (nam != NULL) {
+			UNP_GLOBAL_WLOCK_ASSERT();
+			UNP_PCB_LOCK(unp2);
+			unp_disconnect(unp, unp2);
+			UNP_PCB_UNLOCK(unp2);
+		}
+		UNP_PCB_UNLOCK(unp);
 		break;
 	}
 
 	case SOCK_STREAM:
-		/* Connect if not connected yet. */
 		/*
-		 * Note: A better implementation would complain
-		 * if not equal to the peer's address.
+		 * Connect if not connected yet.
+		 *
+		 * Note: A better implementation would complain if not equal
+		 * to the peer's address.
 		 */
 		if ((so->so_state & SS_ISCONNECTED) == 0) {
 			if (nam != NULL) {
+				UNP_GLOBAL_WLOCK_ASSERT();
 				error = unp_connect(so, nam, td);
 				if (error)
 					break;	/* XXX */
@@ -466,45 +845,61 @@
 			}
 		}
 
-		SOCKBUF_LOCK(&so->so_snd);
+		/* Lockless read. */
 		if (so->so_snd.sb_state & SBS_CANTSENDMORE) {
-			SOCKBUF_UNLOCK(&so->so_snd);
 			error = EPIPE;
 			break;
 		}
-		if (unp->unp_conn == NULL)
-			panic("uipc_send connected but no connection?");
-		so2 = unp->unp_conn->unp_socket;
+		/*
+		 * Because connect() and send() are non-atomic in a sendto()
+		 * with a target address, it's possible that the socket will
+		 * have disconnected before the send() can run.  In that case
+		 * return the slightly counter-intuitive but otherwise
+		 * correct error that the socket is not connected.
+		 *
+		 * Locking here must be done carefully: the global lock
+		 * prevents interconnections between unpcbs from changing, so
+		 * we can traverse from unp to unp2 without acquiring unp's
+		 * lock.  Socket buffer locks follow unpcb locks, so we can
+		 * acquire both remote and lock socket buffer locks.
+		 */
+		unp2 = unp->unp_conn;
+		if (unp2 == NULL) {
+			error = ENOTCONN;
+			break;
+		}
+		so2 = unp2->unp_socket;
+		UNP_PCB_LOCK(unp2);
 		SOCKBUF_LOCK(&so2->so_rcv);
-		if (unp->unp_conn->unp_flags & UNP_WANTCRED) {
+		if (unp2->unp_flags & UNP_WANTCRED) {
 			/*
-			 * Credentials are passed only once on
-			 * SOCK_STREAM.
+			 * Credentials are passed only once on SOCK_STREAM.
 			 */
-			unp->unp_conn->unp_flags &= ~UNP_WANTCRED;
+			unp2->unp_flags &= ~UNP_WANTCRED;
 			control = unp_addsockcred(td, control);
 		}
 		/*
-		 * Send to paired receive port, and then reduce
-		 * send buffer hiwater marks to maintain backpressure.
-		 * Wake up readers.
+		 * Send to paired receive port, and then reduce send buffer
+		 * hiwater marks to maintain backpressure.  Wake up readers.
 		 */
 		if (control != NULL) {
 			if (sbappendcontrol_locked(&so2->so_rcv, m, control))
 				control = NULL;
-		} else {
+		} else
 			sbappend_locked(&so2->so_rcv, m);
-		}
-		so->so_snd.sb_mbmax -=
-			so2->so_rcv.sb_mbcnt - unp->unp_conn->unp_mbcnt;
-		unp->unp_conn->unp_mbcnt = so2->so_rcv.sb_mbcnt;
-		newhiwat = so->so_snd.sb_hiwat -
-		    (so2->so_rcv.sb_cc - unp->unp_conn->unp_cc);
+		mbcnt = so2->so_rcv.sb_mbcnt - unp2->unp_mbcnt;
+		unp2->unp_mbcnt = so2->so_rcv.sb_mbcnt;
+		sbcc = so2->so_rcv.sb_cc;
+		sorwakeup_locked(so2);
+
+		SOCKBUF_LOCK(&so->so_snd);
+		newhiwat = so->so_snd.sb_hiwat - (sbcc - unp2->unp_cc);
 		(void)chgsbsize(so->so_cred->cr_uidinfo, &so->so_snd.sb_hiwat,
 		    newhiwat, RLIM_INFINITY);
+		so->so_snd.sb_mbmax -= mbcnt;
 		SOCKBUF_UNLOCK(&so->so_snd);
-		unp->unp_conn->unp_cc = so2->so_rcv.sb_cc;
-		sorwakeup_locked(so2);
+		unp2->unp_cc = sbcc;
+		UNP_PCB_UNLOCK(unp2);
 		m = NULL;
 		break;
 
@@ -513,16 +908,20 @@
 	}
 
 	/*
-	 * SEND_EOF is equivalent to a SEND followed by
-	 * a SHUTDOWN.
+	 * SEND_EOF is equivalent to a SEND followed by a SHUTDOWN.
 	 */
 	if (flags & PRUS_EOF) {
+		UNP_PCB_LOCK(unp);
 		socantsendmore(so);
 		unp_shutdown(unp);
+		UNP_PCB_UNLOCK(unp);
 	}
-	UNP_UNLOCK();
 
-dispose_release:
+	if ((nam != NULL) || (flags & PRUS_EOF))
+		UNP_GLOBAL_WUNLOCK();
+	else
+		UNP_GLOBAL_RUNLOCK();
+
 	if (control != NULL && error != 0)
 		unp_dispose(control);
 
@@ -537,25 +936,26 @@
 static int
 uipc_sense(struct socket *so, struct stat *sb)
 {
-	struct unpcb *unp;
+	struct unpcb *unp, *unp2;
 	struct socket *so2;
 
-	UNP_LOCK();
 	unp = sotounpcb(so);
-	if (unp == NULL) {
-		UNP_UNLOCK();
-		return (EINVAL);
-	}
+	KASSERT(unp != NULL, ("uipc_sense: unp == NULL"));
+
 	sb->st_blksize = so->so_snd.sb_hiwat;
-	if (so->so_type == SOCK_STREAM && unp->unp_conn != NULL) {
-		so2 = unp->unp_conn->unp_socket;
+	UNP_GLOBAL_RLOCK();
+	UNP_PCB_LOCK(unp);
+	unp2 = unp->unp_conn;
+	if (so->so_type == SOCK_STREAM && unp2 != NULL) {
+		so2 = unp2->unp_socket;
 		sb->st_blksize += so2->so_rcv.sb_cc;
 	}
 	sb->st_dev = NODEV;
 	if (unp->unp_ino == 0)
 		unp->unp_ino = (++unp_ino == 0) ? ++unp_ino : unp_ino;
 	sb->st_ino = unp->unp_ino;
-	UNP_UNLOCK();
+	UNP_PCB_UNLOCK(unp);
+	UNP_GLOBAL_RUNLOCK();
 	return (0);
 }
 
@@ -564,15 +964,15 @@
 {
 	struct unpcb *unp;
 
-	UNP_LOCK();
 	unp = sotounpcb(so);
-	if (unp == NULL) {
-		UNP_UNLOCK();
-		return (EINVAL);
-	}
+	KASSERT(unp != NULL, ("uipc_shutdown: unp == NULL"));
+
+	UNP_GLOBAL_WLOCK();
+	UNP_PCB_LOCK(unp);
 	socantsendmore(so);
 	unp_shutdown(unp);
-	UNP_UNLOCK();
+	UNP_PCB_UNLOCK(unp);
+	UNP_GLOBAL_WUNLOCK();
 	return (0);
 }
 
@@ -582,21 +982,17 @@
 	struct unpcb *unp;
 	const struct sockaddr *sa;
 
-	*nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
-	UNP_LOCK();
 	unp = sotounpcb(so);
-	if (unp == NULL) {
-		UNP_UNLOCK();
-		free(*nam, M_SONAME);
-		*nam = NULL;
-		return (EINVAL);
-	}
+	KASSERT(unp != NULL, ("uipc_sockaddr: unp == NULL"));
+
+	*nam = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
+	UNP_PCB_LOCK(unp);
 	if (unp->unp_addr != NULL)
 		sa = (struct sockaddr *) unp->unp_addr;
 	else
 		sa = &sun_noname;
 	bcopy(sa, *nam, sa->sa_len);
-	UNP_UNLOCK();
+	UNP_PCB_UNLOCK(unp);
 	return (0);
 }
 
@@ -616,9 +1012,7 @@
 	.pru_sense =		uipc_sense,
 	.pru_shutdown =		uipc_shutdown,
 	.pru_sockaddr =		uipc_sockaddr,
-	.pru_sosend =		sosend,
-	.pru_soreceive =	soreceive,
-	.pru_sopoll =		sopoll,
+	.pru_close =		uipc_close,
 };
 
 int
@@ -631,284 +1025,87 @@
 	if (sopt->sopt_level != 0)
 		return (EINVAL);
 
-	UNP_LOCK();
 	unp = sotounpcb(so);
-	if (unp == NULL) {
-		UNP_UNLOCK();
-		return (EINVAL);
-	}
-	error = 0;
-
-	switch (sopt->sopt_dir) {
-	case SOPT_GET:
-		switch (sopt->sopt_name) {
-		case LOCAL_PEERCRED:
-			if (unp->unp_flags & UNP_HAVEPC)
-				xu = unp->unp_peercred;
-			else {
-				if (so->so_type == SOCK_STREAM)
-					error = ENOTCONN;
-				else
-					error = EINVAL;
-			}
-			if (error == 0)
-				error = sooptcopyout(sopt, &xu, sizeof(xu));
-			break;
-		case LOCAL_CREDS:
-			optval = unp->unp_flags & UNP_WANTCRED ? 1 : 0;
-			error = sooptcopyout(sopt, &optval, sizeof(optval));
-			break;
-		case LOCAL_CONNWAIT:
-			optval = unp->unp_flags & UNP_CONNWAIT ? 1 : 0;
-			error = sooptcopyout(sopt, &optval, sizeof(optval));
-			break;
-		default:
-			error = EOPNOTSUPP;
-			break;
-		}
-		break;
-	case SOPT_SET:
-		switch (sopt->sopt_name) {
-		case LOCAL_CREDS:
-		case LOCAL_CONNWAIT:
-			error = sooptcopyin(sopt, &optval, sizeof(optval),
-					    sizeof(optval));
-			if (error)
-				break;
-
-#define	OPTSET(bit) \
-	if (optval) \
-		unp->unp_flags |= bit; \
-	else \
-		unp->unp_flags &= ~bit;
-
-			switch (sopt->sopt_name) {
-			case LOCAL_CREDS:
-				OPTSET(UNP_WANTCRED);
-				break;
-			case LOCAL_CONNWAIT:
-				OPTSET(UNP_CONNWAIT);
-				break;
-			default:
-				break;
-			}
-			break;
-#undef	OPTSET
-		default:
-			error = ENOPROTOOPT;
-			break;
-		}
-		break;
-	default:
-		error = EOPNOTSUPP;
-		break;
-	}
-	UNP_UNLOCK();
-	return (error);
-}
-
-/*
- * Both send and receive buffers are allocated PIPSIZ bytes of buffering
- * for stream sockets, although the total for sender and receiver is
- * actually only PIPSIZ.
- * Datagram sockets really use the sendspace as the maximum datagram size,
- * and don't really want to reserve the sendspace.  Their recvspace should
- * be large enough for at least one max-size datagram plus address.
- */
-#ifndef PIPSIZ
-#define	PIPSIZ	8192
-#endif
-static u_long	unpst_sendspace = PIPSIZ;
-static u_long	unpst_recvspace = PIPSIZ;
-static u_long	unpdg_sendspace = 2*1024;	/* really max datagram size */
-static u_long	unpdg_recvspace = 4*1024;
-
-static int	unp_rights;			/* file descriptors in flight */
-
-SYSCTL_DECL(_net_local_stream);
-SYSCTL_INT(_net_local_stream, OID_AUTO, sendspace, CTLFLAG_RW,
-	   &unpst_sendspace, 0, "");
-SYSCTL_INT(_net_local_stream, OID_AUTO, recvspace, CTLFLAG_RW,
-	   &unpst_recvspace, 0, "");
-SYSCTL_DECL(_net_local_dgram);
-SYSCTL_INT(_net_local_dgram, OID_AUTO, maxdgram, CTLFLAG_RW,
-	   &unpdg_sendspace, 0, "");
-SYSCTL_INT(_net_local_dgram, OID_AUTO, recvspace, CTLFLAG_RW,
-	   &unpdg_recvspace, 0, "");
-SYSCTL_DECL(_net_local);
-SYSCTL_INT(_net_local, OID_AUTO, inflight, CTLFLAG_RD, &unp_rights, 0, "");
-
-static int
-unp_attach(struct socket *so)
-{
-	struct unpcb *unp;
-	int error;
-
-	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
-		switch (so->so_type) {
-
-		case SOCK_STREAM:
-			error = soreserve(so, unpst_sendspace, unpst_recvspace);
-			break;
-
-		case SOCK_DGRAM:
-			error = soreserve(so, unpdg_sendspace, unpdg_recvspace);
-			break;
-
-		default:
-			panic("unp_attach");
-		}
-		if (error)
-			return (error);
-	}
-	unp = uma_zalloc(unp_zone, M_WAITOK | M_ZERO);
-	if (unp == NULL)
-		return (ENOBUFS);
-	LIST_INIT(&unp->unp_refs);
-	unp->unp_socket = so;
-	so->so_pcb = unp;
-
-	UNP_LOCK();
-	unp->unp_gencnt = ++unp_gencnt;
-	unp_count++;
-	LIST_INSERT_HEAD(so->so_type == SOCK_DGRAM ? &unp_dhead
-			 : &unp_shead, unp, unp_link);
-	UNP_UNLOCK();
-
-	return (0);
-}
-
-static void
-unp_detach(struct unpcb *unp)
-{
-	struct vnode *vp;
-	int local_unp_rights;
-
-	UNP_LOCK_ASSERT();
-
-	LIST_REMOVE(unp, unp_link);
-	unp->unp_gencnt = ++unp_gencnt;
-	--unp_count;
-	if ((vp = unp->unp_vnode) != NULL) {
-		/*
-		 * XXXRW: should v_socket be frobbed only while holding
-		 * Giant?
-		 */
-		unp->unp_vnode->v_socket = NULL;
-		unp->unp_vnode = NULL;
-	}
-	if (unp->unp_conn != NULL)
-		unp_disconnect(unp);
-	while (!LIST_EMPTY(&unp->unp_refs)) {
-		struct unpcb *ref = LIST_FIRST(&unp->unp_refs);
-		unp_drop(ref, ECONNRESET);
-	}
-	soisdisconnected(unp->unp_socket);
-	unp->unp_socket->so_pcb = NULL;
-	local_unp_rights = unp_rights;
-	UNP_UNLOCK();
-	if (unp->unp_addr != NULL)
-		FREE(unp->unp_addr, M_SONAME);
-	uma_zfree(unp_zone, unp);
-	if (vp) {
-		int vfslocked;
+	KASSERT(unp != NULL, ("uipc_ctloutput: unp == NULL"));
+	error = 0;
+	switch (sopt->sopt_dir) {
+	case SOPT_GET:
+		switch (sopt->sopt_name) {
+		case LOCAL_PEERCRED:
+			UNP_PCB_LOCK(unp);
+			if (unp->unp_flags & UNP_HAVEPC)
+				xu = unp->unp_peercred;
+			else {
+				if (so->so_type == SOCK_STREAM)
+					error = ENOTCONN;
+				else
+					error = EINVAL;
+			}
+			UNP_PCB_UNLOCK(unp);
+			if (error == 0)
+				error = sooptcopyout(sopt, &xu, sizeof(xu));
+			break;
 
-		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
-		vrele(vp);
-		VFS_UNLOCK_GIANT(vfslocked);
-	}
-	if (local_unp_rights)
-		taskqueue_enqueue(taskqueue_thread, &unp_gc_task);
-}
+		case LOCAL_CREDS:
+			/* Unocked read. */
+			optval = unp->unp_flags & UNP_WANTCRED ? 1 : 0;
+			error = sooptcopyout(sopt, &optval, sizeof(optval));
+			break;
 
-static int
-unp_bind(struct unpcb *unp, struct sockaddr *nam, struct thread *td)
-{
-	struct sockaddr_un *soun = (struct sockaddr_un *)nam;
-	struct vnode *vp;
-	struct mount *mp;
-	struct vattr vattr;
-	int error, namelen;
-	struct nameidata nd;
-	char *buf;
+		case LOCAL_CONNWAIT:
+			/* Unocked read. */
+			optval = unp->unp_flags & UNP_CONNWAIT ? 1 : 0;
+			error = sooptcopyout(sopt, &optval, sizeof(optval));
+			break;
 
-	UNP_LOCK_ASSERT();
+		default:
+			error = EOPNOTSUPP;
+			break;
+		}
+		break;
 
-	/*
-	 * XXXRW: This test-and-set of unp_vnode is non-atomic; the
-	 * unlocked read here is fine, but the value of unp_vnode needs
-	 * to be tested again after we do all the lookups to see if the
-	 * pcb is still unbound?
-	 */
-	if (unp->unp_vnode != NULL)
-		return (EINVAL);
+	case SOPT_SET:
+		switch (sopt->sopt_name) {
+		case LOCAL_CREDS:
+		case LOCAL_CONNWAIT:
+			error = sooptcopyin(sopt, &optval, sizeof(optval),
+					    sizeof(optval));
+			if (error)
+				break;
 
-	namelen = soun->sun_len - offsetof(struct sockaddr_un, sun_path);
-	if (namelen <= 0)
-		return (EINVAL);
+#define	OPTSET(bit) do {						\
+	UNP_PCB_LOCK(unp);						\
+	if (optval)							\
+		unp->unp_flags |= bit;					\
+	else								\
+		unp->unp_flags &= ~bit;					\
+	UNP_PCB_UNLOCK(unp);						\
+} while (0)
 
-	UNP_UNLOCK();
+			switch (sopt->sopt_name) {
+			case LOCAL_CREDS:
+				OPTSET(UNP_WANTCRED);
+				break;
 
-	buf = malloc(namelen + 1, M_TEMP, M_WAITOK);
-	strlcpy(buf, soun->sun_path, namelen + 1);
+			case LOCAL_CONNWAIT:
+				OPTSET(UNP_CONNWAIT);
+				break;
 
-	mtx_lock(&Giant);
-restart:
-	mtx_assert(&Giant, MA_OWNED);
-	NDINIT(&nd, CREATE, NOFOLLOW | LOCKPARENT | SAVENAME, UIO_SYSSPACE,
-	    buf, td);
-/* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */
-	error = namei(&nd);
-	if (error)
-		goto done;
-	vp = nd.ni_vp;
-	if (vp != NULL || vn_start_write(nd.ni_dvp, &mp, V_NOWAIT) != 0) {
-		NDFREE(&nd, NDF_ONLY_PNBUF);
-		if (nd.ni_dvp == vp)
-			vrele(nd.ni_dvp);
-		else
-			vput(nd.ni_dvp);
-		if (vp != NULL) {
-			vrele(vp);
-			error = EADDRINUSE;
-			goto done;
+			default:
+				break;
+			}
+			break;
+#undef	OPTSET
+		default:
+			error = ENOPROTOOPT;
+			break;
 		}
-		error = vn_start_write(NULL, &mp, V_XSLEEP | PCATCH);
-		if (error)
-			goto done;
-		goto restart;
-	}
-	VATTR_NULL(&vattr);
-	vattr.va_type = VSOCK;
-	vattr.va_mode = (ACCESSPERMS & ~td->td_proc->p_fd->fd_cmask);
-#ifdef MAC
-	error = mac_check_vnode_create(td->td_ucred, nd.ni_dvp, &nd.ni_cnd,
-	    &vattr);
-#endif
-	if (error == 0) {
-		VOP_LEASE(nd.ni_dvp, td, td->td_ucred, LEASE_WRITE);
-		error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
-	}
-	NDFREE(&nd, NDF_ONLY_PNBUF);
-	vput(nd.ni_dvp);
-	if (error) {
-		vn_finished_write(mp);
-		goto done;
+		break;
+
+	default:
+		error = EOPNOTSUPP;
+		break;
 	}
-	vp = nd.ni_vp;
-	ASSERT_VOP_LOCKED(vp, "unp_bind");
-	soun = (struct sockaddr_un *)sodupsockaddr(nam, M_WAITOK);
-	UNP_LOCK();
-	vp->v_socket = unp->unp_socket;
-	unp->unp_vnode = vp;
-	unp->unp_addr = soun;
-	UNP_UNLOCK();
-	VOP_UNLOCK(vp, 0, td);
-	vn_finished_write(mp);
-done:
-	mtx_unlock(&Giant);
-	free(buf, M_TEMP);
-	UNP_LOCK();
 	return (error);
 }
 
@@ -919,28 +1116,40 @@
 	struct vnode *vp;
 	struct socket *so2, *so3;
 	struct unpcb *unp, *unp2, *unp3;
-	int error, len;
+	int error, len, vfslocked;
 	struct nameidata nd;
 	char buf[SOCK_MAXADDRLEN];
 	struct sockaddr *sa;
 
-	UNP_LOCK_ASSERT();
+	UNP_GLOBAL_WLOCK_ASSERT();
+
 	unp = sotounpcb(so);
+	KASSERT(unp != NULL, ("unp_connect: unp == NULL"));
 
 	len = nam->sa_len - offsetof(struct sockaddr_un, sun_path);
 	if (len <= 0)
 		return (EINVAL);
 	strlcpy(buf, soun->sun_path, len + 1);
-	UNP_UNLOCK();
+
+	UNP_PCB_LOCK(unp);
+	if (unp->unp_flags & UNP_CONNECTING) {
+		UNP_PCB_UNLOCK(unp);
+		return (EALREADY);
+	}
+	UNP_GLOBAL_WUNLOCK();
+	unp->unp_flags |= UNP_CONNECTING;
+	UNP_PCB_UNLOCK(unp);
+
 	sa = malloc(sizeof(struct sockaddr_un), M_SONAME, M_WAITOK);
-	mtx_lock(&Giant);
-	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, buf, td);
+	NDINIT(&nd, LOOKUP, MPSAFE | FOLLOW | LOCKLEAF, UIO_SYSSPACE, buf,
+	    td);
 	error = namei(&nd);
 	if (error)
 		vp = NULL;
 	else
 		vp = nd.ni_vp;
 	ASSERT_VOP_LOCKED(vp, "unp_connect");
+	vfslocked = NDHASGIANT(&nd);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	if (error)
 		goto bad;
@@ -949,16 +1158,24 @@
 		error = ENOTSOCK;
 		goto bad;
 	}
+#ifdef MAC
+	error = mac_check_vnode_open(td->td_ucred, vp, VWRITE | VREAD);
+	if (error)
+		goto bad;
+#endif
 	error = VOP_ACCESS(vp, VWRITE, td->td_ucred, td);
 	if (error)
 		goto bad;
-	mtx_unlock(&Giant);
-	UNP_LOCK();
+	VFS_UNLOCK_GIANT(vfslocked);
+
 	unp = sotounpcb(so);
-	if (unp == NULL) {
-		error = EINVAL;
-		goto bad2;
-	}
+	KASSERT(unp != NULL, ("unp_connect: unp == NULL"));
+
+	/*
+	 * Lock global lock for two reasons: make sure v_socket is stable,
+	 * and to protect simultaneous locking of multiple pcbs.
+	 */
+	UNP_GLOBAL_WLOCK();
 	so2 = vp->v_socket;
 	if (so2 == NULL) {
 		error = ECONNREFUSED;
@@ -971,14 +1188,11 @@
 	if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
 		if (so2->so_options & SO_ACCEPTCONN) {
 			/*
-			 * NB: drop locks here so unp_attach is entered
-			 *     w/o locks; this avoids a recursive lock
-			 *     of the head and holding sleep locks across
-			 *     a (potentially) blocking malloc.
+			 * We can't drop the global lock here or 'so2' may
+			 * become invalid.  As a result, we need to handle
+			 * possibly lock recursion in uipc_attach.
 			 */
-			UNP_UNLOCK();
 			so3 = sonewconn(so2, 0);
-			UNP_LOCK();
 		} else
 			so3 = NULL;
 		if (so3 == NULL) {
@@ -988,6 +1202,9 @@
 		unp = sotounpcb(so);
 		unp2 = sotounpcb(so2);
 		unp3 = sotounpcb(so3);
+		UNP_PCB_LOCK(unp);
+		UNP_PCB_LOCK(unp2);
+		UNP_PCB_LOCK(unp3);
 		if (unp2->unp_addr != NULL) {
 			bcopy(unp2->unp_addr, sa, unp2->unp_addr->sun_len);
 			unp3->unp_addr = (struct sockaddr_un *) sa;
@@ -996,24 +1213,27 @@
 		/*
 		 * unp_peercred management:
 		 *
-		 * The connecter's (client's) credentials are copied
-		 * from its process structure at the time of connect()
-		 * (which is now).
+		 * The connecter's (client's) credentials are copied from its
+		 * process structure at the time of connect() (which is now).
 		 */
 		cru2x(td->td_ucred, &unp3->unp_peercred);
 		unp3->unp_flags |= UNP_HAVEPC;
 		/*
-		 * The receiver's (server's) credentials are copied
-		 * from the unp_peercred member of socket on which the
-		 * former called listen(); unp_listen() cached that
-		 * process's credentials at that time so we can use
-		 * them now.
+		 * The receiver's (server's) credentials are copied from the
+		 * unp_peercred member of socket on which the former called
+		 * listen(); uipc_listen() cached that process's credentials
+		 * at that time so we can use them now.
 		 */
 		KASSERT(unp2->unp_flags & UNP_HAVEPCCACHED,
 		    ("unp_connect: listener without cached peercred"));
 		memcpy(&unp->unp_peercred, &unp2->unp_peercred,
 		    sizeof(unp->unp_peercred));
 		unp->unp_flags |= UNP_HAVEPC;
+		if (unp2->unp_flags & UNP_WANTCRED)
+			unp3->unp_flags |= UNP_WANTCRED;
+		UNP_PCB_UNLOCK(unp3);
+		UNP_PCB_UNLOCK(unp2);
+		UNP_PCB_UNLOCK(unp);
 #ifdef MAC
 		SOCK_LOCK(so);
 		mac_set_socket_peer_from_socket(so, so3);
@@ -1023,34 +1243,55 @@
 
 		so2 = so3;
 	}
+	unp = sotounpcb(so);
+	KASSERT(unp != NULL, ("unp_connect: unp == NULL"));
+	unp2 = sotounpcb(so2);
+	KASSERT(unp2 != NULL, ("unp_connect: unp2 == NULL"));
+	UNP_PCB_LOCK(unp);
+	UNP_PCB_LOCK(unp2);
 	error = unp_connect2(so, so2, PRU_CONNECT);
+	UNP_PCB_UNLOCK(unp2);
+	UNP_PCB_UNLOCK(unp);
 bad2:
-	UNP_UNLOCK();
-	mtx_lock(&Giant);
+	UNP_GLOBAL_WUNLOCK();
+	if (vfslocked)
+		/* 
+		 * Giant has been previously acquired. This means filesystem
+		 * isn't MPSAFE. Do it once again.
+		 */
+		mtx_lock(&Giant);
 bad:
-	mtx_assert(&Giant, MA_OWNED);
 	if (vp != NULL)
 		vput(vp);
-	mtx_unlock(&Giant);
+	VFS_UNLOCK_GIANT(vfslocked);
 	free(sa, M_SONAME);
-	UNP_LOCK();
+	UNP_GLOBAL_WLOCK();
+	UNP_PCB_LOCK(unp);
+	unp->unp_flags &= ~UNP_CONNECTING;
+	UNP_PCB_UNLOCK(unp);
 	return (error);
 }
 
 static int
 unp_connect2(struct socket *so, struct socket *so2, int req)
 {
-	struct unpcb *unp = sotounpcb(so);
+	struct unpcb *unp;
 	struct unpcb *unp2;
 
-	UNP_LOCK_ASSERT();
+	unp = sotounpcb(so);
+	KASSERT(unp != NULL, ("unp_connect2: unp == NULL"));
+	unp2 = sotounpcb(so2);
+	KASSERT(unp2 != NULL, ("unp_connect2: unp2 == NULL"));
+
+	UNP_GLOBAL_WLOCK_ASSERT();
+	UNP_PCB_LOCK_ASSERT(unp);
+	UNP_PCB_LOCK_ASSERT(unp2);
 
 	if (so2->so_type != so->so_type)
 		return (EPROTOTYPE);
-	unp2 = sotounpcb(so2);
 	unp->unp_conn = unp2;
-	switch (so->so_type) {
 
+	switch (so->so_type) {
 	case SOCK_DGRAM:
 		LIST_INSERT_HEAD(&unp2->unp_refs, unp, unp_reflink);
 		soisconnected(so);
@@ -1073,18 +1314,18 @@
 }
 
 static void
-unp_disconnect(struct unpcb *unp)
+unp_disconnect(struct unpcb *unp, struct unpcb *unp2)
 {
-	struct unpcb *unp2 = unp->unp_conn;
 	struct socket *so;
 
-	UNP_LOCK_ASSERT();
+	KASSERT(unp2 != NULL, ("unp_disconnect: unp2 == NULL"));
+
+	UNP_GLOBAL_WLOCK_ASSERT();
+	UNP_PCB_LOCK_ASSERT(unp);
+	UNP_PCB_LOCK_ASSERT(unp2);
 
-	if (unp2 == NULL)
-		return;
 	unp->unp_conn = NULL;
 	switch (unp->unp_socket->so_type) {
-
 	case SOCK_DGRAM:
 		LIST_REMOVE(unp, unp_reflink);
 		so = unp->unp_socket;
@@ -1101,28 +1342,18 @@
 	}
 }
 
-#ifdef notdef
-void
-unp_abort(struct unpcb *unp)
-{
-
-	unp_detach(unp);
-	UNP_UNLOCK_ASSERT();
-}
-#endif
-
 /*
- * unp_pcblist() assumes that UNIX domain socket memory is never reclaimed
- * by the zone (UMA_ZONE_NOFREE), and as such potentially stale pointers
- * are safe to reference.  It first scans the list of struct unpcb's to
- * generate a pointer list, then it rescans its list one entry at a time to
- * externalize and copyout.  It checks the generation number to see if a
- * struct unpcb has been reused, and will skip it if so.
+ * unp_pcblist() walks the global list of struct unpcb's to generate a
+ * pointer list, bumping the refcount on each unpcb.  It then copies them out
+ * sequentially, validating the generation number on each to see if it has
+ * been detached.  All of this is necessary because copyout() may sleep on
+ * disk I/O.
  */
 static int
 unp_pcblist(SYSCTL_HANDLER_ARGS)
 {
 	int error, i, n;
+	int freeunp;
 	struct unpcb *unp, **unp_list;
 	unp_gen_t gencnt;
 	struct xunpgen *xug;
@@ -1149,10 +1380,10 @@
 	 * OK, now we're committed to doing something.
 	 */
 	xug = malloc(sizeof(*xug), M_TEMP, M_WAITOK);
-	UNP_LOCK();
+	UNP_GLOBAL_RLOCK();
 	gencnt = unp_gencnt;
 	n = unp_count;
-	UNP_UNLOCK();
+	UNP_GLOBAL_RUNLOCK();
 
 	xug->xug_len = sizeof *xug;
 	xug->xug_count = n;
@@ -1166,24 +1397,31 @@
 
 	unp_list = malloc(n * sizeof *unp_list, M_TEMP, M_WAITOK);
 
-	UNP_LOCK();
+	UNP_GLOBAL_RLOCK();
 	for (unp = LIST_FIRST(head), i = 0; unp && i < n;
 	     unp = LIST_NEXT(unp, unp_link)) {
+		UNP_PCB_LOCK(unp);
 		if (unp->unp_gencnt <= gencnt) {
 			if (cr_cansee(req->td->td_ucred,
-			    unp->unp_socket->so_cred))
+			    unp->unp_socket->so_cred)) {
+				UNP_PCB_UNLOCK(unp);
 				continue;
+			}
 			unp_list[i++] = unp;
+			unp->unp_refcount++;
 		}
+		UNP_PCB_UNLOCK(unp);
 	}
-	UNP_UNLOCK();
-	n = i;			/* in case we lost some during malloc */
+	UNP_GLOBAL_RUNLOCK();
+	n = i;			/* In case we lost some during malloc. */
 
 	error = 0;
 	xu = malloc(sizeof(*xu), M_TEMP, M_WAITOK | M_ZERO);
 	for (i = 0; i < n; i++) {
 		unp = unp_list[i];
-		if (unp->unp_gencnt <= gencnt) {
+		UNP_PCB_LOCK(unp);
+		unp->unp_refcount--;
+	        if (unp->unp_refcount != 0 && unp->unp_gencnt <= gencnt) {
 			xu->xu_len = sizeof *xu;
 			xu->xu_unpp = unp;
 			/*
@@ -1200,17 +1438,24 @@
 				      unp->unp_conn->unp_addr->sun_len);
 			bcopy(unp, &xu->xu_unp, sizeof *unp);
 			sotoxsocket(unp->unp_socket, &xu->xu_socket);
+			UNP_PCB_UNLOCK(unp);
 			error = SYSCTL_OUT(req, xu, sizeof *xu);
+		} else {
+			freeunp = (unp->unp_refcount == 0);
+			UNP_PCB_UNLOCK(unp);
+			if (freeunp) {
+				UNP_PCB_LOCK_DESTROY(unp);
+				uma_zfree(unp_zone, unp);
+			}
 		}
 	}
 	free(xu, M_TEMP);
 	if (!error) {
 		/*
-		 * Give the user an updated idea of our state.
-		 * If the generation differs from what we told
-		 * her before, she knows that something happened
-		 * while we were processing this request, and it
-		 * might be necessary to retry.
+		 * Give the user an updated idea of our state.  If the
+		 * generation differs from what we told her before, she knows
+		 * that something happened while we were processing this
+		 * request, and it might be necessary to retry.
 		 */
 		xug->xug_gen = unp_gencnt;
 		xug->xug_sogen = so_gencnt;
@@ -1232,33 +1477,38 @@
 static void
 unp_shutdown(struct unpcb *unp)
 {
+	struct unpcb *unp2;
 	struct socket *so;
 
-	UNP_LOCK_ASSERT();
+	UNP_GLOBAL_WLOCK_ASSERT();
+	UNP_PCB_LOCK_ASSERT(unp);
 
-	if (unp->unp_socket->so_type == SOCK_STREAM && unp->unp_conn &&
-	    (so = unp->unp_conn->unp_socket))
-		socantrcvmore(so);
+	unp2 = unp->unp_conn;
+	if (unp->unp_socket->so_type == SOCK_STREAM && unp2 != NULL) {
+		so = unp2->unp_socket;
+		if (so != NULL)
+			socantrcvmore(so);
+	}
 }
 
 static void
 unp_drop(struct unpcb *unp, int errno)
 {
 	struct socket *so = unp->unp_socket;
+	struct unpcb *unp2;
 
-	UNP_LOCK_ASSERT();
+	UNP_GLOBAL_WLOCK_ASSERT();
+	UNP_PCB_LOCK_ASSERT(unp);
 
 	so->so_error = errno;
-	unp_disconnect(unp);
-}
-
-#ifdef notdef
-void
-unp_drain(void)
-{
+	unp2 = unp->unp_conn;
+	if (unp2 == NULL)
+		return;
 
+	UNP_PCB_LOCK(unp2);
+	unp_disconnect(unp, unp2);
+	UNP_PCB_UNLOCK(unp2);
 }
-#endif
 
 static void
 unp_freerights(struct file **rp, int fdcount)
@@ -1267,13 +1517,14 @@
 	struct file *fp;
 
 	for (i = 0; i < fdcount; i++) {
-		fp = *rp;
 		/*
-		 * zero the pointer before calling
-		 * unp_discard since it may end up
-		 * in unp_gc()..
+		 * Zero the pointer before calling unp_discard since it may
+		 * end up in unp_gc()..
+		 *
+		 * XXXRW: This is less true than it used to be.
 		 */
-		*rp++ = 0;
+		fp = *rp;
+		*rp++ = NULL;
 		unp_discard(fp);
 	}
 }
@@ -1293,7 +1544,7 @@
 	int f;
 	u_int newlen;
 
-	UNP_UNLOCK_ASSERT();
+	UNP_GLOBAL_UNLOCK_ASSERT();
 
 	error = 0;
 	if (controlp != NULL) /* controlp == NULL => free control messages */
@@ -1318,25 +1569,25 @@
 				unp_freerights(rp, newfds);
 				goto next;
 			}
-			FILEDESC_LOCK(td->td_proc->p_fd);
+			FILEDESC_XLOCK(td->td_proc->p_fd);
 			/* if the new FD's will not fit free them.  */
 			if (!fdavail(td, newfds)) {
-				FILEDESC_UNLOCK(td->td_proc->p_fd);
+				FILEDESC_XUNLOCK(td->td_proc->p_fd);
 				error = EMSGSIZE;
 				unp_freerights(rp, newfds);
 				goto next;
 			}
 			/*
-			 * now change each pointer to an fd in the global
-			 * table to an integer that is the index to the
-			 * local fd table entry that we set up to point
-			 * to the global one we are transferring.
+			 * Now change each pointer to an fd in the global
+			 * table to an integer that is the index to the local
+			 * fd table entry that we set up to point to the
+			 * global one we are transferring.
 			 */
 			newlen = newfds * sizeof(int);
 			*controlp = sbcreatecontrol(NULL, newlen,
 			    SCM_RIGHTS, SOL_SOCKET);
 			if (*controlp == NULL) {
-				FILEDESC_UNLOCK(td->td_proc->p_fd);
+				FILEDESC_XUNLOCK(td->td_proc->p_fd);
 				error = E2BIG;
 				unp_freerights(rp, newfds);
 				goto next;
@@ -1355,8 +1606,9 @@
 				unp_rights--;
 				*fdp++ = f;
 			}
-			FILEDESC_UNLOCK(td->td_proc->p_fd);
-		} else { /* We can just copy anything else across */
+			FILEDESC_XUNLOCK(td->td_proc->p_fd);
+		} else {
+			/* We can just copy anything else across. */
 			if (error || controlp == NULL)
 				goto next;
 			*controlp = sbcreatecontrol(NULL, datalen,
@@ -1388,18 +1640,28 @@
 	return (error);
 }
 
+static void
+unp_zone_change(void *tag)
+{
+
+	uma_zone_set_max(unp_zone, maxsockets);
+}
+
 void
 unp_init(void)
 {
+
 	unp_zone = uma_zcreate("unpcb", sizeof(struct unpcb), NULL, NULL,
-	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
+	    NULL, NULL, UMA_ALIGN_PTR, 0);
 	if (unp_zone == NULL)
 		panic("unp_init");
-	uma_zone_set_max(unp_zone, nmbclusters);
+	uma_zone_set_max(unp_zone, maxsockets);
+	EVENTHANDLER_REGISTER(maxsockets_change, unp_zone_change,
+	    NULL, EVENTHANDLER_PRI_ANY);
 	LIST_INIT(&unp_dhead);
 	LIST_INIT(&unp_shead);
 	TASK_INIT(&unp_gc_task, 0, unp_gc, NULL);
-	UNP_LOCK_INIT();
+	UNP_GLOBAL_LOCK_INIT();
 }
 
 static int
@@ -1419,7 +1681,7 @@
 	int error, oldfds;
 	u_int newlen;
 
-	UNP_UNLOCK_ASSERT();
+	UNP_GLOBAL_UNLOCK_ASSERT();
 
 	error = 0;
 	*controlp = NULL;
@@ -1462,27 +1724,28 @@
 		case SCM_RIGHTS:
 			oldfds = datalen / sizeof (int);
 			/*
-			 * check that all the FDs passed in refer to legal files
-			 * If not, reject the entire operation.
+			 * Check that all the FDs passed in refer to legal
+			 * files.  If not, reject the entire operation.
 			 */
 			fdp = data;
-			FILEDESC_LOCK(fdescp);
+			FILEDESC_SLOCK(fdescp);
 			for (i = 0; i < oldfds; i++) {
 				fd = *fdp++;
 				if ((unsigned)fd >= fdescp->fd_nfiles ||
 				    fdescp->fd_ofiles[fd] == NULL) {
-					FILEDESC_UNLOCK(fdescp);
+					FILEDESC_SUNLOCK(fdescp);
 					error = EBADF;
 					goto out;
 				}
 				fp = fdescp->fd_ofiles[fd];
 				if (!(fp->f_ops->fo_flags & DFLAG_PASSABLE)) {
-					FILEDESC_UNLOCK(fdescp);
+					FILEDESC_SUNLOCK(fdescp);
 					error = EOPNOTSUPP;
 					goto out;
 				}
 
 			}
+
 			/*
 			 * Now replace the integer FDs with pointers to
 			 * the associated global file table entry..
@@ -1491,7 +1754,7 @@
 			*controlp = sbcreatecontrol(NULL, newlen,
 			    SCM_RIGHTS, SOL_SOCKET);
 			if (*controlp == NULL) {
-				FILEDESC_UNLOCK(fdescp);
+				FILEDESC_SUNLOCK(fdescp);
 				error = E2BIG;
 				goto out;
 			}
@@ -1508,7 +1771,7 @@
 				FILE_UNLOCK(fp);
 				unp_rights++;
 			}
-			FILEDESC_UNLOCK(fdescp);
+			FILEDESC_SUNLOCK(fdescp);
 			break;
 
 		case SCM_TIMESTAMP:
@@ -1546,11 +1809,12 @@
 	return (error);
 }
 
-struct mbuf *
+static struct mbuf *
 unp_addsockcred(struct thread *td, struct mbuf *control)
 {
-	struct mbuf *m, *n;
+	struct mbuf *m, *n, *n_prev;
 	struct sockcred *sc;
+	const struct cmsghdr *cm;
 	int ngroups;
 	int i;
 
@@ -1559,7 +1823,6 @@
 	m = sbcreatecontrol(NULL, SOCKCREDSIZE(ngroups), SCM_CREDS, SOL_SOCKET);
 	if (m == NULL)
 		return (control);
-	m->m_next = NULL;
 
 	sc = (struct sockcred *) CMSG_DATA(mtod(m, struct cmsghdr *));
 	sc->sc_uid = td->td_ucred->cr_ruid;
@@ -1571,16 +1834,30 @@
 		sc->sc_groups[i] = td->td_ucred->cr_groups[i];
 
 	/*
-	 * If a control message already exists, append us to the end.
+	 * Unlink SCM_CREDS control messages (struct cmsgcred), since just
+	 * created SCM_CREDS control message (struct sockcred) has another
+	 * format.
 	 */
-	if (control != NULL) {
-		for (n = control; n->m_next != NULL; n = n->m_next)
-			;
-		n->m_next = m;
-	} else
-		control = m;
+	if (control != NULL)
+		for (n = control, n_prev = NULL; n != NULL;) {
+			cm = mtod(n, struct cmsghdr *);
+    			if (cm->cmsg_level == SOL_SOCKET &&
+			    cm->cmsg_type == SCM_CREDS) {
+    				if (n_prev == NULL)
+					control = n->m_next;
+				else
+					n_prev->m_next = n->m_next;
+				n = m_free(n);
+			} else {
+				n_prev = n;
+				n = n->m_next;
+			}
+		}
+
+	/* Prepend it to the head. */
+	m->m_next = control;
 
-	return (control);
+	return (m);
 }
 
 /*
@@ -1609,13 +1886,14 @@
 	unp_taskcount++;
 	unp_defer = 0;
 	/*
-	 * before going through all this, set all FDs to
-	 * be NOT defered and NOT externally accessible
+	 * Before going through all this, set all FDs to be NOT deferred and
+	 * NOT externally accessible.
 	 */
 	sx_slock(&filelist_lock);
 	LIST_FOREACH(fp, &filehead, f_list)
 		fp->f_gcflag &= ~(FMARK|FDEFER);
 	do {
+		KASSERT(unp_defer >= 0, ("unp_gc: unp_defer %d", unp_defer));
 		LIST_FOREACH(fp, &filehead, f_list) {
 			FILE_LOCK(fp);
 			/*
@@ -1633,16 +1911,16 @@
 				continue;
 			}
 			/*
-			 * If we already marked it as 'defer'  in a
-			 * previous pass, then try process it this time
-			 * and un-mark it
+			 * If we already marked it as 'defer' in a
+			 * previous pass, then try to process it this
+			 * time and un-mark it.
 			 */
 			if (fp->f_gcflag & FDEFER) {
 				fp->f_gcflag &= ~FDEFER;
 				unp_defer--;
 			} else {
 				/*
-				 * if it's not defered, then check if it's
+				 * If it's not deferred, then check if it's
 				 * already marked.. if so skip it
 				 */
 				if (fp->f_gcflag & FMARK) {
@@ -1650,9 +1928,9 @@
 					continue;
 				}
 				/*
-				 * If all references are from messages
-				 * in transit, then skip it. it's not
-				 * externally accessible.
+				 * If all references are from messages in
+				 * transit, then skip it. it's not externally
+				 * accessible.
 				 */
 				if (fp->f_count == fp->f_msgcount) {
 					FILE_UNLOCK(fp);
@@ -1665,29 +1943,47 @@
 				fp->f_gcflag |= FMARK;
 			}
 			/*
-			 * either it was defered, or it is externally
-			 * accessible and not already marked so.
-			 * Now check if it is possibly one of OUR sockets.
+			 * Either it was deferred, or it is externally
+			 * accessible and not already marked so.  Now check
+			 * if it is possibly one of OUR sockets.
 			 */
 			if (fp->f_type != DTYPE_SOCKET ||
 			    (so = fp->f_data) == NULL) {
 				FILE_UNLOCK(fp);
 				continue;
 			}
-			FILE_UNLOCK(fp);
 			if (so->so_proto->pr_domain != &localdomain ||
-			    (so->so_proto->pr_flags&PR_RIGHTS) == 0)
+			    (so->so_proto->pr_flags & PR_RIGHTS) == 0) {
+				FILE_UNLOCK(fp);				
 				continue;
+			}
+
+			/*
+			 * Tell any other threads that do a subsequent
+			 * fdrop() that we are scanning the message
+			 * buffers.
+			 */
+			fp->f_gcflag |= FWAIT;
+			FILE_UNLOCK(fp);
+
 			/*
-			 * So, Ok, it's one of our sockets and it IS externally
-			 * accessible (or was defered). Now we look
-			 * to see if we hold any file descriptors in its
+			 * So, Ok, it's one of our sockets and it IS
+			 * externally accessible (or was deferred).  Now we
+			 * look to see if we hold any file descriptors in its
 			 * message buffers. Follow those links and mark them
 			 * as accessible too.
 			 */
 			SOCKBUF_LOCK(&so->so_rcv);
 			unp_scan(so->so_rcv.sb_mb, unp_mark);
 			SOCKBUF_UNLOCK(&so->so_rcv);
+
+			/*
+			 * Wake up any threads waiting in fdrop().
+			 */
+			FILE_LOCK(fp);
+			fp->f_gcflag &= ~FWAIT;
+			wakeup(&fp->f_gcflag);
+			FILE_UNLOCK(fp);
 		}
 	} while (unp_defer);
 	sx_sunlock(&filelist_lock);
@@ -1695,9 +1991,9 @@
 	 * XXXRW: The following comments need updating for a post-SMPng and
 	 * deferred unp_gc() world, but are still generally accurate.
 	 *
-	 * We grab an extra reference to each of the file table entries
-	 * that are not otherwise accessible and then free the rights
-	 * that are stored in messages on them.
+	 * We grab an extra reference to each of the file table entries that
+	 * are not otherwise accessible and then free the rights that are
+	 * stored in messages on them.
 	 *
 	 * The bug in the orginal code is a little tricky, so I'll describe
 	 * what's wrong with it here.
@@ -1711,12 +2007,12 @@
 	 * results in the following chain.  Closef calls soo_close, which
 	 * calls soclose.   Soclose calls first (through the switch
 	 * uipc_usrreq) unp_detach, which re-invokes unp_gc.  Unp_gc simply
-	 * returns because the previous instance had set unp_gcing, and
-	 * we return all the way back to soclose, which marks the socket
-	 * with SS_NOFDREF, and then calls sofree.  Sofree calls sorflush
-	 * to free up the rights that are queued in messages on the socket A,
-	 * i.e., the reference on B.  The sorflush calls via the dom_dispose
-	 * switch unp_dispose, which unp_scans with unp_discard.  This second
+	 * returns because the previous instance had set unp_gcing, and we
+	 * return all the way back to soclose, which marks the socket with
+	 * SS_NOFDREF, and then calls sofree.  Sofree calls sorflush to free
+	 * up the rights that are queued in messages on the socket A, i.e.,
+	 * the reference on B.  The sorflush calls via the dom_dispose switch
+	 * unp_dispose, which unp_scans with unp_discard.  This second
 	 * instance of unp_discard just calls closef on B.
 	 *
 	 * Well, a similar chain occurs on B, resulting in a sorflush on B,
@@ -1725,11 +2021,11 @@
 	 * SS_NOFDREF, and soclose panics at this point.
 	 *
 	 * Here, we first take an extra reference to each inaccessible
-	 * descriptor.  Then, we call sorflush ourself, since we know
-	 * it is a Unix domain socket anyhow.  After we destroy all the
-	 * rights carried in messages, we do a last closef to get rid
-	 * of our extra reference.  This is the last close, and the
-	 * unp_detach etc will shut down the socket.
+	 * descriptor.  Then, we call sorflush ourself, since we know it is a
+	 * Unix domain socket anyhow.  After we destroy all the rights
+	 * carried in messages, we do a last closef to get rid of our extra
+	 * reference.  This is the last close, and the unp_detach etc will
+	 * shut down the socket.
 	 *
 	 * 91/09/19, bsy at cs.cmu.edu
 	 */
@@ -1757,9 +2053,9 @@
 		}
 		/*
 		 * If all refs are from msgs, and it's not marked accessible
-		 * then it must be referenced from some unreachable cycle
-		 * of (shut-down) FDs, so include it in our
-		 * list of FDs to remove
+		 * then it must be referenced from some unreachable cycle of
+		 * (shut-down) FDs, so include it in our list of FDs to
+		 * remove.
 		 */
 		if (fp->f_count == fp->f_msgcount && !(fp->f_gcflag & FMARK)) {
 			*fpp++ = fp;
@@ -1770,7 +2066,7 @@
 	}
 	sx_sunlock(&filelist_lock);
 	/*
-	 * for each FD on our hit list, do the following two things
+	 * For each FD on our hit list, do the following two things:
 	 */
 	for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp) {
 		struct file *tfp = *fpp;
@@ -1798,24 +2094,6 @@
 		unp_scan(m, unp_discard);
 }
 
-static int
-unp_listen(struct socket *so, struct unpcb *unp, struct thread *td)
-{
-	int error;
-
-	UNP_LOCK_ASSERT();
-
-	SOCK_LOCK(so);
-	error = solisten_proto_check(so);
-	if (error == 0) {
-		cru2x(td->td_ucred, &unp->unp_peercred);
-		unp->unp_flags |= UNP_HAVEPCCACHED;
-		solisten_proto(so);
-	}
-	SOCK_UNLOCK(so);
-	return (error);
-}
-
 static void
 unp_scan(struct mbuf *m0, void (*op)(struct file *))
 {
@@ -1868,6 +2146,9 @@
 static void
 unp_mark(struct file *fp)
 {
+
+	/* XXXRW: Should probably assert file list lock here. */
+
 	if (fp->f_gcflag & FMARK)
 		return;
 	unp_defer++;
@@ -1877,11 +2158,128 @@
 static void
 unp_discard(struct file *fp)
 {
-	UNP_LOCK();
+
+	UNP_GLOBAL_WLOCK();
 	FILE_LOCK(fp);
 	fp->f_msgcount--;
 	unp_rights--;
 	FILE_UNLOCK(fp);
-	UNP_UNLOCK();
+	UNP_GLOBAL_WUNLOCK();
 	(void) closef(fp, (struct thread *)NULL);
 }
+
+#ifdef DDB
+static void
+db_print_indent(int indent)
+{
+	int i;
+
+	for (i = 0; i < indent; i++)
+		db_printf(" ");
+}
+
+static void
+db_print_unpflags(int unp_flags)
+{
+	int comma;
+
+	comma = 0;
+	if (unp_flags & UNP_HAVEPC) {
+		db_printf("%sUNP_HAVEPC", comma ? ", " : "");
+		comma = 1;
+	}
+	if (unp_flags & UNP_HAVEPCCACHED) {
+		db_printf("%sUNP_HAVEPCCACHED", comma ? ", " : "");
+		comma = 1;
+	}
+	if (unp_flags & UNP_WANTCRED) {
+		db_printf("%sUNP_WANTCRED", comma ? ", " : "");
+		comma = 1;
+	}
+	if (unp_flags & UNP_CONNWAIT) {
+		db_printf("%sUNP_CONNWAIT", comma ? ", " : "");
+		comma = 1;
+	}
+	if (unp_flags & UNP_CONNECTING) {
+		db_printf("%sUNP_CONNECTING", comma ? ", " : "");
+		comma = 1;
+	}
+	if (unp_flags & UNP_BINDING) {
+		db_printf("%sUNP_BINDING", comma ? ", " : "");
+		comma = 1;
+	}
+}
+
+static void
+db_print_xucred(int indent, struct xucred *xu)
+{
+	int comma, i;
+
+	db_print_indent(indent);
+	db_printf("cr_version: %u   cr_uid: %u   cr_ngroups: %d\n",
+	    xu->cr_version, xu->cr_uid, xu->cr_ngroups);
+	db_print_indent(indent);
+	db_printf("cr_groups: ");
+	comma = 0;
+	for (i = 0; i < xu->cr_ngroups; i++) {
+		db_printf("%s%u", comma ? ", " : "", xu->cr_groups[i]);
+		comma = 1;
+	}
+	db_printf("\n");
+}
+
+static void
+db_print_unprefs(int indent, struct unp_head *uh)
+{
+	struct unpcb *unp;
+	int counter;
+
+	counter = 0;
+	LIST_FOREACH(unp, uh, unp_reflink) {
+		if (counter % 4 == 0)
+			db_print_indent(indent);
+		db_printf("%p  ", unp);
+		if (counter % 4 == 3)
+			db_printf("\n");
+		counter++;
+	}
+	if (counter != 0 && counter % 4 != 0)
+		db_printf("\n");
+}
+
+DB_SHOW_COMMAND(unpcb, db_show_unpcb)
+{
+	struct unpcb *unp;
+
+        if (!have_addr) {
+                db_printf("usage: show unpcb <addr>\n");
+                return;
+        }
+        unp = (struct unpcb *)addr;
+
+	db_printf("unp_socket: %p   unp_vnode: %p\n", unp->unp_socket,
+	    unp->unp_vnode);
+
+	db_printf("unp_ino: %d   unp_conn: %p\n", unp->unp_ino,
+	    unp->unp_conn);
+
+	db_printf("unp_refs:\n");
+	db_print_unprefs(2, &unp->unp_refs);
+
+	/* XXXRW: Would be nice to print the full address, if any. */
+	db_printf("unp_addr: %p\n", unp->unp_addr);
+
+	db_printf("unp_cc: %d   unp_mbcnt: %d   unp_gencnt: %llu\n",
+	    unp->unp_cc, unp->unp_mbcnt,
+	    (unsigned long long)unp->unp_gencnt);
+
+	db_printf("unp_flags: %x (", unp->unp_flags);
+	db_print_unpflags(unp->unp_flags);
+	db_printf(")\n");
+
+	db_printf("unp_peercred:\n");
+	db_print_xucred(2, &unp->unp_peercred);
+
+	db_printf("unp_refcount: %u\n", unp->unp_refcount);
+}
+#endif
Index: kern_poll.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_poll.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/kern_poll.c -L sys/kern/kern_poll.c -u -r1.2 -r1.3
--- sys/kern/kern_poll.c
+++ sys/kern/kern_poll.c
@@ -26,7 +26,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_poll.c,v 1.19.2.2 2005/10/07 14:00:05 glebius Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_poll.c,v 1.31 2007/08/06 14:26:00 rwatson Exp $");
 
 #include "opt_device_polling.h"
 
@@ -113,7 +113,7 @@
 	uint32_t val = poll_burst_max;
 	int error;
 
-	error = sysctl_handle_int(oidp, &val, sizeof(int), req);
+	error = sysctl_handle_int(oidp, &val, 0, req);
 	if (error || !req->newptr )
 		return (error);
 	if (val < MIN_POLL_BURST_MAX || val > MAX_POLL_BURST_MAX)
@@ -137,7 +137,7 @@
 	uint32_t val = poll_each_burst;
 	int error;
 
-	error = sysctl_handle_int(oidp, &val, sizeof(int), req);
+	error = sysctl_handle_int(oidp, &val, 0, req);
 	if (error || !req->newptr )
 		return (error);
 	if (val < 1)
@@ -167,7 +167,7 @@
 	uint32_t val = user_frac;
 	int error;
 
-	error = sysctl_handle_int(oidp, &val, sizeof(int), req);
+	error = sysctl_handle_int(oidp, &val, 0, req);
 	if (error || !req->newptr )
 		return (error);
 	if (val < 0 || val > 99)
@@ -190,7 +190,7 @@
 	uint32_t val = reg_frac;
 	int error;
 
-	error = sysctl_handle_int(oidp, &val, sizeof(int), req);
+	error = sysctl_handle_int(oidp, &val, 0, req);
 	if (error || !req->newptr )
 		return (error);
 	if (val < 1 || val > hz)
@@ -329,7 +329,6 @@
 {
 	int i;
 
-	NET_LOCK_GIANT();
 	mtx_lock(&poll_mtx);
 
 	if (count > poll_each_burst)
@@ -339,7 +338,6 @@
 		pr[i].handler(pr[i].ifp, POLL_ONLY, count);
 
 	mtx_unlock(&poll_mtx);
-	NET_UNLOCK_GIANT();
 }
 
 /*
@@ -366,8 +364,6 @@
 	struct timeval t;
 	int kern_load;
 
-	NET_ASSERT_GIANT();
-
 	mtx_lock(&poll_mtx);
 	phase = 5;
 	if (residual_burst > 0) {
@@ -417,8 +413,6 @@
 	int i, cycles;
 	enum poll_cmd arg = POLL_ONLY;
 
-	NET_ASSERT_GIANT();
-
 	mtx_lock(&poll_mtx);
 	phase = 3;
 	if (residual_burst == 0) { /* first call in this tick */
@@ -456,8 +450,6 @@
 	KASSERT(h != NULL, ("%s: handler is NULL", __func__));
 	KASSERT(ifp != NULL, ("%s: ifp is NULL", __func__));
 
-	NET_ASSERT_GIANT();
-
 	mtx_lock(&poll_mtx);
 	if (poll_handlers >= POLL_LIST_LEN) {
 		/*
@@ -504,7 +496,6 @@
 
 	KASSERT(ifp != NULL, ("%s: ifp is NULL", __func__));
 
-	NET_ASSERT_GIANT();
 	mtx_lock(&poll_mtx);
 
 	for (i = 0 ; i < poll_handlers ; i++)
@@ -535,7 +526,7 @@
 	int error;
 	int val = polling;
 
-	error = sysctl_handle_int(oidp, &val, sizeof(int), req);
+	error = sysctl_handle_int(oidp, &val, 0, req);
 	if (error || !req->newptr )
 		return (error);
 
@@ -547,7 +538,6 @@
 
 	polling = val;
 
-	NET_LOCK_GIANT();
 	IFNET_RLOCK();
 	TAILQ_FOREACH(ifp, &ifnet, if_link) {
 		if (ifp->if_capabilities & IFCAP_POLLING) {
@@ -565,7 +555,6 @@
 		}
 	}
 	IFNET_RUNLOCK();
-	NET_UNLOCK_GIANT();
 
 	log(LOG_ERR, "kern.polling.enable is deprecated. Use ifconfig(8)");
 
@@ -580,17 +569,17 @@
 
 	rtp.prio = RTP_PRIO_MAX;	/* lowest priority */
 	rtp.type = RTP_PRIO_IDLE;
-	mtx_lock_spin(&sched_lock);
-	rtp_to_pri(&rtp, td->td_ksegrp);
-	mtx_unlock_spin(&sched_lock);
+	PROC_SLOCK(td->td_proc);
+	rtp_to_pri(&rtp, td);
+	PROC_SUNLOCK(td->td_proc);
 
 	for (;;) {
 		if (poll_in_idle_loop && poll_handlers > 0) {
 			idlepoll_sleeping = 0;
 			ether_poll(poll_each_burst);
-			mtx_lock_spin(&sched_lock);
+			thread_lock(td);
 			mi_switch(SW_VOL, NULL);
-			mtx_unlock_spin(&sched_lock);
+			thread_unlock(td);
 		} else {
 			idlepoll_sleeping = 1;
 			tsleep(&idlepoll_sleeping, 0, "pollid", hz * 3);
Index: subr_taskqueue.c
===================================================================
RCS file: /home/cvs/src/sys/kern/subr_taskqueue.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/subr_taskqueue.c -L sys/kern/subr_taskqueue.c -u -r1.2 -r1.3
--- sys/kern/subr_taskqueue.c
+++ sys/kern/subr_taskqueue.c
@@ -25,7 +25,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: /repoman/r/ncvs/src/sys/kern/subr_taskqueue.c,v 1.27.2.4 2006/07/06 08:32:50 glebius Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/subr_taskqueue.c,v 1.39 2007/06/05 00:00:54 jeff Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -131,10 +131,8 @@
 
 struct taskqueue *
 taskqueue_create(const char *name, int mflags,
-		 taskqueue_enqueue_fn enqueue, void *context,
-		 struct proc **pp)
+		 taskqueue_enqueue_fn enqueue, void *context)
 {
-	(void) pp;
 	return _taskqueue_create(name, mflags, enqueue, context,
 			MTX_DEF, "taskqueue");
 }
@@ -317,32 +315,48 @@
 {
 	va_list ap;
 	struct taskqueue *tq;
+	struct thread *td;
 	char ktname[MAXCOMLEN];
-	int i;
+	int i, error;
 
 	if (count <= 0)
 		return (EINVAL);
 	tq = *tqp;
 
-	if ((tq->tq_pproc = malloc(sizeof(struct proc *) * count, M_TASKQUEUE,
-	    M_NOWAIT | M_ZERO)) == NULL)
-		return (ENOMEM);
-	
 	va_start(ap, name);
 	vsnprintf(ktname, MAXCOMLEN, name, ap);
 	va_end(ap);
 
+	tq->tq_pproc = malloc(sizeof(struct proc *) * count, M_TASKQUEUE,
+	    M_NOWAIT | M_ZERO);
+	if (tq->tq_pproc == NULL) {
+		printf("%s: no memory for %s threads\n", __func__, ktname);
+		return (ENOMEM);
+	}
+
 	for (i = 0; i < count; i++) {
 		if (count == 1)
-			kthread_create(taskqueue_thread_loop, tqp,
-			    &tq->tq_pproc[i], 0, 0, ktname);
+			error = kthread_create(taskqueue_thread_loop, tqp,
+			    &tq->tq_pproc[i], RFSTOPPED, 0, ktname);
 		else
-			kthread_create(taskqueue_thread_loop, tqp,
-			    &tq->tq_pproc[i], 0, 0, "%s_%d", ktname, i);
-		mtx_lock_spin(&sched_lock);
-		sched_prio(FIRST_THREAD_IN_PROC(tq->tq_pproc[i]), pri);
-		mtx_unlock_spin(&sched_lock);
-		tq->tq_pcount++;
+			error = kthread_create(taskqueue_thread_loop, tqp,
+			    &tq->tq_pproc[i], RFSTOPPED, 0, "%s_%d", ktname, i);
+		if (error) {
+			/* should be ok to continue, taskqueue_free will dtrt */
+			printf("%s: kthread_create(%s): error %d",
+				__func__, ktname, error);
+			tq->tq_pproc[i] = NULL;		/* paranoid */
+		} else
+			tq->tq_pcount++;
+	}
+	for (i = 0; i < count; i++) {
+		if (tq->tq_pproc[i] == NULL)
+			continue;
+		td = FIRST_THREAD_IN_PROC(tq->tq_pproc[i]);
+		thread_lock(td);
+		sched_prio(td, pri);
+		sched_add(td, SRQ_BORING);
+		thread_unlock(td);
 	}
 
 	return (0);
@@ -358,7 +372,7 @@
 	TQ_LOCK(tq);
 	do {
 		taskqueue_run(tq);
-		TQ_SLEEP(tq, tq, &tq->tq_mutex, curthread->td_priority, "-", 0);
+		TQ_SLEEP(tq, tq, &tq->tq_mutex, 0, "-", 0);
 	} while ((tq->tq_flags & TQ_FLAGS_ACTIVE) != 0);
 
 	/* rendezvous with thread that asked us to terminate */
Index: vfs_cache.c
===================================================================
RCS file: /home/cvs/src/sys/kern/vfs_cache.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/vfs_cache.c -L sys/kern/vfs_cache.c -u -r1.2 -r1.3
--- sys/kern/vfs_cache.c
+++ sys/kern/vfs_cache.c
@@ -33,7 +33,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/vfs_cache.c,v 1.103.2.1 2006/03/13 03:06:14 jeff Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/vfs_cache.c,v 1.114 2007/09/21 10:16:56 pjd Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -141,8 +141,8 @@
 SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, "");
 
 /* Export size information to userland */
-SYSCTL_INT(_debug, OID_AUTO, vnsize, CTLFLAG_RD, 0, sizeof(struct vnode), "");
-SYSCTL_INT(_debug, OID_AUTO, ncsize, CTLFLAG_RD, 0, sizeof(struct namecache), "");
+SYSCTL_INT(_debug_sizeof, OID_AUTO, namecache, CTLFLAG_RD, 0,
+	sizeof(struct namecache), "");
 
 /*
  * The new name cache statistics
@@ -293,37 +293,6 @@
 }
 
 /*
- * cache_leaf_test()
- *
- *      Test whether this (directory) vnode's namei cache entry contains
- *      subdirectories or not.  Used to determine whether the directory is
- *      a leaf in the namei cache or not.  Note: the directory may still
- *      contain files in the namei cache.
- *
- *      Returns 0 if the directory is a leaf, -1 if it isn't.
- */
-int
-cache_leaf_test(struct vnode *vp)
-{
-	struct namecache *ncpc;
-	int leaf;
-
-	leaf = 0;
-	CACHE_LOCK();
-	for (ncpc = LIST_FIRST(&vp->v_cache_src);
-	     ncpc != NULL;
-	     ncpc = LIST_NEXT(ncpc, nc_src)
-	 ) {
-		if (ncpc->nc_vp != NULL && ncpc->nc_vp->v_type == VDIR) {
-			leaf = -1;
-			break;
-		}
-	}
-	CACHE_UNLOCK();
-	return (leaf);
-}
-
-/*
  * Lookup an entry in the cache
  *
  * Lookup is called with dvp pointing to the directory to search,
@@ -345,13 +314,15 @@
 	struct componentname *cnp;
 {
 	struct namecache *ncp;
+	struct thread *td;
 	u_int32_t hash;
-	int error;
+	int error, ltype;
 
 	if (!doingcache) {
 		cnp->cn_flags &= ~MAKEENTRY;
 		return (0);
 	}
+	td = cnp->cn_thread;
 retry:
 	CACHE_LOCK();
 	numcalls++;
@@ -450,15 +421,29 @@
 	if (dvp == *vpp) {   /* lookup on "." */
 		VREF(*vpp);
 		CACHE_UNLOCK();
+		/*
+		 * When we lookup "." we still can be asked to lock it
+		 * differently...
+		 */
+		ltype = cnp->cn_lkflags & (LK_SHARED | LK_EXCLUSIVE);
+		if (ltype == VOP_ISLOCKED(*vpp, td))
+			return (-1);
+		else if (ltype == LK_EXCLUSIVE)
+			vn_lock(*vpp, LK_UPGRADE | LK_RETRY, td);
 		return (-1);
 	}
-	if (cnp->cn_flags & ISDOTDOT)
-		VOP_UNLOCK(dvp, 0, cnp->cn_thread);
+	ltype = 0;	/* silence gcc warning */
+	if (cnp->cn_flags & ISDOTDOT) {
+		ltype = VOP_ISLOCKED(dvp, td);
+		VOP_UNLOCK(dvp, 0, td);
+	}
 	VI_LOCK(*vpp);
 	CACHE_UNLOCK();
-	error = vget(*vpp, cnp->cn_lkflags | LK_INTERLOCK, cnp->cn_thread);
+	error = vget(*vpp, cnp->cn_lkflags | LK_INTERLOCK, td);
 	if (cnp->cn_flags & ISDOTDOT)
-		vn_lock(dvp, LK_EXCLUSIVE | LK_RETRY, cnp->cn_thread);
+		vn_lock(dvp, ltype | LK_RETRY, td);
+	if ((cnp->cn_flags & ISLASTCN) && (cnp->cn_lkflags & LK_EXCLUSIVE))
+		ASSERT_VOP_ELOCKED(*vpp, "cache_lookup");
 	if (error) {
 		*vpp = NULL;
 		goto retry;
@@ -601,9 +586,6 @@
 
 /*
  * Flush all entries referencing a particular filesystem.
- *
- * Since we need to check it anyway, we will flush all the invalid
- * entries at the same time.
  */
 void
 cache_purgevfs(mp)
@@ -611,24 +593,15 @@
 {
 	struct nchashhead *ncpp;
 	struct namecache *ncp, *nnp;
-	struct nchashhead mplist;
-
-	LIST_INIT(&mplist);
-	ncp = NULL;
 
 	/* Scan hash tables for applicable entries */
 	CACHE_LOCK();
 	for (ncpp = &nchashtbl[nchash]; ncpp >= nchashtbl; ncpp--) {
-		for (ncp = LIST_FIRST(ncpp); ncp != 0; ncp = nnp) {
-			nnp = LIST_NEXT(ncp, nc_hash);
-			if (ncp->nc_dvp->v_mount == mp) {
-				LIST_REMOVE(ncp, nc_hash);
-				LIST_INSERT_HEAD(&mplist, ncp, nc_hash);
-			}
+		LIST_FOREACH_SAFE(ncp, ncpp, nc_hash, nnp) {
+			if (ncp->nc_dvp->v_mount == mp)
+				cache_zap(ncp);
 		}
 	}
-	while (!LIST_EMPTY(&mplist))
-		cache_zap(LIST_FIRST(&mplist));
 	CACHE_UNLOCK();
 }
 
@@ -690,7 +663,7 @@
 SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0,
    "Disable the getcwd syscall");
 
-/* Implementation of the getcwd syscall */
+/* Implementation of the getcwd syscall. */
 int
 __getcwd(td, uap)
 	struct thread *td;
@@ -717,10 +690,10 @@
 	tmpbuf = malloc(buflen, M_TEMP, M_WAITOK);
 	fdp = td->td_proc->p_fd;
 	mtx_lock(&Giant);
-	FILEDESC_LOCK(fdp);
+	FILEDESC_SLOCK(fdp);
 	error = vn_fullpath1(td, fdp->fd_cdir, fdp->fd_rdir, tmpbuf,
 	    &bp, buflen);
-	FILEDESC_UNLOCK(fdp);
+	FILEDESC_SUNLOCK(fdp);
 	mtx_unlock(&Giant);
 
 	if (!error) {
@@ -771,11 +744,9 @@
 
 	buf = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
 	fdp = td->td_proc->p_fd;
-	mtx_lock(&Giant);
-	FILEDESC_LOCK(fdp);
+	FILEDESC_SLOCK(fdp);
 	error = vn_fullpath1(td, vn, fdp->fd_rdir, buf, retbuf, MAXPATHLEN);
-	FILEDESC_UNLOCK(fdp);
-	mtx_unlock(&Giant);
+	FILEDESC_SUNLOCK(fdp);
 
 	if (!error)
 		*freebuf = buf;
@@ -795,8 +766,6 @@
 	int error, i, slash_prefixed;
 	struct namecache *ncp;
 
-	mtx_assert(&Giant, MA_OWNED);
-
 	bp = buf + buflen - 1;
 	*bp = '\0';
 	error = 0;
Index: kern_pmc.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_pmc.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/kern_pmc.c -L sys/kern/kern_pmc.c -u -r1.1.1.1 -r1.2
--- sys/kern/kern_pmc.c
+++ sys/kern/kern_pmc.c
@@ -24,7 +24,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_pmc.c,v 1.4.2.1 2005/08/15 18:46:12 jkoshy Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_pmc.c,v 1.6 2005/12/04 02:12:43 ru Exp $");
 
 #include "opt_hwpmc_hooks.h"
 
@@ -33,7 +33,7 @@
 #include <sys/pmckern.h>
 #include <sys/smp.h>
 
-#if	HWPMC_HOOKS
+#ifdef	HWPMC_HOOKS
 #define	PMC_KERNEL_VERSION	PMC_VERSION
 #else
 #define	PMC_KERNEL_VERSION	0
Index: sys_process.c
===================================================================
RCS file: /home/cvs/src/sys/kern/sys_process.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/sys_process.c -L sys/kern/sys_process.c -u -r1.2 -r1.3
--- sys/kern/sys_process.c
+++ sys/kern/sys_process.c
@@ -30,7 +30,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/sys_process.c,v 1.131.2.3 2006/03/07 18:08:09 jhb Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/sys_process.c,v 1.145 2007/10/09 00:03:39 jeff Exp $");
 
 #include "opt_compat.h"
 
@@ -49,6 +49,8 @@
 
 #include <machine/reg.h>
 
+#include <security/audit/audit.h>
+
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_extern.h>
@@ -102,7 +104,7 @@
 	int error;							\
 									\
 	PROC_LOCK_ASSERT(td->td_proc, MA_OWNED);			\
-	if ((td->td_proc->p_sflag & PS_INMEM) == 0)			\
+	if ((td->td_proc->p_flag & P_INMEM) == 0)			\
 		error = EIO;						\
 	else								\
 		error = (action);					\
@@ -366,9 +368,6 @@
 #define	COPYIN(u, k, s)		copyin(u, k, s)
 #define	COPYOUT(k, u, s)	copyout(k, u, s)
 #endif
-/*
- * MPSAFE
- */
 int
 ptrace(struct thread *td, struct ptrace_args *uap)
 {
@@ -397,6 +396,10 @@
 	if (td->td_proc->p_sysent == &ia32_freebsd_sysvec)
 		wrap32 = 1;
 #endif
+	AUDIT_ARG(pid, uap->pid);
+	AUDIT_ARG(cmd, uap->req);
+	AUDIT_ARG(addr, uap->addr);
+	AUDIT_ARG(value, uap->data);
 	addr = &r;
 	switch (uap->req) {
 	case PT_GETREGS:
@@ -524,12 +527,12 @@
 			sx_slock(&allproc_lock);
 			FOREACH_PROC_IN_SYSTEM(p) {
 				PROC_LOCK(p);
-				mtx_lock_spin(&sched_lock);
+				PROC_SLOCK(p);
 				FOREACH_THREAD_IN_PROC(p, td2) {
 					if (td2->td_tid == pid)
 						break;
 				}
-				mtx_unlock_spin(&sched_lock);
+				PROC_SUNLOCK(p);
 				if (td2 != NULL)
 					break; /* proc lock held */
 				PROC_UNLOCK(p);
@@ -544,6 +547,7 @@
 			pid = p->p_pid;
 		}
 	}
+	AUDIT_ARG(process, p);
 
 	if ((p->p_flag & P_WEXIT) != 0) {
 		error = ESRCH;
@@ -697,15 +701,15 @@
 		break;
 
 	case PT_SUSPEND:
-		mtx_lock_spin(&sched_lock);
+		thread_lock(td2);
 		td2->td_flags |= TDF_DBSUSPEND;
-		mtx_unlock_spin(&sched_lock);
+		thread_unlock(td2);
 		break;
 
 	case PT_RESUME:
-		mtx_lock_spin(&sched_lock);
+		thread_lock(td2);
 		td2->td_flags &= ~TDF_DBSUSPEND;
-		mtx_unlock_spin(&sched_lock);
+		thread_unlock(td2);
 		break;
 
 	case PT_STEP:
@@ -748,6 +752,10 @@
 			if (p->p_oppid != p->p_pptr->p_pid) {
 				struct proc *pp;
 
+				PROC_LOCK(p->p_pptr);
+				sigqueue_take(p->p_ksi);
+				PROC_UNLOCK(p->p_pptr);
+
 				PROC_UNLOCK(p);
 				pp = pfind(p->p_oppid);
 				if (pp == NULL)
@@ -763,6 +771,7 @@
 			p->p_oppid = 0;
 
 			/* should we send SIGCHLD? */
+			/* childproc_continued(p); */
 		}
 
 	sendsig:
@@ -770,36 +779,41 @@
 			sx_xunlock(&proctree_lock);
 			proctree_locked = 0;
 		}
-		/* deliver or queue signal */
-		mtx_lock_spin(&sched_lock);
-		td2->td_flags &= ~TDF_XSIG;
-		mtx_unlock_spin(&sched_lock);
-		td2->td_xsig = data;
 		p->p_xstat = data;
 		p->p_xthread = NULL;
 		if ((p->p_flag & (P_STOPPED_SIG | P_STOPPED_TRACE)) != 0) {
-			mtx_lock_spin(&sched_lock);
+			/* deliver or queue signal */
+			thread_lock(td2);
+			td2->td_flags &= ~TDF_XSIG;
+			thread_unlock(td2);
+			td2->td_xsig = data;
+
+			PROC_SLOCK(p);
 			if (req == PT_DETACH) {
 				struct thread *td3;
-				FOREACH_THREAD_IN_PROC(p, td3)
+				FOREACH_THREAD_IN_PROC(p, td3) {
+					thread_lock(td3);
 					td3->td_flags &= ~TDF_DBSUSPEND; 
+					thread_unlock(td3);
+				}
 			}
 			/*
 			 * unsuspend all threads, to not let a thread run,
 			 * you should use PT_SUSPEND to suspend it before
 			 * continuing process.
 			 */
-			mtx_unlock_spin(&sched_lock);
+#ifdef KSE
+			PROC_SUNLOCK(p);
 			thread_continued(p);
+			PROC_SLOCK(p);
+#endif
 			p->p_flag &= ~(P_STOPPED_TRACE|P_STOPPED_SIG|P_WAITED);
-			mtx_lock_spin(&sched_lock);
 			thread_unsuspend(p);
-			mtx_unlock_spin(&sched_lock);
+			PROC_SUNLOCK(p);
+		} else {
+			if (data)
+				psignal(p, data);
 		}
-
-		if (data)
-			psignal(p, data);
-
 		break;
 
 	case PT_WRITE_I:
@@ -918,7 +932,7 @@
 		break;
 
 	case PT_LWPINFO:
-		if (data == 0 || data > sizeof(*pl)) {
+		if (data <= 0 || data > sizeof(*pl)) {
 			error = EINVAL;
 			break;
 		}
@@ -928,6 +942,7 @@
 			pl->pl_event = PL_EVENT_SIGNAL;
 		else
 			pl->pl_event = 0;
+#ifdef KSE
 		if (td2->td_pflags & TDP_SA) {
 			pl->pl_flags = PL_FLAG_SA;
 			if (td2->td_upcall && !TD_CAN_UNBIND(td2))
@@ -935,6 +950,11 @@
 		} else {
 			pl->pl_flags = 0;
 		}
+#else
+		pl->pl_flags = 0;
+#endif
+		pl->pl_sigmask = td2->td_sigmask;
+		pl->pl_siglist = td2->td_siglist;
 		break;
 
 	case PT_GETNUMLWPS:
@@ -951,18 +971,18 @@
 		buf = malloc(num * sizeof(lwpid_t), M_TEMP, M_WAITOK);
 		tmp = 0;
 		PROC_LOCK(p);
-		mtx_lock_spin(&sched_lock);
+		PROC_SLOCK(p);
 		FOREACH_THREAD_IN_PROC(p, td2) {
 			if (tmp >= num)
 				break;
 			buf[tmp++] = td2->td_tid;
 		}
-		mtx_unlock_spin(&sched_lock);
+		PROC_SUNLOCK(p);
 		PROC_UNLOCK(p);
 		error = copyout(buf, addr, tmp * sizeof(lwpid_t));
 		free(buf, M_TEMP);
 		if (!error)
-			td->td_retval[0] = num;
+			td->td_retval[0] = tmp;
 		PROC_LOCK(p);
 		break;
 
Index: sysv_sem.c
===================================================================
RCS file: /home/cvs/src/sys/kern/sysv_sem.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/sysv_sem.c -L sys/kern/sysv_sem.c -u -r1.1.1.1 -r1.2
--- sys/kern/sysv_sem.c
+++ sys/kern/sysv_sem.c
@@ -37,7 +37,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/sysv_sem.c,v 1.78 2005/06/07 05:03:27 rwatson Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/sysv_sem.c,v 1.89 2007/07/03 15:58:47 kib Exp $");
 
 #include "opt_sysvipc.h"
 #include "opt_mac.h"
@@ -53,11 +53,14 @@
 #include <sys/mutex.h>
 #include <sys/sem.h>
 #include <sys/syscall.h>
+#include <sys/syscallsubr.h>
 #include <sys/sysent.h>
 #include <sys/sysctl.h>
+#include <sys/uio.h>
 #include <sys/malloc.h>
 #include <sys/jail.h>
-#include <sys/mac.h>
+
+#include <security/mac/mac_framework.h>
 
 static MALLOC_DEFINE(M_SEM, "sem", "SVID compatible semaphores");
 
@@ -66,11 +69,6 @@
 #else
 #define DPRINTF(a)
 #endif
-#ifdef MAC_DEBUG
-#define MPRINTF(a)      printf a
-#else
-#define MPRINTF(a)
-#endif
 
 static void seminit(void);
 static int sysvsem_modload(struct module *, int, void *);
@@ -196,7 +194,6 @@
                 SEMAEM          /* adjust on exit max value */
 };
 
-SYSCTL_DECL(_kern_ipc);
 SYSCTL_INT(_kern_ipc, OID_AUTO, semmap, CTLFLAG_RW, &seminfo.semmap, 0,
     "Number of entries in the semaphore map");
 SYSCTL_INT(_kern_ipc, OID_AUTO, semmni, CTLFLAG_RDTUN, &seminfo.semmni, 0,
@@ -322,9 +319,7 @@
 MODULE_VERSION(sysvsem, 1);
 
 /*
- * Entry point for all SEM calls
- *
- * MPSAFE
+ * Entry point for all SEM calls.
  */
 int
 semsys(td, uap)
@@ -536,7 +531,7 @@
 }
 
 /*
- * Note that the user-mode half of this passes a union, not a pointer
+ * Note that the user-mode half of this passes a union, not a pointer.
  */
 #ifndef _SYS_SYSPROTO_H_
 struct __semctl_args {
@@ -546,29 +541,80 @@
 	union	semun *arg;
 };
 #endif
-
-/*
- * MPSAFE
- */
 int
 __semctl(td, uap)
 	struct thread *td;
 	struct __semctl_args *uap;
 {
-	int semid = uap->semid;
-	int semnum = uap->semnum;
-	int cmd = uap->cmd;
+	struct semid_ds dsbuf;
+	union semun arg, semun;
+	register_t rval;
+	int error;
+
+	switch (uap->cmd) {
+	case SEM_STAT:
+	case IPC_SET:
+	case IPC_STAT:
+	case GETALL:
+	case SETVAL:
+	case SETALL:
+		error = copyin(uap->arg, &arg, sizeof(arg));
+		if (error)
+			return (error);
+		break;
+	}
+
+	switch (uap->cmd) {
+	case SEM_STAT:
+	case IPC_STAT:
+		semun.buf = &dsbuf;
+		break;
+	case IPC_SET:
+		error = copyin(arg.buf, &dsbuf, sizeof(dsbuf));
+		if (error)
+			return (error);
+		semun.buf = &dsbuf;
+		break;
+	case GETALL:
+	case SETALL:
+		semun.array = arg.array;
+		break;
+	case SETVAL:
+		semun.val = arg.val;
+		break;		
+	}
+
+	error = kern_semctl(td, uap->semid, uap->semnum, uap->cmd, &semun,
+	    &rval);
+	if (error)
+		return (error);
+
+	switch (uap->cmd) {
+	case SEM_STAT:
+	case IPC_STAT:
+		error = copyout(&dsbuf, arg.buf, sizeof(dsbuf));
+		break;
+	}
+
+	if (error == 0)
+		td->td_retval[0] = rval;
+	return (error);
+}
+
+int
+kern_semctl(struct thread *td, int semid, int semnum, int cmd,
+    union semun *arg, register_t *rval)
+{
 	u_short *array;
-	union semun *arg = uap->arg;
-	union semun real_arg;
 	struct ucred *cred = td->td_ucred;
-	int i, rval, error;
-	struct semid_ds sbuf;
+	int i, error;
+	struct semid_ds *sbuf;
 	struct semid_kernel *semakptr;
 	struct mtx *sema_mtxp;
 	u_short usval, count;
+	int semidx;
 
-	DPRINTF(("call to semctl(%d, %d, %d, 0x%x)\n",
+	DPRINTF(("call to semctl(%d, %d, %d, 0x%p)\n",
 	    semid, semnum, cmd, arg));
 	if (!jail_sysvipc_allowed && jailed(td->td_ucred))
 		return (ENOSYS);
@@ -577,10 +623,12 @@
 
 	switch(cmd) {
 	case SEM_STAT:
+		/*
+		 * For this command we assume semid is an array index
+		 * rather than an IPC id.
+		 */
 		if (semid < 0 || semid >= seminfo.semmni)
 			return (EINVAL);
-		if ((error = copyin(arg, &real_arg, sizeof(real_arg))) != 0)
-			return (error);
 		semakptr = &sema[semid];
 		sema_mtxp = &sema_mtx[semid];
 		mtx_lock(sema_mtxp);
@@ -592,45 +640,34 @@
 			goto done2;
 #ifdef MAC
 		error = mac_check_sysv_semctl(cred, semakptr, cmd);
-		if (error != 0) {
-			MPRINTF(("mac_check_sysv_semctl returned %d\n",
-			    error));
+		if (error != 0)
 			goto done2;
-		}
 #endif
+		bcopy(&semakptr->u, arg->buf, sizeof(struct semid_ds));
+		*rval = IXSEQ_TO_IPCID(semid, semakptr->u.sem_perm);
 		mtx_unlock(sema_mtxp);
-		error = copyout(&semakptr->u, real_arg.buf,
-		    sizeof(struct semid_ds));
-		rval = IXSEQ_TO_IPCID(semid, semakptr->u.sem_perm);
-		if (error == 0)
-			td->td_retval[0] = rval;
-		return (error);
+		return (0);
 	}
 
-	semid = IPCID_TO_IX(semid);
-	if (semid < 0 || semid >= seminfo.semmni)
+	semidx = IPCID_TO_IX(semid);
+	if (semidx < 0 || semidx >= seminfo.semmni)
 		return (EINVAL);
 
-	semakptr = &sema[semid];
-	sema_mtxp = &sema_mtx[semid];
-#ifdef MAC
+	semakptr = &sema[semidx];
+	sema_mtxp = &sema_mtx[semidx];
 	mtx_lock(sema_mtxp);
+#ifdef MAC
 	error = mac_check_sysv_semctl(cred, semakptr, cmd);
-	if (error != 0) {
-		MPRINTF(("mac_check_sysv_semctl returned %d\n", error));
-		mtx_unlock(sema_mtxp);
-		return (error);
-	}
-	mtx_unlock(sema_mtxp);
+	if (error != 0)
+		goto done2;
 #endif
 
 	error = 0;
-	rval = 0;
+	*rval = 0;
 
 	switch (cmd) {
 	case IPC_RMID:
-		mtx_lock(sema_mtxp);
-		if ((error = semvalid(uap->semid, semakptr)) != 0)
+		if ((error = semvalid(semid, semakptr)) != 0)
 			goto done2;
 		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_M)))
 			goto done2;
@@ -649,45 +686,34 @@
 		mac_cleanup_sysv_sem(semakptr);
 #endif
 		SEMUNDO_LOCK();
-		semundo_clear(semid, -1);
+		semundo_clear(semidx, -1);
 		SEMUNDO_UNLOCK();
 		wakeup(semakptr);
 		break;
 
 	case IPC_SET:
-		if ((error = copyin(arg, &real_arg, sizeof(real_arg))) != 0)
-			goto done2;
-		if ((error = copyin(real_arg.buf, &sbuf, sizeof(sbuf))) != 0)
-			goto done2;
-		mtx_lock(sema_mtxp);
-		if ((error = semvalid(uap->semid, semakptr)) != 0)
+		if ((error = semvalid(semid, semakptr)) != 0)
 			goto done2;
 		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_M)))
 			goto done2;
-		semakptr->u.sem_perm.uid = sbuf.sem_perm.uid;
-		semakptr->u.sem_perm.gid = sbuf.sem_perm.gid;
+		sbuf = arg->buf;
+		semakptr->u.sem_perm.uid = sbuf->sem_perm.uid;
+		semakptr->u.sem_perm.gid = sbuf->sem_perm.gid;
 		semakptr->u.sem_perm.mode = (semakptr->u.sem_perm.mode &
-		    ~0777) | (sbuf.sem_perm.mode & 0777);
+		    ~0777) | (sbuf->sem_perm.mode & 0777);
 		semakptr->u.sem_ctime = time_second;
 		break;
 
 	case IPC_STAT:
-		if ((error = copyin(arg, &real_arg, sizeof(real_arg))) != 0)
-			goto done2;
-		mtx_lock(sema_mtxp);
-		if ((error = semvalid(uap->semid, semakptr)) != 0)
+		if ((error = semvalid(semid, semakptr)) != 0)
 			goto done2;
 		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
 			goto done2;
-		sbuf = semakptr->u;
-		mtx_unlock(sema_mtxp);
-		error = copyout(&semakptr->u, real_arg.buf,
-				sizeof(struct semid_ds));
+		bcopy(&semakptr->u, arg->buf, sizeof(struct semid_ds));
 		break;
 
 	case GETNCNT:
-		mtx_lock(sema_mtxp);
-		if ((error = semvalid(uap->semid, semakptr)) != 0)
+		if ((error = semvalid(semid, semakptr)) != 0)
 			goto done2;
 		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
 			goto done2;
@@ -695,12 +721,11 @@
 			error = EINVAL;
 			goto done2;
 		}
-		rval = semakptr->u.sem_base[semnum].semncnt;
+		*rval = semakptr->u.sem_base[semnum].semncnt;
 		break;
 
 	case GETPID:
-		mtx_lock(sema_mtxp);
-		if ((error = semvalid(uap->semid, semakptr)) != 0)
+		if ((error = semvalid(semid, semakptr)) != 0)
 			goto done2;
 		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
 			goto done2;
@@ -708,12 +733,11 @@
 			error = EINVAL;
 			goto done2;
 		}
-		rval = semakptr->u.sem_base[semnum].sempid;
+		*rval = semakptr->u.sem_base[semnum].sempid;
 		break;
 
 	case GETVAL:
-		mtx_lock(sema_mtxp);
-		if ((error = semvalid(uap->semid, semakptr)) != 0)
+		if ((error = semvalid(semid, semakptr)) != 0)
 			goto done2;
 		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
 			goto done2;
@@ -721,29 +745,48 @@
 			error = EINVAL;
 			goto done2;
 		}
-		rval = semakptr->u.sem_base[semnum].semval;
+		*rval = semakptr->u.sem_base[semnum].semval;
 		break;
 
 	case GETALL:
-		if ((error = copyin(arg, &real_arg, sizeof(real_arg))) != 0)
-			goto done2;
-		array = malloc(sizeof(*array) * semakptr->u.sem_nsems, M_TEMP,
-		    M_WAITOK);
+		/*
+		 * Unfortunately, callers of this function don't know
+		 * in advance how many semaphores are in this set.
+		 * While we could just allocate the maximum size array
+		 * and pass the actual size back to the caller, that
+		 * won't work for SETALL since we can't copyin() more
+		 * data than the user specified as we may return a
+		 * spurious EFAULT.
+		 * 
+		 * Note that the number of semaphores in a set is
+		 * fixed for the life of that set.  The only way that
+		 * the 'count' could change while are blocked in
+		 * malloc() is if this semaphore set were destroyed
+		 * and a new one created with the same index.
+		 * However, semvalid() will catch that due to the
+		 * sequence number unless exactly 0x8000 (or a
+		 * multiple thereof) semaphore sets for the same index
+		 * are created and destroyed while we are in malloc!
+		 *
+		 */
+		count = semakptr->u.sem_nsems;
+		mtx_unlock(sema_mtxp);		    
+		array = malloc(sizeof(*array) * count, M_TEMP, M_WAITOK);
 		mtx_lock(sema_mtxp);
-		if ((error = semvalid(uap->semid, semakptr)) != 0)
+		if ((error = semvalid(semid, semakptr)) != 0)
 			goto done2;
+		KASSERT(count == semakptr->u.sem_nsems, ("nsems changed"));
 		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
 			goto done2;
 		for (i = 0; i < semakptr->u.sem_nsems; i++)
 			array[i] = semakptr->u.sem_base[i].semval;
 		mtx_unlock(sema_mtxp);
-		error = copyout(array, real_arg.array,
-		    i * sizeof(real_arg.array[0]));
+		error = copyout(array, arg->array, count * sizeof(*array));
+		mtx_lock(sema_mtxp);
 		break;
 
 	case GETZCNT:
-		mtx_lock(sema_mtxp);
-		if ((error = semvalid(uap->semid, semakptr)) != 0)
+		if ((error = semvalid(semid, semakptr)) != 0)
 			goto done2;
 		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_R)))
 			goto done2;
@@ -751,14 +794,11 @@
 			error = EINVAL;
 			goto done2;
 		}
-		rval = semakptr->u.sem_base[semnum].semzcnt;
+		*rval = semakptr->u.sem_base[semnum].semzcnt;
 		break;
 
 	case SETVAL:
-		if ((error = copyin(arg, &real_arg, sizeof(real_arg))) != 0)
-			goto done2;
-		mtx_lock(sema_mtxp);
-		if ((error = semvalid(uap->semid, semakptr)) != 0)
+		if ((error = semvalid(semid, semakptr)) != 0)
 			goto done2;
 		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_W)))
 			goto done2;
@@ -766,39 +806,32 @@
 			error = EINVAL;
 			goto done2;
 		}
-		if (real_arg.val < 0 || real_arg.val > seminfo.semvmx) {
+		if (arg->val < 0 || arg->val > seminfo.semvmx) {
 			error = ERANGE;
 			goto done2;
 		}
-		semakptr->u.sem_base[semnum].semval = real_arg.val;
+		semakptr->u.sem_base[semnum].semval = arg->val;
 		SEMUNDO_LOCK();
-		semundo_clear(semid, semnum);
+		semundo_clear(semidx, semnum);
 		SEMUNDO_UNLOCK();
 		wakeup(semakptr);
 		break;
 
 	case SETALL:
-		mtx_lock(sema_mtxp);
-raced:
-		if ((error = semvalid(uap->semid, semakptr)) != 0)
-			goto done2;
+		/*
+		 * See comment on GETALL for why 'count' shouldn't change
+		 * and why we require a userland buffer.
+		 */
 		count = semakptr->u.sem_nsems;
-		mtx_unlock(sema_mtxp);
-		if ((error = copyin(arg, &real_arg, sizeof(real_arg))) != 0)
-			goto done2;
+		mtx_unlock(sema_mtxp);		    
 		array = malloc(sizeof(*array) * count, M_TEMP, M_WAITOK);
-		error = copyin(real_arg.array, array, count * sizeof(*array));
+		error = copyin(arg->array, array, count * sizeof(*array));
+		mtx_lock(sema_mtxp);
 		if (error)
 			break;
-		mtx_lock(sema_mtxp);
-		if ((error = semvalid(uap->semid, semakptr)) != 0)
+		if ((error = semvalid(semid, semakptr)) != 0)
 			goto done2;
-		/* we could have raced? */
-		if (count != semakptr->u.sem_nsems) {
-			free(array, M_TEMP);
-			array = NULL;
-			goto raced;
-		}
+		KASSERT(count == semakptr->u.sem_nsems, ("nsems changed"));
 		if ((error = ipcperm(td, &semakptr->u.sem_perm, IPC_W)))
 			goto done2;
 		for (i = 0; i < semakptr->u.sem_nsems; i++) {
@@ -810,7 +843,7 @@
 			semakptr->u.sem_base[i].semval = usval;
 		}
 		SEMUNDO_LOCK();
-		semundo_clear(semid, -1);
+		semundo_clear(semidx, -1);
 		SEMUNDO_UNLOCK();
 		wakeup(semakptr);
 		break;
@@ -820,11 +853,8 @@
 		break;
 	}
 
-	if (error == 0)
-		td->td_retval[0] = rval;
 done2:
-	if (mtx_owned(sema_mtxp))
-		mtx_unlock(sema_mtxp);
+	mtx_unlock(sema_mtxp);
 	if (array != NULL)
 		free(array, M_TEMP);
 	return(error);
@@ -837,10 +867,6 @@
 	int	semflg;
 };
 #endif
-
-/*
- * MPSAFE
- */
 int
 semget(td, uap)
 	struct thread *td;
@@ -881,11 +907,8 @@
 			}
 #ifdef MAC
 			error = mac_check_sysv_semget(cred, &sema[semid]);
-			if (error != 0) {
-				MPRINTF(("mac_check_sysv_semget returned %d\n",
-				    error));
+			if (error != 0)
 				goto done2;
-			}
 #endif
 			goto found;
 		}
@@ -934,7 +957,7 @@
 #ifdef MAC
 		mac_create_sysv_sem(cred, &sema[semid]);
 #endif
-		DPRINTF(("sembase = 0x%x, next = 0x%x\n",
+		DPRINTF(("sembase = %p, next = %p\n",
 		    sema[semid].u.sem_base, &sem[semtot]));
 	} else {
 		DPRINTF(("didn't find it and wasn't asked to create it\n"));
@@ -956,10 +979,6 @@
 	size_t	nsops;
 };
 #endif
-
-/*
- * MPSAFE
- */
 int
 semop(td, uap)
 	struct thread *td;
@@ -979,7 +998,10 @@
 	int error;
 	int do_wakeup, do_undos;
 
-	DPRINTF(("call to semop(%d, 0x%x, %u)\n", semid, sops, nsops));
+#ifdef SEM_DEBUG
+	sops = NULL;
+#endif
+	DPRINTF(("call to semop(%d, %p, %u)\n", semid, sops, nsops));
 
 	if (!jail_sysvipc_allowed && jailed(td->td_ucred))
 		return (ENOSYS);
@@ -1000,7 +1022,7 @@
 		return (E2BIG);
 	}
 	if ((error = copyin(uap->sops, sops, nsops * sizeof(sops[0]))) != 0) {
-		DPRINTF(("error = %d from copyin(%08x, %08x, %d)\n", error,
+		DPRINTF(("error = %d from copyin(%p, %p, %d)\n", error,
 		    uap->sops, sops, nsops * sizeof(sops[0])));
 		if (sops != small_sops)
 			free(sops, M_SEM);
@@ -1042,10 +1064,8 @@
 	}
 #ifdef MAC
 	error = mac_check_sysv_semop(td->td_ucred, semakptr, j);
-	if (error != 0) {
-		MPRINTF(("mac_check_sysv_semop returned %d\n", error));
+	if (error != 0)
 		goto done2;
-	}
 #endif
 
 	/*
@@ -1066,8 +1086,8 @@
 			semptr = &semakptr->u.sem_base[sopptr->sem_num];
 
 			DPRINTF((
-			    "semop:  semakptr=%x, sem_base=%x, "
-			    "semptr=%x, sem[%d]=%d : op=%d, flag=%s\n",
+			    "semop:  semakptr=%p, sem_base=%p, "
+			    "semptr=%p, sem[%d]=%d : op=%d, flag=%s\n",
 			    semakptr, semakptr->u.sem_base, semptr,
 			    sopptr->sem_num, semptr->semval, sopptr->sem_op,
 			    (sopptr->sem_flg & IPC_NOWAIT) ?
@@ -1267,15 +1287,17 @@
 	 */
 	SEMUNDO_LOCK();
 	SLIST_FOREACH_PREVPTR(suptr, supptr, &semu_list, un_next) {
-		if (suptr->un_proc == p)
+		if (suptr->un_proc == p) {
+			*supptr = SLIST_NEXT(suptr, un_next);
 			break;
+		}
 	}
 	SEMUNDO_UNLOCK();
 
 	if (suptr == NULL)
 		return;
 
-	DPRINTF(("proc @%08x has undo structure with %d entries\n", p,
+	DPRINTF(("proc @%p has undo structure with %d entries\n", p,
 	    suptr->un_cnt));
 
 	/*
@@ -1301,7 +1323,7 @@
 				panic("semexit - semnum out of range");
 
 			DPRINTF((
-			    "semexit:  %08x id=%d num=%d(adj=%d) ; sem=%d\n",
+			    "semexit:  %p id=%d num=%d(adj=%d) ; sem=%d\n",
 			    suptr->un_proc, suptr->un_ent[ix].un_id,
 			    suptr->un_ent[ix].un_num,
 			    suptr->un_ent[ix].un_adjval,
@@ -1328,8 +1350,9 @@
 	 * Deallocate the undo vector.
 	 */
 	DPRINTF(("removing vector\n"));
+	SEMUNDO_LOCK();
 	suptr->un_proc = NULL;
-	*supptr = SLIST_NEXT(suptr, un_next);
+	SEMUNDO_UNLOCK();
 }
 
 static int
Index: subr_witness.c
===================================================================
RCS file: /home/cvs/src/sys/kern/subr_witness.c,v
retrieving revision 1.3
retrieving revision 1.4
diff -L sys/kern/subr_witness.c -L sys/kern/subr_witness.c -u -r1.3 -r1.4
--- sys/kern/subr_witness.c
+++ sys/kern/subr_witness.c
@@ -82,9 +82,10 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/subr_witness.c,v 1.195.2.7 2006/01/04 19:27:22 truckman Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/subr_witness.c,v 1.236.2.1 2007/11/27 13:18:54 attilio Exp $");
 
 #include "opt_ddb.h"
+#include "opt_hwpmc_hooks.h"
 #include "opt_witness.h"
 
 #include <sys/param.h>
@@ -95,6 +96,7 @@
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
+#include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
@@ -103,6 +105,17 @@
 
 #include <machine/stdarg.h>
 
+/* Note that these traces do not work with KTR_ALQ. */
+#if 0
+#define	KTR_WITNESS	KTR_SUBSYS
+#else
+#define	KTR_WITNESS	0
+#endif
+
+/* Easier to stay with the old names. */
+#define	lo_list		lo_witness_data.lod_list
+#define	lo_witness	lo_witness_data.lod_witness
+
 /* Define this to check for blessed mutexes */
 #undef BLESSING
 
@@ -167,11 +180,7 @@
 static int	itismychild(struct witness *parent, struct witness *child);
 static void	removechild(struct witness *parent, struct witness *child);
 static int	sysctl_debug_witness_watch(SYSCTL_HANDLER_ARGS);
-static void	witness_displaydescendants(void(*)(const char *fmt, ...),
-					   struct witness *, int indent);
 static const char *fixup_filename(const char *file);
-static void	witness_leveldescendents(struct witness *parent, int level);
-static void	witness_levelall(void);
 static struct	witness *witness_get(void);
 static void	witness_free(struct witness *m);
 static struct	witness_child_list_entry *witness_child_get(void);
@@ -182,10 +191,14 @@
 					     struct lock_object *lock);
 static void	witness_list_lock(struct lock_instance *instance);
 #ifdef DDB
-static void	witness_list(struct thread *td);
+static void	witness_leveldescendents(struct witness *parent, int level);
+static void	witness_levelall(void);
+static void	witness_displaydescendants(void(*)(const char *fmt, ...),
+					   struct witness *, int indent);
 static void	witness_display_list(void(*prnt)(const char *fmt, ...),
 				     struct witness_list *list);
 static void	witness_display(void(*)(const char *fmt, ...));
+static void	witness_list(struct thread *td);
 #endif
 
 SYSCTL_NODE(_debug, OID_AUTO, witness, CTLFLAG_RW, 0, "Witness Locking");
@@ -205,7 +218,7 @@
 /*
  * When KDB is enabled and witness_kdb is set to 1, it will cause the system
  * to drop into kdebug() when:
- *	- a lock heirarchy violation occurs
+ *	- a lock hierarchy violation occurs
  *	- locks are held when going to sleep.
  */
 #ifdef WITNESS_KDB
@@ -219,7 +232,7 @@
 /*
  * When KDB is enabled and witness_trace is set to 1, it will cause the system
  * to print a stack trace:
- *	- a lock heirarchy violation occurs
+ *	- a lock hierarchy violation occurs
  *	- locks are held when going to sleep.
  */
 int	witness_trace = 1;
@@ -264,12 +277,12 @@
 	 */
 	{ "proctree", &lock_class_sx },
 	{ "allproc", &lock_class_sx },
+	{ "allprison", &lock_class_sx },
 	{ NULL, NULL },
 	/*
 	 * Various mutexes
 	 */
 	{ "Giant", &lock_class_mtx_sleep },
-	{ "filedesc structure", &lock_class_mtx_sleep },
 	{ "pipe mutex", &lock_class_mtx_sleep },
 	{ "sigio lock", &lock_class_mtx_sleep },
 	{ "process group", &lock_class_mtx_sleep },
@@ -277,12 +290,13 @@
 	{ "session", &lock_class_mtx_sleep },
 	{ "uidinfo hash", &lock_class_mtx_sleep },
 	{ "uidinfo struct", &lock_class_mtx_sleep },
-	{ "allprison", &lock_class_mtx_sleep },
+#ifdef	HWPMC_HOOKS
+	{ "pmc-sleep", &lock_class_mtx_sleep },
+#endif
 	{ NULL, NULL },
 	/*
 	 * Sockets
 	 */
-	{ "filedesc structure", &lock_class_mtx_sleep },
 	{ "accept", &lock_class_mtx_sleep },
 	{ "so_snd", &lock_class_mtx_sleep },
 	{ "so_rcv", &lock_class_mtx_sleep },
@@ -297,8 +311,9 @@
 	{ "ifaddr", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
-	 * Multicast - protocol locks before interface locks.
+	 * Multicast - protocol locks before interface locks, after UDP locks.
 	 */
+	{ "udpinp", &lock_class_mtx_sleep },
 	{ "in_multi_mtx", &lock_class_mtx_sleep },
 	{ "igmp_mtx", &lock_class_mtx_sleep },
 	{ "if_addr_mtx", &lock_class_mtx_sleep },
@@ -348,6 +363,24 @@
 	{ "nfsd_mtx", &lock_class_mtx_sleep },
 	{ "so_snd", &lock_class_mtx_sleep },
 	{ NULL, NULL },
+
+	/*
+	 * IEEE 802.11
+	 */
+	{ "802.11 com lock", &lock_class_mtx_sleep},
+	{ NULL, NULL },
+	/*
+	 * Network drivers
+	 */
+	{ "network driver", &lock_class_mtx_sleep},
+	{ NULL, NULL },
+
+	/*
+	 * Netgraph
+	 */
+	{ "ng_node", &lock_class_mtx_sleep },
+	{ "ng_worklist", &lock_class_mtx_sleep },
+	{ NULL, NULL },
 	/*
 	 * CDEV
 	 */
@@ -357,6 +390,13 @@
 	{ "cdev", &lock_class_mtx_sleep },
 	{ NULL, NULL },
 	/*
+	 * kqueue/VFS interaction
+	 */
+	{ "kqueue", &lock_class_mtx_sleep },
+	{ "struct mount mtx", &lock_class_mtx_sleep },
+	{ "vnode interlock", &lock_class_mtx_sleep },
+	{ NULL, NULL },
+	/*
 	 * spin locks
 	 */
 #ifdef SMP
@@ -364,42 +404,50 @@
 #endif
 	{ "rm.mutex_mtx", &lock_class_mtx_spin },
 	{ "sio", &lock_class_mtx_spin },
+	{ "scrlock", &lock_class_mtx_spin },
 #ifdef __i386__
 	{ "cy", &lock_class_mtx_spin },
 #endif
+#ifdef __sparc64__
+	{ "pcib_mtx", &lock_class_mtx_spin },
+	{ "rtc_mtx", &lock_class_mtx_spin },
+#endif
+	{ "scc_hwmtx", &lock_class_mtx_spin },
 	{ "uart_hwmtx", &lock_class_mtx_spin },
-	{ "sabtty", &lock_class_mtx_spin },
-	{ "zstty", &lock_class_mtx_spin },
-	{ "ng_node", &lock_class_mtx_spin },
-	{ "ng_worklist", &lock_class_mtx_spin },
-	{ "taskqueue_fast", &lock_class_mtx_spin },
+	{ "fast_taskqueue", &lock_class_mtx_spin },
 	{ "intr table", &lock_class_mtx_spin },
+#ifdef	HWPMC_HOOKS
+	{ "pmc-per-proc", &lock_class_mtx_spin },
+#endif
+	{ "process slock", &lock_class_mtx_spin },
 	{ "sleepq chain", &lock_class_mtx_spin },
-	{ "sched lock", &lock_class_mtx_spin },
+	{ "umtx lock", &lock_class_mtx_spin },
 	{ "turnstile chain", &lock_class_mtx_spin },
+	{ "turnstile lock", &lock_class_mtx_spin },
+	{ "sched lock", &lock_class_mtx_spin },
 	{ "td_contested", &lock_class_mtx_spin },
 	{ "callout", &lock_class_mtx_spin },
 	{ "entropy harvest mutex", &lock_class_mtx_spin },
 	{ "syscons video lock", &lock_class_mtx_spin },
+	{ "time lock", &lock_class_mtx_spin },
+#ifdef SMP
+	{ "smp rendezvous", &lock_class_mtx_spin },
+#endif
 	/*
 	 * leaf locks
 	 */
-	{ "allpmaps", &lock_class_mtx_spin },
-	{ "vm page queue free mutex", &lock_class_mtx_spin },
 	{ "icu", &lock_class_mtx_spin },
-#ifdef SMP
-	{ "smp rendezvous", &lock_class_mtx_spin },
-#if defined(__i386__) || defined(__amd64__)
-	{ "tlb", &lock_class_mtx_spin },
-#endif
-#ifdef __sparc64__
+#if defined(SMP) && defined(__sparc64__)
 	{ "ipi", &lock_class_mtx_spin },
-	{ "rtc_mtx", &lock_class_mtx_spin },
 #endif
+#ifdef __i386__
+	{ "allpmaps", &lock_class_mtx_spin },
+	{ "descriptor tables", &lock_class_mtx_spin },
 #endif
 	{ "clk", &lock_class_mtx_spin },
-	{ "mutex profiling lock", &lock_class_mtx_spin },
-	{ "kse zombie lock", &lock_class_mtx_spin },
+	{ "mprof lock", &lock_class_mtx_spin },
+	{ "kse lock", &lock_class_mtx_spin },
+	{ "zombie lock", &lock_class_mtx_spin },
 	{ "ALD Queue", &lock_class_mtx_spin },
 #ifdef __ia64__
 	{ "MCA spin lock", &lock_class_mtx_spin },
@@ -413,6 +461,10 @@
 	{ "tw_cl_io_lock", &lock_class_mtx_spin },
 	{ "tw_cl_intr_lock", &lock_class_mtx_spin },
 	{ "tw_cl_gen_lock", &lock_class_mtx_spin },
+#ifdef	HWPMC_HOOKS
+	{ "pmc-leaf", &lock_class_mtx_spin },
+#endif
+	{ "blocked lock", &lock_class_mtx_spin },
 	{ NULL, NULL },
 	{ NULL, NULL }
 };
@@ -429,19 +481,11 @@
 #endif
 
 /*
- * List of all locks in the system.
+ * List of locks initialized prior to witness being initialized whose
+ * enrollment is currently deferred.
  */
-TAILQ_HEAD(, lock_object) all_locks = TAILQ_HEAD_INITIALIZER(all_locks);
-
-static struct mtx all_mtx = {
-	{ &lock_class_mtx_sleep,	/* mtx_object.lo_class */
-	  "All locks list",		/* mtx_object.lo_name */
-	  "All locks list",		/* mtx_object.lo_type */
-	  LO_INITIALIZED,		/* mtx_object.lo_flags */
-	  { NULL, NULL },		/* mtx_object.lo_list */
-	  NULL },			/* mtx_object.lo_witness */
-	MTX_UNOWNED, 0			/* mtx_lock, mtx_recurse */
-};
+STAILQ_HEAD(, lock_object) pending_locks =
+    STAILQ_HEAD_INITIALIZER(pending_locks);
 
 /*
  * This global is set to 0 once it becomes safe to use the witness code.
@@ -455,13 +499,9 @@
 static int witness_spin_warn = 0;
 
 /*
- * Global variables for book keeping.
- */
-static int lock_cur_cnt;
-static int lock_max_cnt;
-
-/*
- * The WITNESS-enabled diagnostic code.
+ * The WITNESS-enabled diagnostic code.  Note that the witness code does
+ * assume that the early boot is single-threaded at least until after this
+ * routine is completed.
  */
 static void
 witness_initialize(void *dummy __unused)
@@ -479,9 +519,8 @@
 	mtx_assert(&Giant, MA_NOTOWNED);
 
 	CTR1(KTR_WITNESS, "%s: initializing witness", __func__);
-	TAILQ_INSERT_HEAD(&all_locks, &all_mtx.mtx_object, lo_list);
 	mtx_init(&w_mtx, "witness lock", NULL, MTX_SPIN | MTX_QUIET |
-	    MTX_NOWITNESS);
+	    MTX_NOWITNESS | MTX_NOPROFILE);
 	for (i = 0; i < WITNESS_COUNT; i++)
 		witness_free(&w_data[i]);
 	for (i = 0; i < WITNESS_CHILDCOUNT; i++)
@@ -508,15 +547,14 @@
 	witness_spin_warn = 1;
 
 	/* Iterate through all locks and add them to witness. */
-	mtx_lock(&all_mtx);
-	TAILQ_FOREACH(lock, &all_locks, lo_list) {
-		if (lock->lo_flags & LO_WITNESS)
-			lock->lo_witness = enroll(lock->lo_type,
-			    lock->lo_class);
-		else
-			lock->lo_witness = NULL;
+	while (!STAILQ_EMPTY(&pending_locks)) {
+		lock = STAILQ_FIRST(&pending_locks);
+		STAILQ_REMOVE_HEAD(&pending_locks, lo_list);
+		KASSERT(lock->lo_flags & LO_WITNESS,
+		    ("%s: lock %s is on pending list but not LO_WITNESS",
+		    __func__, lock->lo_name));
+		lock->lo_witness = enroll(lock->lo_type, LOCK_CLASS(lock));
 	}
-	mtx_unlock(&all_mtx);
 
 	/* Mark the witness code as being ready for use. */
 	witness_cold = 0;
@@ -534,9 +572,6 @@
 	error = sysctl_handle_int(oidp, &value, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
-	error = suser(req->td);
-	if (error != 0)
-		return (error);
 	if (value == witness_watch)
 		return (0);
 	if (value != 0)
@@ -550,10 +585,8 @@
 {
 	struct lock_class *class;
 
-	class = lock->lo_class;
-	if (lock->lo_flags & LO_INITIALIZED)
-		panic("%s: lock (%s) %s is already initialized", __func__,
-		    class->lc_name, lock->lo_name);
+	/* Various sanity checks. */
+	class = LOCK_CLASS(lock);
 	if ((lock->lo_flags & LO_RECURSABLE) != 0 &&
 	    (class->lc_flags & LC_RECURSABLE) == 0)
 		panic("%s: lock (%s) %s can not be recursable", __func__,
@@ -567,35 +600,38 @@
 		panic("%s: lock (%s) %s can not be upgradable", __func__,
 		    class->lc_name, lock->lo_name);
 
-	mtx_lock(&all_mtx);
-	TAILQ_INSERT_TAIL(&all_locks, lock, lo_list);
-	lock->lo_flags |= LO_INITIALIZED;
-	lock_cur_cnt++;
-	if (lock_cur_cnt > lock_max_cnt)
-		lock_max_cnt = lock_cur_cnt;
-	mtx_unlock(&all_mtx);
-	if (!witness_cold && witness_watch != 0 && panicstr == NULL &&
-	    (lock->lo_flags & LO_WITNESS) != 0)
-		lock->lo_witness = enroll(lock->lo_type, class);
-	else
+	/*
+	 * If we shouldn't watch this lock, then just clear lo_witness.
+	 * Otherwise, if witness_cold is set, then it is too early to
+	 * enroll this lock, so defer it to witness_initialize() by adding
+	 * it to the pending_locks list.  If it is not too early, then enroll
+	 * the lock now.
+	 */
+	if (witness_watch == 0 || panicstr != NULL ||
+	    (lock->lo_flags & LO_WITNESS) == 0)
 		lock->lo_witness = NULL;
+	else if (witness_cold) {
+		STAILQ_INSERT_TAIL(&pending_locks, lock, lo_list);
+		lock->lo_flags |= LO_ENROLLPEND;
+	} else
+		lock->lo_witness = enroll(lock->lo_type, class);
 }
 
 void
 witness_destroy(struct lock_object *lock)
 {
+	struct lock_class *class;
 	struct witness *w;
 
+	class = LOCK_CLASS(lock);
 	if (witness_cold)
 		panic("lock (%s) %s destroyed while witness_cold",
-		    lock->lo_class->lc_name, lock->lo_name);
-	if ((lock->lo_flags & LO_INITIALIZED) == 0)
-		panic("%s: lock (%s) %s is not initialized", __func__,
-		    lock->lo_class->lc_name, lock->lo_name);
+		    class->lc_name, lock->lo_name);
 
 	/* XXX: need to verify that no one holds the lock */
-	w = lock->lo_witness;
-	if (w != NULL) {
+	if ((lock->lo_flags & (LO_WITNESS | LO_ENROLLPEND)) == LO_WITNESS &&
+	    lock->lo_witness != NULL) {
+		w = lock->lo_witness;
 		mtx_lock_spin(&w_mtx);
 		MPASS(w->w_refcount > 0);
 		w->w_refcount--;
@@ -608,15 +644,99 @@
 			mtx_unlock_spin(&w_mtx);
 	}
 
-	mtx_lock(&all_mtx);
-	lock_cur_cnt--;
-	TAILQ_REMOVE(&all_locks, lock, lo_list);
-	lock->lo_flags &= ~LO_INITIALIZED;
-	mtx_unlock(&all_mtx);
+	/*
+	 * If this lock is destroyed before witness is up and running,
+	 * remove it from the pending list.
+	 */
+	if (lock->lo_flags & LO_ENROLLPEND) {
+		STAILQ_REMOVE(&pending_locks, lock, lock_object, lo_list);
+		lock->lo_flags &= ~LO_ENROLLPEND;
+	}
 }
 
 #ifdef DDB
 static void
+witness_levelall (void)
+{
+	struct witness_list *list;
+	struct witness *w, *w1;
+
+	/*
+	 * First clear all levels.
+	 */
+	STAILQ_FOREACH(w, &w_all, w_list) {
+		w->w_level = 0;
+	}
+
+	/*
+	 * Look for locks with no parent and level all their descendants.
+	 */
+	STAILQ_FOREACH(w, &w_all, w_list) {
+		/*
+		 * This is just an optimization, technically we could get
+		 * away just walking the all list each time.
+		 */
+		if (w->w_class->lc_flags & LC_SLEEPLOCK)
+			list = &w_sleep;
+		else
+			list = &w_spin;
+		STAILQ_FOREACH(w1, list, w_typelist) {
+			if (isitmychild(w1, w))
+				goto skip;
+		}
+		witness_leveldescendents(w, 0);
+	skip:
+		;	/* silence GCC 3.x */
+	}
+}
+
+static void
+witness_leveldescendents(struct witness *parent, int level)
+{
+	struct witness_child_list_entry *wcl;
+	int i;
+
+	if (parent->w_level < level)
+		parent->w_level = level;
+	level++;
+	for (wcl = parent->w_children; wcl != NULL; wcl = wcl->wcl_next)
+		for (i = 0; i < wcl->wcl_count; i++)
+			witness_leveldescendents(wcl->wcl_children[i], level);
+}
+
+static void
+witness_displaydescendants(void(*prnt)(const char *fmt, ...),
+			   struct witness *parent, int indent)
+{
+	struct witness_child_list_entry *wcl;
+	int i, level;
+
+	level = parent->w_level;
+	prnt("%-2d", level);
+	for (i = 0; i < indent; i++)
+		prnt(" ");
+	if (parent->w_refcount > 0)
+		prnt("%s", parent->w_name);
+	else
+		prnt("(dead)");
+	if (parent->w_displayed) {
+		prnt(" -- (already displayed)\n");
+		return;
+	}
+	parent->w_displayed = 1;
+	if (parent->w_refcount > 0) {
+		if (parent->w_file != NULL)
+			prnt(" -- last acquired @ %s:%d", parent->w_file,
+			    parent->w_line);
+	}
+	prnt("\n");
+	for (wcl = parent->w_children; wcl != NULL; wcl = wcl->wcl_next)
+		for (i = 0; i < wcl->wcl_count; i++)
+			    witness_displaydescendants(prnt,
+				wcl->wcl_children[i], indent + 1);
+}
+
+static void
 witness_display_list(void(*prnt)(const char *fmt, ...),
 		     struct witness_list *list)
 {
@@ -742,7 +862,7 @@
 		    __func__);
 
 	w = lock->lo_witness;
-	class = lock->lo_class;
+	class = LOCK_CLASS(lock);
 	td = curthread;
 	file = fixup_filename(file);
 
@@ -867,14 +987,14 @@
 			 * lock, then skip it.
 			 */
 			if ((lock1->li_lock->lo_flags & LO_SLEEPABLE) != 0 &&
-			    lock == &Giant.mtx_object)
+			    lock == &Giant.lock_object)
 				continue;
 			/*
 			 * If we are locking a sleepable lock and this lock
 			 * is Giant, then skip it.
 			 */
 			if ((lock->lo_flags & LO_SLEEPABLE) != 0 &&
-			    lock1->li_lock == &Giant.mtx_object)
+			    lock1->li_lock == &Giant.lock_object)
 				continue;
 			/*
 			 * If we are locking a sleepable lock and this lock
@@ -890,7 +1010,7 @@
 			 * lock, then treat it as a reversal.
 			 */
 			if ((lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0 &&
-			    lock == &Giant.mtx_object)
+			    lock == &Giant.lock_object)
 				goto reversal;
 			/*
 			 * Check the lock order hierarchy for a reveresal.
@@ -912,7 +1032,7 @@
 			if (blessed(w, w1))
 				return;
 #endif
-			if (lock1->li_lock == &Giant.mtx_object) {
+			if (lock1->li_lock == &Giant.lock_object) {
 				if (w1->w_Giant_squawked)
 					return;
 				else
@@ -931,7 +1051,7 @@
 				printf(
 		"lock order reversal: (sleepable after non-sleepable)\n");
 			else if ((lock1->li_lock->lo_flags & LO_SLEEPABLE) == 0
-			    && lock == &Giant.mtx_object)
+			    && lock == &Giant.lock_object)
 				printf(
 		"lock order reversal: (Giant after non-sleepable)\n");
 			else
@@ -986,7 +1106,7 @@
 	 * always come before Giant.
 	 */
 	if (flags & LOP_NEWORDER &&
-	    !(lock1->li_lock == &Giant.mtx_object &&
+	    !(lock1->li_lock == &Giant.lock_object &&
 	    (lock->lo_flags & LO_SLEEPABLE) != 0)) {
 		CTR3(KTR_WITNESS, "%s: adding %s as a child of %s", __func__,
 		    lock->lo_type, lock1->li_lock->lo_type);
@@ -1022,7 +1142,7 @@
 	file = fixup_filename(file);
 
 	/* Determine lock list for this lock. */
-	if (lock->lo_class->lc_flags & LC_SLEEPLOCK)
+	if (LOCK_CLASS(lock)->lc_flags & LC_SLEEPLOCK)
 		lock_list = &td->td_sleeplocks;
 	else
 		lock_list = PCPU_PTR(spinlocks);
@@ -1075,7 +1195,7 @@
 	KASSERT(!witness_cold, ("%s: witness_cold", __func__));
 	if (lock->lo_witness == NULL || witness_watch == 0 || panicstr != NULL)
 		return;
-	class = lock->lo_class;
+	class = LOCK_CLASS(lock);
 	file = fixup_filename(file);
 	if ((lock->lo_flags & LO_UPGRADABLE) == 0)
 		panic("upgrade of non-upgradable lock (%s) %s @ %s:%d",
@@ -1083,7 +1203,7 @@
 	if ((flags & LOP_TRYLOCK) == 0)
 		panic("non-try upgrade of lock (%s) %s @ %s:%d", class->lc_name,
 		    lock->lo_name, file, line);
-	if ((lock->lo_class->lc_flags & LC_SLEEPLOCK) == 0)
+	if ((class->lc_flags & LC_SLEEPLOCK) == 0)
 		panic("upgrade of non-sleep lock (%s) %s @ %s:%d",
 		    class->lc_name, lock->lo_name, file, line);
 	instance = find_instance(curthread->td_sleeplocks, lock);
@@ -1110,12 +1230,12 @@
 	KASSERT(!witness_cold, ("%s: witness_cold", __func__));
 	if (lock->lo_witness == NULL || witness_watch == 0 || panicstr != NULL)
 		return;
-	class = lock->lo_class;
+	class = LOCK_CLASS(lock);
 	file = fixup_filename(file);
 	if ((lock->lo_flags & LO_UPGRADABLE) == 0)
 		panic("downgrade of non-upgradable lock (%s) %s @ %s:%d",
 		    class->lc_name, lock->lo_name, file, line);
-	if ((lock->lo_class->lc_flags & LC_SLEEPLOCK) == 0)
+	if ((class->lc_flags & LC_SLEEPLOCK) == 0)
 		panic("downgrade of non-sleep lock (%s) %s @ %s:%d",
 		    class->lc_name, lock->lo_name, file, line);
 	instance = find_instance(curthread->td_sleeplocks, lock);
@@ -1146,7 +1266,7 @@
 	    panicstr != NULL)
 		return;
 	td = curthread;
-	class = lock->lo_class;
+	class = LOCK_CLASS(lock);
 	file = fixup_filename(file);
 
 	/* Find lock instance associated with this lock. */
@@ -1238,7 +1358,7 @@
 			if (lock1->li_lock == lock)
 				continue;
 			if (flags & WARN_GIANTOK &&
-			    lock1->li_lock == &Giant.mtx_object)
+			    lock1->li_lock == &Giant.lock_object)
 				continue;
 			if (flags & WARN_SLEEPOK &&
 			    (lock1->li_lock->lo_flags & LO_SLEEPABLE) != 0)
@@ -1520,87 +1640,6 @@
 	return (0);
 }
 
-static void
-witness_levelall (void)
-{
-	struct witness_list *list;
-	struct witness *w, *w1;
-
-	/*
-	 * First clear all levels.
-	 */
-	STAILQ_FOREACH(w, &w_all, w_list) {
-		w->w_level = 0;
-	}
-
-	/*
-	 * Look for locks with no parent and level all their descendants.
-	 */
-	STAILQ_FOREACH(w, &w_all, w_list) {
-		/*
-		 * This is just an optimization, technically we could get
-		 * away just walking the all list each time.
-		 */
-		if (w->w_class->lc_flags & LC_SLEEPLOCK)
-			list = &w_sleep;
-		else
-			list = &w_spin;
-		STAILQ_FOREACH(w1, list, w_typelist) {
-			if (isitmychild(w1, w))
-				goto skip;
-		}
-		witness_leveldescendents(w, 0);
-	skip:
-		;	/* silence GCC 3.x */
-	}
-}
-
-static void
-witness_leveldescendents(struct witness *parent, int level)
-{
-	struct witness_child_list_entry *wcl;
-	int i;
-
-	if (parent->w_level < level)
-		parent->w_level = level;
-	level++;
-	for (wcl = parent->w_children; wcl != NULL; wcl = wcl->wcl_next)
-		for (i = 0; i < wcl->wcl_count; i++)
-			witness_leveldescendents(wcl->wcl_children[i], level);
-}
-
-static void
-witness_displaydescendants(void(*prnt)(const char *fmt, ...),
-			   struct witness *parent, int indent)
-{
-	struct witness_child_list_entry *wcl;
-	int i, level;
-
-	level = parent->w_level;
-	prnt("%-2d", level);
-	for (i = 0; i < indent; i++)
-		prnt(" ");
-	if (parent->w_refcount > 0)
-		prnt("%s", parent->w_name);
-	else
-		prnt("(dead)");
-	if (parent->w_displayed) {
-		prnt(" -- (already displayed)\n");
-		return;
-	}
-	parent->w_displayed = 1;
-	if (parent->w_refcount > 0) {
-		if (parent->w_file != NULL)
-			prnt(" -- last acquired @ %s:%d", parent->w_file,
-			    parent->w_line);
-	}
-	prnt("\n");
-	for (wcl = parent->w_children; wcl != NULL; wcl = wcl->wcl_next)
-		for (i = 0; i < wcl->wcl_count; i++)
-			    witness_displaydescendants(prnt,
-				wcl->wcl_children[i], indent + 1);
-}
-
 #ifdef BLESSING
 static int
 blessed(struct witness *w1, struct witness *w2)
@@ -1738,7 +1777,7 @@
 
 	lock = instance->li_lock;
 	printf("%s %s %s", (instance->li_flags & LI_EXCLUSIVE) != 0 ?
-	    "exclusive" : "shared", lock->lo_class->lc_name, lock->lo_name);
+	    "exclusive" : "shared", LOCK_CLASS(lock)->lc_name, lock->lo_name);
 	if (lock->lo_type != lock->lo_name)
 		printf(" (%s)", lock->lo_type);
 	printf(" r = %d (%p) locked @ %s:%d\n",
@@ -1806,18 +1845,25 @@
 void
 witness_save(struct lock_object *lock, const char **filep, int *linep)
 {
+	struct lock_list_entry *lock_list;
 	struct lock_instance *instance;
+	struct lock_class *class;
 
 	KASSERT(!witness_cold, ("%s: witness_cold", __func__));
 	if (lock->lo_witness == NULL || witness_watch == 0 || panicstr != NULL)
 		return;
-	if ((lock->lo_class->lc_flags & LC_SLEEPLOCK) == 0)
-		panic("%s: lock (%s) %s is not a sleep lock", __func__,
-		    lock->lo_class->lc_name, lock->lo_name);
-	instance = find_instance(curthread->td_sleeplocks, lock);
+	class = LOCK_CLASS(lock);
+	if (class->lc_flags & LC_SLEEPLOCK)
+		lock_list = curthread->td_sleeplocks;
+	else {
+		if (witness_skipspin)
+			return;
+		lock_list = PCPU_GET(spinlocks);
+	}
+	instance = find_instance(lock_list, lock);
 	if (instance == NULL)
 		panic("%s: lock (%s) %s not locked", __func__,
-		    lock->lo_class->lc_name, lock->lo_name);
+		    class->lc_name, lock->lo_name);
 	*filep = instance->li_file;
 	*linep = instance->li_line;
 }
@@ -1825,18 +1871,25 @@
 void
 witness_restore(struct lock_object *lock, const char *file, int line)
 {
+	struct lock_list_entry *lock_list;
 	struct lock_instance *instance;
+	struct lock_class *class;
 
 	KASSERT(!witness_cold, ("%s: witness_cold", __func__));
 	if (lock->lo_witness == NULL || witness_watch == 0 || panicstr != NULL)
 		return;
-	if ((lock->lo_class->lc_flags & LC_SLEEPLOCK) == 0)
-		panic("%s: lock (%s) %s is not a sleep lock", __func__,
-		    lock->lo_class->lc_name, lock->lo_name);
-	instance = find_instance(curthread->td_sleeplocks, lock);
+	class = LOCK_CLASS(lock);
+	if (class->lc_flags & LC_SLEEPLOCK)
+		lock_list = curthread->td_sleeplocks;
+	else {
+		if (witness_skipspin)
+			return;
+		lock_list = PCPU_GET(spinlocks);
+	}
+	instance = find_instance(lock_list, lock);
 	if (instance == NULL)
 		panic("%s: lock (%s) %s not locked", __func__,
-		    lock->lo_class->lc_name, lock->lo_name);
+		    class->lc_name, lock->lo_name);
 	lock->lo_witness->w_file = file;
 	lock->lo_witness->w_line = line;
 	instance->li_file = file;
@@ -1848,23 +1901,25 @@
 {
 #ifdef INVARIANT_SUPPORT
 	struct lock_instance *instance;
+	struct lock_class *class;
 
 	if (lock->lo_witness == NULL || witness_watch == 0 || panicstr != NULL)
 		return;
-	if ((lock->lo_class->lc_flags & LC_SLEEPLOCK) != 0)
+	class = LOCK_CLASS(lock);
+	if ((class->lc_flags & LC_SLEEPLOCK) != 0)
 		instance = find_instance(curthread->td_sleeplocks, lock);
-	else if ((lock->lo_class->lc_flags & LC_SPINLOCK) != 0)
+	else if ((class->lc_flags & LC_SPINLOCK) != 0)
 		instance = find_instance(PCPU_GET(spinlocks), lock);
 	else {
 		panic("Lock (%s) %s is not sleep or spin!",
-		    lock->lo_class->lc_name, lock->lo_name);
+		    class->lc_name, lock->lo_name);
 	}
 	file = fixup_filename(file);
 	switch (flags) {
 	case LA_UNLOCKED:
 		if (instance != NULL)
 			panic("Lock (%s) %s locked @ %s:%d.",
-			    lock->lo_class->lc_name, lock->lo_name, file, line);
+			    class->lc_name, lock->lo_name, file, line);
 		break;
 	case LA_LOCKED:
 	case LA_LOCKED | LA_RECURSED:
@@ -1877,25 +1932,25 @@
 	case LA_XLOCKED | LA_NOTRECURSED:
 		if (instance == NULL) {
 			panic("Lock (%s) %s not locked @ %s:%d.",
-			    lock->lo_class->lc_name, lock->lo_name, file, line);
+			    class->lc_name, lock->lo_name, file, line);
 			break;
 		}
 		if ((flags & LA_XLOCKED) != 0 &&
 		    (instance->li_flags & LI_EXCLUSIVE) == 0)
 			panic("Lock (%s) %s not exclusively locked @ %s:%d.",
-			    lock->lo_class->lc_name, lock->lo_name, file, line);
+			    class->lc_name, lock->lo_name, file, line);
 		if ((flags & LA_SLOCKED) != 0 &&
 		    (instance->li_flags & LI_EXCLUSIVE) != 0)
 			panic("Lock (%s) %s exclusively locked @ %s:%d.",
-			    lock->lo_class->lc_name, lock->lo_name, file, line);
+			    class->lc_name, lock->lo_name, file, line);
 		if ((flags & LA_RECURSED) != 0 &&
 		    (instance->li_flags & LI_RECURSEMASK) == 0)
 			panic("Lock (%s) %s not recursed @ %s:%d.",
-			    lock->lo_class->lc_name, lock->lo_name, file, line);
+			    class->lc_name, lock->lo_name, file, line);
 		if ((flags & LA_NOTRECURSED) != 0 &&
 		    (instance->li_flags & LI_RECURSEMASK) != 0)
 			panic("Lock (%s) %s recursed @ %s:%d.",
-			    lock->lo_class->lc_name, lock->lo_name, file, line);
+			    class->lc_name, lock->lo_name, file, line);
 		break;
 	default:
 		panic("Invalid lock assertion at %s:%d.", file, line);
@@ -1925,10 +1980,10 @@
 	 * td->td_oncpu to get the list of spinlocks for this thread
 	 * and "fix" this.
 	 *
-	 * That still wouldn't really fix this unless we locked sched_lock
-	 * or stopped the other CPU to make sure it wasn't changing the list
-	 * out from under us.  It is probably best to just not try to handle
-	 * threads on other CPU's for now.
+	 * That still wouldn't really fix this unless we locked the scheduler
+	 * lock or stopped the other CPU to make sure it wasn't changing the
+	 * list out from under us.  It is probably best to just not try to
+	 * handle threads on other CPU's for now.
 	 */
 	if (td == curthread && PCPU_GET(spinlocks) != NULL)
 		witness_list_locks(PCPU_PTR(spinlocks));
@@ -1937,30 +1992,12 @@
 DB_SHOW_COMMAND(locks, db_witness_list)
 {
 	struct thread *td;
-	pid_t pid;
-	struct proc *p;
 
-	if (have_addr) {
-		pid = (addr % 16) + ((addr >> 4) % 16) * 10 +
-		    ((addr >> 8) % 16) * 100 + ((addr >> 12) % 16) * 1000 +
-		    ((addr >> 16) % 16) * 10000;
-		/* sx_slock(&allproc_lock); */
-		FOREACH_PROC_IN_SYSTEM(p) {
-			if (p->p_pid == pid)
-				break;
-		}
-		/* sx_sunlock(&allproc_lock); */
-		if (p == NULL) {
-			db_printf("pid %d not found\n", pid);
-			return;
-		}
-		FOREACH_THREAD_IN_PROC(p, td) {
-			witness_list(td);
-		}
-	} else {
-		td = curthread;
-		witness_list(td);
-	}
+	if (have_addr)
+		td = db_lookup_thread(addr, TRUE);
+	else
+		td = kdb_thread;
+	witness_list(td);
 }
 
 DB_SHOW_COMMAND(alllocks, db_witness_list_all)
Index: init_sysent.c
===================================================================
RCS file: /home/cvs/src/sys/kern/init_sysent.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/init_sysent.c -L sys/kern/init_sysent.c -u -r1.2 -r1.3
--- sys/kern/init_sysent.c
+++ sys/kern/init_sysent.c
@@ -2,8 +2,8 @@
  * System call switch table.
  *
  * DO NOT EDIT-- this file is automatically generated.
- * $FreeBSD: src/sys/kern/init_sysent.c,v 1.195.2.2 2006/03/17 01:47:32 rwatson Exp $
- * created from FreeBSD: src/sys/kern/syscalls.master,v 1.198.2.2 2006/03/17 01:47:06 rwatson Exp 
+ * $FreeBSD: src/sys/kern/init_sysent.c,v 1.230 2007/08/16 05:32:25 davidxu Exp $
+ * created from FreeBSD: src/sys/kern/syscalls.master,v 1.232 2007/07/04 22:47:37 peter Exp 
  */
 
 #include "opt_compat.h"
@@ -29,460 +29,486 @@
 
 /* The casts are bogus but will do for now. */
 struct sysent sysent[] = {
-	{ SYF_MPSAFE | 0, (sy_call_t *)nosys, AUE_NULL },	/* 0 = syscall */
-	{ SYF_MPSAFE | AS(sys_exit_args), (sy_call_t *)sys_exit, AUE_NULL },	/* 1 = exit */
-	{ SYF_MPSAFE | 0, (sy_call_t *)fork, AUE_NULL },	/* 2 = fork */
-	{ SYF_MPSAFE | AS(read_args), (sy_call_t *)read, AUE_NULL },	/* 3 = read */
-	{ SYF_MPSAFE | AS(write_args), (sy_call_t *)write, AUE_NULL },	/* 4 = write */
-	{ SYF_MPSAFE | AS(open_args), (sy_call_t *)open, AUE_NULL },	/* 5 = open */
-	{ SYF_MPSAFE | AS(close_args), (sy_call_t *)close, AUE_NULL },	/* 6 = close */
-	{ SYF_MPSAFE | AS(wait_args), (sy_call_t *)wait4, AUE_NULL },	/* 7 = wait4 */
-	{ compat(SYF_MPSAFE | AS(ocreat_args),creat), AUE_NULL },	/* 8 = old creat */
-	{ SYF_MPSAFE | AS(link_args), (sy_call_t *)link, AUE_NULL },	/* 9 = link */
-	{ SYF_MPSAFE | AS(unlink_args), (sy_call_t *)unlink, AUE_NULL },	/* 10 = unlink */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 11 = obsolete execv */
-	{ SYF_MPSAFE | AS(chdir_args), (sy_call_t *)chdir, AUE_NULL },	/* 12 = chdir */
-	{ SYF_MPSAFE | AS(fchdir_args), (sy_call_t *)fchdir, AUE_NULL },	/* 13 = fchdir */
-	{ SYF_MPSAFE | AS(mknod_args), (sy_call_t *)mknod, AUE_NULL },	/* 14 = mknod */
-	{ SYF_MPSAFE | AS(chmod_args), (sy_call_t *)chmod, AUE_NULL },	/* 15 = chmod */
-	{ SYF_MPSAFE | AS(chown_args), (sy_call_t *)chown, AUE_NULL },	/* 16 = chown */
-	{ SYF_MPSAFE | AS(obreak_args), (sy_call_t *)obreak, AUE_NULL },	/* 17 = break */
-	{ compat4(SYF_MPSAFE | AS(freebsd4_getfsstat_args),getfsstat), AUE_NULL },	/* 18 = old getfsstat */
-	{ compat(SYF_MPSAFE | AS(olseek_args),lseek), AUE_NULL },	/* 19 = old lseek */
-	{ SYF_MPSAFE | 0, (sy_call_t *)getpid, AUE_NULL },	/* 20 = getpid */
-	{ AS(mount_args), (sy_call_t *)mount, AUE_NULL },	/* 21 = mount */
-	{ AS(unmount_args), (sy_call_t *)unmount, AUE_NULL },	/* 22 = unmount */
-	{ SYF_MPSAFE | AS(setuid_args), (sy_call_t *)setuid, AUE_NULL },	/* 23 = setuid */
-	{ SYF_MPSAFE | 0, (sy_call_t *)getuid, AUE_NULL },	/* 24 = getuid */
-	{ SYF_MPSAFE | 0, (sy_call_t *)geteuid, AUE_NULL },	/* 25 = geteuid */
-	{ SYF_MPSAFE | AS(ptrace_args), (sy_call_t *)ptrace, AUE_NULL },	/* 26 = ptrace */
-	{ SYF_MPSAFE | AS(recvmsg_args), (sy_call_t *)recvmsg, AUE_NULL },	/* 27 = recvmsg */
-	{ SYF_MPSAFE | AS(sendmsg_args), (sy_call_t *)sendmsg, AUE_NULL },	/* 28 = sendmsg */
-	{ SYF_MPSAFE | AS(recvfrom_args), (sy_call_t *)recvfrom, AUE_NULL },	/* 29 = recvfrom */
-	{ SYF_MPSAFE | AS(accept_args), (sy_call_t *)accept, AUE_NULL },	/* 30 = accept */
-	{ SYF_MPSAFE | AS(getpeername_args), (sy_call_t *)getpeername, AUE_NULL },	/* 31 = getpeername */
-	{ SYF_MPSAFE | AS(getsockname_args), (sy_call_t *)getsockname, AUE_NULL },	/* 32 = getsockname */
-	{ SYF_MPSAFE | AS(access_args), (sy_call_t *)access, AUE_NULL },	/* 33 = access */
-	{ SYF_MPSAFE | AS(chflags_args), (sy_call_t *)chflags, AUE_NULL },	/* 34 = chflags */
-	{ SYF_MPSAFE | AS(fchflags_args), (sy_call_t *)fchflags, AUE_NULL },	/* 35 = fchflags */
-	{ SYF_MPSAFE | 0, (sy_call_t *)sync, AUE_NULL },	/* 36 = sync */
-	{ SYF_MPSAFE | AS(kill_args), (sy_call_t *)kill, AUE_NULL },	/* 37 = kill */
-	{ compat(SYF_MPSAFE | AS(ostat_args),stat), AUE_NULL },	/* 38 = old stat */
-	{ SYF_MPSAFE | 0, (sy_call_t *)getppid, AUE_NULL },	/* 39 = getppid */
-	{ compat(SYF_MPSAFE | AS(olstat_args),lstat), AUE_NULL },	/* 40 = old lstat */
-	{ SYF_MPSAFE | AS(dup_args), (sy_call_t *)dup, AUE_NULL },	/* 41 = dup */
-	{ SYF_MPSAFE | 0, (sy_call_t *)pipe, AUE_NULL },	/* 42 = pipe */
-	{ SYF_MPSAFE | 0, (sy_call_t *)getegid, AUE_NULL },	/* 43 = getegid */
-	{ SYF_MPSAFE | AS(profil_args), (sy_call_t *)profil, AUE_NULL },	/* 44 = profil */
-	{ SYF_MPSAFE | AS(ktrace_args), (sy_call_t *)ktrace, AUE_NULL },	/* 45 = ktrace */
-	{ compat(SYF_MPSAFE | AS(osigaction_args),sigaction), AUE_NULL },	/* 46 = old sigaction */
-	{ SYF_MPSAFE | 0, (sy_call_t *)getgid, AUE_NULL },	/* 47 = getgid */
-	{ compat(SYF_MPSAFE | AS(osigprocmask_args),sigprocmask), AUE_NULL },	/* 48 = old sigprocmask */
-	{ SYF_MPSAFE | AS(getlogin_args), (sy_call_t *)getlogin, AUE_NULL },	/* 49 = getlogin */
-	{ SYF_MPSAFE | AS(setlogin_args), (sy_call_t *)setlogin, AUE_NULL },	/* 50 = setlogin */
-	{ SYF_MPSAFE | AS(acct_args), (sy_call_t *)acct, AUE_NULL },	/* 51 = acct */
-	{ compat(SYF_MPSAFE | 0,sigpending), AUE_NULL },	/* 52 = old sigpending */
-	{ SYF_MPSAFE | AS(sigaltstack_args), (sy_call_t *)sigaltstack, AUE_NULL },	/* 53 = sigaltstack */
-	{ SYF_MPSAFE | AS(ioctl_args), (sy_call_t *)ioctl, AUE_NULL },	/* 54 = ioctl */
-	{ SYF_MPSAFE | AS(reboot_args), (sy_call_t *)reboot, AUE_NULL },	/* 55 = reboot */
-	{ SYF_MPSAFE | AS(revoke_args), (sy_call_t *)revoke, AUE_NULL },	/* 56 = revoke */
-	{ SYF_MPSAFE | AS(symlink_args), (sy_call_t *)symlink, AUE_NULL },	/* 57 = symlink */
-	{ SYF_MPSAFE | AS(readlink_args), (sy_call_t *)readlink, AUE_NULL },	/* 58 = readlink */
-	{ SYF_MPSAFE | AS(execve_args), (sy_call_t *)execve, AUE_NULL },	/* 59 = execve */
-	{ SYF_MPSAFE | AS(umask_args), (sy_call_t *)umask, AUE_NULL },	/* 60 = umask */
-	{ SYF_MPSAFE | AS(chroot_args), (sy_call_t *)chroot, AUE_NULL },	/* 61 = chroot */
-	{ compat(SYF_MPSAFE | AS(ofstat_args),fstat), AUE_NULL },	/* 62 = old fstat */
-	{ compat(SYF_MPSAFE | AS(getkerninfo_args),getkerninfo), AUE_NULL },	/* 63 = old getkerninfo */
-	{ compat(SYF_MPSAFE | 0,getpagesize), AUE_NULL },	/* 64 = old getpagesize */
-	{ SYF_MPSAFE | AS(msync_args), (sy_call_t *)msync, AUE_NULL },	/* 65 = msync */
-	{ SYF_MPSAFE | 0, (sy_call_t *)vfork, AUE_NULL },	/* 66 = vfork */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 67 = obsolete vread */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 68 = obsolete vwrite */
-	{ SYF_MPSAFE | AS(sbrk_args), (sy_call_t *)sbrk, AUE_NULL },	/* 69 = sbrk */
-	{ SYF_MPSAFE | AS(sstk_args), (sy_call_t *)sstk, AUE_NULL },	/* 70 = sstk */
-	{ compat(SYF_MPSAFE | AS(ommap_args),mmap), AUE_NULL },	/* 71 = old mmap */
-	{ SYF_MPSAFE | AS(ovadvise_args), (sy_call_t *)ovadvise, AUE_NULL },	/* 72 = vadvise */
-	{ SYF_MPSAFE | AS(munmap_args), (sy_call_t *)munmap, AUE_NULL },	/* 73 = munmap */
-	{ SYF_MPSAFE | AS(mprotect_args), (sy_call_t *)mprotect, AUE_NULL },	/* 74 = mprotect */
-	{ SYF_MPSAFE | AS(madvise_args), (sy_call_t *)madvise, AUE_NULL },	/* 75 = madvise */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 76 = obsolete vhangup */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 77 = obsolete vlimit */
-	{ SYF_MPSAFE | AS(mincore_args), (sy_call_t *)mincore, AUE_NULL },	/* 78 = mincore */
-	{ SYF_MPSAFE | AS(getgroups_args), (sy_call_t *)getgroups, AUE_NULL },	/* 79 = getgroups */
-	{ SYF_MPSAFE | AS(setgroups_args), (sy_call_t *)setgroups, AUE_NULL },	/* 80 = setgroups */
-	{ SYF_MPSAFE | 0, (sy_call_t *)getpgrp, AUE_NULL },	/* 81 = getpgrp */
-	{ SYF_MPSAFE | AS(setpgid_args), (sy_call_t *)setpgid, AUE_NULL },	/* 82 = setpgid */
-	{ SYF_MPSAFE | AS(setitimer_args), (sy_call_t *)setitimer, AUE_NULL },	/* 83 = setitimer */
-	{ compat(SYF_MPSAFE | 0,wait), AUE_NULL },	/* 84 = old wait */
-	{ SYF_MPSAFE | AS(swapon_args), (sy_call_t *)swapon, AUE_NULL },	/* 85 = swapon */
-	{ SYF_MPSAFE | AS(getitimer_args), (sy_call_t *)getitimer, AUE_NULL },	/* 86 = getitimer */
-	{ compat(SYF_MPSAFE | AS(gethostname_args),gethostname), AUE_NULL },	/* 87 = old gethostname */
-	{ compat(SYF_MPSAFE | AS(sethostname_args),sethostname), AUE_NULL },	/* 88 = old sethostname */
-	{ SYF_MPSAFE | 0, (sy_call_t *)getdtablesize, AUE_NULL },	/* 89 = getdtablesize */
-	{ SYF_MPSAFE | AS(dup2_args), (sy_call_t *)dup2, AUE_NULL },	/* 90 = dup2 */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 91 = getdopt */
-	{ SYF_MPSAFE | AS(fcntl_args), (sy_call_t *)fcntl, AUE_NULL },	/* 92 = fcntl */
-	{ SYF_MPSAFE | AS(select_args), (sy_call_t *)select, AUE_NULL },	/* 93 = select */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 94 = setdopt */
-	{ SYF_MPSAFE | AS(fsync_args), (sy_call_t *)fsync, AUE_NULL },	/* 95 = fsync */
-	{ SYF_MPSAFE | AS(setpriority_args), (sy_call_t *)setpriority, AUE_NULL },	/* 96 = setpriority */
-	{ SYF_MPSAFE | AS(socket_args), (sy_call_t *)socket, AUE_NULL },	/* 97 = socket */
-	{ SYF_MPSAFE | AS(connect_args), (sy_call_t *)connect, AUE_NULL },	/* 98 = connect */
-	{ compat(SYF_MPSAFE | AS(accept_args),accept), AUE_NULL },	/* 99 = old accept */
-	{ SYF_MPSAFE | AS(getpriority_args), (sy_call_t *)getpriority, AUE_NULL },	/* 100 = getpriority */
-	{ compat(SYF_MPSAFE | AS(osend_args),send), AUE_NULL },	/* 101 = old send */
-	{ compat(SYF_MPSAFE | AS(orecv_args),recv), AUE_NULL },	/* 102 = old recv */
-	{ compat(SYF_MPSAFE | AS(osigreturn_args),sigreturn), AUE_NULL },	/* 103 = old sigreturn */
-	{ SYF_MPSAFE | AS(bind_args), (sy_call_t *)bind, AUE_NULL },	/* 104 = bind */
-	{ SYF_MPSAFE | AS(setsockopt_args), (sy_call_t *)setsockopt, AUE_NULL },	/* 105 = setsockopt */
-	{ SYF_MPSAFE | AS(listen_args), (sy_call_t *)listen, AUE_NULL },	/* 106 = listen */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 107 = obsolete vtimes */
-	{ compat(SYF_MPSAFE | AS(osigvec_args),sigvec), AUE_NULL },	/* 108 = old sigvec */
-	{ compat(SYF_MPSAFE | AS(osigblock_args),sigblock), AUE_NULL },	/* 109 = old sigblock */
-	{ compat(SYF_MPSAFE | AS(osigsetmask_args),sigsetmask), AUE_NULL },	/* 110 = old sigsetmask */
-	{ compat(SYF_MPSAFE | AS(osigsuspend_args),sigsuspend), AUE_NULL },	/* 111 = old sigsuspend */
-	{ compat(SYF_MPSAFE | AS(osigstack_args),sigstack), AUE_NULL },	/* 112 = old sigstack */
-	{ compat(SYF_MPSAFE | AS(orecvmsg_args),recvmsg), AUE_NULL },	/* 113 = old recvmsg */
-	{ compat(SYF_MPSAFE | AS(osendmsg_args),sendmsg), AUE_NULL },	/* 114 = old sendmsg */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 115 = obsolete vtrace */
-	{ SYF_MPSAFE | AS(gettimeofday_args), (sy_call_t *)gettimeofday, AUE_NULL },	/* 116 = gettimeofday */
-	{ SYF_MPSAFE | AS(getrusage_args), (sy_call_t *)getrusage, AUE_NULL },	/* 117 = getrusage */
-	{ SYF_MPSAFE | AS(getsockopt_args), (sy_call_t *)getsockopt, AUE_NULL },	/* 118 = getsockopt */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 119 = resuba */
-	{ SYF_MPSAFE | AS(readv_args), (sy_call_t *)readv, AUE_NULL },	/* 120 = readv */
-	{ SYF_MPSAFE | AS(writev_args), (sy_call_t *)writev, AUE_NULL },	/* 121 = writev */
-	{ SYF_MPSAFE | AS(settimeofday_args), (sy_call_t *)settimeofday, AUE_NULL },	/* 122 = settimeofday */
-	{ SYF_MPSAFE | AS(fchown_args), (sy_call_t *)fchown, AUE_NULL },	/* 123 = fchown */
-	{ SYF_MPSAFE | AS(fchmod_args), (sy_call_t *)fchmod, AUE_NULL },	/* 124 = fchmod */
-	{ compat(SYF_MPSAFE | AS(recvfrom_args),recvfrom), AUE_NULL },	/* 125 = old recvfrom */
-	{ SYF_MPSAFE | AS(setreuid_args), (sy_call_t *)setreuid, AUE_NULL },	/* 126 = setreuid */
-	{ SYF_MPSAFE | AS(setregid_args), (sy_call_t *)setregid, AUE_NULL },	/* 127 = setregid */
-	{ SYF_MPSAFE | AS(rename_args), (sy_call_t *)rename, AUE_NULL },	/* 128 = rename */
-	{ compat(SYF_MPSAFE | AS(otruncate_args),truncate), AUE_NULL },	/* 129 = old truncate */
-	{ compat(SYF_MPSAFE | AS(oftruncate_args),ftruncate), AUE_NULL },	/* 130 = old ftruncate */
-	{ SYF_MPSAFE | AS(flock_args), (sy_call_t *)flock, AUE_NULL },	/* 131 = flock */
-	{ SYF_MPSAFE | AS(mkfifo_args), (sy_call_t *)mkfifo, AUE_NULL },	/* 132 = mkfifo */
-	{ SYF_MPSAFE | AS(sendto_args), (sy_call_t *)sendto, AUE_NULL },	/* 133 = sendto */
-	{ SYF_MPSAFE | AS(shutdown_args), (sy_call_t *)shutdown, AUE_NULL },	/* 134 = shutdown */
-	{ SYF_MPSAFE | AS(socketpair_args), (sy_call_t *)socketpair, AUE_NULL },	/* 135 = socketpair */
-	{ SYF_MPSAFE | AS(mkdir_args), (sy_call_t *)mkdir, AUE_NULL },	/* 136 = mkdir */
-	{ SYF_MPSAFE | AS(rmdir_args), (sy_call_t *)rmdir, AUE_NULL },	/* 137 = rmdir */
-	{ SYF_MPSAFE | AS(utimes_args), (sy_call_t *)utimes, AUE_NULL },	/* 138 = utimes */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 139 = obsolete 4.2 sigreturn */
-	{ SYF_MPSAFE | AS(adjtime_args), (sy_call_t *)adjtime, AUE_NULL },	/* 140 = adjtime */
-	{ compat(SYF_MPSAFE | AS(ogetpeername_args),getpeername), AUE_NULL },	/* 141 = old getpeername */
-	{ compat(SYF_MPSAFE | 0,gethostid), AUE_NULL },	/* 142 = old gethostid */
-	{ compat(SYF_MPSAFE | AS(osethostid_args),sethostid), AUE_NULL },	/* 143 = old sethostid */
-	{ compat(SYF_MPSAFE | AS(ogetrlimit_args),getrlimit), AUE_NULL },	/* 144 = old getrlimit */
-	{ compat(SYF_MPSAFE | AS(osetrlimit_args),setrlimit), AUE_NULL },	/* 145 = old setrlimit */
-	{ compat(SYF_MPSAFE | AS(okillpg_args),killpg), AUE_NULL },	/* 146 = old killpg */
-	{ SYF_MPSAFE | 0, (sy_call_t *)setsid, AUE_NULL },	/* 147 = setsid */
-	{ SYF_MPSAFE | AS(quotactl_args), (sy_call_t *)quotactl, AUE_NULL },	/* 148 = quotactl */
-	{ compat(SYF_MPSAFE | 0,quota), AUE_NULL },	/* 149 = old quota */
-	{ compat(SYF_MPSAFE | AS(getsockname_args),getsockname), AUE_NULL },	/* 150 = old getsockname */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 151 = sem_lock */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 152 = sem_wakeup */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 153 = asyncdaemon */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 154 = nosys */
-	{ SYF_MPSAFE | AS(nfssvc_args), (sy_call_t *)nosys, AUE_NULL },	/* 155 = nfssvc */
-	{ compat(AS(ogetdirentries_args),getdirentries), AUE_NULL },	/* 156 = old getdirentries */
-	{ compat4(SYF_MPSAFE | AS(freebsd4_statfs_args),statfs), AUE_NULL },	/* 157 = old statfs */
-	{ compat4(SYF_MPSAFE | AS(freebsd4_fstatfs_args),fstatfs), AUE_NULL },	/* 158 = old fstatfs */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 159 = nosys */
-	{ SYF_MPSAFE | AS(lgetfh_args), (sy_call_t *)lgetfh, AUE_NULL },	/* 160 = lgetfh */
-	{ SYF_MPSAFE | AS(getfh_args), (sy_call_t *)getfh, AUE_NULL },	/* 161 = getfh */
-	{ SYF_MPSAFE | AS(getdomainname_args), (sy_call_t *)getdomainname, AUE_NULL },	/* 162 = getdomainname */
-	{ SYF_MPSAFE | AS(setdomainname_args), (sy_call_t *)setdomainname, AUE_NULL },	/* 163 = setdomainname */
-	{ SYF_MPSAFE | AS(uname_args), (sy_call_t *)uname, AUE_NULL },	/* 164 = uname */
-	{ SYF_MPSAFE | AS(sysarch_args), (sy_call_t *)sysarch, AUE_NULL },	/* 165 = sysarch */
-	{ SYF_MPSAFE | AS(rtprio_args), (sy_call_t *)rtprio, AUE_NULL },	/* 166 = rtprio */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 167 = nosys */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 168 = nosys */
-	{ SYF_MPSAFE | AS(semsys_args), (sy_call_t *)lkmressys, AUE_NULL },	/* 169 = semsys */
-	{ SYF_MPSAFE | AS(msgsys_args), (sy_call_t *)lkmressys, AUE_NULL },	/* 170 = msgsys */
-	{ SYF_MPSAFE | AS(shmsys_args), (sy_call_t *)lkmressys, AUE_NULL },	/* 171 = shmsys */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 172 = nosys */
-	{ SYF_MPSAFE | AS(pread_args), (sy_call_t *)pread, AUE_NULL },	/* 173 = pread */
-	{ SYF_MPSAFE | AS(pwrite_args), (sy_call_t *)pwrite, AUE_NULL },	/* 174 = pwrite */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 175 = nosys */
-	{ SYF_MPSAFE | AS(ntp_adjtime_args), (sy_call_t *)ntp_adjtime, AUE_NULL },	/* 176 = ntp_adjtime */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 177 = sfork */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 178 = getdescriptor */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 179 = setdescriptor */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 180 = nosys */
-	{ SYF_MPSAFE | AS(setgid_args), (sy_call_t *)setgid, AUE_NULL },	/* 181 = setgid */
-	{ SYF_MPSAFE | AS(setegid_args), (sy_call_t *)setegid, AUE_NULL },	/* 182 = setegid */
-	{ SYF_MPSAFE | AS(seteuid_args), (sy_call_t *)seteuid, AUE_NULL },	/* 183 = seteuid */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 184 = lfs_bmapv */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 185 = lfs_markv */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 186 = lfs_segclean */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 187 = lfs_segwait */
-	{ SYF_MPSAFE | AS(stat_args), (sy_call_t *)stat, AUE_NULL },	/* 188 = stat */
-	{ SYF_MPSAFE | AS(fstat_args), (sy_call_t *)fstat, AUE_NULL },	/* 189 = fstat */
-	{ SYF_MPSAFE | AS(lstat_args), (sy_call_t *)lstat, AUE_NULL },	/* 190 = lstat */
-	{ SYF_MPSAFE | AS(pathconf_args), (sy_call_t *)pathconf, AUE_NULL },	/* 191 = pathconf */
-	{ SYF_MPSAFE | AS(fpathconf_args), (sy_call_t *)fpathconf, AUE_NULL },	/* 192 = fpathconf */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 193 = nosys */
-	{ SYF_MPSAFE | AS(__getrlimit_args), (sy_call_t *)getrlimit, AUE_NULL },	/* 194 = getrlimit */
-	{ SYF_MPSAFE | AS(__setrlimit_args), (sy_call_t *)setrlimit, AUE_NULL },	/* 195 = setrlimit */
-	{ SYF_MPSAFE | AS(getdirentries_args), (sy_call_t *)getdirentries, AUE_NULL },	/* 196 = getdirentries */
-	{ SYF_MPSAFE | AS(mmap_args), (sy_call_t *)mmap, AUE_NULL },	/* 197 = mmap */
-	{ SYF_MPSAFE | 0, (sy_call_t *)nosys, AUE_NULL },	/* 198 = __syscall */
-	{ SYF_MPSAFE | AS(lseek_args), (sy_call_t *)lseek, AUE_NULL },	/* 199 = lseek */
-	{ SYF_MPSAFE | AS(truncate_args), (sy_call_t *)truncate, AUE_NULL },	/* 200 = truncate */
-	{ SYF_MPSAFE | AS(ftruncate_args), (sy_call_t *)ftruncate, AUE_NULL },	/* 201 = ftruncate */
-	{ SYF_MPSAFE | AS(sysctl_args), (sy_call_t *)__sysctl, AUE_NULL },	/* 202 = __sysctl */
-	{ SYF_MPSAFE | AS(mlock_args), (sy_call_t *)mlock, AUE_NULL },	/* 203 = mlock */
-	{ SYF_MPSAFE | AS(munlock_args), (sy_call_t *)munlock, AUE_NULL },	/* 204 = munlock */
-	{ SYF_MPSAFE | AS(undelete_args), (sy_call_t *)undelete, AUE_NULL },	/* 205 = undelete */
-	{ SYF_MPSAFE | AS(futimes_args), (sy_call_t *)futimes, AUE_NULL },	/* 206 = futimes */
-	{ SYF_MPSAFE | AS(getpgid_args), (sy_call_t *)getpgid, AUE_NULL },	/* 207 = getpgid */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 208 = newreboot */
-	{ SYF_MPSAFE | AS(poll_args), (sy_call_t *)poll, AUE_NULL },	/* 209 = poll */
-	{ AS(nosys_args), (sy_call_t *)lkmnosys, AUE_NULL },	/* 210 = lkmnosys */
-	{ AS(nosys_args), (sy_call_t *)lkmnosys, AUE_NULL },	/* 211 = lkmnosys */
-	{ AS(nosys_args), (sy_call_t *)lkmnosys, AUE_NULL },	/* 212 = lkmnosys */
-	{ AS(nosys_args), (sy_call_t *)lkmnosys, AUE_NULL },	/* 213 = lkmnosys */
-	{ AS(nosys_args), (sy_call_t *)lkmnosys, AUE_NULL },	/* 214 = lkmnosys */
-	{ AS(nosys_args), (sy_call_t *)lkmnosys, AUE_NULL },	/* 215 = lkmnosys */
-	{ AS(nosys_args), (sy_call_t *)lkmnosys, AUE_NULL },	/* 216 = lkmnosys */
-	{ AS(nosys_args), (sy_call_t *)lkmnosys, AUE_NULL },	/* 217 = lkmnosys */
-	{ AS(nosys_args), (sy_call_t *)lkmnosys, AUE_NULL },	/* 218 = lkmnosys */
-	{ AS(nosys_args), (sy_call_t *)lkmnosys, AUE_NULL },	/* 219 = lkmnosys */
-	{ SYF_MPSAFE | AS(__semctl_args), (sy_call_t *)lkmressys, AUE_NULL },	/* 220 = __semctl */
-	{ SYF_MPSAFE | AS(semget_args), (sy_call_t *)lkmressys, AUE_NULL },	/* 221 = semget */
-	{ SYF_MPSAFE | AS(semop_args), (sy_call_t *)lkmressys, AUE_NULL },	/* 222 = semop */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 223 = semconfig */
-	{ SYF_MPSAFE | AS(msgctl_args), (sy_call_t *)lkmressys, AUE_NULL },	/* 224 = msgctl */
-	{ SYF_MPSAFE | AS(msgget_args), (sy_call_t *)lkmressys, AUE_NULL },	/* 225 = msgget */
-	{ SYF_MPSAFE | AS(msgsnd_args), (sy_call_t *)lkmressys, AUE_NULL },	/* 226 = msgsnd */
-	{ SYF_MPSAFE | AS(msgrcv_args), (sy_call_t *)lkmressys, AUE_NULL },	/* 227 = msgrcv */
-	{ SYF_MPSAFE | AS(shmat_args), (sy_call_t *)lkmressys, AUE_NULL },	/* 228 = shmat */
-	{ SYF_MPSAFE | AS(shmctl_args), (sy_call_t *)lkmressys, AUE_NULL },	/* 229 = shmctl */
-	{ SYF_MPSAFE | AS(shmdt_args), (sy_call_t *)lkmressys, AUE_NULL },	/* 230 = shmdt */
-	{ SYF_MPSAFE | AS(shmget_args), (sy_call_t *)lkmressys, AUE_NULL },	/* 231 = shmget */
-	{ SYF_MPSAFE | AS(clock_gettime_args), (sy_call_t *)clock_gettime, AUE_NULL },	/* 232 = clock_gettime */
-	{ SYF_MPSAFE | AS(clock_settime_args), (sy_call_t *)clock_settime, AUE_NULL },	/* 233 = clock_settime */
-	{ SYF_MPSAFE | AS(clock_getres_args), (sy_call_t *)clock_getres, AUE_NULL },	/* 234 = clock_getres */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 235 = timer_create */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 236 = timer_delete */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 237 = timer_settime */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 238 = timer_gettime */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 239 = timer_getoverrun */
-	{ SYF_MPSAFE | AS(nanosleep_args), (sy_call_t *)nanosleep, AUE_NULL },	/* 240 = nanosleep */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 241 = nosys */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 242 = nosys */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 243 = nosys */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 244 = nosys */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 245 = nosys */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 246 = nosys */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 247 = nosys */
-	{ SYF_MPSAFE | AS(ntp_gettime_args), (sy_call_t *)ntp_gettime, AUE_NULL },	/* 248 = ntp_gettime */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 249 = nosys */
-	{ SYF_MPSAFE | AS(minherit_args), (sy_call_t *)minherit, AUE_NULL },	/* 250 = minherit */
-	{ SYF_MPSAFE | AS(rfork_args), (sy_call_t *)rfork, AUE_NULL },	/* 251 = rfork */
-	{ SYF_MPSAFE | AS(openbsd_poll_args), (sy_call_t *)openbsd_poll, AUE_NULL },	/* 252 = openbsd_poll */
-	{ SYF_MPSAFE | 0, (sy_call_t *)issetugid, AUE_NULL },	/* 253 = issetugid */
-	{ SYF_MPSAFE | AS(lchown_args), (sy_call_t *)lchown, AUE_NULL },	/* 254 = lchown */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 255 = nosys */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 256 = nosys */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 257 = nosys */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 258 = nosys */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 259 = nosys */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 260 = nosys */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 261 = nosys */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 262 = nosys */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 263 = nosys */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 264 = nosys */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 265 = nosys */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 266 = nosys */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 267 = nosys */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 268 = nosys */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 269 = nosys */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 270 = nosys */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 271 = nosys */
-	{ SYF_MPSAFE | AS(getdents_args), (sy_call_t *)getdents, AUE_NULL },	/* 272 = getdents */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 273 = nosys */
-	{ SYF_MPSAFE | AS(lchmod_args), (sy_call_t *)lchmod, AUE_NULL },	/* 274 = lchmod */
-	{ SYF_MPSAFE | AS(lchown_args), (sy_call_t *)lchown, AUE_NULL },	/* 275 = netbsd_lchown */
-	{ SYF_MPSAFE | AS(lutimes_args), (sy_call_t *)lutimes, AUE_NULL },	/* 276 = lutimes */
-	{ SYF_MPSAFE | AS(msync_args), (sy_call_t *)msync, AUE_NULL },	/* 277 = netbsd_msync */
-	{ SYF_MPSAFE | AS(nstat_args), (sy_call_t *)nstat, AUE_NULL },	/* 278 = nstat */
-	{ SYF_MPSAFE | AS(nfstat_args), (sy_call_t *)nfstat, AUE_NULL },	/* 279 = nfstat */
-	{ SYF_MPSAFE | AS(nlstat_args), (sy_call_t *)nlstat, AUE_NULL },	/* 280 = nlstat */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 281 = nosys */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 282 = nosys */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 283 = nosys */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 284 = nosys */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 285 = nosys */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 286 = nosys */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 287 = nosys */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 288 = nosys */
-	{ SYF_MPSAFE | AS(preadv_args), (sy_call_t *)preadv, AUE_NULL },	/* 289 = preadv */
-	{ SYF_MPSAFE | AS(pwritev_args), (sy_call_t *)pwritev, AUE_NULL },	/* 290 = pwritev */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 291 = nosys */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 292 = nosys */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 293 = nosys */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 294 = nosys */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 295 = nosys */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 296 = nosys */
-	{ compat4(SYF_MPSAFE | AS(freebsd4_fhstatfs_args),fhstatfs), AUE_NULL },	/* 297 = old fhstatfs */
-	{ SYF_MPSAFE | AS(fhopen_args), (sy_call_t *)fhopen, AUE_NULL },	/* 298 = fhopen */
-	{ SYF_MPSAFE | AS(fhstat_args), (sy_call_t *)fhstat, AUE_NULL },	/* 299 = fhstat */
-	{ SYF_MPSAFE | AS(modnext_args), (sy_call_t *)modnext, AUE_NULL },	/* 300 = modnext */
-	{ SYF_MPSAFE | AS(modstat_args), (sy_call_t *)modstat, AUE_NULL },	/* 301 = modstat */
-	{ SYF_MPSAFE | AS(modfnext_args), (sy_call_t *)modfnext, AUE_NULL },	/* 302 = modfnext */
-	{ SYF_MPSAFE | AS(modfind_args), (sy_call_t *)modfind, AUE_NULL },	/* 303 = modfind */
-	{ SYF_MPSAFE | AS(kldload_args), (sy_call_t *)kldload, AUE_NULL },	/* 304 = kldload */
-	{ SYF_MPSAFE | AS(kldunload_args), (sy_call_t *)kldunload, AUE_NULL },	/* 305 = kldunload */
-	{ SYF_MPSAFE | AS(kldfind_args), (sy_call_t *)kldfind, AUE_NULL },	/* 306 = kldfind */
-	{ SYF_MPSAFE | AS(kldnext_args), (sy_call_t *)kldnext, AUE_NULL },	/* 307 = kldnext */
-	{ SYF_MPSAFE | AS(kldstat_args), (sy_call_t *)kldstat, AUE_NULL },	/* 308 = kldstat */
-	{ SYF_MPSAFE | AS(kldfirstmod_args), (sy_call_t *)kldfirstmod, AUE_NULL },	/* 309 = kldfirstmod */
-	{ SYF_MPSAFE | AS(getsid_args), (sy_call_t *)getsid, AUE_NULL },	/* 310 = getsid */
-	{ SYF_MPSAFE | AS(setresuid_args), (sy_call_t *)setresuid, AUE_NULL },	/* 311 = setresuid */
-	{ SYF_MPSAFE | AS(setresgid_args), (sy_call_t *)setresgid, AUE_NULL },	/* 312 = setresgid */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 313 = obsolete signanosleep */
-	{ AS(aio_return_args), (sy_call_t *)lkmressys, AUE_NULL },	/* 314 = aio_return */
-	{ AS(aio_suspend_args), (sy_call_t *)lkmressys, AUE_NULL },	/* 315 = aio_suspend */
-	{ AS(aio_cancel_args), (sy_call_t *)lkmressys, AUE_NULL },	/* 316 = aio_cancel */
-	{ AS(aio_error_args), (sy_call_t *)lkmressys, AUE_NULL },	/* 317 = aio_error */
-	{ AS(aio_read_args), (sy_call_t *)lkmressys, AUE_NULL },	/* 318 = aio_read */
-	{ AS(aio_write_args), (sy_call_t *)lkmressys, AUE_NULL },	/* 319 = aio_write */
-	{ AS(lio_listio_args), (sy_call_t *)lkmressys, AUE_NULL },	/* 320 = lio_listio */
-	{ SYF_MPSAFE | 0, (sy_call_t *)yield, AUE_NULL },	/* 321 = yield */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 322 = obsolete thr_sleep */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 323 = obsolete thr_wakeup */
-	{ SYF_MPSAFE | AS(mlockall_args), (sy_call_t *)mlockall, AUE_NULL },	/* 324 = mlockall */
-	{ SYF_MPSAFE | 0, (sy_call_t *)munlockall, AUE_NULL },	/* 325 = munlockall */
-	{ SYF_MPSAFE | AS(__getcwd_args), (sy_call_t *)__getcwd, AUE_NULL },	/* 326 = __getcwd */
-	{ SYF_MPSAFE | AS(sched_setparam_args), (sy_call_t *)sched_setparam, AUE_NULL },	/* 327 = sched_setparam */
-	{ SYF_MPSAFE | AS(sched_getparam_args), (sy_call_t *)sched_getparam, AUE_NULL },	/* 328 = sched_getparam */
-	{ SYF_MPSAFE | AS(sched_setscheduler_args), (sy_call_t *)sched_setscheduler, AUE_NULL },	/* 329 = sched_setscheduler */
-	{ SYF_MPSAFE | AS(sched_getscheduler_args), (sy_call_t *)sched_getscheduler, AUE_NULL },	/* 330 = sched_getscheduler */
-	{ SYF_MPSAFE | 0, (sy_call_t *)sched_yield, AUE_NULL },	/* 331 = sched_yield */
-	{ SYF_MPSAFE | AS(sched_get_priority_max_args), (sy_call_t *)sched_get_priority_max, AUE_NULL },	/* 332 = sched_get_priority_max */
-	{ SYF_MPSAFE | AS(sched_get_priority_min_args), (sy_call_t *)sched_get_priority_min, AUE_NULL },	/* 333 = sched_get_priority_min */
-	{ SYF_MPSAFE | AS(sched_rr_get_interval_args), (sy_call_t *)sched_rr_get_interval, AUE_NULL },	/* 334 = sched_rr_get_interval */
-	{ SYF_MPSAFE | AS(utrace_args), (sy_call_t *)utrace, AUE_NULL },	/* 335 = utrace */
-	{ compat4(SYF_MPSAFE | AS(freebsd4_sendfile_args),sendfile), AUE_NULL },	/* 336 = old sendfile */
-	{ SYF_MPSAFE | AS(kldsym_args), (sy_call_t *)kldsym, AUE_NULL },	/* 337 = kldsym */
-	{ SYF_MPSAFE | AS(jail_args), (sy_call_t *)jail, AUE_NULL },	/* 338 = jail */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 339 = pioctl */
-	{ SYF_MPSAFE | AS(sigprocmask_args), (sy_call_t *)sigprocmask, AUE_NULL },	/* 340 = sigprocmask */
-	{ SYF_MPSAFE | AS(sigsuspend_args), (sy_call_t *)sigsuspend, AUE_NULL },	/* 341 = sigsuspend */
-	{ compat4(SYF_MPSAFE | AS(freebsd4_sigaction_args),sigaction), AUE_NULL },	/* 342 = old sigaction */
-	{ SYF_MPSAFE | AS(sigpending_args), (sy_call_t *)sigpending, AUE_NULL },	/* 343 = sigpending */
-	{ compat4(SYF_MPSAFE | AS(freebsd4_sigreturn_args),sigreturn), AUE_NULL },	/* 344 = old sigreturn */
-	{ SYF_MPSAFE | AS(sigtimedwait_args), (sy_call_t *)sigtimedwait, AUE_NULL },	/* 345 = sigtimedwait */
-	{ SYF_MPSAFE | AS(sigwaitinfo_args), (sy_call_t *)sigwaitinfo, AUE_NULL },	/* 346 = sigwaitinfo */
-	{ SYF_MPSAFE | AS(__acl_get_file_args), (sy_call_t *)__acl_get_file, AUE_NULL },	/* 347 = __acl_get_file */
-	{ SYF_MPSAFE | AS(__acl_set_file_args), (sy_call_t *)__acl_set_file, AUE_NULL },	/* 348 = __acl_set_file */
-	{ SYF_MPSAFE | AS(__acl_get_fd_args), (sy_call_t *)__acl_get_fd, AUE_NULL },	/* 349 = __acl_get_fd */
-	{ SYF_MPSAFE | AS(__acl_set_fd_args), (sy_call_t *)__acl_set_fd, AUE_NULL },	/* 350 = __acl_set_fd */
-	{ SYF_MPSAFE | AS(__acl_delete_file_args), (sy_call_t *)__acl_delete_file, AUE_NULL },	/* 351 = __acl_delete_file */
-	{ SYF_MPSAFE | AS(__acl_delete_fd_args), (sy_call_t *)__acl_delete_fd, AUE_NULL },	/* 352 = __acl_delete_fd */
-	{ SYF_MPSAFE | AS(__acl_aclcheck_file_args), (sy_call_t *)__acl_aclcheck_file, AUE_NULL },	/* 353 = __acl_aclcheck_file */
-	{ SYF_MPSAFE | AS(__acl_aclcheck_fd_args), (sy_call_t *)__acl_aclcheck_fd, AUE_NULL },	/* 354 = __acl_aclcheck_fd */
-	{ SYF_MPSAFE | AS(extattrctl_args), (sy_call_t *)extattrctl, AUE_NULL },	/* 355 = extattrctl */
-	{ SYF_MPSAFE | AS(extattr_set_file_args), (sy_call_t *)extattr_set_file, AUE_NULL },	/* 356 = extattr_set_file */
-	{ SYF_MPSAFE | AS(extattr_get_file_args), (sy_call_t *)extattr_get_file, AUE_NULL },	/* 357 = extattr_get_file */
-	{ SYF_MPSAFE | AS(extattr_delete_file_args), (sy_call_t *)extattr_delete_file, AUE_NULL },	/* 358 = extattr_delete_file */
-	{ AS(aio_waitcomplete_args), (sy_call_t *)lkmressys, AUE_NULL },	/* 359 = aio_waitcomplete */
-	{ SYF_MPSAFE | AS(getresuid_args), (sy_call_t *)getresuid, AUE_NULL },	/* 360 = getresuid */
-	{ SYF_MPSAFE | AS(getresgid_args), (sy_call_t *)getresgid, AUE_NULL },	/* 361 = getresgid */
-	{ SYF_MPSAFE | 0, (sy_call_t *)kqueue, AUE_NULL },	/* 362 = kqueue */
-	{ SYF_MPSAFE | AS(kevent_args), (sy_call_t *)kevent, AUE_NULL },	/* 363 = kevent */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 364 = __cap_get_proc */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 365 = __cap_set_proc */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 366 = __cap_get_fd */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 367 = __cap_get_file */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 368 = __cap_set_fd */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 369 = __cap_set_file */
-	{ AS(nosys_args), (sy_call_t *)lkmressys, AUE_NULL },	/* 370 = lkmressys */
-	{ SYF_MPSAFE | AS(extattr_set_fd_args), (sy_call_t *)extattr_set_fd, AUE_NULL },	/* 371 = extattr_set_fd */
-	{ SYF_MPSAFE | AS(extattr_get_fd_args), (sy_call_t *)extattr_get_fd, AUE_NULL },	/* 372 = extattr_get_fd */
-	{ SYF_MPSAFE | AS(extattr_delete_fd_args), (sy_call_t *)extattr_delete_fd, AUE_NULL },	/* 373 = extattr_delete_fd */
-	{ SYF_MPSAFE | AS(__setugid_args), (sy_call_t *)__setugid, AUE_NULL },	/* 374 = __setugid */
-	{ AS(nfsclnt_args), (sy_call_t *)nosys, AUE_NULL },	/* 375 = nfsclnt */
-	{ SYF_MPSAFE | AS(eaccess_args), (sy_call_t *)eaccess, AUE_NULL },	/* 376 = eaccess */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 377 = afs_syscall */
-	{ AS(nmount_args), (sy_call_t *)nmount, AUE_NULL },	/* 378 = nmount */
-	{ SYF_MPSAFE | 0, (sy_call_t *)kse_exit, AUE_NULL },	/* 379 = kse_exit */
-	{ SYF_MPSAFE | AS(kse_wakeup_args), (sy_call_t *)kse_wakeup, AUE_NULL },	/* 380 = kse_wakeup */
-	{ SYF_MPSAFE | AS(kse_create_args), (sy_call_t *)kse_create, AUE_NULL },	/* 381 = kse_create */
-	{ SYF_MPSAFE | AS(kse_thr_interrupt_args), (sy_call_t *)kse_thr_interrupt, AUE_NULL },	/* 382 = kse_thr_interrupt */
-	{ SYF_MPSAFE | AS(kse_release_args), (sy_call_t *)kse_release, AUE_NULL },	/* 383 = kse_release */
-	{ SYF_MPSAFE | AS(__mac_get_proc_args), (sy_call_t *)__mac_get_proc, AUE_NULL },	/* 384 = __mac_get_proc */
-	{ SYF_MPSAFE | AS(__mac_set_proc_args), (sy_call_t *)__mac_set_proc, AUE_NULL },	/* 385 = __mac_set_proc */
-	{ SYF_MPSAFE | AS(__mac_get_fd_args), (sy_call_t *)__mac_get_fd, AUE_NULL },	/* 386 = __mac_get_fd */
-	{ SYF_MPSAFE | AS(__mac_get_file_args), (sy_call_t *)__mac_get_file, AUE_NULL },	/* 387 = __mac_get_file */
-	{ SYF_MPSAFE | AS(__mac_set_fd_args), (sy_call_t *)__mac_set_fd, AUE_NULL },	/* 388 = __mac_set_fd */
-	{ SYF_MPSAFE | AS(__mac_set_file_args), (sy_call_t *)__mac_set_file, AUE_NULL },	/* 389 = __mac_set_file */
-	{ SYF_MPSAFE | AS(kenv_args), (sy_call_t *)kenv, AUE_NULL },	/* 390 = kenv */
-	{ SYF_MPSAFE | AS(lchflags_args), (sy_call_t *)lchflags, AUE_NULL },	/* 391 = lchflags */
-	{ SYF_MPSAFE | AS(uuidgen_args), (sy_call_t *)uuidgen, AUE_NULL },	/* 392 = uuidgen */
-	{ SYF_MPSAFE | AS(sendfile_args), (sy_call_t *)sendfile, AUE_NULL },	/* 393 = sendfile */
-	{ SYF_MPSAFE | AS(mac_syscall_args), (sy_call_t *)mac_syscall, AUE_NULL },	/* 394 = mac_syscall */
-	{ SYF_MPSAFE | AS(getfsstat_args), (sy_call_t *)getfsstat, AUE_NULL },	/* 395 = getfsstat */
-	{ SYF_MPSAFE | AS(statfs_args), (sy_call_t *)statfs, AUE_NULL },	/* 396 = statfs */
-	{ SYF_MPSAFE | AS(fstatfs_args), (sy_call_t *)fstatfs, AUE_NULL },	/* 397 = fstatfs */
-	{ SYF_MPSAFE | AS(fhstatfs_args), (sy_call_t *)fhstatfs, AUE_NULL },	/* 398 = fhstatfs */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 399 = nosys */
-	{ SYF_MPSAFE | AS(ksem_close_args), (sy_call_t *)lkmressys, AUE_NULL },	/* 400 = ksem_close */
-	{ SYF_MPSAFE | AS(ksem_post_args), (sy_call_t *)lkmressys, AUE_NULL },	/* 401 = ksem_post */
-	{ SYF_MPSAFE | AS(ksem_wait_args), (sy_call_t *)lkmressys, AUE_NULL },	/* 402 = ksem_wait */
-	{ SYF_MPSAFE | AS(ksem_trywait_args), (sy_call_t *)lkmressys, AUE_NULL },	/* 403 = ksem_trywait */
-	{ SYF_MPSAFE | AS(ksem_init_args), (sy_call_t *)lkmressys, AUE_NULL },	/* 404 = ksem_init */
-	{ SYF_MPSAFE | AS(ksem_open_args), (sy_call_t *)lkmressys, AUE_NULL },	/* 405 = ksem_open */
-	{ SYF_MPSAFE | AS(ksem_unlink_args), (sy_call_t *)lkmressys, AUE_NULL },	/* 406 = ksem_unlink */
-	{ SYF_MPSAFE | AS(ksem_getvalue_args), (sy_call_t *)lkmressys, AUE_NULL },	/* 407 = ksem_getvalue */
-	{ SYF_MPSAFE | AS(ksem_destroy_args), (sy_call_t *)lkmressys, AUE_NULL },	/* 408 = ksem_destroy */
-	{ SYF_MPSAFE | AS(__mac_get_pid_args), (sy_call_t *)__mac_get_pid, AUE_NULL },	/* 409 = __mac_get_pid */
-	{ SYF_MPSAFE | AS(__mac_get_link_args), (sy_call_t *)__mac_get_link, AUE_NULL },	/* 410 = __mac_get_link */
-	{ SYF_MPSAFE | AS(__mac_set_link_args), (sy_call_t *)__mac_set_link, AUE_NULL },	/* 411 = __mac_set_link */
-	{ SYF_MPSAFE | AS(extattr_set_link_args), (sy_call_t *)extattr_set_link, AUE_NULL },	/* 412 = extattr_set_link */
-	{ SYF_MPSAFE | AS(extattr_get_link_args), (sy_call_t *)extattr_get_link, AUE_NULL },	/* 413 = extattr_get_link */
-	{ SYF_MPSAFE | AS(extattr_delete_link_args), (sy_call_t *)extattr_delete_link, AUE_NULL },	/* 414 = extattr_delete_link */
-	{ SYF_MPSAFE | AS(__mac_execve_args), (sy_call_t *)__mac_execve, AUE_NULL },	/* 415 = __mac_execve */
-	{ SYF_MPSAFE | AS(sigaction_args), (sy_call_t *)sigaction, AUE_NULL },	/* 416 = sigaction */
-	{ SYF_MPSAFE | AS(sigreturn_args), (sy_call_t *)sigreturn, AUE_NULL },	/* 417 = sigreturn */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 418 = __xstat */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 419 = __xfstat */
-	{ 0, (sy_call_t *)nosys, AUE_NULL },			/* 420 = __xlstat */
-	{ SYF_MPSAFE | AS(getcontext_args), (sy_call_t *)getcontext, AUE_NULL },	/* 421 = getcontext */
-	{ SYF_MPSAFE | AS(setcontext_args), (sy_call_t *)setcontext, AUE_NULL },	/* 422 = setcontext */
-	{ SYF_MPSAFE | AS(swapcontext_args), (sy_call_t *)swapcontext, AUE_NULL },	/* 423 = swapcontext */
-	{ SYF_MPSAFE | AS(swapoff_args), (sy_call_t *)swapoff, AUE_NULL },	/* 424 = swapoff */
-	{ SYF_MPSAFE | AS(__acl_get_link_args), (sy_call_t *)__acl_get_link, AUE_NULL },	/* 425 = __acl_get_link */
-	{ SYF_MPSAFE | AS(__acl_set_link_args), (sy_call_t *)__acl_set_link, AUE_NULL },	/* 426 = __acl_set_link */
-	{ SYF_MPSAFE | AS(__acl_delete_link_args), (sy_call_t *)__acl_delete_link, AUE_NULL },	/* 427 = __acl_delete_link */
-	{ SYF_MPSAFE | AS(__acl_aclcheck_link_args), (sy_call_t *)__acl_aclcheck_link, AUE_NULL },	/* 428 = __acl_aclcheck_link */
-	{ SYF_MPSAFE | AS(sigwait_args), (sy_call_t *)sigwait, AUE_NULL },	/* 429 = sigwait */
-	{ SYF_MPSAFE | AS(thr_create_args), (sy_call_t *)thr_create, AUE_NULL },	/* 430 = thr_create */
-	{ SYF_MPSAFE | AS(thr_exit_args), (sy_call_t *)thr_exit, AUE_NULL },	/* 431 = thr_exit */
-	{ SYF_MPSAFE | AS(thr_self_args), (sy_call_t *)thr_self, AUE_NULL },	/* 432 = thr_self */
-	{ SYF_MPSAFE | AS(thr_kill_args), (sy_call_t *)thr_kill, AUE_NULL },	/* 433 = thr_kill */
-	{ SYF_MPSAFE | AS(_umtx_lock_args), (sy_call_t *)_umtx_lock, AUE_NULL },	/* 434 = _umtx_lock */
-	{ SYF_MPSAFE | AS(_umtx_unlock_args), (sy_call_t *)_umtx_unlock, AUE_NULL },	/* 435 = _umtx_unlock */
-	{ SYF_MPSAFE | AS(jail_attach_args), (sy_call_t *)jail_attach, AUE_NULL },	/* 436 = jail_attach */
-	{ SYF_MPSAFE | AS(extattr_list_fd_args), (sy_call_t *)extattr_list_fd, AUE_NULL },	/* 437 = extattr_list_fd */
-	{ SYF_MPSAFE | AS(extattr_list_file_args), (sy_call_t *)extattr_list_file, AUE_NULL },	/* 438 = extattr_list_file */
-	{ SYF_MPSAFE | AS(extattr_list_link_args), (sy_call_t *)extattr_list_link, AUE_NULL },	/* 439 = extattr_list_link */
-	{ SYF_MPSAFE | AS(kse_switchin_args), (sy_call_t *)kse_switchin, AUE_NULL },	/* 440 = kse_switchin */
-	{ SYF_MPSAFE | AS(ksem_timedwait_args), (sy_call_t *)lkmressys, AUE_NULL },	/* 441 = ksem_timedwait */
-	{ SYF_MPSAFE | AS(thr_suspend_args), (sy_call_t *)thr_suspend, AUE_NULL },	/* 442 = thr_suspend */
-	{ SYF_MPSAFE | AS(thr_wake_args), (sy_call_t *)thr_wake, AUE_NULL },	/* 443 = thr_wake */
-	{ SYF_MPSAFE | AS(kldunloadf_args), (sy_call_t *)kldunloadf, AUE_NULL },	/* 444 = kldunloadf */
-	{ SYF_MPSAFE | AS(audit_args), (sy_call_t *)audit, AUE_NULL },	/* 445 = audit */
-	{ SYF_MPSAFE | AS(auditon_args), (sy_call_t *)auditon, AUE_NULL },	/* 446 = auditon */
-	{ SYF_MPSAFE | AS(getauid_args), (sy_call_t *)getauid, AUE_NULL },	/* 447 = getauid */
-	{ SYF_MPSAFE | AS(setauid_args), (sy_call_t *)setauid, AUE_NULL },	/* 448 = setauid */
-	{ SYF_MPSAFE | AS(getaudit_args), (sy_call_t *)getaudit, AUE_NULL },	/* 449 = getaudit */
-	{ SYF_MPSAFE | AS(setaudit_args), (sy_call_t *)setaudit, AUE_NULL },	/* 450 = setaudit */
-	{ SYF_MPSAFE | AS(getaudit_addr_args), (sy_call_t *)getaudit_addr, AUE_NULL },	/* 451 = getaudit_addr */
-	{ SYF_MPSAFE | AS(setaudit_addr_args), (sy_call_t *)setaudit_addr, AUE_NULL },	/* 452 = setaudit_addr */
-	{ SYF_MPSAFE | AS(auditctl_args), (sy_call_t *)auditctl, AUE_NULL },	/* 453 = auditctl */
-	{ SYF_MPSAFE | AS(_umtx_op_args), (sy_call_t *)_umtx_op, AUE_NULL },	/* 454 = _umtx_op */
-	{ SYF_MPSAFE | AS(thr_new_args), (sy_call_t *)thr_new, AUE_NULL },	/* 455 = thr_new */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },		/* 0 = syscall */
+	{ AS(sys_exit_args), (sy_call_t *)sys_exit, AUE_EXIT, NULL, 0, 0 },	/* 1 = exit */
+	{ 0, (sy_call_t *)fork, AUE_FORK, NULL, 0, 0 },		/* 2 = fork */
+	{ AS(read_args), (sy_call_t *)read, AUE_NULL, NULL, 0, 0 },	/* 3 = read */
+	{ AS(write_args), (sy_call_t *)write, AUE_NULL, NULL, 0, 0 },	/* 4 = write */
+	{ AS(open_args), (sy_call_t *)open, AUE_OPEN_RWTC, NULL, 0, 0 },	/* 5 = open */
+	{ AS(close_args), (sy_call_t *)close, AUE_CLOSE, NULL, 0, 0 },	/* 6 = close */
+	{ AS(wait_args), (sy_call_t *)wait4, AUE_WAIT4, NULL, 0, 0 },	/* 7 = wait4 */
+	{ compat(AS(ocreat_args),creat), AUE_CREAT, NULL, 0, 0 },	/* 8 = old creat */
+	{ AS(link_args), (sy_call_t *)link, AUE_LINK, NULL, 0, 0 },	/* 9 = link */
+	{ AS(unlink_args), (sy_call_t *)unlink, AUE_UNLINK, NULL, 0, 0 },	/* 10 = unlink */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 11 = obsolete execv */
+	{ AS(chdir_args), (sy_call_t *)chdir, AUE_CHDIR, NULL, 0, 0 },	/* 12 = chdir */
+	{ AS(fchdir_args), (sy_call_t *)fchdir, AUE_FCHDIR, NULL, 0, 0 },	/* 13 = fchdir */
+	{ AS(mknod_args), (sy_call_t *)mknod, AUE_MKNOD, NULL, 0, 0 },	/* 14 = mknod */
+	{ AS(chmod_args), (sy_call_t *)chmod, AUE_CHMOD, NULL, 0, 0 },	/* 15 = chmod */
+	{ AS(chown_args), (sy_call_t *)chown, AUE_CHOWN, NULL, 0, 0 },	/* 16 = chown */
+	{ AS(obreak_args), (sy_call_t *)obreak, AUE_NULL, NULL, 0, 0 },	/* 17 = break */
+	{ compat4(AS(freebsd4_getfsstat_args),getfsstat), AUE_GETFSSTAT, NULL, 0, 0 },	/* 18 = old getfsstat */
+	{ compat(AS(olseek_args),lseek), AUE_LSEEK, NULL, 0, 0 },	/* 19 = old lseek */
+	{ 0, (sy_call_t *)getpid, AUE_GETPID, NULL, 0, 0 },		/* 20 = getpid */
+	{ AS(mount_args), (sy_call_t *)mount, AUE_MOUNT, NULL, 0, 0 },	/* 21 = mount */
+	{ AS(unmount_args), (sy_call_t *)unmount, AUE_UMOUNT, NULL, 0, 0 },	/* 22 = unmount */
+	{ AS(setuid_args), (sy_call_t *)setuid, AUE_SETUID, NULL, 0, 0 },	/* 23 = setuid */
+	{ 0, (sy_call_t *)getuid, AUE_GETUID, NULL, 0, 0 },		/* 24 = getuid */
+	{ 0, (sy_call_t *)geteuid, AUE_GETEUID, NULL, 0, 0 },		/* 25 = geteuid */
+	{ AS(ptrace_args), (sy_call_t *)ptrace, AUE_PTRACE, NULL, 0, 0 },	/* 26 = ptrace */
+	{ AS(recvmsg_args), (sy_call_t *)recvmsg, AUE_RECVMSG, NULL, 0, 0 },	/* 27 = recvmsg */
+	{ AS(sendmsg_args), (sy_call_t *)sendmsg, AUE_SENDMSG, NULL, 0, 0 },	/* 28 = sendmsg */
+	{ AS(recvfrom_args), (sy_call_t *)recvfrom, AUE_RECVFROM, NULL, 0, 0 },	/* 29 = recvfrom */
+	{ AS(accept_args), (sy_call_t *)accept, AUE_ACCEPT, NULL, 0, 0 },	/* 30 = accept */
+	{ AS(getpeername_args), (sy_call_t *)getpeername, AUE_GETPEERNAME, NULL, 0, 0 },	/* 31 = getpeername */
+	{ AS(getsockname_args), (sy_call_t *)getsockname, AUE_GETSOCKNAME, NULL, 0, 0 },	/* 32 = getsockname */
+	{ AS(access_args), (sy_call_t *)access, AUE_ACCESS, NULL, 0, 0 },	/* 33 = access */
+	{ AS(chflags_args), (sy_call_t *)chflags, AUE_CHFLAGS, NULL, 0, 0 },	/* 34 = chflags */
+	{ AS(fchflags_args), (sy_call_t *)fchflags, AUE_FCHFLAGS, NULL, 0, 0 },	/* 35 = fchflags */
+	{ 0, (sy_call_t *)sync, AUE_SYNC, NULL, 0, 0 },		/* 36 = sync */
+	{ AS(kill_args), (sy_call_t *)kill, AUE_KILL, NULL, 0, 0 },	/* 37 = kill */
+	{ compat(AS(ostat_args),stat), AUE_STAT, NULL, 0, 0 },	/* 38 = old stat */
+	{ 0, (sy_call_t *)getppid, AUE_GETPPID, NULL, 0, 0 },		/* 39 = getppid */
+	{ compat(AS(olstat_args),lstat), AUE_LSTAT, NULL, 0, 0 },	/* 40 = old lstat */
+	{ AS(dup_args), (sy_call_t *)dup, AUE_DUP, NULL, 0, 0 },	/* 41 = dup */
+	{ 0, (sy_call_t *)pipe, AUE_PIPE, NULL, 0, 0 },		/* 42 = pipe */
+	{ 0, (sy_call_t *)getegid, AUE_GETEGID, NULL, 0, 0 },		/* 43 = getegid */
+	{ AS(profil_args), (sy_call_t *)profil, AUE_PROFILE, NULL, 0, 0 },	/* 44 = profil */
+	{ AS(ktrace_args), (sy_call_t *)ktrace, AUE_KTRACE, NULL, 0, 0 },	/* 45 = ktrace */
+	{ compat(AS(osigaction_args),sigaction), AUE_SIGACTION, NULL, 0, 0 },	/* 46 = old sigaction */
+	{ 0, (sy_call_t *)getgid, AUE_GETGID, NULL, 0, 0 },		/* 47 = getgid */
+	{ compat(AS(osigprocmask_args),sigprocmask), AUE_SIGPROCMASK, NULL, 0, 0 },	/* 48 = old sigprocmask */
+	{ AS(getlogin_args), (sy_call_t *)getlogin, AUE_GETLOGIN, NULL, 0, 0 },	/* 49 = getlogin */
+	{ AS(setlogin_args), (sy_call_t *)setlogin, AUE_SETLOGIN, NULL, 0, 0 },	/* 50 = setlogin */
+	{ AS(acct_args), (sy_call_t *)acct, AUE_ACCT, NULL, 0, 0 },	/* 51 = acct */
+	{ compat(0,sigpending), AUE_SIGPENDING, NULL, 0, 0 },		/* 52 = old sigpending */
+	{ AS(sigaltstack_args), (sy_call_t *)sigaltstack, AUE_SIGALTSTACK, NULL, 0, 0 },	/* 53 = sigaltstack */
+	{ AS(ioctl_args), (sy_call_t *)ioctl, AUE_IOCTL, NULL, 0, 0 },	/* 54 = ioctl */
+	{ AS(reboot_args), (sy_call_t *)reboot, AUE_REBOOT, NULL, 0, 0 },	/* 55 = reboot */
+	{ AS(revoke_args), (sy_call_t *)revoke, AUE_REVOKE, NULL, 0, 0 },	/* 56 = revoke */
+	{ AS(symlink_args), (sy_call_t *)symlink, AUE_SYMLINK, NULL, 0, 0 },	/* 57 = symlink */
+	{ AS(readlink_args), (sy_call_t *)readlink, AUE_READLINK, NULL, 0, 0 },	/* 58 = readlink */
+	{ AS(execve_args), (sy_call_t *)execve, AUE_EXECVE, NULL, 0, 0 },	/* 59 = execve */
+	{ AS(umask_args), (sy_call_t *)umask, AUE_UMASK, NULL, 0, 0 },	/* 60 = umask */
+	{ AS(chroot_args), (sy_call_t *)chroot, AUE_CHROOT, NULL, 0, 0 },	/* 61 = chroot */
+	{ compat(AS(ofstat_args),fstat), AUE_FSTAT, NULL, 0, 0 },	/* 62 = old fstat */
+	{ compat(AS(getkerninfo_args),getkerninfo), AUE_NULL, NULL, 0, 0 },	/* 63 = old getkerninfo */
+	{ compat(0,getpagesize), AUE_NULL, NULL, 0, 0 },		/* 64 = old getpagesize */
+	{ AS(msync_args), (sy_call_t *)msync, AUE_MSYNC, NULL, 0, 0 },	/* 65 = msync */
+	{ 0, (sy_call_t *)vfork, AUE_VFORK, NULL, 0, 0 },		/* 66 = vfork */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 67 = obsolete vread */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 68 = obsolete vwrite */
+	{ AS(sbrk_args), (sy_call_t *)sbrk, AUE_SBRK, NULL, 0, 0 },	/* 69 = sbrk */
+	{ AS(sstk_args), (sy_call_t *)sstk, AUE_SSTK, NULL, 0, 0 },	/* 70 = sstk */
+	{ compat(AS(ommap_args),mmap), AUE_MMAP, NULL, 0, 0 },	/* 71 = old mmap */
+	{ AS(ovadvise_args), (sy_call_t *)ovadvise, AUE_O_VADVISE, NULL, 0, 0 },	/* 72 = vadvise */
+	{ AS(munmap_args), (sy_call_t *)munmap, AUE_MUNMAP, NULL, 0, 0 },	/* 73 = munmap */
+	{ AS(mprotect_args), (sy_call_t *)mprotect, AUE_MPROTECT, NULL, 0, 0 },	/* 74 = mprotect */
+	{ AS(madvise_args), (sy_call_t *)madvise, AUE_MADVISE, NULL, 0, 0 },	/* 75 = madvise */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 76 = obsolete vhangup */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 77 = obsolete vlimit */
+	{ AS(mincore_args), (sy_call_t *)mincore, AUE_MINCORE, NULL, 0, 0 },	/* 78 = mincore */
+	{ AS(getgroups_args), (sy_call_t *)getgroups, AUE_GETGROUPS, NULL, 0, 0 },	/* 79 = getgroups */
+	{ AS(setgroups_args), (sy_call_t *)setgroups, AUE_SETGROUPS, NULL, 0, 0 },	/* 80 = setgroups */
+	{ 0, (sy_call_t *)getpgrp, AUE_GETPGRP, NULL, 0, 0 },		/* 81 = getpgrp */
+	{ AS(setpgid_args), (sy_call_t *)setpgid, AUE_SETPGRP, NULL, 0, 0 },	/* 82 = setpgid */
+	{ AS(setitimer_args), (sy_call_t *)setitimer, AUE_SETITIMER, NULL, 0, 0 },	/* 83 = setitimer */
+	{ compat(0,wait), AUE_WAIT4, NULL, 0, 0 },			/* 84 = old wait */
+	{ AS(swapon_args), (sy_call_t *)swapon, AUE_SWAPON, NULL, 0, 0 },	/* 85 = swapon */
+	{ AS(getitimer_args), (sy_call_t *)getitimer, AUE_GETITIMER, NULL, 0, 0 },	/* 86 = getitimer */
+	{ compat(AS(gethostname_args),gethostname), AUE_SYSCTL, NULL, 0, 0 },	/* 87 = old gethostname */
+	{ compat(AS(sethostname_args),sethostname), AUE_SYSCTL, NULL, 0, 0 },	/* 88 = old sethostname */
+	{ 0, (sy_call_t *)getdtablesize, AUE_GETDTABLESIZE, NULL, 0, 0 },	/* 89 = getdtablesize */
+	{ AS(dup2_args), (sy_call_t *)dup2, AUE_DUP2, NULL, 0, 0 },	/* 90 = dup2 */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 91 = getdopt */
+	{ AS(fcntl_args), (sy_call_t *)fcntl, AUE_FCNTL, NULL, 0, 0 },	/* 92 = fcntl */
+	{ AS(select_args), (sy_call_t *)select, AUE_SELECT, NULL, 0, 0 },	/* 93 = select */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 94 = setdopt */
+	{ AS(fsync_args), (sy_call_t *)fsync, AUE_FSYNC, NULL, 0, 0 },	/* 95 = fsync */
+	{ AS(setpriority_args), (sy_call_t *)setpriority, AUE_SETPRIORITY, NULL, 0, 0 },	/* 96 = setpriority */
+	{ AS(socket_args), (sy_call_t *)socket, AUE_SOCKET, NULL, 0, 0 },	/* 97 = socket */
+	{ AS(connect_args), (sy_call_t *)connect, AUE_CONNECT, NULL, 0, 0 },	/* 98 = connect */
+	{ compat(AS(accept_args),accept), AUE_ACCEPT, NULL, 0, 0 },	/* 99 = old accept */
+	{ AS(getpriority_args), (sy_call_t *)getpriority, AUE_GETPRIORITY, NULL, 0, 0 },	/* 100 = getpriority */
+	{ compat(AS(osend_args),send), AUE_SEND, NULL, 0, 0 },	/* 101 = old send */
+	{ compat(AS(orecv_args),recv), AUE_RECV, NULL, 0, 0 },	/* 102 = old recv */
+	{ compat(AS(osigreturn_args),sigreturn), AUE_SIGRETURN, NULL, 0, 0 },	/* 103 = old sigreturn */
+	{ AS(bind_args), (sy_call_t *)bind, AUE_BIND, NULL, 0, 0 },	/* 104 = bind */
+	{ AS(setsockopt_args), (sy_call_t *)setsockopt, AUE_SETSOCKOPT, NULL, 0, 0 },	/* 105 = setsockopt */
+	{ AS(listen_args), (sy_call_t *)listen, AUE_LISTEN, NULL, 0, 0 },	/* 106 = listen */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 107 = obsolete vtimes */
+	{ compat(AS(osigvec_args),sigvec), AUE_NULL, NULL, 0, 0 },	/* 108 = old sigvec */
+	{ compat(AS(osigblock_args),sigblock), AUE_NULL, NULL, 0, 0 },	/* 109 = old sigblock */
+	{ compat(AS(osigsetmask_args),sigsetmask), AUE_NULL, NULL, 0, 0 },	/* 110 = old sigsetmask */
+	{ compat(AS(osigsuspend_args),sigsuspend), AUE_NULL, NULL, 0, 0 },	/* 111 = old sigsuspend */
+	{ compat(AS(osigstack_args),sigstack), AUE_NULL, NULL, 0, 0 },	/* 112 = old sigstack */
+	{ compat(AS(orecvmsg_args),recvmsg), AUE_RECVMSG, NULL, 0, 0 },	/* 113 = old recvmsg */
+	{ compat(AS(osendmsg_args),sendmsg), AUE_SENDMSG, NULL, 0, 0 },	/* 114 = old sendmsg */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 115 = obsolete vtrace */
+	{ AS(gettimeofday_args), (sy_call_t *)gettimeofday, AUE_GETTIMEOFDAY, NULL, 0, 0 },	/* 116 = gettimeofday */
+	{ AS(getrusage_args), (sy_call_t *)getrusage, AUE_GETRUSAGE, NULL, 0, 0 },	/* 117 = getrusage */
+	{ AS(getsockopt_args), (sy_call_t *)getsockopt, AUE_GETSOCKOPT, NULL, 0, 0 },	/* 118 = getsockopt */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 119 = resuba */
+	{ AS(readv_args), (sy_call_t *)readv, AUE_READV, NULL, 0, 0 },	/* 120 = readv */
+	{ AS(writev_args), (sy_call_t *)writev, AUE_WRITEV, NULL, 0, 0 },	/* 121 = writev */
+	{ AS(settimeofday_args), (sy_call_t *)settimeofday, AUE_SETTIMEOFDAY, NULL, 0, 0 },	/* 122 = settimeofday */
+	{ AS(fchown_args), (sy_call_t *)fchown, AUE_FCHOWN, NULL, 0, 0 },	/* 123 = fchown */
+	{ AS(fchmod_args), (sy_call_t *)fchmod, AUE_FCHMOD, NULL, 0, 0 },	/* 124 = fchmod */
+	{ compat(AS(recvfrom_args),recvfrom), AUE_RECVFROM, NULL, 0, 0 },	/* 125 = old recvfrom */
+	{ AS(setreuid_args), (sy_call_t *)setreuid, AUE_SETREUID, NULL, 0, 0 },	/* 126 = setreuid */
+	{ AS(setregid_args), (sy_call_t *)setregid, AUE_SETREGID, NULL, 0, 0 },	/* 127 = setregid */
+	{ AS(rename_args), (sy_call_t *)rename, AUE_RENAME, NULL, 0, 0 },	/* 128 = rename */
+	{ compat(AS(otruncate_args),truncate), AUE_TRUNCATE, NULL, 0, 0 },	/* 129 = old truncate */
+	{ compat(AS(oftruncate_args),ftruncate), AUE_FTRUNCATE, NULL, 0, 0 },	/* 130 = old ftruncate */
+	{ AS(flock_args), (sy_call_t *)flock, AUE_FLOCK, NULL, 0, 0 },	/* 131 = flock */
+	{ AS(mkfifo_args), (sy_call_t *)mkfifo, AUE_MKFIFO, NULL, 0, 0 },	/* 132 = mkfifo */
+	{ AS(sendto_args), (sy_call_t *)sendto, AUE_SENDTO, NULL, 0, 0 },	/* 133 = sendto */
+	{ AS(shutdown_args), (sy_call_t *)shutdown, AUE_SHUTDOWN, NULL, 0, 0 },	/* 134 = shutdown */
+	{ AS(socketpair_args), (sy_call_t *)socketpair, AUE_SOCKETPAIR, NULL, 0, 0 },	/* 135 = socketpair */
+	{ AS(mkdir_args), (sy_call_t *)mkdir, AUE_MKDIR, NULL, 0, 0 },	/* 136 = mkdir */
+	{ AS(rmdir_args), (sy_call_t *)rmdir, AUE_RMDIR, NULL, 0, 0 },	/* 137 = rmdir */
+	{ AS(utimes_args), (sy_call_t *)utimes, AUE_UTIMES, NULL, 0, 0 },	/* 138 = utimes */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 139 = obsolete 4.2 sigreturn */
+	{ AS(adjtime_args), (sy_call_t *)adjtime, AUE_ADJTIME, NULL, 0, 0 },	/* 140 = adjtime */
+	{ compat(AS(ogetpeername_args),getpeername), AUE_GETPEERNAME, NULL, 0, 0 },	/* 141 = old getpeername */
+	{ compat(0,gethostid), AUE_SYSCTL, NULL, 0, 0 },		/* 142 = old gethostid */
+	{ compat(AS(osethostid_args),sethostid), AUE_SYSCTL, NULL, 0, 0 },	/* 143 = old sethostid */
+	{ compat(AS(ogetrlimit_args),getrlimit), AUE_GETRLIMIT, NULL, 0, 0 },	/* 144 = old getrlimit */
+	{ compat(AS(osetrlimit_args),setrlimit), AUE_SETRLIMIT, NULL, 0, 0 },	/* 145 = old setrlimit */
+	{ compat(AS(okillpg_args),killpg), AUE_KILLPG, NULL, 0, 0 },	/* 146 = old killpg */
+	{ 0, (sy_call_t *)setsid, AUE_SETSID, NULL, 0, 0 },		/* 147 = setsid */
+	{ AS(quotactl_args), (sy_call_t *)quotactl, AUE_QUOTACTL, NULL, 0, 0 },	/* 148 = quotactl */
+	{ compat(0,quota), AUE_O_QUOTA, NULL, 0, 0 },			/* 149 = old quota */
+	{ compat(AS(getsockname_args),getsockname), AUE_GETSOCKNAME, NULL, 0, 0 },	/* 150 = old getsockname */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 151 = sem_lock */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 152 = sem_wakeup */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 153 = asyncdaemon */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 154 = nosys */
+	{ AS(nfssvc_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 },	/* 155 = nfssvc */
+	{ compat(AS(ogetdirentries_args),getdirentries), AUE_GETDIRENTRIES, NULL, 0, 0 },	/* 156 = old getdirentries */
+	{ compat4(AS(freebsd4_statfs_args),statfs), AUE_STATFS, NULL, 0, 0 },	/* 157 = old statfs */
+	{ compat4(AS(freebsd4_fstatfs_args),fstatfs), AUE_FSTATFS, NULL, 0, 0 },	/* 158 = old fstatfs */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 159 = nosys */
+	{ AS(lgetfh_args), (sy_call_t *)lgetfh, AUE_LGETFH, NULL, 0, 0 },	/* 160 = lgetfh */
+	{ AS(getfh_args), (sy_call_t *)getfh, AUE_NFS_GETFH, NULL, 0, 0 },	/* 161 = getfh */
+	{ AS(getdomainname_args), (sy_call_t *)getdomainname, AUE_SYSCTL, NULL, 0, 0 },	/* 162 = getdomainname */
+	{ AS(setdomainname_args), (sy_call_t *)setdomainname, AUE_SYSCTL, NULL, 0, 0 },	/* 163 = setdomainname */
+	{ AS(uname_args), (sy_call_t *)uname, AUE_NULL, NULL, 0, 0 },	/* 164 = uname */
+	{ AS(sysarch_args), (sy_call_t *)sysarch, AUE_SYSARCH, NULL, 0, 0 },	/* 165 = sysarch */
+	{ AS(rtprio_args), (sy_call_t *)rtprio, AUE_RTPRIO, NULL, 0, 0 },	/* 166 = rtprio */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 167 = nosys */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 168 = nosys */
+	{ AS(semsys_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 },	/* 169 = semsys */
+	{ AS(msgsys_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 },	/* 170 = msgsys */
+	{ AS(shmsys_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 },	/* 171 = shmsys */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 172 = nosys */
+	{ AS(freebsd6_pread_args), (sy_call_t *)freebsd6_pread, AUE_PREAD, NULL, 0, 0 },	/* 173 = freebsd6_pread */
+	{ AS(freebsd6_pwrite_args), (sy_call_t *)freebsd6_pwrite, AUE_PWRITE, NULL, 0, 0 },	/* 174 = freebsd6_pwrite */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 175 = nosys */
+	{ AS(ntp_adjtime_args), (sy_call_t *)ntp_adjtime, AUE_NTP_ADJTIME, NULL, 0, 0 },	/* 176 = ntp_adjtime */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 177 = sfork */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 178 = getdescriptor */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 179 = setdescriptor */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 180 = nosys */
+	{ AS(setgid_args), (sy_call_t *)setgid, AUE_SETGID, NULL, 0, 0 },	/* 181 = setgid */
+	{ AS(setegid_args), (sy_call_t *)setegid, AUE_SETEGID, NULL, 0, 0 },	/* 182 = setegid */
+	{ AS(seteuid_args), (sy_call_t *)seteuid, AUE_SETEUID, NULL, 0, 0 },	/* 183 = seteuid */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 184 = lfs_bmapv */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 185 = lfs_markv */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 186 = lfs_segclean */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 187 = lfs_segwait */
+	{ AS(stat_args), (sy_call_t *)stat, AUE_STAT, NULL, 0, 0 },	/* 188 = stat */
+	{ AS(fstat_args), (sy_call_t *)fstat, AUE_FSTAT, NULL, 0, 0 },	/* 189 = fstat */
+	{ AS(lstat_args), (sy_call_t *)lstat, AUE_LSTAT, NULL, 0, 0 },	/* 190 = lstat */
+	{ AS(pathconf_args), (sy_call_t *)pathconf, AUE_PATHCONF, NULL, 0, 0 },	/* 191 = pathconf */
+	{ AS(fpathconf_args), (sy_call_t *)fpathconf, AUE_FPATHCONF, NULL, 0, 0 },	/* 192 = fpathconf */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 193 = nosys */
+	{ AS(__getrlimit_args), (sy_call_t *)getrlimit, AUE_GETRLIMIT, NULL, 0, 0 },	/* 194 = getrlimit */
+	{ AS(__setrlimit_args), (sy_call_t *)setrlimit, AUE_SETRLIMIT, NULL, 0, 0 },	/* 195 = setrlimit */
+	{ AS(getdirentries_args), (sy_call_t *)getdirentries, AUE_GETDIRENTRIES, NULL, 0, 0 },	/* 196 = getdirentries */
+	{ AS(freebsd6_mmap_args), (sy_call_t *)freebsd6_mmap, AUE_MMAP, NULL, 0, 0 },	/* 197 = freebsd6_mmap */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },		/* 198 = __syscall */
+	{ AS(freebsd6_lseek_args), (sy_call_t *)freebsd6_lseek, AUE_LSEEK, NULL, 0, 0 },	/* 199 = freebsd6_lseek */
+	{ AS(freebsd6_truncate_args), (sy_call_t *)freebsd6_truncate, AUE_TRUNCATE, NULL, 0, 0 },	/* 200 = freebsd6_truncate */
+	{ AS(freebsd6_ftruncate_args), (sy_call_t *)freebsd6_ftruncate, AUE_FTRUNCATE, NULL, 0, 0 },	/* 201 = freebsd6_ftruncate */
+	{ AS(sysctl_args), (sy_call_t *)__sysctl, AUE_SYSCTL, NULL, 0, 0 },	/* 202 = __sysctl */
+	{ AS(mlock_args), (sy_call_t *)mlock, AUE_MLOCK, NULL, 0, 0 },	/* 203 = mlock */
+	{ AS(munlock_args), (sy_call_t *)munlock, AUE_MUNLOCK, NULL, 0, 0 },	/* 204 = munlock */
+	{ AS(undelete_args), (sy_call_t *)undelete, AUE_UNDELETE, NULL, 0, 0 },	/* 205 = undelete */
+	{ AS(futimes_args), (sy_call_t *)futimes, AUE_FUTIMES, NULL, 0, 0 },	/* 206 = futimes */
+	{ AS(getpgid_args), (sy_call_t *)getpgid, AUE_GETPGID, NULL, 0, 0 },	/* 207 = getpgid */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 208 = newreboot */
+	{ AS(poll_args), (sy_call_t *)poll, AUE_POLL, NULL, 0, 0 },	/* 209 = poll */
+	{ AS(nosys_args), (sy_call_t *)lkmnosys, AUE_NULL, NULL, 0, 0 },	/* 210 = lkmnosys */
+	{ AS(nosys_args), (sy_call_t *)lkmnosys, AUE_NULL, NULL, 0, 0 },	/* 211 = lkmnosys */
+	{ AS(nosys_args), (sy_call_t *)lkmnosys, AUE_NULL, NULL, 0, 0 },	/* 212 = lkmnosys */
+	{ AS(nosys_args), (sy_call_t *)lkmnosys, AUE_NULL, NULL, 0, 0 },	/* 213 = lkmnosys */
+	{ AS(nosys_args), (sy_call_t *)lkmnosys, AUE_NULL, NULL, 0, 0 },	/* 214 = lkmnosys */
+	{ AS(nosys_args), (sy_call_t *)lkmnosys, AUE_NULL, NULL, 0, 0 },	/* 215 = lkmnosys */
+	{ AS(nosys_args), (sy_call_t *)lkmnosys, AUE_NULL, NULL, 0, 0 },	/* 216 = lkmnosys */
+	{ AS(nosys_args), (sy_call_t *)lkmnosys, AUE_NULL, NULL, 0, 0 },	/* 217 = lkmnosys */
+	{ AS(nosys_args), (sy_call_t *)lkmnosys, AUE_NULL, NULL, 0, 0 },	/* 218 = lkmnosys */
+	{ AS(nosys_args), (sy_call_t *)lkmnosys, AUE_NULL, NULL, 0, 0 },	/* 219 = lkmnosys */
+	{ AS(__semctl_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 },	/* 220 = __semctl */
+	{ AS(semget_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 },	/* 221 = semget */
+	{ AS(semop_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 },	/* 222 = semop */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 223 = semconfig */
+	{ AS(msgctl_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 },	/* 224 = msgctl */
+	{ AS(msgget_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 },	/* 225 = msgget */
+	{ AS(msgsnd_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 },	/* 226 = msgsnd */
+	{ AS(msgrcv_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 },	/* 227 = msgrcv */
+	{ AS(shmat_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 },	/* 228 = shmat */
+	{ AS(shmctl_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 },	/* 229 = shmctl */
+	{ AS(shmdt_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 },	/* 230 = shmdt */
+	{ AS(shmget_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 },	/* 231 = shmget */
+	{ AS(clock_gettime_args), (sy_call_t *)clock_gettime, AUE_NULL, NULL, 0, 0 },	/* 232 = clock_gettime */
+	{ AS(clock_settime_args), (sy_call_t *)clock_settime, AUE_CLOCK_SETTIME, NULL, 0, 0 },	/* 233 = clock_settime */
+	{ AS(clock_getres_args), (sy_call_t *)clock_getres, AUE_NULL, NULL, 0, 0 },	/* 234 = clock_getres */
+	{ AS(ktimer_create_args), (sy_call_t *)ktimer_create, AUE_NULL, NULL, 0, 0 },	/* 235 = ktimer_create */
+	{ AS(ktimer_delete_args), (sy_call_t *)ktimer_delete, AUE_NULL, NULL, 0, 0 },	/* 236 = ktimer_delete */
+	{ AS(ktimer_settime_args), (sy_call_t *)ktimer_settime, AUE_NULL, NULL, 0, 0 },	/* 237 = ktimer_settime */
+	{ AS(ktimer_gettime_args), (sy_call_t *)ktimer_gettime, AUE_NULL, NULL, 0, 0 },	/* 238 = ktimer_gettime */
+	{ AS(ktimer_getoverrun_args), (sy_call_t *)ktimer_getoverrun, AUE_NULL, NULL, 0, 0 },	/* 239 = ktimer_getoverrun */
+	{ AS(nanosleep_args), (sy_call_t *)nanosleep, AUE_NULL, NULL, 0, 0 },	/* 240 = nanosleep */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 241 = nosys */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 242 = nosys */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 243 = nosys */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 244 = nosys */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 245 = nosys */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 246 = nosys */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 247 = nosys */
+	{ AS(ntp_gettime_args), (sy_call_t *)ntp_gettime, AUE_NULL, NULL, 0, 0 },	/* 248 = ntp_gettime */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 249 = nosys */
+	{ AS(minherit_args), (sy_call_t *)minherit, AUE_MINHERIT, NULL, 0, 0 },	/* 250 = minherit */
+	{ AS(rfork_args), (sy_call_t *)rfork, AUE_RFORK, NULL, 0, 0 },	/* 251 = rfork */
+	{ AS(openbsd_poll_args), (sy_call_t *)openbsd_poll, AUE_POLL, NULL, 0, 0 },	/* 252 = openbsd_poll */
+	{ 0, (sy_call_t *)issetugid, AUE_ISSETUGID, NULL, 0, 0 },	/* 253 = issetugid */
+	{ AS(lchown_args), (sy_call_t *)lchown, AUE_LCHOWN, NULL, 0, 0 },	/* 254 = lchown */
+	{ AS(aio_read_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 },	/* 255 = aio_read */
+	{ AS(aio_write_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 },	/* 256 = aio_write */
+	{ AS(lio_listio_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 },	/* 257 = lio_listio */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 258 = nosys */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 259 = nosys */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 260 = nosys */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 261 = nosys */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 262 = nosys */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 263 = nosys */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 264 = nosys */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 265 = nosys */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 266 = nosys */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 267 = nosys */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 268 = nosys */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 269 = nosys */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 270 = nosys */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 271 = nosys */
+	{ AS(getdents_args), (sy_call_t *)getdents, AUE_O_GETDENTS, NULL, 0, 0 },	/* 272 = getdents */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 273 = nosys */
+	{ AS(lchmod_args), (sy_call_t *)lchmod, AUE_LCHMOD, NULL, 0, 0 },	/* 274 = lchmod */
+	{ AS(lchown_args), (sy_call_t *)lchown, AUE_LCHOWN, NULL, 0, 0 },	/* 275 = netbsd_lchown */
+	{ AS(lutimes_args), (sy_call_t *)lutimes, AUE_LUTIMES, NULL, 0, 0 },	/* 276 = lutimes */
+	{ AS(msync_args), (sy_call_t *)msync, AUE_MSYNC, NULL, 0, 0 },	/* 277 = netbsd_msync */
+	{ AS(nstat_args), (sy_call_t *)nstat, AUE_STAT, NULL, 0, 0 },	/* 278 = nstat */
+	{ AS(nfstat_args), (sy_call_t *)nfstat, AUE_FSTAT, NULL, 0, 0 },	/* 279 = nfstat */
+	{ AS(nlstat_args), (sy_call_t *)nlstat, AUE_LSTAT, NULL, 0, 0 },	/* 280 = nlstat */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 281 = nosys */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 282 = nosys */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 283 = nosys */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 284 = nosys */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 285 = nosys */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 286 = nosys */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 287 = nosys */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 288 = nosys */
+	{ AS(preadv_args), (sy_call_t *)preadv, AUE_PREADV, NULL, 0, 0 },	/* 289 = preadv */
+	{ AS(pwritev_args), (sy_call_t *)pwritev, AUE_PWRITEV, NULL, 0, 0 },	/* 290 = pwritev */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 291 = nosys */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 292 = nosys */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 293 = nosys */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 294 = nosys */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 295 = nosys */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 296 = nosys */
+	{ compat4(AS(freebsd4_fhstatfs_args),fhstatfs), AUE_FHSTATFS, NULL, 0, 0 },	/* 297 = old fhstatfs */
+	{ AS(fhopen_args), (sy_call_t *)fhopen, AUE_FHOPEN, NULL, 0, 0 },	/* 298 = fhopen */
+	{ AS(fhstat_args), (sy_call_t *)fhstat, AUE_FHSTAT, NULL, 0, 0 },	/* 299 = fhstat */
+	{ AS(modnext_args), (sy_call_t *)modnext, AUE_NULL, NULL, 0, 0 },	/* 300 = modnext */
+	{ AS(modstat_args), (sy_call_t *)modstat, AUE_NULL, NULL, 0, 0 },	/* 301 = modstat */
+	{ AS(modfnext_args), (sy_call_t *)modfnext, AUE_NULL, NULL, 0, 0 },	/* 302 = modfnext */
+	{ AS(modfind_args), (sy_call_t *)modfind, AUE_NULL, NULL, 0, 0 },	/* 303 = modfind */
+	{ AS(kldload_args), (sy_call_t *)kldload, AUE_MODLOAD, NULL, 0, 0 },	/* 304 = kldload */
+	{ AS(kldunload_args), (sy_call_t *)kldunload, AUE_MODUNLOAD, NULL, 0, 0 },	/* 305 = kldunload */
+	{ AS(kldfind_args), (sy_call_t *)kldfind, AUE_NULL, NULL, 0, 0 },	/* 306 = kldfind */
+	{ AS(kldnext_args), (sy_call_t *)kldnext, AUE_NULL, NULL, 0, 0 },	/* 307 = kldnext */
+	{ AS(kldstat_args), (sy_call_t *)kldstat, AUE_NULL, NULL, 0, 0 },	/* 308 = kldstat */
+	{ AS(kldfirstmod_args), (sy_call_t *)kldfirstmod, AUE_NULL, NULL, 0, 0 },	/* 309 = kldfirstmod */
+	{ AS(getsid_args), (sy_call_t *)getsid, AUE_GETSID, NULL, 0, 0 },	/* 310 = getsid */
+	{ AS(setresuid_args), (sy_call_t *)setresuid, AUE_SETRESUID, NULL, 0, 0 },	/* 311 = setresuid */
+	{ AS(setresgid_args), (sy_call_t *)setresgid, AUE_SETRESGID, NULL, 0, 0 },	/* 312 = setresgid */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 313 = obsolete signanosleep */
+	{ AS(aio_return_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 },	/* 314 = aio_return */
+	{ AS(aio_suspend_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 },	/* 315 = aio_suspend */
+	{ AS(aio_cancel_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 },	/* 316 = aio_cancel */
+	{ AS(aio_error_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 },	/* 317 = aio_error */
+	{ AS(oaio_read_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 },	/* 318 = oaio_read */
+	{ AS(oaio_write_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 },	/* 319 = oaio_write */
+	{ AS(olio_listio_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 },	/* 320 = olio_listio */
+	{ 0, (sy_call_t *)yield, AUE_NULL, NULL, 0, 0 },		/* 321 = yield */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 322 = obsolete thr_sleep */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 323 = obsolete thr_wakeup */
+	{ AS(mlockall_args), (sy_call_t *)mlockall, AUE_MLOCKALL, NULL, 0, 0 },	/* 324 = mlockall */
+	{ 0, (sy_call_t *)munlockall, AUE_MUNLOCKALL, NULL, 0, 0 },	/* 325 = munlockall */
+	{ AS(__getcwd_args), (sy_call_t *)__getcwd, AUE_GETCWD, NULL, 0, 0 },	/* 326 = __getcwd */
+	{ AS(sched_setparam_args), (sy_call_t *)sched_setparam, AUE_NULL, NULL, 0, 0 },	/* 327 = sched_setparam */
+	{ AS(sched_getparam_args), (sy_call_t *)sched_getparam, AUE_NULL, NULL, 0, 0 },	/* 328 = sched_getparam */
+	{ AS(sched_setscheduler_args), (sy_call_t *)sched_setscheduler, AUE_NULL, NULL, 0, 0 },	/* 329 = sched_setscheduler */
+	{ AS(sched_getscheduler_args), (sy_call_t *)sched_getscheduler, AUE_NULL, NULL, 0, 0 },	/* 330 = sched_getscheduler */
+	{ 0, (sy_call_t *)sched_yield, AUE_NULL, NULL, 0, 0 },	/* 331 = sched_yield */
+	{ AS(sched_get_priority_max_args), (sy_call_t *)sched_get_priority_max, AUE_NULL, NULL, 0, 0 },	/* 332 = sched_get_priority_max */
+	{ AS(sched_get_priority_min_args), (sy_call_t *)sched_get_priority_min, AUE_NULL, NULL, 0, 0 },	/* 333 = sched_get_priority_min */
+	{ AS(sched_rr_get_interval_args), (sy_call_t *)sched_rr_get_interval, AUE_NULL, NULL, 0, 0 },	/* 334 = sched_rr_get_interval */
+	{ AS(utrace_args), (sy_call_t *)utrace, AUE_NULL, NULL, 0, 0 },	/* 335 = utrace */
+	{ compat4(AS(freebsd4_sendfile_args),sendfile), AUE_SENDFILE, NULL, 0, 0 },	/* 336 = old sendfile */
+	{ AS(kldsym_args), (sy_call_t *)kldsym, AUE_NULL, NULL, 0, 0 },	/* 337 = kldsym */
+	{ AS(jail_args), (sy_call_t *)jail, AUE_JAIL, NULL, 0, 0 },	/* 338 = jail */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 339 = pioctl */
+	{ AS(sigprocmask_args), (sy_call_t *)sigprocmask, AUE_SIGPROCMASK, NULL, 0, 0 },	/* 340 = sigprocmask */
+	{ AS(sigsuspend_args), (sy_call_t *)sigsuspend, AUE_SIGSUSPEND, NULL, 0, 0 },	/* 341 = sigsuspend */
+	{ compat4(AS(freebsd4_sigaction_args),sigaction), AUE_SIGACTION, NULL, 0, 0 },	/* 342 = old sigaction */
+	{ AS(sigpending_args), (sy_call_t *)sigpending, AUE_SIGPENDING, NULL, 0, 0 },	/* 343 = sigpending */
+	{ compat4(AS(freebsd4_sigreturn_args),sigreturn), AUE_SIGRETURN, NULL, 0, 0 },	/* 344 = old sigreturn */
+	{ AS(sigtimedwait_args), (sy_call_t *)sigtimedwait, AUE_SIGWAIT, NULL, 0, 0 },	/* 345 = sigtimedwait */
+	{ AS(sigwaitinfo_args), (sy_call_t *)sigwaitinfo, AUE_NULL, NULL, 0, 0 },	/* 346 = sigwaitinfo */
+	{ AS(__acl_get_file_args), (sy_call_t *)__acl_get_file, AUE_NULL, NULL, 0, 0 },	/* 347 = __acl_get_file */
+	{ AS(__acl_set_file_args), (sy_call_t *)__acl_set_file, AUE_NULL, NULL, 0, 0 },	/* 348 = __acl_set_file */
+	{ AS(__acl_get_fd_args), (sy_call_t *)__acl_get_fd, AUE_NULL, NULL, 0, 0 },	/* 349 = __acl_get_fd */
+	{ AS(__acl_set_fd_args), (sy_call_t *)__acl_set_fd, AUE_NULL, NULL, 0, 0 },	/* 350 = __acl_set_fd */
+	{ AS(__acl_delete_file_args), (sy_call_t *)__acl_delete_file, AUE_NULL, NULL, 0, 0 },	/* 351 = __acl_delete_file */
+	{ AS(__acl_delete_fd_args), (sy_call_t *)__acl_delete_fd, AUE_NULL, NULL, 0, 0 },	/* 352 = __acl_delete_fd */
+	{ AS(__acl_aclcheck_file_args), (sy_call_t *)__acl_aclcheck_file, AUE_NULL, NULL, 0, 0 },	/* 353 = __acl_aclcheck_file */
+	{ AS(__acl_aclcheck_fd_args), (sy_call_t *)__acl_aclcheck_fd, AUE_NULL, NULL, 0, 0 },	/* 354 = __acl_aclcheck_fd */
+	{ AS(extattrctl_args), (sy_call_t *)extattrctl, AUE_EXTATTRCTL, NULL, 0, 0 },	/* 355 = extattrctl */
+	{ AS(extattr_set_file_args), (sy_call_t *)extattr_set_file, AUE_EXTATTR_SET_FILE, NULL, 0, 0 },	/* 356 = extattr_set_file */
+	{ AS(extattr_get_file_args), (sy_call_t *)extattr_get_file, AUE_EXTATTR_GET_FILE, NULL, 0, 0 },	/* 357 = extattr_get_file */
+	{ AS(extattr_delete_file_args), (sy_call_t *)extattr_delete_file, AUE_EXTATTR_DELETE_FILE, NULL, 0, 0 },	/* 358 = extattr_delete_file */
+	{ AS(aio_waitcomplete_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 },	/* 359 = aio_waitcomplete */
+	{ AS(getresuid_args), (sy_call_t *)getresuid, AUE_GETRESUID, NULL, 0, 0 },	/* 360 = getresuid */
+	{ AS(getresgid_args), (sy_call_t *)getresgid, AUE_GETRESGID, NULL, 0, 0 },	/* 361 = getresgid */
+	{ 0, (sy_call_t *)kqueue, AUE_KQUEUE, NULL, 0, 0 },		/* 362 = kqueue */
+	{ AS(kevent_args), (sy_call_t *)kevent, AUE_NULL, NULL, 0, 0 },	/* 363 = kevent */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 364 = __cap_get_proc */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 365 = __cap_set_proc */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 366 = __cap_get_fd */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 367 = __cap_get_file */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 368 = __cap_set_fd */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 369 = __cap_set_file */
+	{ AS(nosys_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 },	/* 370 = lkmressys */
+	{ AS(extattr_set_fd_args), (sy_call_t *)extattr_set_fd, AUE_EXTATTR_SET_FD, NULL, 0, 0 },	/* 371 = extattr_set_fd */
+	{ AS(extattr_get_fd_args), (sy_call_t *)extattr_get_fd, AUE_EXTATTR_GET_FD, NULL, 0, 0 },	/* 372 = extattr_get_fd */
+	{ AS(extattr_delete_fd_args), (sy_call_t *)extattr_delete_fd, AUE_EXTATTR_DELETE_FD, NULL, 0, 0 },	/* 373 = extattr_delete_fd */
+	{ AS(__setugid_args), (sy_call_t *)__setugid, AUE_NULL, NULL, 0, 0 },	/* 374 = __setugid */
+	{ AS(nfsclnt_args), (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },	/* 375 = nfsclnt */
+	{ AS(eaccess_args), (sy_call_t *)eaccess, AUE_EACCESS, NULL, 0, 0 },	/* 376 = eaccess */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 377 = afs_syscall */
+	{ AS(nmount_args), (sy_call_t *)nmount, AUE_NMOUNT, NULL, 0, 0 },	/* 378 = nmount */
+	{ 0, (sy_call_t *)kse_exit, AUE_NULL, NULL, 0, 0 },		/* 379 = kse_exit */
+	{ AS(kse_wakeup_args), (sy_call_t *)kse_wakeup, AUE_NULL, NULL, 0, 0 },	/* 380 = kse_wakeup */
+	{ AS(kse_create_args), (sy_call_t *)kse_create, AUE_NULL, NULL, 0, 0 },	/* 381 = kse_create */
+	{ AS(kse_thr_interrupt_args), (sy_call_t *)kse_thr_interrupt, AUE_NULL, NULL, 0, 0 },	/* 382 = kse_thr_interrupt */
+	{ AS(kse_release_args), (sy_call_t *)kse_release, AUE_NULL, NULL, 0, 0 },	/* 383 = kse_release */
+	{ AS(__mac_get_proc_args), (sy_call_t *)__mac_get_proc, AUE_NULL, NULL, 0, 0 },	/* 384 = __mac_get_proc */
+	{ AS(__mac_set_proc_args), (sy_call_t *)__mac_set_proc, AUE_NULL, NULL, 0, 0 },	/* 385 = __mac_set_proc */
+	{ AS(__mac_get_fd_args), (sy_call_t *)__mac_get_fd, AUE_NULL, NULL, 0, 0 },	/* 386 = __mac_get_fd */
+	{ AS(__mac_get_file_args), (sy_call_t *)__mac_get_file, AUE_NULL, NULL, 0, 0 },	/* 387 = __mac_get_file */
+	{ AS(__mac_set_fd_args), (sy_call_t *)__mac_set_fd, AUE_NULL, NULL, 0, 0 },	/* 388 = __mac_set_fd */
+	{ AS(__mac_set_file_args), (sy_call_t *)__mac_set_file, AUE_NULL, NULL, 0, 0 },	/* 389 = __mac_set_file */
+	{ AS(kenv_args), (sy_call_t *)kenv, AUE_NULL, NULL, 0, 0 },	/* 390 = kenv */
+	{ AS(lchflags_args), (sy_call_t *)lchflags, AUE_LCHFLAGS, NULL, 0, 0 },	/* 391 = lchflags */
+	{ AS(uuidgen_args), (sy_call_t *)uuidgen, AUE_NULL, NULL, 0, 0 },	/* 392 = uuidgen */
+	{ AS(sendfile_args), (sy_call_t *)sendfile, AUE_SENDFILE, NULL, 0, 0 },	/* 393 = sendfile */
+	{ AS(mac_syscall_args), (sy_call_t *)mac_syscall, AUE_NULL, NULL, 0, 0 },	/* 394 = mac_syscall */
+	{ AS(getfsstat_args), (sy_call_t *)getfsstat, AUE_GETFSSTAT, NULL, 0, 0 },	/* 395 = getfsstat */
+	{ AS(statfs_args), (sy_call_t *)statfs, AUE_STATFS, NULL, 0, 0 },	/* 396 = statfs */
+	{ AS(fstatfs_args), (sy_call_t *)fstatfs, AUE_FSTATFS, NULL, 0, 0 },	/* 397 = fstatfs */
+	{ AS(fhstatfs_args), (sy_call_t *)fhstatfs, AUE_FHSTATFS, NULL, 0, 0 },	/* 398 = fhstatfs */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 399 = nosys */
+	{ AS(ksem_close_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 },	/* 400 = ksem_close */
+	{ AS(ksem_post_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 },	/* 401 = ksem_post */
+	{ AS(ksem_wait_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 },	/* 402 = ksem_wait */
+	{ AS(ksem_trywait_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 },	/* 403 = ksem_trywait */
+	{ AS(ksem_init_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 },	/* 404 = ksem_init */
+	{ AS(ksem_open_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 },	/* 405 = ksem_open */
+	{ AS(ksem_unlink_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 },	/* 406 = ksem_unlink */
+	{ AS(ksem_getvalue_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 },	/* 407 = ksem_getvalue */
+	{ AS(ksem_destroy_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 },	/* 408 = ksem_destroy */
+	{ AS(__mac_get_pid_args), (sy_call_t *)__mac_get_pid, AUE_NULL, NULL, 0, 0 },	/* 409 = __mac_get_pid */
+	{ AS(__mac_get_link_args), (sy_call_t *)__mac_get_link, AUE_NULL, NULL, 0, 0 },	/* 410 = __mac_get_link */
+	{ AS(__mac_set_link_args), (sy_call_t *)__mac_set_link, AUE_NULL, NULL, 0, 0 },	/* 411 = __mac_set_link */
+	{ AS(extattr_set_link_args), (sy_call_t *)extattr_set_link, AUE_EXTATTR_SET_LINK, NULL, 0, 0 },	/* 412 = extattr_set_link */
+	{ AS(extattr_get_link_args), (sy_call_t *)extattr_get_link, AUE_EXTATTR_GET_LINK, NULL, 0, 0 },	/* 413 = extattr_get_link */
+	{ AS(extattr_delete_link_args), (sy_call_t *)extattr_delete_link, AUE_EXTATTR_DELETE_LINK, NULL, 0, 0 },	/* 414 = extattr_delete_link */
+	{ AS(__mac_execve_args), (sy_call_t *)__mac_execve, AUE_NULL, NULL, 0, 0 },	/* 415 = __mac_execve */
+	{ AS(sigaction_args), (sy_call_t *)sigaction, AUE_SIGACTION, NULL, 0, 0 },	/* 416 = sigaction */
+	{ AS(sigreturn_args), (sy_call_t *)sigreturn, AUE_SIGRETURN, NULL, 0, 0 },	/* 417 = sigreturn */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 418 = __xstat */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 419 = __xfstat */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 420 = __xlstat */
+	{ AS(getcontext_args), (sy_call_t *)getcontext, AUE_NULL, NULL, 0, 0 },	/* 421 = getcontext */
+	{ AS(setcontext_args), (sy_call_t *)setcontext, AUE_NULL, NULL, 0, 0 },	/* 422 = setcontext */
+	{ AS(swapcontext_args), (sy_call_t *)swapcontext, AUE_NULL, NULL, 0, 0 },	/* 423 = swapcontext */
+	{ AS(swapoff_args), (sy_call_t *)swapoff, AUE_SWAPOFF, NULL, 0, 0 },	/* 424 = swapoff */
+	{ AS(__acl_get_link_args), (sy_call_t *)__acl_get_link, AUE_NULL, NULL, 0, 0 },	/* 425 = __acl_get_link */
+	{ AS(__acl_set_link_args), (sy_call_t *)__acl_set_link, AUE_NULL, NULL, 0, 0 },	/* 426 = __acl_set_link */
+	{ AS(__acl_delete_link_args), (sy_call_t *)__acl_delete_link, AUE_NULL, NULL, 0, 0 },	/* 427 = __acl_delete_link */
+	{ AS(__acl_aclcheck_link_args), (sy_call_t *)__acl_aclcheck_link, AUE_NULL, NULL, 0, 0 },	/* 428 = __acl_aclcheck_link */
+	{ AS(sigwait_args), (sy_call_t *)sigwait, AUE_SIGWAIT, NULL, 0, 0 },	/* 429 = sigwait */
+	{ AS(thr_create_args), (sy_call_t *)thr_create, AUE_NULL, NULL, 0, 0 },	/* 430 = thr_create */
+	{ AS(thr_exit_args), (sy_call_t *)thr_exit, AUE_NULL, NULL, 0, 0 },	/* 431 = thr_exit */
+	{ AS(thr_self_args), (sy_call_t *)thr_self, AUE_NULL, NULL, 0, 0 },	/* 432 = thr_self */
+	{ AS(thr_kill_args), (sy_call_t *)thr_kill, AUE_NULL, NULL, 0, 0 },	/* 433 = thr_kill */
+	{ AS(_umtx_lock_args), (sy_call_t *)_umtx_lock, AUE_NULL, NULL, 0, 0 },	/* 434 = _umtx_lock */
+	{ AS(_umtx_unlock_args), (sy_call_t *)_umtx_unlock, AUE_NULL, NULL, 0, 0 },	/* 435 = _umtx_unlock */
+	{ AS(jail_attach_args), (sy_call_t *)jail_attach, AUE_NULL, NULL, 0, 0 },	/* 436 = jail_attach */
+	{ AS(extattr_list_fd_args), (sy_call_t *)extattr_list_fd, AUE_EXTATTR_LIST_FD, NULL, 0, 0 },	/* 437 = extattr_list_fd */
+	{ AS(extattr_list_file_args), (sy_call_t *)extattr_list_file, AUE_EXTATTR_LIST_FILE, NULL, 0, 0 },	/* 438 = extattr_list_file */
+	{ AS(extattr_list_link_args), (sy_call_t *)extattr_list_link, AUE_EXTATTR_LIST_LINK, NULL, 0, 0 },	/* 439 = extattr_list_link */
+	{ AS(kse_switchin_args), (sy_call_t *)kse_switchin, AUE_NULL, NULL, 0, 0 },	/* 440 = kse_switchin */
+	{ AS(ksem_timedwait_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 },	/* 441 = ksem_timedwait */
+	{ AS(thr_suspend_args), (sy_call_t *)thr_suspend, AUE_NULL, NULL, 0, 0 },	/* 442 = thr_suspend */
+	{ AS(thr_wake_args), (sy_call_t *)thr_wake, AUE_NULL, NULL, 0, 0 },	/* 443 = thr_wake */
+	{ AS(kldunloadf_args), (sy_call_t *)kldunloadf, AUE_MODUNLOAD, NULL, 0, 0 },	/* 444 = kldunloadf */
+	{ AS(audit_args), (sy_call_t *)audit, AUE_AUDIT, NULL, 0, 0 },	/* 445 = audit */
+	{ AS(auditon_args), (sy_call_t *)auditon, AUE_AUDITON, NULL, 0, 0 },	/* 446 = auditon */
+	{ AS(getauid_args), (sy_call_t *)getauid, AUE_GETAUID, NULL, 0, 0 },	/* 447 = getauid */
+	{ AS(setauid_args), (sy_call_t *)setauid, AUE_SETAUID, NULL, 0, 0 },	/* 448 = setauid */
+	{ AS(getaudit_args), (sy_call_t *)getaudit, AUE_GETAUDIT, NULL, 0, 0 },	/* 449 = getaudit */
+	{ AS(setaudit_args), (sy_call_t *)setaudit, AUE_SETAUDIT, NULL, 0, 0 },	/* 450 = setaudit */
+	{ AS(getaudit_addr_args), (sy_call_t *)getaudit_addr, AUE_GETAUDIT_ADDR, NULL, 0, 0 },	/* 451 = getaudit_addr */
+	{ AS(setaudit_addr_args), (sy_call_t *)setaudit_addr, AUE_SETAUDIT_ADDR, NULL, 0, 0 },	/* 452 = setaudit_addr */
+	{ AS(auditctl_args), (sy_call_t *)auditctl, AUE_AUDITCTL, NULL, 0, 0 },	/* 453 = auditctl */
+	{ AS(_umtx_op_args), (sy_call_t *)_umtx_op, AUE_NULL, NULL, 0, 0 },	/* 454 = _umtx_op */
+	{ AS(thr_new_args), (sy_call_t *)thr_new, AUE_NULL, NULL, 0, 0 },	/* 455 = thr_new */
+	{ AS(sigqueue_args), (sy_call_t *)sigqueue, AUE_NULL, NULL, 0, 0 },	/* 456 = sigqueue */
+	{ AS(kmq_open_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 },	/* 457 = kmq_open */
+	{ AS(kmq_setattr_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 },	/* 458 = kmq_setattr */
+	{ AS(kmq_timedreceive_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 },	/* 459 = kmq_timedreceive */
+	{ AS(kmq_timedsend_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 },	/* 460 = kmq_timedsend */
+	{ AS(kmq_notify_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 },	/* 461 = kmq_notify */
+	{ AS(kmq_unlink_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 },	/* 462 = kmq_unlink */
+	{ AS(abort2_args), (sy_call_t *)abort2, AUE_NULL, NULL, 0, 0 },	/* 463 = abort2 */
+	{ AS(thr_set_name_args), (sy_call_t *)thr_set_name, AUE_NULL, NULL, 0, 0 },	/* 464 = thr_set_name */
+	{ AS(aio_fsync_args), (sy_call_t *)lkmressys, AUE_NULL, NULL, 0, 0 },	/* 465 = aio_fsync */
+	{ AS(rtprio_thread_args), (sy_call_t *)rtprio_thread, AUE_RTPRIO, NULL, 0, 0 },	/* 466 = rtprio_thread */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 467 = nosys */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 468 = nosys */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 469 = __getpath_fromfd */
+	{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },			/* 470 = __getpath_fromaddr */
+	{ AS(sctp_peeloff_args), (sy_call_t *)sctp_peeloff, AUE_NULL, NULL, 0, 0 },	/* 471 = sctp_peeloff */
+	{ AS(sctp_generic_sendmsg_args), (sy_call_t *)sctp_generic_sendmsg, AUE_NULL, NULL, 0, 0 },	/* 472 = sctp_generic_sendmsg */
+	{ AS(sctp_generic_sendmsg_iov_args), (sy_call_t *)sctp_generic_sendmsg_iov, AUE_NULL, NULL, 0, 0 },	/* 473 = sctp_generic_sendmsg_iov */
+	{ AS(sctp_generic_recvmsg_args), (sy_call_t *)sctp_generic_recvmsg, AUE_NULL, NULL, 0, 0 },	/* 474 = sctp_generic_recvmsg */
+	{ AS(pread_args), (sy_call_t *)pread, AUE_PREAD, NULL, 0, 0 },	/* 475 = pread */
+	{ AS(pwrite_args), (sy_call_t *)pwrite, AUE_PWRITE, NULL, 0, 0 },	/* 476 = pwrite */
+	{ AS(mmap_args), (sy_call_t *)mmap, AUE_MMAP, NULL, 0, 0 },	/* 477 = mmap */
+	{ AS(lseek_args), (sy_call_t *)lseek, AUE_LSEEK, NULL, 0, 0 },	/* 478 = lseek */
+	{ AS(truncate_args), (sy_call_t *)truncate, AUE_TRUNCATE, NULL, 0, 0 },	/* 479 = truncate */
+	{ AS(ftruncate_args), (sy_call_t *)ftruncate, AUE_FTRUNCATE, NULL, 0, 0 },	/* 480 = ftruncate */
+	{ AS(thr_kill2_args), (sy_call_t *)thr_kill2, AUE_KILL, NULL, 0, 0 },	/* 481 = thr_kill2 */
 };
Index: sched_ule.c
===================================================================
RCS file: /home/cvs/src/sys/kern/sched_ule.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/sched_ule.c -L sys/kern/sched_ule.c -u -r1.1.1.1 -r1.2
--- sys/kern/sched_ule.c
+++ sys/kern/sched_ule.c
@@ -1,5 +1,5 @@
 /*-
- * Copyright (c) 2002-2005, Jeffrey Roberson <jeff at freebsd.org>
+ * Copyright (c) 2002-2007, Jeffrey Roberson <jeff at freebsd.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -24,14 +24,23 @@
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+/*
+ * This file implements the ULE scheduler.  ULE supports independent CPU
+ * run queues and fine grain locking.  It has superior interactive
+ * performance under load even on uni-processor systems.
+ *
+ * etymology:
+ *   ULE is the last three letters in schedule.  It owes its name to a
+ * generic user created for a scheduling system by Paul Mikesell at
+ * Isilon Systems and a general lack of creativity on the part of the author.
+ */
+
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/sched_ule.c,v 1.153.2.3 2005/09/27 12:00:31 davidxu Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/sched_ule.c,v 1.214.2.2 2007/12/20 07:15:40 davidxu Exp $");
 
 #include "opt_hwpmc_hooks.h"
 #include "opt_sched.h"
 
-#define kse td_sched
-
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kdb.h>
@@ -48,6 +57,7 @@
 #include <sys/sysctl.h>
 #include <sys/sysproto.h>
 #include <sys/turnstile.h>
+#include <sys/umtx.h>
 #include <sys/vmmeter.h>
 #ifdef KTRACE
 #include <sys/uio.h>
@@ -61,111 +71,84 @@
 #include <machine/cpu.h>
 #include <machine/smp.h>
 
-/* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
-/* XXX This is bogus compatability crap for ps */
-static fixpt_t  ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */
-SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
-
-static void sched_setup(void *dummy);
-SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL)
-
-static SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0, "Scheduler");
-
-SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "ule", 0,
-    "Scheduler name");
-
-static int slice_min = 1;
-SYSCTL_INT(_kern_sched, OID_AUTO, slice_min, CTLFLAG_RW, &slice_min, 0, "");
-
-static int slice_max = 10;
-SYSCTL_INT(_kern_sched, OID_AUTO, slice_max, CTLFLAG_RW, &slice_max, 0, "");
+#if !defined(__i386__) && !defined(__amd64__) && !defined(__arm__)
+#error "This architecture is not currently compatible with ULE"
+#endif
 
-int realstathz;
-int tickincr = 1;
+#define	KTR_ULE	0
 
 /*
- * The following datastructures are allocated within their parent structure
- * but are scheduler specific.
- */
-/*
- * The schedulable entity that can be given a context to run.  A process may
- * have several of these.
- */
-struct kse {
-	TAILQ_ENTRY(kse) ke_procq;	/* (j/z) Run queue. */
-	int		ke_flags;	/* (j) KEF_* flags. */
-	struct thread	*ke_thread;	/* (*) Active associated thread. */
-	fixpt_t		ke_pctcpu;	/* (j) %cpu during p_swtime. */
-	char		ke_rqindex;	/* (j) Run queue index. */
-	enum {
-		KES_THREAD = 0x0,	/* slaved to thread state */
-		KES_ONRUNQ
-	} ke_state;			/* (j) thread sched specific status. */
-	int		ke_slptime;
-	int		ke_slice;
-	struct runq	*ke_runq;
-	u_char		ke_cpu;		/* CPU that we have affinity for. */
+ * Thread scheduler specific section.  All fields are protected
+ * by the thread lock.
+ */
+struct td_sched {	
+	TAILQ_ENTRY(td_sched) ts_procq;	/* Run queue. */
+	struct thread	*ts_thread;	/* Active associated thread. */
+	struct runq	*ts_runq;	/* Run-queue we're queued on. */
+	short		ts_flags;	/* TSF_* flags. */
+	u_char		ts_rqindex;	/* Run queue index. */
+	u_char		ts_cpu;		/* CPU that we have affinity for. */
+	int		ts_slice;	/* Ticks of slice remaining. */
+	u_int		ts_slptime;	/* Number of ticks we vol. slept */
+	u_int		ts_runtime;	/* Number of ticks we were running */
 	/* The following variables are only used for pctcpu calculation */
-	int		ke_ltick;	/* Last tick that we were running on */
-	int		ke_ftick;	/* First tick that we were running on */
-	int		ke_ticks;	/* Tick count */
-
-};
-#define	td_kse			td_sched
-#define	td_slptime		td_kse->ke_slptime
-#define ke_proc			ke_thread->td_proc
-#define ke_ksegrp		ke_thread->td_ksegrp
-#define	ke_assign		ke_procq.tqe_next
-/* flags kept in ke_flags */
-#define	KEF_ASSIGNED	0x0001		/* Thread is being migrated. */
-#define	KEF_BOUND	0x0002		/* Thread can not migrate. */
-#define	KEF_XFERABLE	0x0004		/* Thread was added as transferable. */
-#define	KEF_HOLD	0x0008		/* Thread is temporarily bound. */
-#define	KEF_REMOVED	0x0010		/* Thread was removed while ASSIGNED */
-#define	KEF_INTERNAL	0x0020		/* Thread added due to migration. */
-#define	KEF_PREEMPTED	0x0040		/* Thread was preempted */
-#define	KEF_DIDRUN	0x02000		/* Thread actually ran. */
-#define	KEF_EXIT	0x04000		/* Thread is being killed. */
-
-struct kg_sched {
-	struct thread	*skg_last_assigned; /* (j) Last thread assigned to */
-					   /* the system scheduler */
-	int	skg_slptime;		/* Number of ticks we vol. slept */
-	int	skg_runtime;		/* Number of ticks we were running */
-	int	skg_avail_opennings;	/* (j) Num unfilled slots in group.*/
-	int	skg_concurrency;	/* (j) Num threads requested in group.*/
+	int		ts_ltick;	/* Last tick that we were running on */
+	int		ts_ftick;	/* First tick that we were running on */
+	int		ts_ticks;	/* Tick count */
+#ifdef SMP
+	int		ts_rltick;	/* Real last tick, for affinity. */
+#endif
 };
-#define kg_last_assigned	kg_sched->skg_last_assigned
-#define kg_avail_opennings	kg_sched->skg_avail_opennings
-#define kg_concurrency		kg_sched->skg_concurrency
-#define kg_runtime		kg_sched->skg_runtime
-#define kg_slptime		kg_sched->skg_slptime
-
-#define SLOT_RELEASE(kg)	(kg)->kg_avail_opennings++
-#define	SLOT_USE(kg)		(kg)->kg_avail_opennings--
-
-static struct kse kse0;
-static struct kg_sched kg_sched0;
-
-/*
- * The priority is primarily determined by the interactivity score.  Thus, we
- * give lower(better) priorities to kse groups that use less CPU.  The nice
- * value is then directly added to this to allow nice to have some effect
- * on latency.
+/* flags kept in ts_flags */
+#define	TSF_BOUND	0x0001		/* Thread can not migrate. */
+#define	TSF_XFERABLE	0x0002		/* Thread was added as transferable. */
+
+static struct td_sched td_sched0;
+
+/*
+ * Cpu percentage computation macros and defines.
+ *
+ * SCHED_TICK_SECS:	Number of seconds to average the cpu usage across.
+ * SCHED_TICK_TARG:	Number of hz ticks to average the cpu usage across.
+ * SCHED_TICK_MAX:	Maximum number of ticks before scaling back.
+ * SCHED_TICK_SHIFT:	Shift factor to avoid rounding away results.
+ * SCHED_TICK_HZ:	Compute the number of hz ticks for a given ticks count.
+ * SCHED_TICK_TOTAL:	Gives the amount of time we've been recording ticks.
+ */
+#define	SCHED_TICK_SECS		10
+#define	SCHED_TICK_TARG		(hz * SCHED_TICK_SECS)
+#define	SCHED_TICK_MAX		(SCHED_TICK_TARG + hz)
+#define	SCHED_TICK_SHIFT	10
+#define	SCHED_TICK_HZ(ts)	((ts)->ts_ticks >> SCHED_TICK_SHIFT)
+#define	SCHED_TICK_TOTAL(ts)	(max((ts)->ts_ltick - (ts)->ts_ftick, hz))
+
+/*
+ * These macros determine priorities for non-interactive threads.  They are
+ * assigned a priority based on their recent cpu utilization as expressed
+ * by the ratio of ticks to the tick total.  NHALF priorities at the start
+ * and end of the MIN to MAX timeshare range are only reachable with negative
+ * or positive nice respectively.
  *
- * PRI_RANGE:	Total priority range for timeshare threads.
+ * PRI_RANGE:	Priority range for utilization dependent priorities.
  * PRI_NRESV:	Number of nice values.
- * PRI_BASE:	The start of the dynamic range.
+ * PRI_TICKS:	Compute a priority in PRI_RANGE from the ticks count and total.
+ * PRI_NICE:	Determines the part of the priority inherited from nice.
  */
-#define	SCHED_PRI_RANGE		(PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE + 1)
-#define	SCHED_PRI_NRESV		((PRIO_MAX - PRIO_MIN) + 1)
+#define	SCHED_PRI_NRESV		(PRIO_MAX - PRIO_MIN)
 #define	SCHED_PRI_NHALF		(SCHED_PRI_NRESV / 2)
-#define	SCHED_PRI_BASE		(PRI_MIN_TIMESHARE)
-#define	SCHED_PRI_INTERACT(score)					\
-    ((score) * SCHED_PRI_RANGE / SCHED_INTERACT_MAX)
+#define	SCHED_PRI_MIN		(PRI_MIN_TIMESHARE + SCHED_PRI_NHALF)
+#define	SCHED_PRI_MAX		(PRI_MAX_TIMESHARE - SCHED_PRI_NHALF)
+#define	SCHED_PRI_RANGE		(SCHED_PRI_MAX - SCHED_PRI_MIN)
+#define	SCHED_PRI_TICKS(ts)						\
+    (SCHED_TICK_HZ((ts)) /						\
+    (roundup(SCHED_TICK_TOTAL((ts)), SCHED_PRI_RANGE) / SCHED_PRI_RANGE))
+#define	SCHED_PRI_NICE(nice)	(nice)
 
 /*
- * These determine the interactivity of a process.
+ * These determine the interactivity of a process.  Interactivity differs from
+ * cpu utilization in that it expresses the voluntary time slept vs time ran
+ * while cpu utilization includes all time not running.  This more accurately
+ * models the intent of the thread.
  *
  * SLP_RUN_MAX:	Maximum amount of sleep time + run time we'll accumulate
  *		before throttling back.
@@ -173,280 +156,351 @@
  * INTERACT_MAX:	Maximum interactivity value.  Smaller is better.
  * INTERACT_THRESH:	Threshhold for placement on the current runq.
  */
-#define	SCHED_SLP_RUN_MAX	((hz * 5) << 10)
-#define	SCHED_SLP_RUN_FORK	((hz / 2) << 10)
+#define	SCHED_SLP_RUN_MAX	((hz * 5) << SCHED_TICK_SHIFT)
+#define	SCHED_SLP_RUN_FORK	((hz / 2) << SCHED_TICK_SHIFT)
 #define	SCHED_INTERACT_MAX	(100)
 #define	SCHED_INTERACT_HALF	(SCHED_INTERACT_MAX / 2)
 #define	SCHED_INTERACT_THRESH	(30)
 
 /*
- * These parameters and macros determine the size of the time slice that is
- * granted to each thread.
- *
- * SLICE_MIN:	Minimum time slice granted, in units of ticks.
- * SLICE_MAX:	Maximum time slice granted.
- * SLICE_RANGE:	Range of available time slices scaled by hz.
- * SLICE_SCALE:	The number slices granted per val in the range of [0, max].
- * SLICE_NICE:  Determine the amount of slice granted to a scaled nice.
- * SLICE_NTHRESH:	The nice cutoff point for slice assignment.
- */
-#define	SCHED_SLICE_MIN			(slice_min)
-#define	SCHED_SLICE_MAX			(slice_max)
-#define	SCHED_SLICE_INTERACTIVE		(slice_max)
-#define	SCHED_SLICE_NTHRESH	(SCHED_PRI_NHALF - 1)
-#define	SCHED_SLICE_RANGE		(SCHED_SLICE_MAX - SCHED_SLICE_MIN + 1)
-#define	SCHED_SLICE_SCALE(val, max)	(((val) * SCHED_SLICE_RANGE) / (max))
-#define	SCHED_SLICE_NICE(nice)						\
-    (SCHED_SLICE_MAX - SCHED_SLICE_SCALE((nice), SCHED_SLICE_NTHRESH))
-
-/*
- * This macro determines whether or not the thread belongs on the current or
- * next run queue.
- */
-#define	SCHED_INTERACTIVE(kg)						\
-    (sched_interact_score(kg) < SCHED_INTERACT_THRESH)
-#define	SCHED_CURR(kg, ke)						\
-    ((ke->ke_thread->td_flags & TDF_BORROWING) ||			\
-     (ke->ke_flags & KEF_PREEMPTED) || SCHED_INTERACTIVE(kg))
-
-/*
- * Cpu percentage computation macros and defines.
- *
- * SCHED_CPU_TIME:	Number of seconds to average the cpu usage across.
- * SCHED_CPU_TICKS:	Number of hz ticks to average the cpu usage across.
- */
-
-#define	SCHED_CPU_TIME	10
-#define	SCHED_CPU_TICKS	(hz * SCHED_CPU_TIME)
+ * tickincr:		Converts a stathz tick into a hz domain scaled by
+ *			the shift factor.  Without the shift the error rate
+ *			due to rounding would be unacceptably high.
+ * realstathz:		stathz is sometimes 0 and run off of hz.
+ * sched_slice:		Runtime of each thread before rescheduling.
+ * preempt_thresh:	Priority threshold for preemption and remote IPIs.
+ */
+static int sched_interact = SCHED_INTERACT_THRESH;
+static int realstathz;
+static int tickincr;
+static int sched_slice;
+#ifdef PREEMPTION
+#ifdef FULL_PREEMPTION
+static int preempt_thresh = PRI_MAX_IDLE;
+#else
+static int preempt_thresh = PRI_MIN_KERN;
+#endif
+#else 
+static int preempt_thresh = 0;
+#endif
 
 /*
- * kseq - per processor runqs and statistics.
- */
-struct kseq {
-	struct runq	ksq_idle;		/* Queue of IDLE threads. */
-	struct runq	ksq_timeshare[2];	/* Run queues for !IDLE. */
-	struct runq	*ksq_next;		/* Next timeshare queue. */
-	struct runq	*ksq_curr;		/* Current queue. */
-	int		ksq_load_timeshare;	/* Load for timeshare. */
-	int		ksq_load;		/* Aggregate load. */
-	short		ksq_nice[SCHED_PRI_NRESV]; /* KSEs in each nice bin. */
-	short		ksq_nicemin;		/* Least nice. */
-#ifdef SMP
-	int			ksq_transferable;
-	LIST_ENTRY(kseq)	ksq_siblings;	/* Next in kseq group. */
-	struct kseq_group	*ksq_group;	/* Our processor group. */
-	volatile struct kse	*ksq_assigned;	/* assigned by another CPU. */
+ * tdq - per processor runqs and statistics.  All fields are protected by the
+ * tdq_lock.  The load and lowpri may be accessed without to avoid excess
+ * locking in sched_pickcpu();
+ */
+struct tdq {
+	struct mtx	*tdq_lock;		/* Pointer to group lock. */
+	struct runq	tdq_realtime;		/* real-time run queue. */
+	struct runq	tdq_timeshare;		/* timeshare run queue. */
+	struct runq	tdq_idle;		/* Queue of IDLE threads. */
+	int		tdq_load;		/* Aggregate load. */
+	u_char		tdq_idx;		/* Current insert index. */
+	u_char		tdq_ridx;		/* Current removal index. */
+#ifdef SMP
+	u_char		tdq_lowpri;		/* Lowest priority thread. */
+	int		tdq_transferable;	/* Transferable thread count. */
+	LIST_ENTRY(tdq)	tdq_siblings;		/* Next in tdq group. */
+	struct tdq_group *tdq_group;		/* Our processor group. */
 #else
-	int		ksq_sysload;		/* For loadavg, !ITHD load. */
+	int		tdq_sysload;		/* For loadavg, !ITHD load. */
 #endif
-};
+} __aligned(64);
+
 
 #ifdef SMP
 /*
- * kseq groups are groups of processors which can cheaply share threads.  When
+ * tdq groups are groups of processors which can cheaply share threads.  When
  * one processor in the group goes idle it will check the runqs of the other
  * processors in its group prior to halting and waiting for an interrupt.
  * These groups are suitable for SMT (Symetric Multi-Threading) and not NUMA.
  * In a numa environment we'd want an idle bitmap per group and a two tiered
  * load balancer.
  */
-struct kseq_group {
-	int	ksg_cpus;		/* Count of CPUs in this kseq group. */
-	cpumask_t ksg_cpumask;		/* Mask of cpus in this group. */
-	cpumask_t ksg_idlemask;		/* Idle cpus in this group. */
-	cpumask_t ksg_mask;		/* Bit mask for first cpu. */
-	int	ksg_load;		/* Total load of this group. */
-	int	ksg_transferable;	/* Transferable load of this group. */
-	LIST_HEAD(, kseq)	ksg_members; /* Linked list of all members. */
-};
-#endif
-
-/*
- * One kse queue per processor.
- */
-#ifdef SMP
-static cpumask_t kseq_idle;
-static int ksg_maxid;
-static struct kseq	kseq_cpu[MAXCPU];
-static struct kseq_group kseq_groups[MAXCPU];
-static int bal_tick;
-static int gbal_tick;
-static int balance_groups;
-
-#define	KSEQ_SELF()	(&kseq_cpu[PCPU_GET(cpuid)])
-#define	KSEQ_CPU(x)	(&kseq_cpu[(x)])
-#define	KSEQ_ID(x)	((x) - kseq_cpu)
-#define	KSEQ_GROUP(x)	(&kseq_groups[(x)])
+struct tdq_group {
+	struct mtx	tdg_lock;	/* Protects all fields below. */
+	int		tdg_cpus;	/* Count of CPUs in this tdq group. */
+	cpumask_t 	tdg_cpumask;	/* Mask of cpus in this group. */
+	cpumask_t 	tdg_idlemask;	/* Idle cpus in this group. */
+	cpumask_t 	tdg_mask;	/* Bit mask for first cpu. */
+	int		tdg_load;	/* Total load of this group. */
+	int	tdg_transferable;	/* Transferable load of this group. */
+	LIST_HEAD(, tdq) tdg_members;	/* Linked list of all members. */
+	char		tdg_name[16];	/* lock name. */
+} __aligned(64);
+
+#define	SCHED_AFFINITY_DEFAULT	(max(1, hz / 300))
+#define	SCHED_AFFINITY(ts)	((ts)->ts_rltick > ticks - affinity)
+
+/*
+ * Run-time tunables.
+ */
+static int rebalance = 1;
+static int balance_interval = 128;	/* Default set in sched_initticks(). */
+static int pick_pri = 1;
+static int affinity;
+static int tryself = 1;
+static int steal_htt = 1;
+static int steal_idle = 1;
+static int steal_thresh = 2;
+static int topology = 0;
+
+/*
+ * One thread queue per processor.
+ */
+static volatile cpumask_t tdq_idle;
+static int tdg_maxid;
+static struct tdq	tdq_cpu[MAXCPU];
+static struct tdq_group tdq_groups[MAXCPU];
+static struct tdq	*balance_tdq;
+static int balance_group_ticks;
+static int balance_ticks;
+
+#define	TDQ_SELF()	(&tdq_cpu[PCPU_GET(cpuid)])
+#define	TDQ_CPU(x)	(&tdq_cpu[(x)])
+#define	TDQ_ID(x)	((int)((x) - tdq_cpu))
+#define	TDQ_GROUP(x)	(&tdq_groups[(x)])
+#define	TDG_ID(x)	((int)((x) - tdq_groups))
 #else	/* !SMP */
-static struct kseq	kseq_cpu;
+static struct tdq	tdq_cpu;
+static struct mtx	tdq_lock;
 
-#define	KSEQ_SELF()	(&kseq_cpu)
-#define	KSEQ_CPU(x)	(&kseq_cpu)
+#define	TDQ_ID(x)	(0)
+#define	TDQ_SELF()	(&tdq_cpu)
+#define	TDQ_CPU(x)	(&tdq_cpu)
 #endif
 
-static void slot_fill(struct ksegrp *);
-static struct kse *sched_choose(void);		/* XXX Should be thread * */
-static void sched_slice(struct kse *);
-static void sched_priority(struct ksegrp *);
+#define	TDQ_LOCK_ASSERT(t, type)	mtx_assert(TDQ_LOCKPTR((t)), (type))
+#define	TDQ_LOCK(t)		mtx_lock_spin(TDQ_LOCKPTR((t)))
+#define	TDQ_LOCK_FLAGS(t, f)	mtx_lock_spin_flags(TDQ_LOCKPTR((t)), (f))
+#define	TDQ_UNLOCK(t)		mtx_unlock_spin(TDQ_LOCKPTR((t)))
+#define	TDQ_LOCKPTR(t)		((t)->tdq_lock)
+
+static void sched_priority(struct thread *);
 static void sched_thread_priority(struct thread *, u_char);
-static int sched_interact_score(struct ksegrp *);
-static void sched_interact_update(struct ksegrp *);
-static void sched_interact_fork(struct ksegrp *);
-static void sched_pctcpu_update(struct kse *);
+static int sched_interact_score(struct thread *);
+static void sched_interact_update(struct thread *);
+static void sched_interact_fork(struct thread *);
+static void sched_pctcpu_update(struct td_sched *);
 
 /* Operations on per processor queues */
-static struct kse * kseq_choose(struct kseq *);
-static void kseq_setup(struct kseq *);
-static void kseq_load_add(struct kseq *, struct kse *);
-static void kseq_load_rem(struct kseq *, struct kse *);
-static __inline void kseq_runq_add(struct kseq *, struct kse *, int);
-static __inline void kseq_runq_rem(struct kseq *, struct kse *);
-static void kseq_nice_add(struct kseq *, int);
-static void kseq_nice_rem(struct kseq *, int);
-void kseq_print(int cpu);
-#ifdef SMP
-static int kseq_transfer(struct kseq *, struct kse *, int);
-static struct kse *runq_steal(struct runq *);
+static struct td_sched * tdq_choose(struct tdq *);
+static void tdq_setup(struct tdq *);
+static void tdq_load_add(struct tdq *, struct td_sched *);
+static void tdq_load_rem(struct tdq *, struct td_sched *);
+static __inline void tdq_runq_add(struct tdq *, struct td_sched *, int);
+static __inline void tdq_runq_rem(struct tdq *, struct td_sched *);
+void tdq_print(int cpu);
+static void runq_print(struct runq *rq);
+static void tdq_add(struct tdq *, struct thread *, int);
+#ifdef SMP
+static void tdq_move(struct tdq *, struct tdq *);
+static int tdq_idled(struct tdq *);
+static void tdq_notify(struct td_sched *);
+static struct td_sched *tdq_steal(struct tdq *);
+static struct td_sched *runq_steal(struct runq *);
+static int sched_pickcpu(struct td_sched *, int);
 static void sched_balance(void);
 static void sched_balance_groups(void);
-static void sched_balance_group(struct kseq_group *);
-static void sched_balance_pair(struct kseq *, struct kseq *);
-static void kseq_move(struct kseq *, int);
-static int kseq_idled(struct kseq *);
-static void kseq_notify(struct kse *, int);
-static void kseq_assign(struct kseq *);
-static struct kse *kseq_steal(struct kseq *, int);
-#define	KSE_CAN_MIGRATE(ke)						\
-    ((ke)->ke_thread->td_pinned == 0 && ((ke)->ke_flags & KEF_BOUND) == 0)
+static void sched_balance_group(struct tdq_group *);
+static void sched_balance_pair(struct tdq *, struct tdq *);
+static inline struct tdq *sched_setcpu(struct td_sched *, int, int);
+static inline struct mtx *thread_block_switch(struct thread *);
+static inline void thread_unblock_switch(struct thread *, struct mtx *);
+static struct mtx *sched_switch_migrate(struct tdq *, struct thread *, int);
+
+#define	THREAD_CAN_MIGRATE(td)	 ((td)->td_pinned == 0)
 #endif
 
-void
-kseq_print(int cpu)
+static void sched_setup(void *dummy);
+SYSINIT(sched_setup, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, sched_setup, NULL)
+
+static void sched_initticks(void *dummy);
+SYSINIT(sched_initticks, SI_SUB_CLOCKS, SI_ORDER_THIRD, sched_initticks, NULL)
+
+/*
+ * Print the threads waiting on a run-queue.
+ */
+static void
+runq_print(struct runq *rq)
 {
-	struct kseq *kseq;
+	struct rqhead *rqh;
+	struct td_sched *ts;
+	int pri;
+	int j;
 	int i;
 
-	kseq = KSEQ_CPU(cpu);
+	for (i = 0; i < RQB_LEN; i++) {
+		printf("\t\trunq bits %d 0x%zx\n",
+		    i, rq->rq_status.rqb_bits[i]);
+		for (j = 0; j < RQB_BPW; j++)
+			if (rq->rq_status.rqb_bits[i] & (1ul << j)) {
+				pri = j + (i << RQB_L2BPW);
+				rqh = &rq->rq_queues[pri];
+				TAILQ_FOREACH(ts, rqh, ts_procq) {
+					printf("\t\t\ttd %p(%s) priority %d rqindex %d pri %d\n",
+					    ts->ts_thread, ts->ts_thread->td_proc->p_comm, ts->ts_thread->td_priority, ts->ts_rqindex, pri);
+				}
+			}
+	}
+}
+
+/*
+ * Print the status of a per-cpu thread queue.  Should be a ddb show cmd.
+ */
+void
+tdq_print(int cpu)
+{
+	struct tdq *tdq;
+
+	tdq = TDQ_CPU(cpu);
 
-	printf("kseq:\n");
-	printf("\tload:           %d\n", kseq->ksq_load);
-	printf("\tload TIMESHARE: %d\n", kseq->ksq_load_timeshare);
-#ifdef SMP
-	printf("\tload transferable: %d\n", kseq->ksq_transferable);
-#endif
-	printf("\tnicemin:\t%d\n", kseq->ksq_nicemin);
-	printf("\tnice counts:\n");
-	for (i = 0; i < SCHED_PRI_NRESV; i++)
-		if (kseq->ksq_nice[i])
-			printf("\t\t%d = %d\n",
-			    i - SCHED_PRI_NHALF, kseq->ksq_nice[i]);
+	printf("tdq %d:\n", TDQ_ID(tdq));
+	printf("\tlockptr         %p\n", TDQ_LOCKPTR(tdq));
+	printf("\tload:           %d\n", tdq->tdq_load);
+	printf("\ttimeshare idx:  %d\n", tdq->tdq_idx);
+	printf("\ttimeshare ridx: %d\n", tdq->tdq_ridx);
+	printf("\trealtime runq:\n");
+	runq_print(&tdq->tdq_realtime);
+	printf("\ttimeshare runq:\n");
+	runq_print(&tdq->tdq_timeshare);
+	printf("\tidle runq:\n");
+	runq_print(&tdq->tdq_idle);
+#ifdef SMP
+	printf("\tload transferable: %d\n", tdq->tdq_transferable);
+	printf("\tlowest priority:   %d\n", tdq->tdq_lowpri);
+	printf("\tgroup:             %d\n", TDG_ID(tdq->tdq_group));
+	printf("\tLock name:         %s\n", tdq->tdq_group->tdg_name);
+#endif
 }
 
+#define	TS_RQ_PPQ	(((PRI_MAX_TIMESHARE - PRI_MIN_TIMESHARE) + 1) / RQ_NQS)
+/*
+ * Add a thread to the actual run-queue.  Keeps transferable counts up to
+ * date with what is actually on the run-queue.  Selects the correct
+ * queue position for timeshare threads.
+ */
 static __inline void
-kseq_runq_add(struct kseq *kseq, struct kse *ke, int flags)
+tdq_runq_add(struct tdq *tdq, struct td_sched *ts, int flags)
 {
+	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
+	THREAD_LOCK_ASSERT(ts->ts_thread, MA_OWNED);
 #ifdef SMP
-	if (KSE_CAN_MIGRATE(ke)) {
-		kseq->ksq_transferable++;
-		kseq->ksq_group->ksg_transferable++;
-		ke->ke_flags |= KEF_XFERABLE;
+	if (THREAD_CAN_MIGRATE(ts->ts_thread)) {
+		tdq->tdq_transferable++;
+		tdq->tdq_group->tdg_transferable++;
+		ts->ts_flags |= TSF_XFERABLE;
 	}
 #endif
-	if (ke->ke_flags & KEF_PREEMPTED)
-		flags |= SRQ_PREEMPTED;
-	runq_add(ke->ke_runq, ke, flags);
+	if (ts->ts_runq == &tdq->tdq_timeshare) {
+		u_char pri;
+
+		pri = ts->ts_thread->td_priority;
+		KASSERT(pri <= PRI_MAX_TIMESHARE && pri >= PRI_MIN_TIMESHARE,
+			("Invalid priority %d on timeshare runq", pri));
+		/*
+		 * This queue contains only priorities between MIN and MAX
+		 * realtime.  Use the whole queue to represent these values.
+		 */
+		if ((flags & (SRQ_BORROWING|SRQ_PREEMPTED)) == 0) {
+			pri = (pri - PRI_MIN_TIMESHARE) / TS_RQ_PPQ;
+			pri = (pri + tdq->tdq_idx) % RQ_NQS;
+			/*
+			 * This effectively shortens the queue by one so we
+			 * can have a one slot difference between idx and
+			 * ridx while we wait for threads to drain.
+			 */
+			if (tdq->tdq_ridx != tdq->tdq_idx &&
+			    pri == tdq->tdq_ridx)
+				pri = (unsigned char)(pri - 1) % RQ_NQS;
+		} else
+			pri = tdq->tdq_ridx;
+		runq_add_pri(ts->ts_runq, ts, pri, flags);
+	} else
+		runq_add(ts->ts_runq, ts, flags);
 }
 
+/* 
+ * Remove a thread from a run-queue.  This typically happens when a thread
+ * is selected to run.  Running threads are not on the queue and the
+ * transferable count does not reflect them.
+ */
 static __inline void
-kseq_runq_rem(struct kseq *kseq, struct kse *ke)
+tdq_runq_rem(struct tdq *tdq, struct td_sched *ts)
 {
+	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
+	KASSERT(ts->ts_runq != NULL,
+	    ("tdq_runq_remove: thread %p null ts_runq", ts->ts_thread));
 #ifdef SMP
-	if (ke->ke_flags & KEF_XFERABLE) {
-		kseq->ksq_transferable--;
-		kseq->ksq_group->ksg_transferable--;
-		ke->ke_flags &= ~KEF_XFERABLE;
+	if (ts->ts_flags & TSF_XFERABLE) {
+		tdq->tdq_transferable--;
+		tdq->tdq_group->tdg_transferable--;
+		ts->ts_flags &= ~TSF_XFERABLE;
 	}
 #endif
-	runq_remove(ke->ke_runq, ke);
+	if (ts->ts_runq == &tdq->tdq_timeshare) {
+		if (tdq->tdq_idx != tdq->tdq_ridx)
+			runq_remove_idx(ts->ts_runq, ts, &tdq->tdq_ridx);
+		else
+			runq_remove_idx(ts->ts_runq, ts, NULL);
+		/*
+		 * For timeshare threads we update the priority here so
+		 * the priority reflects the time we've been sleeping.
+		 */
+		ts->ts_ltick = ticks;
+		sched_pctcpu_update(ts);
+		sched_priority(ts->ts_thread);
+	} else
+		runq_remove(ts->ts_runq, ts);
 }
 
+/*
+ * Load is maintained for all threads RUNNING and ON_RUNQ.  Add the load
+ * for this thread to the referenced thread queue.
+ */
 static void
-kseq_load_add(struct kseq *kseq, struct kse *ke)
+tdq_load_add(struct tdq *tdq, struct td_sched *ts)
 {
 	int class;
-	mtx_assert(&sched_lock, MA_OWNED);
-	class = PRI_BASE(ke->ke_ksegrp->kg_pri_class);
-	if (class == PRI_TIMESHARE)
-		kseq->ksq_load_timeshare++;
-	kseq->ksq_load++;
-	CTR1(KTR_SCHED, "load: %d", kseq->ksq_load);
-	if (class != PRI_ITHD && (ke->ke_proc->p_flag & P_NOLOAD) == 0)
+
+	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
+	THREAD_LOCK_ASSERT(ts->ts_thread, MA_OWNED);
+	class = PRI_BASE(ts->ts_thread->td_pri_class);
+	tdq->tdq_load++;
+	CTR2(KTR_SCHED, "cpu %d load: %d", TDQ_ID(tdq), tdq->tdq_load);
+	if (class != PRI_ITHD &&
+	    (ts->ts_thread->td_proc->p_flag & P_NOLOAD) == 0)
 #ifdef SMP
-		kseq->ksq_group->ksg_load++;
+		tdq->tdq_group->tdg_load++;
 #else
-		kseq->ksq_sysload++;
+		tdq->tdq_sysload++;
 #endif
-	if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE)
-		kseq_nice_add(kseq, ke->ke_proc->p_nice);
 }
 
+/*
+ * Remove the load from a thread that is transitioning to a sleep state or
+ * exiting.
+ */
 static void
-kseq_load_rem(struct kseq *kseq, struct kse *ke)
+tdq_load_rem(struct tdq *tdq, struct td_sched *ts)
 {
 	int class;
-	mtx_assert(&sched_lock, MA_OWNED);
-	class = PRI_BASE(ke->ke_ksegrp->kg_pri_class);
-	if (class == PRI_TIMESHARE)
-		kseq->ksq_load_timeshare--;
-	if (class != PRI_ITHD  && (ke->ke_proc->p_flag & P_NOLOAD) == 0)
+
+	THREAD_LOCK_ASSERT(ts->ts_thread, MA_OWNED);
+	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
+	class = PRI_BASE(ts->ts_thread->td_pri_class);
+	if (class != PRI_ITHD &&
+	    (ts->ts_thread->td_proc->p_flag & P_NOLOAD) == 0)
 #ifdef SMP
-		kseq->ksq_group->ksg_load--;
+		tdq->tdq_group->tdg_load--;
 #else
-		kseq->ksq_sysload--;
+		tdq->tdq_sysload--;
 #endif
-	kseq->ksq_load--;
-	CTR1(KTR_SCHED, "load: %d", kseq->ksq_load);
-	ke->ke_runq = NULL;
-	if (ke->ke_ksegrp->kg_pri_class == PRI_TIMESHARE)
-		kseq_nice_rem(kseq, ke->ke_proc->p_nice);
-}
-
-static void
-kseq_nice_add(struct kseq *kseq, int nice)
-{
-	mtx_assert(&sched_lock, MA_OWNED);
-	/* Normalize to zero. */
-	kseq->ksq_nice[nice + SCHED_PRI_NHALF]++;
-	if (nice < kseq->ksq_nicemin || kseq->ksq_load_timeshare == 1)
-		kseq->ksq_nicemin = nice;
-}
-
-static void
-kseq_nice_rem(struct kseq *kseq, int nice) 
-{
-	int n;
-
-	mtx_assert(&sched_lock, MA_OWNED);
-	/* Normalize to zero. */
-	n = nice + SCHED_PRI_NHALF;
-	kseq->ksq_nice[n]--;
-	KASSERT(kseq->ksq_nice[n] >= 0, ("Negative nice count."));
-
-	/*
-	 * If this wasn't the smallest nice value or there are more in
-	 * this bucket we can just return.  Otherwise we have to recalculate
-	 * the smallest nice.
-	 */
-	if (nice != kseq->ksq_nicemin ||
-	    kseq->ksq_nice[n] != 0 ||
-	    kseq->ksq_load_timeshare == 0)
-		return;
-
-	for (; n < SCHED_PRI_NRESV; n++)
-		if (kseq->ksq_nice[n]) {
-			kseq->ksq_nicemin = n - SCHED_PRI_NHALF;
-			return;
-		}
+	KASSERT(tdq->tdq_load != 0,
+	    ("tdq_load_rem: Removing with 0 load on queue %d", TDQ_ID(tdq)));
+	tdq->tdq_load--;
+	CTR1(KTR_SCHED, "load: %d", tdq->tdq_load);
+	ts->ts_runq = NULL;
 }
 
 #ifdef SMP
@@ -459,83 +513,131 @@
  * installations will only have 2 cpus.  Secondly, load balancing too much at
  * once can have an unpleasant effect on the system.  The scheduler rarely has
  * enough information to make perfect decisions.  So this algorithm chooses
- * algorithm simplicity and more gradual effects on load in larger systems.
- *
- * It could be improved by considering the priorities and slices assigned to
- * each task prior to balancing them.  There are many pathological cases with
- * any approach and so the semi random algorithm below may work as well as any.
+ * simplicity and more gradual effects on load in larger systems.
  *
  */
 static void
-sched_balance(void)
+sched_balance()
 {
-	struct kseq_group *high;
-	struct kseq_group *low;
-	struct kseq_group *ksg;
+	struct tdq_group *high;
+	struct tdq_group *low;
+	struct tdq_group *tdg;
+	struct tdq *tdq;
 	int cnt;
 	int i;
 
-	bal_tick = ticks + (random() % (hz * 2));
-	if (smp_started == 0)
+	/*
+	 * Select a random time between .5 * balance_interval and
+	 * 1.5 * balance_interval.
+	 */
+	balance_ticks = max(balance_interval / 2, 1);
+	balance_ticks += random() % balance_interval;
+	if (smp_started == 0 || rebalance == 0)
 		return;
+	tdq = TDQ_SELF();
+	TDQ_UNLOCK(tdq);
 	low = high = NULL;
-	i = random() % (ksg_maxid + 1);
-	for (cnt = 0; cnt <= ksg_maxid; cnt++) {
-		ksg = KSEQ_GROUP(i);
+	i = random() % (tdg_maxid + 1);
+	for (cnt = 0; cnt <= tdg_maxid; cnt++) {
+		tdg = TDQ_GROUP(i);
 		/*
 		 * Find the CPU with the highest load that has some
 		 * threads to transfer.
 		 */
-		if ((high == NULL || ksg->ksg_load > high->ksg_load)
-		    && ksg->ksg_transferable)
-			high = ksg;
-		if (low == NULL || ksg->ksg_load < low->ksg_load)
-			low = ksg;
-		if (++i > ksg_maxid)
+		if ((high == NULL || tdg->tdg_load > high->tdg_load)
+		    && tdg->tdg_transferable)
+			high = tdg;
+		if (low == NULL || tdg->tdg_load < low->tdg_load)
+			low = tdg;
+		if (++i > tdg_maxid)
 			i = 0;
 	}
 	if (low != NULL && high != NULL && high != low)
-		sched_balance_pair(LIST_FIRST(&high->ksg_members),
-		    LIST_FIRST(&low->ksg_members));
+		sched_balance_pair(LIST_FIRST(&high->tdg_members),
+		    LIST_FIRST(&low->tdg_members));
+	TDQ_LOCK(tdq);
 }
 
+/*
+ * Balance load between CPUs in a group.  Will only migrate within the group.
+ */
 static void
-sched_balance_groups(void)
+sched_balance_groups()
 {
+	struct tdq *tdq;
 	int i;
 
-	gbal_tick = ticks + (random() % (hz * 2));
-	mtx_assert(&sched_lock, MA_OWNED);
-	if (smp_started)
-		for (i = 0; i <= ksg_maxid; i++)
-			sched_balance_group(KSEQ_GROUP(i));
+	/*
+	 * Select a random time between .5 * balance_interval and
+	 * 1.5 * balance_interval.
+	 */
+	balance_group_ticks = max(balance_interval / 2, 1);
+	balance_group_ticks += random() % balance_interval;
+	if (smp_started == 0 || rebalance == 0)
+		return;
+	tdq = TDQ_SELF();
+	TDQ_UNLOCK(tdq);
+	for (i = 0; i <= tdg_maxid; i++)
+		sched_balance_group(TDQ_GROUP(i));
+	TDQ_LOCK(tdq);
 }
 
+/*
+ * Finds the greatest imbalance between two tdqs in a group.
+ */
 static void
-sched_balance_group(struct kseq_group *ksg)
+sched_balance_group(struct tdq_group *tdg)
 {
-	struct kseq *kseq;
-	struct kseq *high;
-	struct kseq *low;
+	struct tdq *tdq;
+	struct tdq *high;
+	struct tdq *low;
 	int load;
 
-	if (ksg->ksg_transferable == 0)
+	if (tdg->tdg_transferable == 0)
 		return;
 	low = NULL;
 	high = NULL;
-	LIST_FOREACH(kseq, &ksg->ksg_members, ksq_siblings) {
-		load = kseq->ksq_load;
-		if (high == NULL || load > high->ksq_load)
-			high = kseq;
-		if (low == NULL || load < low->ksq_load)
-			low = kseq;
+	LIST_FOREACH(tdq, &tdg->tdg_members, tdq_siblings) {
+		load = tdq->tdq_load;
+		if (high == NULL || load > high->tdq_load)
+			high = tdq;
+		if (low == NULL || load < low->tdq_load)
+			low = tdq;
 	}
 	if (high != NULL && low != NULL && high != low)
 		sched_balance_pair(high, low);
 }
 
+/*
+ * Lock two thread queues using their address to maintain lock order.
+ */
+static void
+tdq_lock_pair(struct tdq *one, struct tdq *two)
+{
+	if (one < two) {
+		TDQ_LOCK(one);
+		TDQ_LOCK_FLAGS(two, MTX_DUPOK);
+	} else {
+		TDQ_LOCK(two);
+		TDQ_LOCK_FLAGS(one, MTX_DUPOK);
+	}
+}
+
+/*
+ * Unlock two thread queues.  Order is not important here.
+ */
+static void
+tdq_unlock_pair(struct tdq *one, struct tdq *two)
+{
+	TDQ_UNLOCK(one);
+	TDQ_UNLOCK(two);
+}
+
+/*
+ * Transfer load between two imbalanced thread queues.
+ */
 static void
-sched_balance_pair(struct kseq *high, struct kseq *low)
+sched_balance_pair(struct tdq *high, struct tdq *low)
 {
 	int transferable;
 	int high_load;
@@ -544,186 +646,268 @@
 	int diff;
 	int i;
 
+	tdq_lock_pair(high, low);
 	/*
 	 * If we're transfering within a group we have to use this specific
-	 * kseq's transferable count, otherwise we can steal from other members
+	 * tdq's transferable count, otherwise we can steal from other members
 	 * of the group.
 	 */
-	if (high->ksq_group == low->ksq_group) {
-		transferable = high->ksq_transferable;
-		high_load = high->ksq_load;
-		low_load = low->ksq_load;
+	if (high->tdq_group == low->tdq_group) {
+		transferable = high->tdq_transferable;
+		high_load = high->tdq_load;
+		low_load = low->tdq_load;
 	} else {
-		transferable = high->ksq_group->ksg_transferable;
-		high_load = high->ksq_group->ksg_load;
-		low_load = low->ksq_group->ksg_load;
+		transferable = high->tdq_group->tdg_transferable;
+		high_load = high->tdq_group->tdg_load;
+		low_load = low->tdq_group->tdg_load;
 	}
-	if (transferable == 0)
-		return;
 	/*
 	 * Determine what the imbalance is and then adjust that to how many
-	 * kses we actually have to give up (transferable).
+	 * threads we actually have to give up (transferable).
 	 */
-	diff = high_load - low_load;
-	move = diff / 2;
-	if (diff & 0x1)
-		move++;
-	move = min(move, transferable);
-	for (i = 0; i < move; i++)
-		kseq_move(high, KSEQ_ID(low));
+	if (transferable != 0) {
+		diff = high_load - low_load;
+		move = diff / 2;
+		if (diff & 0x1)
+			move++;
+		move = min(move, transferable);
+		for (i = 0; i < move; i++)
+			tdq_move(high, low);
+		/*
+		 * IPI the target cpu to force it to reschedule with the new
+		 * workload.
+		 */
+		ipi_selected(1 << TDQ_ID(low), IPI_PREEMPT);
+	}
+	tdq_unlock_pair(high, low);
 	return;
 }
 
+/*
+ * Move a thread from one thread queue to another.
+ */
 static void
-kseq_move(struct kseq *from, int cpu)
+tdq_move(struct tdq *from, struct tdq *to)
 {
-	struct kseq *kseq;
-	struct kseq *to;
-	struct kse *ke;
-
-	kseq = from;
-	to = KSEQ_CPU(cpu);
-	ke = kseq_steal(kseq, 1);
-	if (ke == NULL) {
-		struct kseq_group *ksg;
-
-		ksg = kseq->ksq_group;
-		LIST_FOREACH(kseq, &ksg->ksg_members, ksq_siblings) {
-			if (kseq == from || kseq->ksq_transferable == 0)
+	struct td_sched *ts;
+	struct thread *td;
+	struct tdq *tdq;
+	int cpu;
+
+	TDQ_LOCK_ASSERT(from, MA_OWNED);
+	TDQ_LOCK_ASSERT(to, MA_OWNED);
+
+	tdq = from;
+	cpu = TDQ_ID(to);
+	ts = tdq_steal(tdq);
+	if (ts == NULL) {
+		struct tdq_group *tdg;
+
+		tdg = tdq->tdq_group;
+		LIST_FOREACH(tdq, &tdg->tdg_members, tdq_siblings) {
+			if (tdq == from || tdq->tdq_transferable == 0)
 				continue;
-			ke = kseq_steal(kseq, 1);
+			ts = tdq_steal(tdq);
 			break;
 		}
-		if (ke == NULL)
-			panic("kseq_move: No KSEs available with a "
-			    "transferable count of %d\n", 
-			    ksg->ksg_transferable);
-	}
-	if (kseq == to)
-		return;
-	ke->ke_state = KES_THREAD;
-	kseq_runq_rem(kseq, ke);
-	kseq_load_rem(kseq, ke);
-	kseq_notify(ke, cpu);
+		if (ts == NULL)
+			return;
+	}
+	if (tdq == to)
+		return;
+	td = ts->ts_thread;
+	/*
+	 * Although the run queue is locked the thread may be blocked.  Lock
+	 * it to clear this and acquire the run-queue lock.
+	 */
+	thread_lock(td);
+	/* Drop recursive lock on from acquired via thread_lock(). */
+	TDQ_UNLOCK(from);
+	sched_rem(td);
+	ts->ts_cpu = cpu;
+	td->td_lock = TDQ_LOCKPTR(to);
+	tdq_add(to, td, SRQ_YIELDING);
 }
 
+/*
+ * This tdq has idled.  Try to steal a thread from another cpu and switch
+ * to it.
+ */
 static int
-kseq_idled(struct kseq *kseq)
+tdq_idled(struct tdq *tdq)
 {
-	struct kseq_group *ksg;
-	struct kseq *steal;
-	struct kse *ke;
+	struct tdq_group *tdg;
+	struct tdq *steal;
+	int highload;
+	int highcpu;
+	int cpu;
 
-	ksg = kseq->ksq_group;
+	if (smp_started == 0 || steal_idle == 0)
+		return (1);
+	/* We don't want to be preempted while we're iterating over tdqs */
+	spinlock_enter();
+	tdg = tdq->tdq_group;
+	/*
+	 * If we're in a cpu group, try and steal threads from another cpu in
+	 * the group before idling.  In a HTT group all cpus share the same
+	 * run-queue lock, however, we still need a recursive lock to
+	 * call tdq_move().
+	 */
+	if (steal_htt && tdg->tdg_cpus > 1 && tdg->tdg_transferable) {
+		TDQ_LOCK(tdq);
+		LIST_FOREACH(steal, &tdg->tdg_members, tdq_siblings) {
+			if (steal == tdq || steal->tdq_transferable == 0)
+				continue;
+			TDQ_LOCK(steal);
+			goto steal;
+		}
+		TDQ_UNLOCK(tdq);
+	}
 	/*
-	 * If we're in a cpu group, try and steal kses from another cpu in
-	 * the group before idling.
+	 * Find the least loaded CPU with a transferable thread and attempt
+	 * to steal it.  We make a lockless pass and then verify that the
+	 * thread is still available after locking.
 	 */
-	if (ksg->ksg_cpus > 1 && ksg->ksg_transferable) {
-		LIST_FOREACH(steal, &ksg->ksg_members, ksq_siblings) {
-			if (steal == kseq || steal->ksq_transferable == 0)
+	for (;;) {
+		highcpu = 0;
+		highload = 0;
+		for (cpu = 0; cpu <= mp_maxid; cpu++) {
+			if (CPU_ABSENT(cpu))
 				continue;
-			ke = kseq_steal(steal, 0);
-			if (ke == NULL)
+			steal = TDQ_CPU(cpu);
+			if (steal->tdq_transferable == 0)
 				continue;
-			ke->ke_state = KES_THREAD;
-			kseq_runq_rem(steal, ke);
-			kseq_load_rem(steal, ke);
-			ke->ke_cpu = PCPU_GET(cpuid);
-			ke->ke_flags |= KEF_INTERNAL | KEF_HOLD;
-			sched_add(ke->ke_thread, SRQ_YIELDING);
-			return (0);
+			if (steal->tdq_load < highload)
+				continue;
+			highload = steal->tdq_load;
+			highcpu = cpu;
 		}
+		if (highload < steal_thresh)
+			break;
+		steal = TDQ_CPU(highcpu);
+		if (steal == tdq)
+			break;
+		tdq_lock_pair(tdq, steal);
+		if (steal->tdq_load >= steal_thresh && steal->tdq_transferable)
+			goto steal;
+		tdq_unlock_pair(tdq, steal);
 	}
-	/*
-	 * We only set the idled bit when all of the cpus in the group are
-	 * idle.  Otherwise we could get into a situation where a KSE bounces
-	 * back and forth between two idle cores on seperate physical CPUs.
-	 */
-	ksg->ksg_idlemask |= PCPU_GET(cpumask);
-	if (ksg->ksg_idlemask != ksg->ksg_cpumask)
-		return (1);
-	atomic_set_int(&kseq_idle, ksg->ksg_mask);
+	spinlock_exit();
 	return (1);
+steal:
+	spinlock_exit();
+	tdq_move(steal, tdq);
+	TDQ_UNLOCK(steal);
+	mi_switch(SW_VOL, NULL);
+	thread_unlock(curthread);
+
+	return (0);
 }
 
+/*
+ * Notify a remote cpu of new work.  Sends an IPI if criteria are met.
+ */
 static void
-kseq_assign(struct kseq *kseq)
+tdq_notify(struct td_sched *ts)
 {
-	struct kse *nke;
-	struct kse *ke;
+	struct thread *ctd;
+	struct pcpu *pcpu;
+	int cpri;
+	int pri;
+	int cpu;
 
-	do {
-		*(volatile struct kse **)&ke = kseq->ksq_assigned;
-	} while(!atomic_cmpset_ptr((volatile uintptr_t *)&kseq->ksq_assigned,
-		(uintptr_t)ke, (uintptr_t)NULL));
-	for (; ke != NULL; ke = nke) {
-		nke = ke->ke_assign;
-		kseq->ksq_group->ksg_load--;
-		kseq->ksq_load--;
-		ke->ke_flags &= ~KEF_ASSIGNED;
-		if (ke->ke_flags & KEF_REMOVED) {
-			ke->ke_flags &= ~KEF_REMOVED;
-			continue;
-		}
-		ke->ke_flags |= KEF_INTERNAL | KEF_HOLD;
-		sched_add(ke->ke_thread, SRQ_YIELDING);
-	}
+	cpu = ts->ts_cpu;
+	pri = ts->ts_thread->td_priority;
+	pcpu = pcpu_find(cpu);
+	ctd = pcpu->pc_curthread;
+	cpri = ctd->td_priority;
+
+	/*
+	 * If our priority is not better than the current priority there is
+	 * nothing to do.
+	 */
+	if (pri > cpri)
+		return;
+	/*
+	 * Always IPI idle.
+	 */
+	if (cpri > PRI_MIN_IDLE)
+		goto sendipi;
+	/*
+	 * If we're realtime or better and there is timeshare or worse running
+	 * send an IPI.
+	 */
+	if (pri < PRI_MAX_REALTIME && cpri > PRI_MAX_REALTIME)
+		goto sendipi;
+	/*
+	 * Otherwise only IPI if we exceed the threshold.
+	 */
+	if (pri > preempt_thresh)
+		return;
+sendipi:
+	ctd->td_flags |= TDF_NEEDRESCHED;
+	ipi_selected(1 << cpu, IPI_PREEMPT);
 }
 
-static void
-kseq_notify(struct kse *ke, int cpu)
+/*
+ * Steals load from a timeshare queue.  Honors the rotating queue head
+ * index.
+ */
+static struct td_sched *
+runq_steal_from(struct runq *rq, u_char start)
 {
-	struct kseq *kseq;
-	struct thread *td;
-	struct pcpu *pcpu;
-	int class;
-	int prio;
+	struct td_sched *ts;
+	struct rqbits *rqb;
+	struct rqhead *rqh;
+	int first;
+	int bit;
+	int pri;
+	int i;
 
-	kseq = KSEQ_CPU(cpu);
-	/* XXX */
-	class = PRI_BASE(ke->ke_ksegrp->kg_pri_class);
-	if ((class == PRI_TIMESHARE || class == PRI_REALTIME) &&
-	    (kseq_idle & kseq->ksq_group->ksg_mask)) 
-		atomic_clear_int(&kseq_idle, kseq->ksq_group->ksg_mask);
-	kseq->ksq_group->ksg_load++;
-	kseq->ksq_load++;
-	ke->ke_cpu = cpu;
-	ke->ke_flags |= KEF_ASSIGNED;
-	prio = ke->ke_thread->td_priority;
-
-	/*
-	 * Place a KSE on another cpu's queue and force a resched.
-	 */
-	do {
-		*(volatile struct kse **)&ke->ke_assign = kseq->ksq_assigned;
-	} while(!atomic_cmpset_ptr((volatile uintptr_t *)&kseq->ksq_assigned,
-		(uintptr_t)ke->ke_assign, (uintptr_t)ke));
-	/*
-	 * Without sched_lock we could lose a race where we set NEEDRESCHED
-	 * on a thread that is switched out before the IPI is delivered.  This
-	 * would lead us to miss the resched.  This will be a problem once
-	 * sched_lock is pushed down.
-	 */
-	pcpu = pcpu_find(cpu);
-	td = pcpu->pc_curthread;
-	if (ke->ke_thread->td_priority < td->td_priority ||
-	    td == pcpu->pc_idlethread) {
-		td->td_flags |= TDF_NEEDRESCHED;
-		ipi_selected(1 << cpu, IPI_AST);
+	rqb = &rq->rq_status;
+	bit = start & (RQB_BPW -1);
+	pri = 0;
+	first = 0;
+again:
+	for (i = RQB_WORD(start); i < RQB_LEN; bit = 0, i++) {
+		if (rqb->rqb_bits[i] == 0)
+			continue;
+		if (bit != 0) {
+			for (pri = bit; pri < RQB_BPW; pri++)
+				if (rqb->rqb_bits[i] & (1ul << pri))
+					break;
+			if (pri >= RQB_BPW)
+				continue;
+		} else
+			pri = RQB_FFS(rqb->rqb_bits[i]);
+		pri += (i << RQB_L2BPW);
+		rqh = &rq->rq_queues[pri];
+		TAILQ_FOREACH(ts, rqh, ts_procq) {
+			if (first && THREAD_CAN_MIGRATE(ts->ts_thread))
+				return (ts);
+			first = 1;
+		}
 	}
+	if (start != 0) {
+		start = 0;
+		goto again;
+	}
+
+	return (NULL);
 }
 
-static struct kse *
+/*
+ * Steals load from a standard linear queue.
+ */
+static struct td_sched *
 runq_steal(struct runq *rq)
 {
 	struct rqhead *rqh;
 	struct rqbits *rqb;
-	struct kse *ke;
+	struct td_sched *ts;
 	int word;
 	int bit;
 
-	mtx_assert(&sched_lock, MA_OWNED);
 	rqb = &rq->rq_status;
 	for (word = 0; word < RQB_LEN; word++) {
 		if (rqb->rqb_bits[word] == 0)
@@ -732,523 +916,717 @@
 			if ((rqb->rqb_bits[word] & (1ul << bit)) == 0)
 				continue;
 			rqh = &rq->rq_queues[bit + (word << RQB_L2BPW)];
-			TAILQ_FOREACH(ke, rqh, ke_procq) {
-				if (KSE_CAN_MIGRATE(ke))
-					return (ke);
-			}
+			TAILQ_FOREACH(ts, rqh, ts_procq)
+				if (THREAD_CAN_MIGRATE(ts->ts_thread))
+					return (ts);
 		}
 	}
 	return (NULL);
 }
 
-static struct kse *
-kseq_steal(struct kseq *kseq, int stealidle)
+/*
+ * Attempt to steal a thread in priority order from a thread queue.
+ */
+static struct td_sched *
+tdq_steal(struct tdq *tdq)
 {
-	struct kse *ke;
+	struct td_sched *ts;
 
-	/*
-	 * Steal from next first to try to get a non-interactive task that
-	 * may not have run for a while.
-	 */
-	if ((ke = runq_steal(kseq->ksq_next)) != NULL)
-		return (ke);
-	if ((ke = runq_steal(kseq->ksq_curr)) != NULL)
-		return (ke);
-	if (stealidle)
-		return (runq_steal(&kseq->ksq_idle));
-	return (NULL);
+	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
+	if ((ts = runq_steal(&tdq->tdq_realtime)) != NULL)
+		return (ts);
+	if ((ts = runq_steal_from(&tdq->tdq_timeshare, tdq->tdq_ridx)) != NULL)
+		return (ts);
+	return (runq_steal(&tdq->tdq_idle));
 }
 
-int
-kseq_transfer(struct kseq *kseq, struct kse *ke, int class)
+/*
+ * Sets the thread lock and ts_cpu to match the requested cpu.  Unlocks the
+ * current lock and returns with the assigned queue locked.
+ */
+static inline struct tdq *
+sched_setcpu(struct td_sched *ts, int cpu, int flags)
 {
-	struct kseq_group *nksg;
-	struct kseq_group *ksg;
-	struct kseq *old;
-	int cpu;
-	int idx;
+	struct thread *td;
+	struct tdq *tdq;
 
-	if (smp_started == 0)
-		return (0);
-	cpu = 0;
-	/*
-	 * If our load exceeds a certain threshold we should attempt to
-	 * reassign this thread.  The first candidate is the cpu that
-	 * originally ran the thread.  If it is idle, assign it there, 
-	 * otherwise, pick an idle cpu.
-	 *
-	 * The threshold at which we start to reassign kses has a large impact
-	 * on the overall performance of the system.  Tuned too high and
-	 * some CPUs may idle.  Too low and there will be excess migration
-	 * and context switches.
-	 */
-	old = KSEQ_CPU(ke->ke_cpu);
-	nksg = old->ksq_group;
-	ksg = kseq->ksq_group;
-	if (kseq_idle) {
-		if (kseq_idle & nksg->ksg_mask) {
-			cpu = ffs(nksg->ksg_idlemask);
-			if (cpu) {
-				CTR2(KTR_SCHED,
-				    "kseq_transfer: %p found old cpu %X " 
-				    "in idlemask.", ke, cpu);
-				goto migrate;
-			}
-		}
-		/*
-		 * Multiple cpus could find this bit simultaneously
-		 * but the race shouldn't be terrible.
-		 */
-		cpu = ffs(kseq_idle);
-		if (cpu) {
-			CTR2(KTR_SCHED, "kseq_transfer: %p found %X " 
-			    "in idlemask.", ke, cpu);
-			goto migrate;
-		}
-	}
-	idx = 0;
-#if 0
-	if (old->ksq_load < kseq->ksq_load) {
-		cpu = ke->ke_cpu + 1;
-		CTR2(KTR_SCHED, "kseq_transfer: %p old cpu %X " 
-		    "load less than ours.", ke, cpu);
-		goto migrate;
-	}
-	/*
-	 * No new CPU was found, look for one with less load.
-	 */
-	for (idx = 0; idx <= ksg_maxid; idx++) {
-		nksg = KSEQ_GROUP(idx);
-		if (nksg->ksg_load /*+ (nksg->ksg_cpus  * 2)*/ < ksg->ksg_load) {
-			cpu = ffs(nksg->ksg_cpumask);
-			CTR2(KTR_SCHED, "kseq_transfer: %p cpu %X load less " 
-			    "than ours.", ke, cpu);
-			goto migrate;
-		}
-	}
-#endif
+	THREAD_LOCK_ASSERT(ts->ts_thread, MA_OWNED);
+
+	tdq = TDQ_CPU(cpu);
+	td = ts->ts_thread;
+	ts->ts_cpu = cpu;
+
+	/* If the lock matches just return the queue. */
+	if (td->td_lock == TDQ_LOCKPTR(tdq))
+		return (tdq);
+#ifdef notyet
 	/*
-	 * If another cpu in this group has idled, assign a thread over
-	 * to them after checking to see if there are idled groups.
+	 * If the thread isn't running its lockptr is a
+	 * turnstile or a sleepqueue.  We can just lock_set without
+	 * blocking.
 	 */
-	if (ksg->ksg_idlemask) {
-		cpu = ffs(ksg->ksg_idlemask);
-		if (cpu) {
-			CTR2(KTR_SCHED, "kseq_transfer: %p cpu %X idle in " 
-			    "group.", ke, cpu);
-			goto migrate;
-		}
+	if (TD_CAN_RUN(td)) {
+		TDQ_LOCK(tdq);
+		thread_lock_set(td, TDQ_LOCKPTR(tdq));
+		return (tdq);
 	}
-	return (0);
-migrate:
+#endif
 	/*
-	 * Now that we've found an idle CPU, migrate the thread.
+	 * The hard case, migration, we need to block the thread first to
+	 * prevent order reversals with other cpus locks.
 	 */
-	cpu--;
-	ke->ke_runq = NULL;
-	kseq_notify(ke, cpu);
-
-	return (1);
+	thread_lock_block(td);
+	TDQ_LOCK(tdq);
+	thread_lock_unblock(td, TDQ_LOCKPTR(tdq));
+	return (tdq);
 }
 
-#endif	/* SMP */
-
 /*
- * Pick the highest priority task we have and return it.
+ * Find the thread queue running the lowest priority thread.
  */
-
-static struct kse *
-kseq_choose(struct kseq *kseq)
+static int
+tdq_lowestpri(void)
 {
-	struct runq *swap;
-	struct kse *ke;
-	int nice;
-
-	mtx_assert(&sched_lock, MA_OWNED);
-	swap = NULL;
+	struct tdq *tdq;
+	int lowpri;
+	int lowcpu;
+	int lowload;
+	int load;
+	int cpu;
+	int pri;
 
-	for (;;) {
-		ke = runq_choose(kseq->ksq_curr);
-		if (ke == NULL) {
-			/*
-			 * We already swapped once and didn't get anywhere.
-			 */
-			if (swap)
-				break;
-			swap = kseq->ksq_curr;
-			kseq->ksq_curr = kseq->ksq_next;
-			kseq->ksq_next = swap;
+	lowload = 0;
+	lowpri = lowcpu = 0;
+	for (cpu = 0; cpu <= mp_maxid; cpu++) {
+		if (CPU_ABSENT(cpu))
 			continue;
-		}
-		/*
-		 * If we encounter a slice of 0 the kse is in a
-		 * TIMESHARE kse group and its nice was too far out
-		 * of the range that receives slices. 
-		 */
-		nice = ke->ke_proc->p_nice + (0 - kseq->ksq_nicemin);
-#if 0
-		if (ke->ke_slice == 0 || (nice > SCHED_SLICE_NTHRESH &&
-		    ke->ke_proc->p_nice != 0)) {
-			runq_remove(ke->ke_runq, ke);
-			sched_slice(ke);
-			ke->ke_runq = kseq->ksq_next;
-			runq_add(ke->ke_runq, ke, 0);
+		tdq = TDQ_CPU(cpu);
+		pri = tdq->tdq_lowpri;
+		load = TDQ_CPU(cpu)->tdq_load;
+		CTR4(KTR_ULE,
+		    "cpu %d pri %d lowcpu %d lowpri %d",
+		    cpu, pri, lowcpu, lowpri);
+		if (pri < lowpri)
 			continue;
-		}
-#endif
-		return (ke);
+		if (lowpri && lowpri == pri && load > lowload)
+			continue;
+		lowpri = pri;
+		lowcpu = cpu;
+		lowload = load;
 	}
 
-	return (runq_choose(&kseq->ksq_idle));
+	return (lowcpu);
 }
 
-static void
-kseq_setup(struct kseq *kseq)
-{
-	runq_init(&kseq->ksq_timeshare[0]);
-	runq_init(&kseq->ksq_timeshare[1]);
-	runq_init(&kseq->ksq_idle);
-	kseq->ksq_curr = &kseq->ksq_timeshare[0];
-	kseq->ksq_next = &kseq->ksq_timeshare[1];
-	kseq->ksq_load = 0;
-	kseq->ksq_load_timeshare = 0;
+/*
+ * Find the thread queue with the least load.
+ */
+static int
+tdq_lowestload(void)
+{
+	struct tdq *tdq;
+	int lowload;
+	int lowpri;
+	int lowcpu;
+	int load;
+	int cpu;
+	int pri;
+
+	lowcpu = 0;
+	lowload = TDQ_CPU(0)->tdq_load;
+	lowpri = TDQ_CPU(0)->tdq_lowpri;
+	for (cpu = 1; cpu <= mp_maxid; cpu++) {
+		if (CPU_ABSENT(cpu))
+			continue;
+		tdq = TDQ_CPU(cpu);
+		load = tdq->tdq_load;
+		pri = tdq->tdq_lowpri;
+		CTR4(KTR_ULE, "cpu %d load %d lowcpu %d lowload %d",
+		    cpu, load, lowcpu, lowload);
+		if (load > lowload)
+			continue;
+		if (load == lowload && pri < lowpri)
+			continue;
+		lowcpu = cpu;
+		lowload = load;
+		lowpri = pri;
+	}
+
+	return (lowcpu);
+}
+
+/*
+ * Pick the destination cpu for sched_add().  Respects affinity and makes
+ * a determination based on load or priority of available processors.
+ */
+static int
+sched_pickcpu(struct td_sched *ts, int flags)
+{
+	struct tdq *tdq;
+	int self;
+	int pri;
+	int cpu;
+
+	cpu = self = PCPU_GET(cpuid);
+	if (smp_started == 0)
+		return (self);
+	/*
+	 * Don't migrate a running thread from sched_switch().
+	 */
+	if (flags & SRQ_OURSELF) {
+		CTR1(KTR_ULE, "YIELDING %d",
+		    curthread->td_priority);
+		return (self);
+	}
+	pri = ts->ts_thread->td_priority;
+	cpu = ts->ts_cpu;
+	/*
+	 * Regardless of affinity, if the last cpu is idle send it there.
+	 */
+	tdq = TDQ_CPU(cpu);
+	if (tdq->tdq_lowpri > PRI_MIN_IDLE) {
+		CTR5(KTR_ULE,
+		    "ts_cpu %d idle, ltick %d ticks %d pri %d curthread %d",
+		    ts->ts_cpu, ts->ts_rltick, ticks, pri,
+		    tdq->tdq_lowpri);
+		return (ts->ts_cpu);
+	}
+	/*
+	 * If we have affinity, try to place it on the cpu we last ran on.
+	 */
+	if (SCHED_AFFINITY(ts) && tdq->tdq_lowpri > pri) {
+		CTR5(KTR_ULE,
+		    "affinity for %d, ltick %d ticks %d pri %d curthread %d",
+		    ts->ts_cpu, ts->ts_rltick, ticks, pri,
+		    tdq->tdq_lowpri);
+		return (ts->ts_cpu);
+	}
+	/*
+	 * Look for an idle group.
+	 */
+	CTR1(KTR_ULE, "tdq_idle %X", tdq_idle);
+	cpu = ffs(tdq_idle);
+	if (cpu)
+		return (--cpu);
+	/*
+	 * If there are no idle cores see if we can run the thread locally.
+	 * This may improve locality among sleepers and wakers when there
+	 * is shared data.
+	 */
+	if (tryself && pri < curthread->td_priority) {
+		CTR1(KTR_ULE, "tryself %d",
+		    curthread->td_priority);
+		return (self);
+	}
+	/*
+ 	 * Now search for the cpu running the lowest priority thread with
+	 * the least load.
+	 */
+	if (pick_pri)
+		cpu = tdq_lowestpri();
+	else
+		cpu = tdq_lowestload();
+	return (cpu);
 }
 
+#endif	/* SMP */
+
+/*
+ * Pick the highest priority task we have and return it.
+ */
+static struct td_sched *
+tdq_choose(struct tdq *tdq)
+{
+	struct td_sched *ts;
+
+	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
+	ts = runq_choose(&tdq->tdq_realtime);
+	if (ts != NULL)
+		return (ts);
+	ts = runq_choose_from(&tdq->tdq_timeshare, tdq->tdq_ridx);
+	if (ts != NULL) {
+		KASSERT(ts->ts_thread->td_priority >= PRI_MIN_TIMESHARE,
+		    ("tdq_choose: Invalid priority on timeshare queue %d",
+		    ts->ts_thread->td_priority));
+		return (ts);
+	}
+
+	ts = runq_choose(&tdq->tdq_idle);
+	if (ts != NULL) {
+		KASSERT(ts->ts_thread->td_priority >= PRI_MIN_IDLE,
+		    ("tdq_choose: Invalid priority on idle queue %d",
+		    ts->ts_thread->td_priority));
+		return (ts);
+	}
+
+	return (NULL);
+}
+
+/*
+ * Initialize a thread queue.
+ */
 static void
-sched_setup(void *dummy)
+tdq_setup(struct tdq *tdq)
 {
-#ifdef SMP
-	int i;
-#endif
 
-	slice_min = (hz/100);	/* 10ms */
-	slice_max = (hz/7);	/* ~140ms */
+	if (bootverbose)
+		printf("ULE: setup cpu %d\n", TDQ_ID(tdq));
+	runq_init(&tdq->tdq_realtime);
+	runq_init(&tdq->tdq_timeshare);
+	runq_init(&tdq->tdq_idle);
+	tdq->tdq_load = 0;
+}
 
 #ifdef SMP
-	balance_groups = 0;
-	/*
-	 * Initialize the kseqs.
-	 */
-	for (i = 0; i < MAXCPU; i++) {
-		struct kseq *ksq;
+static void
+tdg_setup(struct tdq_group *tdg)
+{
+	if (bootverbose)
+		printf("ULE: setup cpu group %d\n", TDG_ID(tdg));
+	snprintf(tdg->tdg_name, sizeof(tdg->tdg_name),
+	    "sched lock %d", (int)TDG_ID(tdg));
+	mtx_init(&tdg->tdg_lock, tdg->tdg_name, "sched lock",
+	    MTX_SPIN | MTX_RECURSE);
+	LIST_INIT(&tdg->tdg_members);
+	tdg->tdg_load = 0;
+	tdg->tdg_transferable = 0;
+	tdg->tdg_cpus = 0;
+	tdg->tdg_mask = 0;
+	tdg->tdg_cpumask = 0;
+	tdg->tdg_idlemask = 0;
+}
 
-		ksq = &kseq_cpu[i];
-		ksq->ksq_assigned = NULL;
-		kseq_setup(&kseq_cpu[i]);
-	}
-	if (smp_topology == NULL) {
-		struct kseq_group *ksg;
-		struct kseq *ksq;
-		int cpus;
+static void
+tdg_add(struct tdq_group *tdg, struct tdq *tdq)
+{
+	if (tdg->tdg_mask == 0)
+		tdg->tdg_mask |= 1 << TDQ_ID(tdq);
+	tdg->tdg_cpumask |= 1 << TDQ_ID(tdq);
+	tdg->tdg_cpus++;
+	tdq->tdq_group = tdg;
+	tdq->tdq_lock = &tdg->tdg_lock;
+	LIST_INSERT_HEAD(&tdg->tdg_members, tdq, tdq_siblings);
+	if (bootverbose)
+		printf("ULE: adding cpu %d to group %d: cpus %d mask 0x%X\n",
+		    TDQ_ID(tdq), TDG_ID(tdg), tdg->tdg_cpus, tdg->tdg_cpumask);
+}
 
-		for (cpus = 0, i = 0; i < MAXCPU; i++) {
-			if (CPU_ABSENT(i))
-				continue;
-			ksq = &kseq_cpu[cpus];
-			ksg = &kseq_groups[cpus];
-			/*
-			 * Setup a kseq group with one member.
-			 */
-			ksq->ksq_transferable = 0;
-			ksq->ksq_group = ksg;
-			ksg->ksg_cpus = 1;
-			ksg->ksg_idlemask = 0;
-			ksg->ksg_cpumask = ksg->ksg_mask = 1 << i;
-			ksg->ksg_load = 0;
-			ksg->ksg_transferable = 0;
-			LIST_INIT(&ksg->ksg_members);
-			LIST_INSERT_HEAD(&ksg->ksg_members, ksq, ksq_siblings);
-			cpus++;
-		}
-		ksg_maxid = cpus - 1;
-	} else {
-		struct kseq_group *ksg;
-		struct cpu_group *cg;
-		int j;
-
-		for (i = 0; i < smp_topology->ct_count; i++) {
-			cg = &smp_topology->ct_group[i];
-			ksg = &kseq_groups[i];
-			/*
-			 * Initialize the group.
-			 */
-			ksg->ksg_idlemask = 0;
-			ksg->ksg_load = 0;
-			ksg->ksg_transferable = 0;
-			ksg->ksg_cpus = cg->cg_count;
-			ksg->ksg_cpumask = cg->cg_mask;
-			LIST_INIT(&ksg->ksg_members);
-			/*
-			 * Find all of the group members and add them.
-			 */
-			for (j = 0; j < MAXCPU; j++) {
-				if ((cg->cg_mask & (1 << j)) != 0) {
-					if (ksg->ksg_mask == 0)
-						ksg->ksg_mask = 1 << j;
-					kseq_cpu[j].ksq_transferable = 0;
-					kseq_cpu[j].ksq_group = ksg;
-					LIST_INSERT_HEAD(&ksg->ksg_members,
-					    &kseq_cpu[j], ksq_siblings);
-				}
+static void
+sched_setup_topology(void)
+{
+	struct tdq_group *tdg;
+	struct cpu_group *cg;
+	int balance_groups;
+	struct tdq *tdq;
+	int i;
+	int j;
+
+	topology = 1;
+	balance_groups = 0;
+	for (i = 0; i < smp_topology->ct_count; i++) {
+		cg = &smp_topology->ct_group[i];
+		tdg = &tdq_groups[i];
+		/*
+		 * Initialize the group.
+		 */
+		tdg_setup(tdg);
+		/*
+		 * Find all of the group members and add them.
+		 */
+		for (j = 0; j < MAXCPU; j++) { 
+			if ((cg->cg_mask & (1 << j)) != 0) {
+				tdq = TDQ_CPU(j);
+				tdq_setup(tdq);
+				tdg_add(tdg, tdq);
 			}
-			if (ksg->ksg_cpus > 1)
-				balance_groups = 1;
 		}
-		ksg_maxid = smp_topology->ct_count - 1;
+		if (tdg->tdg_cpus > 1)
+			balance_groups = 1;
 	}
+	tdg_maxid = smp_topology->ct_count - 1;
+	if (balance_groups)
+		sched_balance_groups();
+}
+
+static void
+sched_setup_smp(void)
+{
+	struct tdq_group *tdg;
+	struct tdq *tdq;
+	int cpus;
+	int i;
+
+	for (cpus = 0, i = 0; i < MAXCPU; i++) {
+		if (CPU_ABSENT(i))
+			continue;
+		tdq = &tdq_cpu[i];
+		tdg = &tdq_groups[i];
+		/*
+		 * Setup a tdq group with one member.
+		 */
+		tdg_setup(tdg);
+		tdq_setup(tdq);
+		tdg_add(tdg, tdq);
+		cpus++;
+	}
+	tdg_maxid = cpus - 1;
+}
+
+/*
+ * Fake a topology with one group containing all CPUs.
+ */
+static void
+sched_fake_topo(void)
+{
+#ifdef SCHED_FAKE_TOPOLOGY
+	static struct cpu_top top;
+	static struct cpu_group group;
+
+	top.ct_count = 1;
+	top.ct_group = &group;
+	group.cg_mask = all_cpus;
+	group.cg_count = mp_ncpus;
+	group.cg_children = 0;
+	smp_topology = ⊤
+#endif
+}
+#endif
+
+/*
+ * Setup the thread queues and initialize the topology based on MD
+ * information.
+ */
+static void
+sched_setup(void *dummy)
+{
+	struct tdq *tdq;
+
+	tdq = TDQ_SELF();
+#ifdef SMP
+	sched_fake_topo();
 	/*
-	 * Stagger the group and global load balancer so they do not
-	 * interfere with each other.
+	 * Setup tdqs based on a topology configuration or vanilla SMP based
+	 * on mp_maxid.
 	 */
-	bal_tick = ticks + hz;
-	if (balance_groups)
-		gbal_tick = ticks + (hz / 2);
+	if (smp_topology == NULL)
+		sched_setup_smp();
+	else 
+		sched_setup_topology();
+	balance_tdq = tdq;
+	sched_balance();
 #else
-	kseq_setup(KSEQ_SELF());
+	tdq_setup(tdq);
+	mtx_init(&tdq_lock, "sched lock", "sched lock", MTX_SPIN | MTX_RECURSE);
+	tdq->tdq_lock = &tdq_lock;
 #endif
-	mtx_lock_spin(&sched_lock);
-	kseq_load_add(KSEQ_SELF(), &kse0);
-	mtx_unlock_spin(&sched_lock);
+	/*
+	 * To avoid divide-by-zero, we set realstathz a dummy value
+	 * in case which sched_clock() called before sched_initticks().
+	 */
+	realstathz = hz;
+	sched_slice = (realstathz/10);	/* ~100ms */
+	tickincr = 1 << SCHED_TICK_SHIFT;
+
+	/* Add thread0's load since it's running. */
+	TDQ_LOCK(tdq);
+	thread0.td_lock = TDQ_LOCKPTR(TDQ_SELF());
+	tdq_load_add(tdq, &td_sched0);
+	TDQ_UNLOCK(tdq);
 }
 
 /*
- * Scale the scheduling priority according to the "interactivity" of this
- * process.
+ * This routine determines the tickincr after stathz and hz are setup.
  */
+/* ARGSUSED */
 static void
-sched_priority(struct ksegrp *kg)
+sched_initticks(void *dummy)
 {
-	int pri;
+	int incr;
+
+	realstathz = stathz ? stathz : hz;
+	sched_slice = (realstathz/10);	/* ~100ms */
+
+	/*
+	 * tickincr is shifted out by 10 to avoid rounding errors due to
+	 * hz not being evenly divisible by stathz on all platforms.
+	 */
+	incr = (hz << SCHED_TICK_SHIFT) / realstathz;
+	/*
+	 * This does not work for values of stathz that are more than
+	 * 1 << SCHED_TICK_SHIFT * hz.  In practice this does not happen.
+	 */
+	if (incr == 0)
+		incr = 1;
+	tickincr = incr;
+#ifdef SMP
+	/*
+	 * Set the default balance interval now that we know
+	 * what realstathz is.
+	 */
+	balance_interval = realstathz;
+	/*
+	 * Set steal thresh to log2(mp_ncpu) but no greater than 4.  This
+	 * prevents excess thrashing on large machines and excess idle on
+	 * smaller machines.
+	 */
+	steal_thresh = min(ffs(mp_ncpus) - 1, 4);
+	affinity = SCHED_AFFINITY_DEFAULT;
+#endif
+}
 
-	if (kg->kg_pri_class != PRI_TIMESHARE)
-		return;
 
-	pri = SCHED_PRI_INTERACT(sched_interact_score(kg));
-	pri += SCHED_PRI_BASE;
-	pri += kg->kg_proc->p_nice;
+/*
+ * This is the core of the interactivity algorithm.  Determines a score based
+ * on past behavior.  It is the ratio of sleep time to run time scaled to
+ * a [0, 100] integer.  This is the voluntary sleep time of a process, which
+ * differs from the cpu usage because it does not account for time spent
+ * waiting on a run-queue.  Would be prettier if we had floating point.
+ */
+static int
+sched_interact_score(struct thread *td)
+{
+	struct td_sched *ts;
+	int div;
 
-	if (pri > PRI_MAX_TIMESHARE)
-		pri = PRI_MAX_TIMESHARE;
-	else if (pri < PRI_MIN_TIMESHARE)
-		pri = PRI_MIN_TIMESHARE;
+	ts = td->td_sched;
+	/*
+	 * The score is only needed if this is likely to be an interactive
+	 * task.  Don't go through the expense of computing it if there's
+	 * no chance.
+	 */
+	if (sched_interact <= SCHED_INTERACT_HALF &&
+		ts->ts_runtime >= ts->ts_slptime)
+			return (SCHED_INTERACT_HALF);
 
-	kg->kg_user_pri = pri;
+	if (ts->ts_runtime > ts->ts_slptime) {
+		div = max(1, ts->ts_runtime / SCHED_INTERACT_HALF);
+		return (SCHED_INTERACT_HALF +
+		    (SCHED_INTERACT_HALF - (ts->ts_slptime / div)));
+	}
+	if (ts->ts_slptime > ts->ts_runtime) {
+		div = max(1, ts->ts_slptime / SCHED_INTERACT_HALF);
+		return (ts->ts_runtime / div);
+	}
+	/* runtime == slptime */
+	if (ts->ts_runtime)
+		return (SCHED_INTERACT_HALF);
+
+	/*
+	 * This can happen if slptime and runtime are 0.
+	 */
+	return (0);
 
-	return;
 }
 
 /*
- * Calculate a time slice based on the properties of the kseg and the runq
- * that we're on.  This is only for PRI_TIMESHARE ksegrps.
+ * Scale the scheduling priority according to the "interactivity" of this
+ * process.
  */
 static void
-sched_slice(struct kse *ke)
+sched_priority(struct thread *td)
 {
-	struct kseq *kseq;
-	struct ksegrp *kg;
-
-	kg = ke->ke_ksegrp;
-	kseq = KSEQ_CPU(ke->ke_cpu);
+	int score;
+	int pri;
 
-	if (ke->ke_thread->td_flags & TDF_BORROWING) {
-		ke->ke_slice = SCHED_SLICE_MIN;
+	if (td->td_pri_class != PRI_TIMESHARE)
 		return;
-	}
-
 	/*
-	 * Rationale:
-	 * KSEs in interactive ksegs get a minimal slice so that we
-	 * quickly notice if it abuses its advantage.
-	 *
-	 * KSEs in non-interactive ksegs are assigned a slice that is
-	 * based on the ksegs nice value relative to the least nice kseg
-	 * on the run queue for this cpu.
-	 *
-	 * If the KSE is less nice than all others it gets the maximum
-	 * slice and other KSEs will adjust their slice relative to
-	 * this when they first expire.
+	 * If the score is interactive we place the thread in the realtime
+	 * queue with a priority that is less than kernel and interrupt
+	 * priorities.  These threads are not subject to nice restrictions.
 	 *
-	 * There is 20 point window that starts relative to the least
-	 * nice kse on the run queue.  Slice size is determined by
-	 * the kse distance from the last nice ksegrp.
+	 * Scores greater than this are placed on the normal timeshare queue
+	 * where the priority is partially decided by the most recent cpu
+	 * utilization and the rest is decided by nice value.
 	 *
-	 * If the kse is outside of the window it will get no slice
-	 * and will be reevaluated each time it is selected on the
-	 * run queue.  The exception to this is nice 0 ksegs when
-	 * a nice -20 is running.  They are always granted a minimum
-	 * slice.
-	 */
-	if (!SCHED_INTERACTIVE(kg)) {
-		int nice;
-
-		nice = kg->kg_proc->p_nice + (0 - kseq->ksq_nicemin);
-		if (kseq->ksq_load_timeshare == 0 ||
-		    kg->kg_proc->p_nice < kseq->ksq_nicemin)
-			ke->ke_slice = SCHED_SLICE_MAX;
-		else if (nice <= SCHED_SLICE_NTHRESH)
-			ke->ke_slice = SCHED_SLICE_NICE(nice);
-		else if (kg->kg_proc->p_nice == 0)
-			ke->ke_slice = SCHED_SLICE_MIN;
-		else
-			ke->ke_slice = SCHED_SLICE_MIN; /* 0 */
-	} else
-		ke->ke_slice = SCHED_SLICE_INTERACTIVE;
+	 * The nice value of the process has a linear effect on the calculated
+	 * score.  Negative nice values make it easier for a thread to be
+	 * considered interactive.
+	 */
+	score = imax(0, sched_interact_score(td) - td->td_proc->p_nice);
+	if (score < sched_interact) {
+		pri = PRI_MIN_REALTIME;
+		pri += ((PRI_MAX_REALTIME - PRI_MIN_REALTIME) / sched_interact)
+		    * score;
+		KASSERT(pri >= PRI_MIN_REALTIME && pri <= PRI_MAX_REALTIME,
+		    ("sched_priority: invalid interactive priority %d score %d",
+		    pri, score));
+	} else {
+		pri = SCHED_PRI_MIN;
+		if (td->td_sched->ts_ticks)
+			pri += SCHED_PRI_TICKS(td->td_sched);
+		pri += SCHED_PRI_NICE(td->td_proc->p_nice);
+		KASSERT(pri >= PRI_MIN_TIMESHARE && pri <= PRI_MAX_TIMESHARE,
+		    ("sched_priority: invalid priority %d: nice %d, " 
+		    "ticks %d ftick %d ltick %d tick pri %d",
+		    pri, td->td_proc->p_nice, td->td_sched->ts_ticks,
+		    td->td_sched->ts_ftick, td->td_sched->ts_ltick,
+		    SCHED_PRI_TICKS(td->td_sched)));
+	}
+	sched_user_prio(td, pri);
 
 	return;
 }
 
 /*
  * This routine enforces a maximum limit on the amount of scheduling history
- * kept.  It is called after either the slptime or runtime is adjusted.
- * This routine will not operate correctly when slp or run times have been
- * adjusted to more than double their maximum.
+ * kept.  It is called after either the slptime or runtime is adjusted.  This
+ * function is ugly due to integer math.
  */
 static void
-sched_interact_update(struct ksegrp *kg)
+sched_interact_update(struct thread *td)
 {
-	int sum;
+	struct td_sched *ts;
+	u_int sum;
 
-	sum = kg->kg_runtime + kg->kg_slptime;
+	ts = td->td_sched;
+	sum = ts->ts_runtime + ts->ts_slptime;
 	if (sum < SCHED_SLP_RUN_MAX)
 		return;
 	/*
+	 * This only happens from two places:
+	 * 1) We have added an unusual amount of run time from fork_exit.
+	 * 2) We have added an unusual amount of sleep time from sched_sleep().
+	 */
+	if (sum > SCHED_SLP_RUN_MAX * 2) {
+		if (ts->ts_runtime > ts->ts_slptime) {
+			ts->ts_runtime = SCHED_SLP_RUN_MAX;
+			ts->ts_slptime = 1;
+		} else {
+			ts->ts_slptime = SCHED_SLP_RUN_MAX;
+			ts->ts_runtime = 1;
+		}
+		return;
+	}
+	/*
 	 * If we have exceeded by more than 1/5th then the algorithm below
 	 * will not bring us back into range.  Dividing by two here forces
 	 * us into the range of [4/5 * SCHED_INTERACT_MAX, SCHED_INTERACT_MAX]
 	 */
 	if (sum > (SCHED_SLP_RUN_MAX / 5) * 6) {
-		kg->kg_runtime /= 2;
-		kg->kg_slptime /= 2;
+		ts->ts_runtime /= 2;
+		ts->ts_slptime /= 2;
 		return;
 	}
-	kg->kg_runtime = (kg->kg_runtime / 5) * 4;
-	kg->kg_slptime = (kg->kg_slptime / 5) * 4;
+	ts->ts_runtime = (ts->ts_runtime / 5) * 4;
+	ts->ts_slptime = (ts->ts_slptime / 5) * 4;
 }
 
+/*
+ * Scale back the interactivity history when a child thread is created.  The
+ * history is inherited from the parent but the thread may behave totally
+ * differently.  For example, a shell spawning a compiler process.  We want
+ * to learn that the compiler is behaving badly very quickly.
+ */
 static void
-sched_interact_fork(struct ksegrp *kg)
+sched_interact_fork(struct thread *td)
 {
 	int ratio;
 	int sum;
 
-	sum = kg->kg_runtime + kg->kg_slptime;
+	sum = td->td_sched->ts_runtime + td->td_sched->ts_slptime;
 	if (sum > SCHED_SLP_RUN_FORK) {
 		ratio = sum / SCHED_SLP_RUN_FORK;
-		kg->kg_runtime /= ratio;
-		kg->kg_slptime /= ratio;
+		td->td_sched->ts_runtime /= ratio;
+		td->td_sched->ts_slptime /= ratio;
 	}
 }
 
-static int
-sched_interact_score(struct ksegrp *kg)
-{
-	int div;
-
-	if (kg->kg_runtime > kg->kg_slptime) {
-		div = max(1, kg->kg_runtime / SCHED_INTERACT_HALF);
-		return (SCHED_INTERACT_HALF +
-		    (SCHED_INTERACT_HALF - (kg->kg_slptime / div)));
-	} if (kg->kg_slptime > kg->kg_runtime) {
-		div = max(1, kg->kg_slptime / SCHED_INTERACT_HALF);
-		return (kg->kg_runtime / div);
-	}
-
-	/*
-	 * This can happen if slptime and runtime are 0.
-	 */
-	return (0);
-
-}
-
 /*
- * Very early in the boot some setup of scheduler-specific
- * parts of proc0 and of soem scheduler resources needs to be done.
- * Called from:
- *  proc0_init()
+ * Called from proc0_init() to setup the scheduler fields.
  */
 void
 schedinit(void)
 {
+
 	/*
 	 * Set up the scheduler specific parts of proc0.
 	 */
 	proc0.p_sched = NULL; /* XXX */
-	ksegrp0.kg_sched = &kg_sched0;
-	thread0.td_sched = &kse0;
-	kse0.ke_thread = &thread0;
-	kse0.ke_state = KES_THREAD;
-	kg_sched0.skg_concurrency = 1;
-	kg_sched0.skg_avail_opennings = 0; /* we are already running */
+	thread0.td_sched = &td_sched0;
+	td_sched0.ts_ltick = ticks;
+	td_sched0.ts_ftick = ticks;
+	td_sched0.ts_thread = &thread0;
 }
 
 /*
  * This is only somewhat accurate since given many processes of the same
  * priority they will switch when their slices run out, which will be
- * at most SCHED_SLICE_MAX.
+ * at most sched_slice stathz ticks.
  */
 int
 sched_rr_interval(void)
 {
-	return (SCHED_SLICE_MAX);
+
+	/* Convert sched_slice to hz */
+	return (hz/(realstathz/sched_slice));
 }
 
+/*
+ * Update the percent cpu tracking information when it is requested or
+ * the total history exceeds the maximum.  We keep a sliding history of
+ * tick counts that slowly decays.  This is less precise than the 4BSD
+ * mechanism since it happens with less regular and frequent events.
+ */
 static void
-sched_pctcpu_update(struct kse *ke)
+sched_pctcpu_update(struct td_sched *ts)
 {
+
+	if (ts->ts_ticks == 0)
+		return;
+	if (ticks - (hz / 10) < ts->ts_ltick &&
+	    SCHED_TICK_TOTAL(ts) < SCHED_TICK_MAX)
+		return;
 	/*
 	 * Adjust counters and watermark for pctcpu calc.
 	 */
-	if (ke->ke_ltick > ticks - SCHED_CPU_TICKS) {
-		/*
-		 * Shift the tick count out so that the divide doesn't
-		 * round away our results.
-		 */
-		ke->ke_ticks <<= 10;
-		ke->ke_ticks = (ke->ke_ticks / (ticks - ke->ke_ftick)) *
-			    SCHED_CPU_TICKS;
-		ke->ke_ticks >>= 10;
-	} else
-		ke->ke_ticks = 0;
-	ke->ke_ltick = ticks;
-	ke->ke_ftick = ke->ke_ltick - SCHED_CPU_TICKS;
+	if (ts->ts_ltick > ticks - SCHED_TICK_TARG)
+		ts->ts_ticks = (ts->ts_ticks / (ticks - ts->ts_ftick)) *
+			    SCHED_TICK_TARG;
+	else
+		ts->ts_ticks = 0;
+	ts->ts_ltick = ticks;
+	ts->ts_ftick = ts->ts_ltick - SCHED_TICK_TARG;
 }
 
-void
+/*
+ * Adjust the priority of a thread.  Move it to the appropriate run-queue
+ * if necessary.  This is the back-end for several priority related
+ * functions.
+ */
+static void
 sched_thread_priority(struct thread *td, u_char prio)
 {
-	struct kse *ke;
+	struct td_sched *ts;
 
 	CTR6(KTR_SCHED, "sched_prio: %p(%s) prio %d newprio %d by %p(%s)",
 	    td, td->td_proc->p_comm, td->td_priority, prio, curthread,
 	    curthread->td_proc->p_comm);
-	ke = td->td_kse;
-	mtx_assert(&sched_lock, MA_OWNED);
+	ts = td->td_sched;
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	if (td->td_priority == prio)
 		return;
-	if (TD_ON_RUNQ(td)) {
+
+	if (TD_ON_RUNQ(td) && prio < td->td_priority) {
 		/*
 		 * If the priority has been elevated due to priority
 		 * propagation, we may have to move ourselves to a new
-		 * queue.  We still call adjustrunqueue below in case kse
-		 * needs to fix things up.
+		 * queue.  This could be optimized to not re-add in some
+		 * cases.
 		 */
-		if (prio < td->td_priority && ke->ke_runq != NULL &&
-		    (ke->ke_flags & KEF_ASSIGNED) == 0 &&
-		    ke->ke_runq != KSEQ_CPU(ke->ke_cpu)->ksq_curr) {
-			runq_remove(ke->ke_runq, ke);
-			ke->ke_runq = KSEQ_CPU(ke->ke_cpu)->ksq_curr;
-			runq_add(ke->ke_runq, ke, 0);
-		}
-		/*
-		 * Hold this kse on this cpu so that sched_prio() doesn't
-		 * cause excessive migration.  We only want migration to
-		 * happen as the result of a wakeup.
-		 */
-		ke->ke_flags |= KEF_HOLD;
-		adjustrunqueue(td, prio);
-		ke->ke_flags &= ~KEF_HOLD;
-	} else
+		sched_rem(td);
+		td->td_priority = prio;
+		sched_add(td, SRQ_BORROWING);
+	} else {
+#ifdef SMP
+		struct tdq *tdq;
+
+		tdq = TDQ_CPU(ts->ts_cpu);
+		if (prio < tdq->tdq_lowpri)
+			tdq->tdq_lowpri = prio;
+#endif
 		td->td_priority = prio;
+	}
 }
 
 /*
@@ -1278,7 +1656,7 @@
 
 	if (td->td_base_pri >= PRI_MIN_TIMESHARE &&
 	    td->td_base_pri <= PRI_MAX_TIMESHARE)
-		base_pri = td->td_ksegrp->kg_user_pri;
+		base_pri = td->td_user_pri;
 	else
 		base_pri = td->td_base_pri;
 	if (prio >= base_pri) {
@@ -1288,6 +1666,9 @@
 		sched_lend_prio(td, prio);
 }
 
+/*
+ * Standard entry for setting the priority to an absolute value.
+ */
 void
 sched_prio(struct thread *td, u_char prio)
 {
@@ -1315,157 +1696,293 @@
 		turnstile_adjust(td, oldprio);
 }
 
+/*
+ * Set the base user priority, does not effect current running priority.
+ */
 void
-sched_switch(struct thread *td, struct thread *newtd, int flags)
+sched_user_prio(struct thread *td, u_char prio)
+{
+	u_char oldprio;
+
+	td->td_base_user_pri = prio;
+	if (td->td_flags & TDF_UBORROWING && td->td_user_pri <= prio)
+                return;
+	oldprio = td->td_user_pri;
+	td->td_user_pri = prio;
+}
+
+void
+sched_lend_user_prio(struct thread *td, u_char prio)
 {
-	struct kseq *ksq;
-	struct kse *ke;
+	u_char oldprio;
 
-	mtx_assert(&sched_lock, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	td->td_flags |= TDF_UBORROWING;
+	oldprio = td->td_user_pri;
+	td->td_user_pri = prio;
+}
 
-	ke = td->td_kse;
-	ksq = KSEQ_SELF();
+void
+sched_unlend_user_prio(struct thread *td, u_char prio)
+{
+	u_char base_pri;
 
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	base_pri = td->td_base_user_pri;
+	if (prio >= base_pri) {
+		td->td_flags &= ~TDF_UBORROWING;
+		sched_user_prio(td, base_pri);
+	} else {
+		sched_lend_user_prio(td, prio);
+	}
+}
+
+/*
+ * Add the thread passed as 'newtd' to the run queue before selecting
+ * the next thread to run.  This is only used for KSE.
+ */
+static void
+sched_switchin(struct tdq *tdq, struct thread *td)
+{
+#ifdef SMP
+	spinlock_enter();
+	TDQ_UNLOCK(tdq);
+	thread_lock(td);
+	spinlock_exit();
+	sched_setcpu(td->td_sched, TDQ_ID(tdq), SRQ_YIELDING);
+#else
+	td->td_lock = TDQ_LOCKPTR(tdq);
+#endif
+	tdq_add(tdq, td, SRQ_YIELDING);
+	MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
+}
+
+/*
+ * Handle migration from sched_switch().  This happens only for
+ * cpu binding.
+ */
+static struct mtx *
+sched_switch_migrate(struct tdq *tdq, struct thread *td, int flags)
+{
+	struct tdq *tdn;
+
+	tdn = TDQ_CPU(td->td_sched->ts_cpu);
+#ifdef SMP
+	/*
+	 * Do the lock dance required to avoid LOR.  We grab an extra
+	 * spinlock nesting to prevent preemption while we're
+	 * not holding either run-queue lock.
+	 */
+	spinlock_enter();
+	thread_block_switch(td);	/* This releases the lock on tdq. */
+	TDQ_LOCK(tdn);
+	tdq_add(tdn, td, flags);
+	tdq_notify(td->td_sched);
+	/*
+	 * After we unlock tdn the new cpu still can't switch into this
+	 * thread until we've unblocked it in cpu_switch().  The lock
+	 * pointers may match in the case of HTT cores.  Don't unlock here
+	 * or we can deadlock when the other CPU runs the IPI handler.
+	 */
+	if (TDQ_LOCKPTR(tdn) != TDQ_LOCKPTR(tdq)) {
+		TDQ_UNLOCK(tdn);
+		TDQ_LOCK(tdq);
+	}
+	spinlock_exit();
+#endif
+	return (TDQ_LOCKPTR(tdn));
+}
+
+/*
+ * Block a thread for switching.  Similar to thread_block() but does not
+ * bump the spin count.
+ */
+static inline struct mtx *
+thread_block_switch(struct thread *td)
+{
+	struct mtx *lock;
+
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	lock = td->td_lock;
+	td->td_lock = &blocked_lock;
+	mtx_unlock_spin(lock);
+
+	return (lock);
+}
+
+/*
+ * Release a thread that was blocked with thread_block_switch().
+ */
+static inline void
+thread_unblock_switch(struct thread *td, struct mtx *mtx)
+{
+	atomic_store_rel_ptr((volatile uintptr_t *)&td->td_lock,
+	    (uintptr_t)mtx);
+}
+
+/*
+ * Switch threads.  This function has to handle threads coming in while
+ * blocked for some reason, running, or idle.  It also must deal with
+ * migrating a thread from one queue to another as running threads may
+ * be assigned elsewhere via binding.
+ */
+void
+sched_switch(struct thread *td, struct thread *newtd, int flags)
+{
+	struct tdq *tdq;
+	struct td_sched *ts;
+	struct mtx *mtx;
+	int srqflag;
+	int cpuid;
+
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+
+	cpuid = PCPU_GET(cpuid);
+	tdq = TDQ_CPU(cpuid);
+	ts = td->td_sched;
+	mtx = td->td_lock;
+#ifdef SMP
+	ts->ts_rltick = ticks;
+	if (newtd && newtd->td_priority < tdq->tdq_lowpri)
+		tdq->tdq_lowpri = newtd->td_priority;
+#endif
 	td->td_lastcpu = td->td_oncpu;
 	td->td_oncpu = NOCPU;
 	td->td_flags &= ~TDF_NEEDRESCHED;
 	td->td_owepreempt = 0;
-
 	/*
-	 * If the KSE has been assigned it may be in the process of switching
-	 * to the new cpu.  This is the case in sched_bind().
+	 * The lock pointer in an idle thread should never change.  Reset it
+	 * to CAN_RUN as well.
 	 */
-	if (td == PCPU_GET(idlethread)) {
+	if (TD_IS_IDLETHREAD(td)) {
+		MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
 		TD_SET_CAN_RUN(td);
-	} else if ((ke->ke_flags & KEF_ASSIGNED) == 0) {
-		/* We are ending our run so make our slot available again */
-		SLOT_RELEASE(td->td_ksegrp);
-		kseq_load_rem(ksq, ke);
-		if (TD_IS_RUNNING(td)) {
-			/*
-			 * Don't allow the thread to migrate
-			 * from a preemption.
-			 */
-			ke->ke_flags |= KEF_HOLD;
-			setrunqueue(td, (flags & SW_PREEMPT) ?
-			    SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED :
-			    SRQ_OURSELF|SRQ_YIELDING);
-			ke->ke_flags &= ~KEF_HOLD;
-		} else if ((td->td_proc->p_flag & P_HADTHREADS) &&
-		    (newtd == NULL || newtd->td_ksegrp != td->td_ksegrp))
-			/*
-			 * We will not be on the run queue.
-			 * So we must be sleeping or similar.
-			 * Don't use the slot if we will need it 
-			 * for newtd.
-			 */
-			slot_fill(td->td_ksegrp);
+	} else if (TD_IS_RUNNING(td)) {
+		MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
+		tdq_load_rem(tdq, ts);
+		srqflag = (flags & SW_PREEMPT) ?
+		    SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED :
+		    SRQ_OURSELF|SRQ_YIELDING;
+		if (ts->ts_cpu == cpuid)
+			tdq_add(tdq, td, srqflag);
+		else
+			mtx = sched_switch_migrate(tdq, td, srqflag);
+	} else {
+		/* This thread must be going to sleep. */
+		TDQ_LOCK(tdq);
+		mtx = thread_block_switch(td);
+		tdq_load_rem(tdq, ts);
 	}
-	if (newtd != NULL) {
-		/*
-		 * If we bring in a thread account for it as if it had been
-		 * added to the run queue and then chosen.
-		 */
-		newtd->td_kse->ke_flags |= KEF_DIDRUN;
-		newtd->td_kse->ke_runq = ksq->ksq_curr;
-		TD_SET_RUNNING(newtd);
-		kseq_load_add(KSEQ_SELF(), newtd->td_kse);
-		/*
-		 * XXX When we preempt, we've already consumed a slot because
-		 * we got here through sched_add().  However, newtd can come
-		 * from thread_switchout() which can't SLOT_USE() because
-		 * the SLOT code is scheduler dependent.  We must use the
-		 * slot here otherwise.
-		 */
-		if ((flags & SW_PREEMPT) == 0)
-			SLOT_USE(newtd->td_ksegrp);
-	} else
-		newtd = choosethread();
+	/*
+	 * We enter here with the thread blocked and assigned to the
+	 * appropriate cpu run-queue or sleep-queue and with the current
+	 * thread-queue locked.
+	 */
+	TDQ_LOCK_ASSERT(tdq, MA_OWNED | MA_NOTRECURSED);
+	/*
+	 * If KSE assigned a new thread just add it here and let choosethread
+	 * select the best one.
+	 */
+	if (newtd != NULL)
+		sched_switchin(tdq, newtd);
+	newtd = choosethread();
+	/*
+	 * Call the MD code to switch contexts if necessary.
+	 */
 	if (td != newtd) {
 #ifdef	HWPMC_HOOKS
 		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
 			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT);
 #endif
-		cpu_switch(td, newtd);
+		TDQ_LOCKPTR(tdq)->mtx_lock = (uintptr_t)newtd;
+		cpu_switch(td, newtd, mtx);
+		/*
+		 * We may return from cpu_switch on a different cpu.  However,
+		 * we always return with td_lock pointing to the current cpu's
+		 * run queue lock.
+		 */
+		cpuid = PCPU_GET(cpuid);
+		tdq = TDQ_CPU(cpuid);
 #ifdef	HWPMC_HOOKS
 		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
 			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN);
 #endif
-	}
-
-	sched_lock.mtx_lock = (uintptr_t)td;
-
-	td->td_oncpu = PCPU_GET(cpuid);
+	} else
+		thread_unblock_switch(td, mtx);
+	/*
+	 * Assert that all went well and return.
+	 */
+#ifdef SMP
+	/* We should always get here with the lowest priority td possible */
+	tdq->tdq_lowpri = td->td_priority;
+#endif
+	TDQ_LOCK_ASSERT(tdq, MA_OWNED|MA_NOTRECURSED);
+	MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
+	td->td_oncpu = cpuid;
 }
 
+/*
+ * Adjust thread priorities as a result of a nice request.
+ */
 void
 sched_nice(struct proc *p, int nice)
 {
-	struct ksegrp *kg;
-	struct kse *ke;
 	struct thread *td;
-	struct kseq *kseq;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
-	mtx_assert(&sched_lock, MA_OWNED);
-	/*
-	 * We need to adjust the nice counts for running KSEs.
-	 */
-	FOREACH_KSEGRP_IN_PROC(p, kg) {
-		if (kg->kg_pri_class == PRI_TIMESHARE) {
-			FOREACH_THREAD_IN_GROUP(kg, td) {
-				ke = td->td_kse;
-				if (ke->ke_runq == NULL)
-					continue;
-				kseq = KSEQ_CPU(ke->ke_cpu);
-				kseq_nice_rem(kseq, p->p_nice);
-				kseq_nice_add(kseq, nice);
-			}
-		}
-	}
+	PROC_SLOCK_ASSERT(p, MA_OWNED);
+
 	p->p_nice = nice;
-	FOREACH_KSEGRP_IN_PROC(p, kg) {
-		sched_priority(kg);
-		FOREACH_THREAD_IN_GROUP(kg, td)
-			td->td_flags |= TDF_NEEDRESCHED;
+	FOREACH_THREAD_IN_PROC(p, td) {
+		thread_lock(td);
+		sched_priority(td);
+		sched_prio(td, td->td_base_user_pri);
+		thread_unlock(td);
 	}
 }
 
+/*
+ * Record the sleep time for the interactivity scorer.
+ */
 void
 sched_sleep(struct thread *td)
 {
-	mtx_assert(&sched_lock, MA_OWNED);
 
-	td->td_slptime = ticks;
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+
+	td->td_slptick = ticks;
 }
 
+/*
+ * Schedule a thread to resume execution and record how long it voluntarily
+ * slept.  We also update the pctcpu, interactivity, and priority.
+ */
 void
 sched_wakeup(struct thread *td)
 {
-	mtx_assert(&sched_lock, MA_OWNED);
+	struct td_sched *ts;
+	int slptick;
 
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	ts = td->td_sched;
 	/*
-	 * Let the kseg know how long we slept for.  This is because process
-	 * interactivity behavior is modeled in the kseg.
+	 * If we slept for more than a tick update our interactivity and
+	 * priority.
 	 */
-	if (td->td_slptime) {
-		struct ksegrp *kg;
-		int hzticks;
-
-		kg = td->td_ksegrp;
-		hzticks = (ticks - td->td_slptime) << 10;
-		if (hzticks >= SCHED_SLP_RUN_MAX) {
-			kg->kg_slptime = SCHED_SLP_RUN_MAX;
-			kg->kg_runtime = 1;
-		} else {
-			kg->kg_slptime += hzticks;
-			sched_interact_update(kg);
-		}
-		sched_priority(kg);
-		sched_slice(td->td_kse);
-		td->td_slptime = 0;
-	}
-	setrunqueue(td, SRQ_BORING);
+	slptick = td->td_slptick;
+	td->td_slptick = 0;
+	if (slptick && slptick != ticks) {
+		u_int hzticks;
+
+		hzticks = (ticks - slptick) << SCHED_TICK_SHIFT;
+		ts->ts_slptime += hzticks;
+		sched_interact_update(td);
+		sched_pctcpu_update(ts);
+		sched_priority(td);
+	}
+	/* Reset the slice value after we sleep. */
+	ts->ts_slice = sched_slice;
+	sched_add(td, SRQ_BORING);
 }
 
 /*
@@ -1473,495 +1990,566 @@
  * priority.
  */
 void
-sched_fork(struct thread *td, struct thread *childtd)
-{
-
-	mtx_assert(&sched_lock, MA_OWNED);
-
-	sched_fork_ksegrp(td, childtd->td_ksegrp);
-	sched_fork_thread(td, childtd);
-}
-
-void
-sched_fork_ksegrp(struct thread *td, struct ksegrp *child)
+sched_fork(struct thread *td, struct thread *child)
 {
-	struct ksegrp *kg = td->td_ksegrp;
-	mtx_assert(&sched_lock, MA_OWNED);
-
-	child->kg_slptime = kg->kg_slptime;
-	child->kg_runtime = kg->kg_runtime;
-	child->kg_user_pri = kg->kg_user_pri;
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	sched_fork_thread(td, child);
+	/*
+	 * Penalize the parent and child for forking.
+	 */
 	sched_interact_fork(child);
-	kg->kg_runtime += tickincr << 10;
-	sched_interact_update(kg);
+	sched_priority(child);
+	td->td_sched->ts_runtime += tickincr;
+	sched_interact_update(td);
+	sched_priority(td);
 }
 
+/*
+ * Fork a new thread, may be within the same process.
+ */
 void
 sched_fork_thread(struct thread *td, struct thread *child)
 {
-	struct kse *ke;
-	struct kse *ke2;
+	struct td_sched *ts;
+	struct td_sched *ts2;
 
+	/*
+	 * Initialize child.
+	 */
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	sched_newthread(child);
-	ke = td->td_kse;
-	ke2 = child->td_kse;
-	ke2->ke_slice = 1;	/* Attempt to quickly learn interactivity. */
-	ke2->ke_cpu = ke->ke_cpu;
-	ke2->ke_runq = NULL;
-
-	/* Grab our parents cpu estimation information. */
-	ke2->ke_ticks = ke->ke_ticks;
-	ke2->ke_ltick = ke->ke_ltick;
-	ke2->ke_ftick = ke->ke_ftick;
+	child->td_lock = TDQ_LOCKPTR(TDQ_SELF());
+	ts = td->td_sched;
+	ts2 = child->td_sched;
+	ts2->ts_cpu = ts->ts_cpu;
+	ts2->ts_runq = NULL;
+	/*
+	 * Grab our parents cpu estimation information and priority.
+	 */
+	ts2->ts_ticks = ts->ts_ticks;
+	ts2->ts_ltick = ts->ts_ltick;
+	ts2->ts_ftick = ts->ts_ftick;
+	child->td_user_pri = td->td_user_pri;
+	child->td_base_user_pri = td->td_base_user_pri;
+	/*
+	 * And update interactivity score.
+	 */
+	ts2->ts_slptime = ts->ts_slptime;
+	ts2->ts_runtime = ts->ts_runtime;
+	ts2->ts_slice = 1;	/* Attempt to quickly learn interactivity. */
 }
 
+/*
+ * Adjust the priority class of a thread.
+ */
 void
-sched_class(struct ksegrp *kg, int class)
+sched_class(struct thread *td, int class)
 {
-	struct kseq *kseq;
-	struct kse *ke;
-	struct thread *td;
-	int nclass;
-	int oclass;
 
-	mtx_assert(&sched_lock, MA_OWNED);
-	if (kg->kg_pri_class == class)
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	if (td->td_pri_class == class)
 		return;
 
-	nclass = PRI_BASE(class);
-	oclass = PRI_BASE(kg->kg_pri_class);
-	FOREACH_THREAD_IN_GROUP(kg, td) {
-		ke = td->td_kse;
-		if ((ke->ke_state != KES_ONRUNQ &&
-		    ke->ke_state != KES_THREAD) || ke->ke_runq == NULL)
-			continue;
-		kseq = KSEQ_CPU(ke->ke_cpu);
-
 #ifdef SMP
-		/*
-		 * On SMP if we're on the RUNQ we must adjust the transferable
-		 * count because could be changing to or from an interrupt
-		 * class.
-		 */
-		if (ke->ke_state == KES_ONRUNQ) {
-			if (KSE_CAN_MIGRATE(ke)) {
-				kseq->ksq_transferable--;
-				kseq->ksq_group->ksg_transferable--;
-			}
-			if (KSE_CAN_MIGRATE(ke)) {
-				kseq->ksq_transferable++;
-				kseq->ksq_group->ksg_transferable++;
-			}
-		}
-#endif
-		if (oclass == PRI_TIMESHARE) {
-			kseq->ksq_load_timeshare--;
-			kseq_nice_rem(kseq, kg->kg_proc->p_nice);
+	/*
+	 * On SMP if we're on the RUNQ we must adjust the transferable
+	 * count because could be changing to or from an interrupt
+	 * class.
+	 */
+	if (TD_ON_RUNQ(td)) {
+		struct tdq *tdq;
+
+		tdq = TDQ_CPU(td->td_sched->ts_cpu);
+		if (THREAD_CAN_MIGRATE(td)) {
+			tdq->tdq_transferable--;
+			tdq->tdq_group->tdg_transferable--;
 		}
-		if (nclass == PRI_TIMESHARE) {
-			kseq->ksq_load_timeshare++;
-			kseq_nice_add(kseq, kg->kg_proc->p_nice);
+		td->td_pri_class = class;
+		if (THREAD_CAN_MIGRATE(td)) {
+			tdq->tdq_transferable++;
+			tdq->tdq_group->tdg_transferable++;
 		}
 	}
-
-	kg->kg_pri_class = class;
+#endif
+	td->td_pri_class = class;
 }
 
 /*
  * Return some of the child's priority and interactivity to the parent.
  */
 void
-sched_exit(struct proc *p, struct thread *childtd)
+sched_exit(struct proc *p, struct thread *child)
 {
-	mtx_assert(&sched_lock, MA_OWNED);
-	sched_exit_ksegrp(FIRST_KSEGRP_IN_PROC(p), childtd);
-	sched_exit_thread(NULL, childtd);
+	struct thread *td;
+	
+	CTR3(KTR_SCHED, "sched_exit: %p(%s) prio %d",
+	    child, child->td_proc->p_comm, child->td_priority);
+
+	PROC_SLOCK_ASSERT(p, MA_OWNED);
+	td = FIRST_THREAD_IN_PROC(p);
+	sched_exit_thread(td, child);
 }
 
+/*
+ * Penalize another thread for the time spent on this one.  This helps to
+ * worsen the priority and interactivity of processes which schedule batch
+ * jobs such as make.  This has little effect on the make process itself but
+ * causes new processes spawned by it to receive worse scores immediately.
+ */
 void
-sched_exit_ksegrp(struct ksegrp *kg, struct thread *td)
+sched_exit_thread(struct thread *td, struct thread *child)
 {
-	/* kg->kg_slptime += td->td_ksegrp->kg_slptime; */
-	kg->kg_runtime += td->td_ksegrp->kg_runtime;
-	sched_interact_update(kg);
+
+	CTR3(KTR_SCHED, "sched_exit_thread: %p(%s) prio %d",
+	    child, child->td_proc->p_comm, child->td_priority);
+
+#ifdef KSE
+	/*
+	 * KSE forks and exits so often that this penalty causes short-lived
+	 * threads to always be non-interactive.  This causes mozilla to
+	 * crawl under load.
+	 */
+	if ((td->td_pflags & TDP_SA) && td->td_proc == child->td_proc)
+		return;
+#endif
+	/*
+	 * Give the child's runtime to the parent without returning the
+	 * sleep time as a penalty to the parent.  This causes shells that
+	 * launch expensive things to mark their children as expensive.
+	 */
+	thread_lock(td);
+	td->td_sched->ts_runtime += child->td_sched->ts_runtime;
+	sched_interact_update(td);
+	sched_priority(td);
+	thread_unlock(td);
 }
 
+/*
+ * Fix priorities on return to user-space.  Priorities may be elevated due
+ * to static priorities in msleep() or similar.
+ */
 void
-sched_exit_thread(struct thread *td, struct thread *childtd)
+sched_userret(struct thread *td)
 {
-	CTR3(KTR_SCHED, "sched_exit_thread: %p(%s) prio %d",
-	    childtd, childtd->td_proc->p_comm, childtd->td_priority);
-	kseq_load_rem(KSEQ_CPU(childtd->td_kse->ke_cpu), childtd->td_kse);
+	/*
+	 * XXX we cheat slightly on the locking here to avoid locking in  
+	 * the usual case.  Setting td_priority here is essentially an
+	 * incomplete workaround for not setting it properly elsewhere.
+	 * Now that some interrupt handlers are threads, not setting it
+	 * properly elsewhere can clobber it in the window between setting
+	 * it here and returning to user mode, so don't waste time setting
+	 * it perfectly here.
+	 */
+	KASSERT((td->td_flags & TDF_BORROWING) == 0,
+	    ("thread with borrowed priority returning to userland"));
+	if (td->td_priority != td->td_user_pri) {
+		thread_lock(td);
+		td->td_priority = td->td_user_pri;
+		td->td_base_pri = td->td_user_pri;
+		thread_unlock(td);
+        }
 }
 
+/*
+ * Handle a stathz tick.  This is really only relevant for timeshare
+ * threads.
+ */
 void
 sched_clock(struct thread *td)
 {
-	struct kseq *kseq;
-	struct ksegrp *kg;
-	struct kse *ke;
-
-	mtx_assert(&sched_lock, MA_OWNED);
-	kseq = KSEQ_SELF();
-#ifdef SMP
-	if (ticks >= bal_tick)
-		sched_balance();
-	if (ticks >= gbal_tick && balance_groups)
-		sched_balance_groups();
+	struct tdq *tdq;
+	struct td_sched *ts;
+
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	tdq = TDQ_SELF();
+#ifdef SMP
 	/*
-	 * We could have been assigned a non real-time thread without an
-	 * IPI.
+	 * We run the long term load balancer infrequently on the first cpu.
 	 */
-	if (kseq->ksq_assigned)
-		kseq_assign(kseq);	/* Potentially sets NEEDRESCHED */
+	if (balance_tdq == tdq) {
+		if (balance_ticks && --balance_ticks == 0)
+			sched_balance();
+		if (balance_group_ticks && --balance_group_ticks == 0)
+			sched_balance_groups();
+	}
 #endif
 	/*
-	 * sched_setup() apparently happens prior to stathz being set.  We
-	 * need to resolve the timers earlier in the boot so we can avoid
-	 * calculating this here.
+	 * Advance the insert index once for each tick to ensure that all
+	 * threads get a chance to run.
 	 */
-	if (realstathz == 0) {
-		realstathz = stathz ? stathz : hz;
-		tickincr = hz / realstathz;
-		/*
-		 * XXX This does not work for values of stathz that are much
-		 * larger than hz.
-		 */
-		if (tickincr == 0)
-			tickincr = 1;
+	if (tdq->tdq_idx == tdq->tdq_ridx) {
+		tdq->tdq_idx = (tdq->tdq_idx + 1) % RQ_NQS;
+		if (TAILQ_EMPTY(&tdq->tdq_timeshare.rq_queues[tdq->tdq_ridx]))
+			tdq->tdq_ridx = tdq->tdq_idx;
 	}
-
-	ke = td->td_kse;
-	kg = ke->ke_ksegrp;
-
-	/* Adjust ticks for pctcpu */
-	ke->ke_ticks++;
-	ke->ke_ltick = ticks;
-
-	/* Go up to one second beyond our max and then trim back down */
-	if (ke->ke_ftick + SCHED_CPU_TICKS + hz < ke->ke_ltick)
-		sched_pctcpu_update(ke);
-
-	if (td->td_flags & TDF_IDLETD)
-		return;
+	ts = td->td_sched;
 	/*
-	 * We only do slicing code for TIMESHARE ksegrps.
+	 * We only do slicing code for TIMESHARE threads.
 	 */
-	if (kg->kg_pri_class != PRI_TIMESHARE)
+	if (td->td_pri_class != PRI_TIMESHARE)
 		return;
 	/*
-	 * We used a tick charge it to the ksegrp so that we can compute our
+	 * We used a tick; charge it to the thread so that we can compute our
 	 * interactivity.
 	 */
-	kg->kg_runtime += tickincr << 10;
-	sched_interact_update(kg);
-
+	td->td_sched->ts_runtime += tickincr;
+	sched_interact_update(td);
 	/*
 	 * We used up one time slice.
 	 */
-	if (--ke->ke_slice > 0)
+	if (--ts->ts_slice > 0)
 		return;
 	/*
 	 * We're out of time, recompute priorities and requeue.
 	 */
-	kseq_load_rem(kseq, ke);
-	sched_priority(kg);
-	sched_slice(ke);
-	if (SCHED_CURR(kg, ke))
-		ke->ke_runq = kseq->ksq_curr;
-	else
-		ke->ke_runq = kseq->ksq_next;
-	kseq_load_add(kseq, ke);
+	sched_priority(td);
 	td->td_flags |= TDF_NEEDRESCHED;
 }
 
+/*
+ * Called once per hz tick.  Used for cpu utilization information.  This
+ * is easier than trying to scale based on stathz.
+ */
+void
+sched_tick(void)
+{
+	struct td_sched *ts;
+
+	ts = curthread->td_sched;
+	/* Adjust ticks for pctcpu */
+	ts->ts_ticks += 1 << SCHED_TICK_SHIFT;
+	ts->ts_ltick = ticks;
+	/*
+	 * Update if we've exceeded our desired tick threshhold by over one
+	 * second.
+	 */
+	if (ts->ts_ftick + SCHED_TICK_MAX < ts->ts_ltick)
+		sched_pctcpu_update(ts);
+}
+
+/*
+ * Return whether the current CPU has runnable tasks.  Used for in-kernel
+ * cooperative idle threads.
+ */
 int
 sched_runnable(void)
 {
-	struct kseq *kseq;
+	struct tdq *tdq;
 	int load;
 
 	load = 1;
 
-	kseq = KSEQ_SELF();
-#ifdef SMP
-	if (kseq->ksq_assigned) {
-		mtx_lock_spin(&sched_lock);
-		kseq_assign(kseq);
-		mtx_unlock_spin(&sched_lock);
-	}
-#endif
+	tdq = TDQ_SELF();
 	if ((curthread->td_flags & TDF_IDLETD) != 0) {
-		if (kseq->ksq_load > 0)
+		if (tdq->tdq_load > 0)
 			goto out;
 	} else
-		if (kseq->ksq_load - 1 > 0)
+		if (tdq->tdq_load - 1 > 0)
 			goto out;
 	load = 0;
 out:
 	return (load);
 }
 
-void
-sched_userret(struct thread *td)
+/*
+ * Choose the highest priority thread to run.  The thread is removed from
+ * the run-queue while running however the load remains.  For SMP we set
+ * the tdq in the global idle bitmask if it idles here.
+ */
+struct thread *
+sched_choose(void)
 {
-	struct ksegrp *kg;
+#ifdef SMP
+	struct tdq_group *tdg;
+#endif
+	struct td_sched *ts;
+	struct tdq *tdq;
 
-	KASSERT((td->td_flags & TDF_BORROWING) == 0,
-	    ("thread with borrowed priority returning to userland"));
-	kg = td->td_ksegrp;	
-	if (td->td_priority != kg->kg_user_pri) {
-		mtx_lock_spin(&sched_lock);
-		td->td_priority = kg->kg_user_pri;
-		td->td_base_pri = kg->kg_user_pri;
-		mtx_unlock_spin(&sched_lock);
+	tdq = TDQ_SELF();
+	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
+	ts = tdq_choose(tdq);
+	if (ts) {
+		tdq_runq_rem(tdq, ts);
+		return (ts->ts_thread);
 	}
+#ifdef SMP
+	/*
+	 * We only set the idled bit when all of the cpus in the group are
+	 * idle.  Otherwise we could get into a situation where a thread bounces
+	 * back and forth between two idle cores on seperate physical CPUs.
+	 */
+	tdg = tdq->tdq_group;
+	tdg->tdg_idlemask |= PCPU_GET(cpumask);
+	if (tdg->tdg_idlemask == tdg->tdg_cpumask)
+		atomic_set_int(&tdq_idle, tdg->tdg_mask);
+	tdq->tdq_lowpri = PRI_MAX_IDLE;
+#endif
+	return (PCPU_GET(idlethread));
 }
 
-struct kse *
-sched_choose(void)
+/*
+ * Set owepreempt if necessary.  Preemption never happens directly in ULE,
+ * we always request it once we exit a critical section.
+ */
+static inline void
+sched_setpreempt(struct thread *td)
 {
-	struct kseq *kseq;
-	struct kse *ke;
+	struct thread *ctd;
+	int cpri;
+	int pri;
+
+	ctd = curthread;
+	pri = td->td_priority;
+	cpri = ctd->td_priority;
+	if (td->td_priority < ctd->td_priority)
+		curthread->td_flags |= TDF_NEEDRESCHED;
+	if (panicstr != NULL || pri >= cpri || cold || TD_IS_INHIBITED(ctd))
+		return;
+	/*
+	 * Always preempt IDLE threads.  Otherwise only if the preempting
+	 * thread is an ithread.
+	 */
+	if (pri > preempt_thresh && cpri < PRI_MIN_IDLE)
+		return;
+	ctd->td_owepreempt = 1;
+	return;
+}
 
-	mtx_assert(&sched_lock, MA_OWNED);
-	kseq = KSEQ_SELF();
+/*
+ * Add a thread to a thread queue.  Initializes priority, slice, runq, and
+ * add it to the appropriate queue.  This is the internal function called
+ * when the tdq is predetermined.
+ */
+void
+tdq_add(struct tdq *tdq, struct thread *td, int flags)
+{
+	struct td_sched *ts;
+	int class;
 #ifdef SMP
-restart:
-	if (kseq->ksq_assigned)
-		kseq_assign(kseq);
+	int cpumask;
 #endif
-	ke = kseq_choose(kseq);
-	if (ke) {
+
+	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
+	KASSERT((td->td_inhibitors == 0),
+	    ("sched_add: trying to run inhibited thread"));
+	KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)),
+	    ("sched_add: bad thread state"));
+	KASSERT(td->td_flags & TDF_INMEM,
+	    ("sched_add: thread swapped out"));
+
+	ts = td->td_sched;
+	class = PRI_BASE(td->td_pri_class);
+        TD_SET_RUNQ(td);
+	if (ts->ts_slice == 0)
+		ts->ts_slice = sched_slice;
+	/*
+	 * Pick the run queue based on priority.
+	 */
+	if (td->td_priority <= PRI_MAX_REALTIME)
+		ts->ts_runq = &tdq->tdq_realtime;
+	else if (td->td_priority <= PRI_MAX_TIMESHARE)
+		ts->ts_runq = &tdq->tdq_timeshare;
+	else
+		ts->ts_runq = &tdq->tdq_idle;
 #ifdef SMP
-		if (ke->ke_ksegrp->kg_pri_class == PRI_IDLE)
-			if (kseq_idled(kseq) == 0)
-				goto restart;
-#endif
-		kseq_runq_rem(kseq, ke);
-		ke->ke_state = KES_THREAD;
-		ke->ke_flags &= ~KEF_PREEMPTED;
-		return (ke);
+	cpumask = 1 << ts->ts_cpu;
+	/*
+	 * If we had been idle, clear our bit in the group and potentially
+	 * the global bitmap.
+	 */
+	if ((class != PRI_IDLE && class != PRI_ITHD) &&
+	    (tdq->tdq_group->tdg_idlemask & cpumask) != 0) {
+		/*
+		 * Check to see if our group is unidling, and if so, remove it
+		 * from the global idle mask.
+		 */
+		if (tdq->tdq_group->tdg_idlemask ==
+		    tdq->tdq_group->tdg_cpumask)
+			atomic_clear_int(&tdq_idle, tdq->tdq_group->tdg_mask);
+		/*
+		 * Now remove ourselves from the group specific idle mask.
+		 */
+		tdq->tdq_group->tdg_idlemask &= ~cpumask;
 	}
-#ifdef SMP
-	if (kseq_idled(kseq) == 0)
-		goto restart;
+	if (td->td_priority < tdq->tdq_lowpri)
+		tdq->tdq_lowpri = td->td_priority;
 #endif
-	return (NULL);
+	tdq_runq_add(tdq, ts, flags);
+	tdq_load_add(tdq, ts);
 }
 
+/*
+ * Select the target thread queue and add a thread to it.  Request
+ * preemption or IPI a remote processor if required.
+ */
 void
 sched_add(struct thread *td, int flags)
 {
-	struct kseq *kseq;
-	struct ksegrp *kg;
-	struct kse *ke;
-	int preemptive;
-	int canmigrate;
-	int class;
-
+	struct td_sched *ts;
+	struct tdq *tdq;
+#ifdef SMP
+	int cpuid;
+	int cpu;
+#endif
 	CTR5(KTR_SCHED, "sched_add: %p(%s) prio %d by %p(%s)",
 	    td, td->td_proc->p_comm, td->td_priority, curthread,
 	    curthread->td_proc->p_comm);
-	mtx_assert(&sched_lock, MA_OWNED);
-	ke = td->td_kse;
-	kg = td->td_ksegrp;
-	canmigrate = 1;
-	preemptive = !(flags & SRQ_YIELDING);
-	class = PRI_BASE(kg->kg_pri_class);
-	kseq = KSEQ_SELF();
-	if ((ke->ke_flags & KEF_INTERNAL) == 0)
-		SLOT_USE(td->td_ksegrp);
-	ke->ke_flags &= ~KEF_INTERNAL;
-#ifdef SMP
-	if (ke->ke_flags & KEF_ASSIGNED) {
-		if (ke->ke_flags & KEF_REMOVED)
-			ke->ke_flags &= ~KEF_REMOVED;
-		return;
-	}
-	canmigrate = KSE_CAN_MIGRATE(ke);
-	/*
-	 * Don't migrate running threads here.  Force the long term balancer
-	 * to do it.
-	 */
-	if (ke->ke_flags & KEF_HOLD) {
-		ke->ke_flags &= ~KEF_HOLD;
-		canmigrate = 0;
-	}
-#endif
-	KASSERT(ke->ke_state != KES_ONRUNQ,
-	    ("sched_add: kse %p (%s) already in run queue", ke,
-	    ke->ke_proc->p_comm));
-	KASSERT(ke->ke_proc->p_sflag & PS_INMEM,
-	    ("sched_add: process swapped out"));
-	KASSERT(ke->ke_runq == NULL,
-	    ("sched_add: KSE %p is still assigned to a run queue", ke));
-	if (flags & SRQ_PREEMPTED)
-		ke->ke_flags |= KEF_PREEMPTED;
-	switch (class) {
-	case PRI_ITHD:
-	case PRI_REALTIME:
-		ke->ke_runq = kseq->ksq_curr;
-		ke->ke_slice = SCHED_SLICE_MAX;
-		if (canmigrate)
-			ke->ke_cpu = PCPU_GET(cpuid);
-		break;
-	case PRI_TIMESHARE:
-		if (SCHED_CURR(kg, ke))
-			ke->ke_runq = kseq->ksq_curr;
-		else
-			ke->ke_runq = kseq->ksq_next;
-		break;
-	case PRI_IDLE:
-		/*
-		 * This is for priority prop.
-		 */
-		if (ke->ke_thread->td_priority < PRI_MIN_IDLE)
-			ke->ke_runq = kseq->ksq_curr;
-		else
-			ke->ke_runq = &kseq->ksq_idle;
-		ke->ke_slice = SCHED_SLICE_MIN;
-		break;
-	default:
-		panic("Unknown pri class.");
-		break;
-	}
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	ts = td->td_sched;
+	/*
+	 * Recalculate the priority before we select the target cpu or
+	 * run-queue.
+	 */
+	if (PRI_BASE(td->td_pri_class) == PRI_TIMESHARE)
+		sched_priority(td);
 #ifdef SMP
+	cpuid = PCPU_GET(cpuid);
 	/*
-	 * If this thread is pinned or bound, notify the target cpu.
+	 * Pick the destination cpu and if it isn't ours transfer to the
+	 * target cpu.
 	 */
-	if (!canmigrate && ke->ke_cpu != PCPU_GET(cpuid) ) {
-		ke->ke_runq = NULL;
-		kseq_notify(ke, ke->ke_cpu);
+	if (td->td_priority <= PRI_MAX_ITHD && THREAD_CAN_MIGRATE(td))
+		cpu = cpuid;
+	else if (!THREAD_CAN_MIGRATE(td))
+		cpu = ts->ts_cpu;
+	else
+		cpu = sched_pickcpu(ts, flags);
+	tdq = sched_setcpu(ts, cpu, flags);
+	tdq_add(tdq, td, flags);
+	if (cpu != cpuid) {
+		tdq_notify(ts);
 		return;
 	}
+#else
+	tdq = TDQ_SELF();
+	TDQ_LOCK(tdq);
 	/*
-	 * If we had been idle, clear our bit in the group and potentially
-	 * the global bitmap.  If not, see if we should transfer this thread.
+	 * Now that the thread is moving to the run-queue, set the lock
+	 * to the scheduler's lock.
 	 */
-	if ((class == PRI_TIMESHARE || class == PRI_REALTIME) &&
-	    (kseq->ksq_group->ksg_idlemask & PCPU_GET(cpumask)) != 0) {
-		/*
-		 * Check to see if our group is unidling, and if so, remove it
-		 * from the global idle mask.
-		 */
-		if (kseq->ksq_group->ksg_idlemask ==
-		    kseq->ksq_group->ksg_cpumask)
-			atomic_clear_int(&kseq_idle, kseq->ksq_group->ksg_mask);
-		/*
-		 * Now remove ourselves from the group specific idle mask.
-		 */
-		kseq->ksq_group->ksg_idlemask &= ~PCPU_GET(cpumask);
-	} else if (canmigrate && kseq->ksq_load > 1 && class != PRI_ITHD)
-		if (kseq_transfer(kseq, ke, class))
-			return;
-	ke->ke_cpu = PCPU_GET(cpuid);
+	thread_lock_set(td, TDQ_LOCKPTR(tdq));
+	tdq_add(tdq, td, flags);
 #endif
-	if (td->td_priority < curthread->td_priority &&
-	    ke->ke_runq == kseq->ksq_curr)
-		curthread->td_flags |= TDF_NEEDRESCHED;
-	if (preemptive && maybe_preempt(td))
-		return;
-	ke->ke_state = KES_ONRUNQ;
-
-	kseq_runq_add(kseq, ke, flags);
-	kseq_load_add(kseq, ke);
+	if (!(flags & SRQ_YIELDING))
+		sched_setpreempt(td);
 }
 
+/*
+ * Remove a thread from a run-queue without running it.  This is used
+ * when we're stealing a thread from a remote queue.  Otherwise all threads
+ * exit by calling sched_exit_thread() and sched_throw() themselves.
+ */
 void
 sched_rem(struct thread *td)
 {
-	struct kseq *kseq;
-	struct kse *ke;
+	struct tdq *tdq;
+	struct td_sched *ts;
 
 	CTR5(KTR_SCHED, "sched_rem: %p(%s) prio %d by %p(%s)",
 	    td, td->td_proc->p_comm, td->td_priority, curthread,
 	    curthread->td_proc->p_comm);
-	mtx_assert(&sched_lock, MA_OWNED);
-	ke = td->td_kse;
-	SLOT_RELEASE(td->td_ksegrp);
-	ke->ke_flags &= ~KEF_PREEMPTED;
-	if (ke->ke_flags & KEF_ASSIGNED) {
-		ke->ke_flags |= KEF_REMOVED;
-		return;
-	}
-	KASSERT((ke->ke_state == KES_ONRUNQ),
-	    ("sched_rem: KSE not on run queue"));
-
-	ke->ke_state = KES_THREAD;
-	kseq = KSEQ_CPU(ke->ke_cpu);
-	kseq_runq_rem(kseq, ke);
-	kseq_load_rem(kseq, ke);
+	ts = td->td_sched;
+	tdq = TDQ_CPU(ts->ts_cpu);
+	TDQ_LOCK_ASSERT(tdq, MA_OWNED);
+	MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
+	KASSERT(TD_ON_RUNQ(td),
+	    ("sched_rem: thread not on run queue"));
+	tdq_runq_rem(tdq, ts);
+	tdq_load_rem(tdq, ts);
+	TD_SET_CAN_RUN(td);
 }
 
+/*
+ * Fetch cpu utilization information.  Updates on demand.
+ */
 fixpt_t
 sched_pctcpu(struct thread *td)
 {
 	fixpt_t pctcpu;
-	struct kse *ke;
+	struct td_sched *ts;
 
 	pctcpu = 0;
-	ke = td->td_kse;
-	if (ke == NULL)
+	ts = td->td_sched;
+	if (ts == NULL)
 		return (0);
 
-	mtx_lock_spin(&sched_lock);
-	if (ke->ke_ticks) {
+	thread_lock(td);
+	if (ts->ts_ticks) {
 		int rtick;
 
-		/*
-		 * Don't update more frequently than twice a second.  Allowing
-		 * this causes the cpu usage to decay away too quickly due to
-		 * rounding errors.
-		 */
-		if (ke->ke_ftick + SCHED_CPU_TICKS < ke->ke_ltick ||
-		    ke->ke_ltick < (ticks - (hz / 2)))
-			sched_pctcpu_update(ke);
+		sched_pctcpu_update(ts);
 		/* How many rtick per second ? */
-		rtick = min(ke->ke_ticks / SCHED_CPU_TIME, SCHED_CPU_TICKS);
-		pctcpu = (FSCALE * ((FSCALE * rtick)/realstathz)) >> FSHIFT;
+		rtick = min(SCHED_TICK_HZ(ts) / SCHED_TICK_SECS, hz);
+		pctcpu = (FSCALE * ((FSCALE * rtick)/hz)) >> FSHIFT;
 	}
-
-	ke->ke_proc->p_swtime = ke->ke_ltick - ke->ke_ftick;
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(td);
 
 	return (pctcpu);
 }
 
+/*
+ * Bind a thread to a target cpu.
+ */
 void
 sched_bind(struct thread *td, int cpu)
 {
-	struct kse *ke;
+	struct td_sched *ts;
 
-	mtx_assert(&sched_lock, MA_OWNED);
-	ke = td->td_kse;
-	ke->ke_flags |= KEF_BOUND;
+	THREAD_LOCK_ASSERT(td, MA_OWNED|MA_NOTRECURSED);
+	ts = td->td_sched;
+	if (ts->ts_flags & TSF_BOUND)
+		sched_unbind(td);
+	ts->ts_flags |= TSF_BOUND;
 #ifdef SMP
+	sched_pin();
 	if (PCPU_GET(cpuid) == cpu)
 		return;
-	/* sched_rem without the runq_remove */
-	ke->ke_state = KES_THREAD;
-	kseq_load_rem(KSEQ_CPU(ke->ke_cpu), ke);
-	kseq_notify(ke, cpu);
+	ts->ts_cpu = cpu;
 	/* When we return from mi_switch we'll be on the correct cpu. */
 	mi_switch(SW_VOL, NULL);
 #endif
 }
 
+/*
+ * Release a bound thread.
+ */
 void
 sched_unbind(struct thread *td)
 {
-	mtx_assert(&sched_lock, MA_OWNED);
-	td->td_kse->ke_flags &= ~KEF_BOUND;
+	struct td_sched *ts;
+
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	ts = td->td_sched;
+	if ((ts->ts_flags & TSF_BOUND) == 0)
+		return;
+	ts->ts_flags &= ~TSF_BOUND;
+#ifdef SMP
+	sched_unpin();
+#endif
 }
 
 int
 sched_is_bound(struct thread *td)
 {
-	mtx_assert(&sched_lock, MA_OWNED);
-	return (td->td_kse->ke_flags & KEF_BOUND);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	return (td->td_sched->ts_flags & TSF_BOUND);
+}
+
+/*
+ * Basic yield call.
+ */
+void
+sched_relinquish(struct thread *td)
+{
+	thread_lock(td);
+	SCHED_STAT_INC(switch_relinquish);
+	mi_switch(SW_VOL, NULL);
+	thread_unlock(td);
 }
 
+/*
+ * Return the total system load.
+ */
 int
 sched_load(void)
 {
@@ -1970,21 +2558,15 @@
 	int i;
 
 	total = 0;
-	for (i = 0; i <= ksg_maxid; i++)
-		total += KSEQ_GROUP(i)->ksg_load;
+	for (i = 0; i <= tdg_maxid; i++)
+		total += TDQ_GROUP(i)->tdg_load;
 	return (total);
 #else
-	return (KSEQ_SELF()->ksq_sysload);
+	return (TDQ_SELF()->tdq_sysload);
 #endif
 }
 
 int
-sched_sizeof_ksegrp(void)
-{
-	return (sizeof(struct ksegrp) + sizeof(struct kg_sched));
-}
-
-int
 sched_sizeof_proc(void)
 {
 	return (sizeof(struct proc));
@@ -1995,5 +2577,116 @@
 {
 	return (sizeof(struct thread) + sizeof(struct td_sched));
 }
+
+/*
+ * The actual idle process.
+ */
+void
+sched_idletd(void *dummy)
+{
+	struct thread *td;
+	struct tdq *tdq;
+
+	td = curthread;
+	tdq = TDQ_SELF();
+	mtx_assert(&Giant, MA_NOTOWNED);
+	/* ULE relies on preemption for idle interruption. */
+	for (;;) {
+#ifdef SMP
+		if (tdq_idled(tdq))
+			cpu_idle();
+#else
+		cpu_idle();
+#endif
+	}
+}
+
+/*
+ * A CPU is entering for the first time or a thread is exiting.
+ */
+void
+sched_throw(struct thread *td)
+{
+	struct thread *newtd;
+	struct tdq *tdq;
+
+	tdq = TDQ_SELF();
+	if (td == NULL) {
+		/* Correct spinlock nesting and acquire the correct lock. */
+		TDQ_LOCK(tdq);
+		spinlock_exit();
+	} else {
+		MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
+		tdq_load_rem(tdq, td->td_sched);
+	}
+	KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count"));
+	newtd = choosethread();
+	TDQ_LOCKPTR(tdq)->mtx_lock = (uintptr_t)newtd;
+	PCPU_SET(switchtime, cpu_ticks());
+	PCPU_SET(switchticks, ticks);
+	cpu_throw(td, newtd);		/* doesn't return */
+}
+
+/*
+ * This is called from fork_exit().  Just acquire the correct locks and
+ * let fork do the rest of the work.
+ */
+void
+sched_fork_exit(struct thread *td)
+{
+	struct td_sched *ts;
+	struct tdq *tdq;
+	int cpuid;
+
+	/*
+	 * Finish setting up thread glue so that it begins execution in a
+	 * non-nested critical section with the scheduler lock held.
+	 */
+	cpuid = PCPU_GET(cpuid);
+	tdq = TDQ_CPU(cpuid);
+	ts = td->td_sched;
+	if (TD_IS_IDLETHREAD(td))
+		td->td_lock = TDQ_LOCKPTR(tdq);
+	MPASS(td->td_lock == TDQ_LOCKPTR(tdq));
+	td->td_oncpu = cpuid;
+	TDQ_LOCK_ASSERT(tdq, MA_OWNED | MA_NOTRECURSED);
+}
+
+static SYSCTL_NODE(_kern, OID_AUTO, sched, CTLFLAG_RW, 0,
+    "Scheduler");
+SYSCTL_STRING(_kern_sched, OID_AUTO, name, CTLFLAG_RD, "ULE", 0,
+    "Scheduler name");
+SYSCTL_INT(_kern_sched, OID_AUTO, slice, CTLFLAG_RW, &sched_slice, 0,
+    "Slice size for timeshare threads");
+SYSCTL_INT(_kern_sched, OID_AUTO, interact, CTLFLAG_RW, &sched_interact, 0,
+     "Interactivity score threshold");
+SYSCTL_INT(_kern_sched, OID_AUTO, preempt_thresh, CTLFLAG_RW, &preempt_thresh,
+     0,"Min priority for preemption, lower priorities have greater precedence");
+#ifdef SMP
+SYSCTL_INT(_kern_sched, OID_AUTO, pick_pri, CTLFLAG_RW, &pick_pri, 0,
+    "Pick the target cpu based on priority rather than load.");
+SYSCTL_INT(_kern_sched, OID_AUTO, affinity, CTLFLAG_RW, &affinity, 0,
+    "Number of hz ticks to keep thread affinity for");
+SYSCTL_INT(_kern_sched, OID_AUTO, tryself, CTLFLAG_RW, &tryself, 0, "");
+SYSCTL_INT(_kern_sched, OID_AUTO, balance, CTLFLAG_RW, &rebalance, 0,
+    "Enables the long-term load balancer");
+SYSCTL_INT(_kern_sched, OID_AUTO, balance_interval, CTLFLAG_RW,
+    &balance_interval, 0,
+    "Average frequency in stathz ticks to run the long-term balancer");
+SYSCTL_INT(_kern_sched, OID_AUTO, steal_htt, CTLFLAG_RW, &steal_htt, 0,
+    "Steals work from another hyper-threaded core on idle");
+SYSCTL_INT(_kern_sched, OID_AUTO, steal_idle, CTLFLAG_RW, &steal_idle, 0,
+    "Attempts to steal work from other cores before idling");
+SYSCTL_INT(_kern_sched, OID_AUTO, steal_thresh, CTLFLAG_RW, &steal_thresh, 0,
+    "Minimum load on remote cpu before we'll steal");
+SYSCTL_INT(_kern_sched, OID_AUTO, topology, CTLFLAG_RD, &topology, 0,
+    "True when a topology has been specified by the MD code.");
+#endif
+
+/* ps compat.  All cpu percentages from ULE are weighted. */
+static int ccpu = 0;
+SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
+
+
 #define KERN_SWITCH_INCLUDE 1
 #include "kern/kern_switch.c"
Index: uipc_mbuf.c
===================================================================
RCS file: /home/cvs/src/sys/kern/uipc_mbuf.c,v
retrieving revision 1.5
retrieving revision 1.6
diff -L sys/kern/uipc_mbuf.c -L sys/kern/uipc_mbuf.c -u -r1.5 -r1.6
--- sys/kern/uipc_mbuf.c
+++ sys/kern/uipc_mbuf.c
@@ -30,7 +30,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/uipc_mbuf.c,v 1.148.2.6 2006/03/23 23:24:32 sam Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/uipc_mbuf.c,v 1.174 2007/10/06 21:42:39 kmacy Exp $");
 
 #include "opt_mac.h"
 #include "opt_param.h"
@@ -41,7 +41,6 @@
 #include <sys/kernel.h>
 #include <sys/limits.h>
 #include <sys/lock.h>
-#include <sys/mac.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/sysctl.h>
@@ -49,6 +48,8 @@
 #include <sys/protosw.h>
 #include <sys/uio.h>
 
+#include <security/mac/mac_framework.h>
+
 int	max_linkhdr;
 int	max_protohdr;
 int	max_hdr;
@@ -64,7 +65,6 @@
 /*
  * sysctl(8) exported objects
  */
-SYSCTL_DECL(_kern_ipc);
 SYSCTL_INT(_kern_ipc, KIPC_MAX_LINKHDR, max_linkhdr, CTLFLAG_RD,
 	   &max_linkhdr, 0, "Size of largest link layer header");
 SYSCTL_INT(_kern_ipc, KIPC_MAX_PROTOHDR, max_protohdr, CTLFLAG_RD,
@@ -87,11 +87,6 @@
 #endif
 
 /*
- * Malloc-type for external ext_buf ref counts.
- */
-static MALLOC_DEFINE(M_MBUF, "mbextcnt", "mbuf external ref counts");
-
-/*
  * Allocate a given length worth of mbufs and/or clusters (whatever fits
  * best) and return a pointer to the top of the allocated chain.  If an
  * existing mbuf chain is provided, then we will append the new chain
@@ -99,61 +94,61 @@
  * chain.
  */
 struct mbuf *
-m_getm(struct mbuf *m, int len, int how, short type)
+m_getm2(struct mbuf *m, int len, int how, short type, int flags)
 {
-	struct mbuf *mb, *top, *cur, *mtail;
-	int num, rem;
-	int i;
+	struct mbuf *mb, *nm = NULL, *mtail = NULL;
 
-	KASSERT(len >= 0, ("m_getm(): len is < 0"));
+	KASSERT(len >= 0, ("%s: len is < 0", __func__));
 
-	/* If m != NULL, we will append to the end of that chain. */
-	if (m != NULL)
-		for (mtail = m; mtail->m_next != NULL; mtail = mtail->m_next);
-	else
-		mtail = NULL;
+	/* Validate flags. */
+	flags &= (M_PKTHDR | M_EOR);
 
-	/*
-	 * Calculate how many mbufs+clusters ("packets") we need and how much
-	 * leftover there is after that and allocate the first mbuf+cluster
-	 * if required.
-	 */
-	num = len / MCLBYTES;
-	rem = len % MCLBYTES;
-	top = cur = NULL;
-	if (num > 0) {
-		if ((top = cur = m_getcl(how, type, 0)) == NULL)
-			goto failed;
-		top->m_len = 0;
-	}
-	num--;
-
-	for (i = 0; i < num; i++) {
-		mb = m_getcl(how, type, 0);
-		if (mb == NULL)
-			goto failed;
-		mb->m_len = 0;
-		cur = (cur->m_next = mb);
-	}
-	if (rem > 0) {
-		mb = (rem > MINCLSIZE) ?
-		    m_getcl(how, type, 0) : m_get(how, type);
-		if (mb == NULL)
-			goto failed;
-		mb->m_len = 0;
-		if (cur == NULL)
-			top = mb;
+	/* Packet header mbuf must be first in chain. */
+	if ((flags & M_PKTHDR) && m != NULL)
+		flags &= ~M_PKTHDR;
+
+	/* Loop and append maximum sized mbufs to the chain tail. */
+	while (len > 0) {
+		if (len > MCLBYTES)
+			mb = m_getjcl(how, type, (flags & M_PKTHDR),
+			    MJUMPAGESIZE);
+		else if (len >= MINCLSIZE)
+			mb = m_getcl(how, type, (flags & M_PKTHDR));
+		else if (flags & M_PKTHDR)
+			mb = m_gethdr(how, type);
 		else
-			cur->m_next = mb;
-	}
+			mb = m_get(how, type);
 
-	if (mtail != NULL)
-		mtail->m_next = top;
-	return top;
-failed:
-	if (top != NULL)
-		m_freem(top);
-	return NULL;
+		/* Fail the whole operation if one mbuf can't be allocated. */
+		if (mb == NULL) {
+			if (nm != NULL)
+				m_freem(nm);
+			return (NULL);
+		}
+
+		/* Book keeping. */
+		len -= (mb->m_flags & M_EXT) ? mb->m_ext.ext_size :
+			((mb->m_flags & M_PKTHDR) ? MHLEN : MLEN);
+		if (mtail != NULL)
+			mtail->m_next = mb;
+		else
+			nm = mb;
+		mtail = mb;
+		flags &= ~M_PKTHDR;	/* Only valid on the first mbuf. */
+	}
+	if (flags & M_EOR)
+		mtail->m_flags |= M_EOR;  /* Only valid on the last mbuf. */
+
+	/* If mbuf was supplied, append new chain to the end of it. */
+	if (m != NULL) {
+		for (mtail = m; mtail->m_next != NULL; mtail = mtail->m_next)
+			;
+		mtail->m_next = nm;
+		mtail->m_flags &= ~M_EOR;
+	} else
+		m = nm;
+
+	return (m);
 }
 
 /*
@@ -193,16 +188,10 @@
 m_extadd(struct mbuf *mb, caddr_t buf, u_int size,
     void (*freef)(void *, void *), void *args, int flags, int type)
 {
-	u_int *ref_cnt = NULL;
+	KASSERT(type != EXT_CLUSTER, ("%s: EXT_CLUSTER not allowed", __func__));
 
-	/* XXX Shouldn't be adding EXT_CLUSTER with this API */
-	if (type == EXT_CLUSTER)
-		ref_cnt = (u_int *)uma_find_refcnt(zone_clust,
-		    mb->m_ext.ext_buf);
-	else if (type == EXT_EXTREF)
-		ref_cnt = __DEVOLATILE(u_int *, mb->m_ext.ref_cnt);
-	mb->m_ext.ref_cnt = (ref_cnt == NULL) ?
-	    malloc(sizeof(u_int), M_MBUF, M_NOWAIT) : (u_int *)ref_cnt;
+	if (type != EXT_EXTREF)
+		mb->m_ext.ref_cnt = (u_int *)uma_zalloc(zone_ext_refcnt, M_NOWAIT);
 	if (mb->m_ext.ref_cnt != NULL) {
 		*(mb->m_ext.ref_cnt) = 1;
 		mb->m_flags |= (M_EXT | flags);
@@ -217,45 +206,33 @@
 
 /*
  * Non-directly-exported function to clean up after mbufs with M_EXT
- * storage attached to them if the reference count hits 0.
+ * storage attached to them if the reference count hits 1.
  */
 void
 mb_free_ext(struct mbuf *m)
 {
-	u_int cnt;
-	int dofree;
+	int skipmbuf;
+	
+	KASSERT((m->m_flags & M_EXT) == M_EXT, ("%s: M_EXT not set", __func__));
+	KASSERT(m->m_ext.ref_cnt != NULL, ("%s: ref_cnt not set", __func__));
 
-	/* Account for lazy ref count assign. */
-	if (m->m_ext.ref_cnt == NULL)
-		dofree = 1;
-	else
-		dofree = 0;
 
 	/*
-	 * This is tricky.  We need to make sure to decrement the
-	 * refcount in a safe way but to also clean up if we're the
-	 * last reference.  This method seems to do it without race.
-	 */
-	while (dofree == 0) {
-		cnt = *(m->m_ext.ref_cnt);
-		if (atomic_cmpset_int(m->m_ext.ref_cnt, cnt, cnt - 1)) {
-			if (cnt == 1)
-				dofree = 1;
-			break;
-		}
-	}
-
-	if (dofree) {
-		/*
-		 * Do the free, should be safe.
-		 */
+	 * check if the header is embedded in the cluster
+	 */     
+	skipmbuf = (m->m_flags & M_NOFREE);
+	
+	/* Free attached storage if this mbuf is the only reference to it. */
+	if (*(m->m_ext.ref_cnt) == 1 ||
+	    atomic_fetchadd_int(m->m_ext.ref_cnt, -1) == 1) {
 		switch (m->m_ext.ext_type) {
-		case EXT_PACKET:
+		case EXT_PACKET:	/* The packet zone is special. */
+			if (*(m->m_ext.ref_cnt) == 0)
+				*(m->m_ext.ref_cnt) = 1;
 			uma_zfree(zone_pack, m);
-			return;
+			return;		/* Job done. */
 		case EXT_CLUSTER:
 			uma_zfree(zone_clust, m->m_ext.ext_buf);
-			m->m_ext.ext_buf = NULL;
 			break;
 		case EXT_JUMBOP:
 			uma_zfree(zone_jumbop, m->m_ext.ext_buf);
@@ -266,24 +243,180 @@
 		case EXT_JUMBO16:
 			uma_zfree(zone_jumbo16, m->m_ext.ext_buf);
 			break;
-		default:
+		case EXT_SFBUF:
+		case EXT_NET_DRV:
+		case EXT_MOD_TYPE:
+		case EXT_DISPOSABLE:
+			*(m->m_ext.ref_cnt) = 0;
+			uma_zfree(zone_ext_refcnt, __DEVOLATILE(u_int *,
+				m->m_ext.ref_cnt));
+			/* FALLTHROUGH */
+		case EXT_EXTREF:
 			KASSERT(m->m_ext.ext_free != NULL,
-			    ("%s: external free pointer not set", __func__));
+				("%s: ext_free not set", __func__));
 			(*(m->m_ext.ext_free))(m->m_ext.ext_buf,
 			    m->m_ext.ext_args);
-			if (m->m_ext.ext_type != EXT_EXTREF) {
-				if (m->m_ext.ref_cnt != NULL)
-					free(__DEVOLATILE(u_int *,
-					    m->m_ext.ref_cnt), M_MBUF);
-				m->m_ext.ref_cnt = NULL;
-			}
-			m->m_ext.ext_buf = NULL;
+			break;
+		default:
+			KASSERT(m->m_ext.ext_type == 0,
+				("%s: unknown ext_type", __func__));
 		}
 	}
+	if (skipmbuf)
+		return;
+	
+	/*
+	 * Free this mbuf back to the mbuf zone with all m_ext
+	 * information purged.
+	 */
+	m->m_ext.ext_buf = NULL;
+	m->m_ext.ext_free = NULL;
+	m->m_ext.ext_args = NULL;
+	m->m_ext.ref_cnt = NULL;
+	m->m_ext.ext_size = 0;
+	m->m_ext.ext_type = 0;
+	m->m_flags &= ~M_EXT;
 	uma_zfree(zone_mbuf, m);
 }
 
 /*
+ * Attach the the cluster from *m to *n, set up m_ext in *n
+ * and bump the refcount of the cluster.
+ */
+static void
+mb_dupcl(struct mbuf *n, struct mbuf *m)
+{
+	KASSERT((m->m_flags & M_EXT) == M_EXT, ("%s: M_EXT not set", __func__));
+	KASSERT(m->m_ext.ref_cnt != NULL, ("%s: ref_cnt not set", __func__));
+	KASSERT((n->m_flags & M_EXT) == 0, ("%s: M_EXT set", __func__));
+
+	if (*(m->m_ext.ref_cnt) == 1)
+		*(m->m_ext.ref_cnt) += 1;
+	else
+		atomic_add_int(m->m_ext.ref_cnt, 1);
+	n->m_ext.ext_buf = m->m_ext.ext_buf;
+	n->m_ext.ext_free = m->m_ext.ext_free;
+	n->m_ext.ext_args = m->m_ext.ext_args;
+	n->m_ext.ext_size = m->m_ext.ext_size;
+	n->m_ext.ref_cnt = m->m_ext.ref_cnt;
+	n->m_ext.ext_type = m->m_ext.ext_type;
+	n->m_flags |= M_EXT;
+}
+
+/*
+ * Clean up mbuf (chain) from any tags and packet headers.
+ * If "all" is set then the first mbuf in the chain will be
+ * cleaned too.
+ */
+void
+m_demote(struct mbuf *m0, int all)
+{
+	struct mbuf *m;
+
+	for (m = all ? m0 : m0->m_next; m != NULL; m = m->m_next) {
+		if (m->m_flags & M_PKTHDR) {
+			m_tag_delete_chain(m, NULL);
+			m->m_flags &= ~M_PKTHDR;
+			bzero(&m->m_pkthdr, sizeof(struct pkthdr));
+		}
+		if (m->m_type == MT_HEADER)
+			m->m_type = MT_DATA;
+		if (m != m0 && m->m_nextpkt != NULL)
+			m->m_nextpkt = NULL;
+		m->m_flags = m->m_flags & (M_EXT|M_EOR|M_RDONLY|M_FREELIST);
+	}
+}
+
+/*
+ * Sanity checks on mbuf (chain) for use in KASSERT() and general
+ * debugging.
+ * Returns 0 or panics when bad and 1 on all tests passed.
+ * Sanitize, 0 to run M_SANITY_ACTION, 1 to garble things so they
+ * blow up later.
+ */
+int
+m_sanity(struct mbuf *m0, int sanitize)
+{
+	struct mbuf *m;
+	caddr_t a, b;
+	int pktlen = 0;
+
+#ifdef INVARIANTS
+#define	M_SANITY_ACTION(s)	panic("mbuf %p: " s, m)
+#else 
+#define	M_SANITY_ACTION(s)	printf("mbuf %p: " s, m)
+#endif
+
+	for (m = m0; m != NULL; m = m->m_next) {
+		/*
+		 * Basic pointer checks.  If any of these fails then some
+		 * unrelated kernel memory before or after us is trashed.
+		 * No way to recover from that.
+		 */
+		a = ((m->m_flags & M_EXT) ? m->m_ext.ext_buf :
+			((m->m_flags & M_PKTHDR) ? (caddr_t)(&m->m_pktdat) :
+			 (caddr_t)(&m->m_dat)) );
+		b = (caddr_t)(a + (m->m_flags & M_EXT ? m->m_ext.ext_size :
+			((m->m_flags & M_PKTHDR) ? MHLEN : MLEN)));
+		if ((caddr_t)m->m_data < a)
+			M_SANITY_ACTION("m_data outside mbuf data range left");
+		if ((caddr_t)m->m_data > b)
+			M_SANITY_ACTION("m_data outside mbuf data range right");
+		if ((caddr_t)m->m_data + m->m_len > b)
+			M_SANITY_ACTION("m_data + m_len exeeds mbuf space");
+		if ((m->m_flags & M_PKTHDR) && m->m_pkthdr.header) {
+			if ((caddr_t)m->m_pkthdr.header < a ||
+			    (caddr_t)m->m_pkthdr.header > b)
+				M_SANITY_ACTION("m_pkthdr.header outside mbuf data range");
+		}
+
+		/* m->m_nextpkt may only be set on first mbuf in chain. */
+		if (m != m0 && m->m_nextpkt != NULL) {
+			if (sanitize) {
+				m_freem(m->m_nextpkt);
+				m->m_nextpkt = (struct mbuf *)0xDEADC0DE;
+			} else
+				M_SANITY_ACTION("m->m_nextpkt on in-chain mbuf");
+		}
+
+		/* packet length (not mbuf length!) calculation */
+		if (m0->m_flags & M_PKTHDR)
+			pktlen += m->m_len;
+
+		/* m_tags may only be attached to first mbuf in chain. */
+		if (m != m0 && m->m_flags & M_PKTHDR &&
+		    !SLIST_EMPTY(&m->m_pkthdr.tags)) {
+			if (sanitize) {
+				m_tag_delete_chain(m, NULL);
+				/* put in 0xDEADC0DE perhaps? */
+			} else
+				M_SANITY_ACTION("m_tags on in-chain mbuf");
+		}
+
+		/* M_PKTHDR may only be set on first mbuf in chain */
+		if (m != m0 && m->m_flags & M_PKTHDR) {
+			if (sanitize) {
+				bzero(&m->m_pkthdr, sizeof(m->m_pkthdr));
+				m->m_flags &= ~M_PKTHDR;
+				/* put in 0xDEADCODE and leave hdr flag in */
+			} else
+				M_SANITY_ACTION("M_PKTHDR on in-chain mbuf");
+		}
+	}
+	m = m0;
+	if (pktlen && pktlen != m->m_pkthdr.len) {
+		if (sanitize)
+			m->m_pkthdr.len = 0;
+		else
+			M_SANITY_ACTION("m_pkthdr.len != mbuf chain length");
+	}
+	return 1;
+
+#undef	M_SANITY_ACTION
+}
+
+
+/*
  * "Move" mbuf pkthdr from "from" to "to".
  * "from" must have M_PKTHDR set, and "to" must be empty.
  */
@@ -369,8 +502,13 @@
 		M_MOVE_PKTHDR(mn, m);
 	mn->m_next = m;
 	m = mn;
-	if (len < MHLEN)
-		MH_ALIGN(m, len);
+	if(m->m_flags & M_PKTHDR) {
+		if (len < MHLEN)
+			MH_ALIGN(m, len);
+	} else {
+		if (len < MLEN) 
+			M_ALIGN(m, len);
+	}
 	m->m_len = len;
 	return (m);
 }
@@ -429,10 +567,7 @@
 		n->m_len = min(len, m->m_len - off);
 		if (m->m_flags & M_EXT) {
 			n->m_data = m->m_data + off;
-			n->m_ext = m->m_ext;
-			n->m_flags |= M_EXT;
-			MEXT_ADD_REF(m);
-			n->m_ext.ref_cnt = m->m_ext.ref_cnt;
+			mb_dupcl(n, m);
 		} else
 			bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t),
 			    (u_int)n->m_len);
@@ -453,6 +588,154 @@
 }
 
 /*
+ * Returns mbuf chain with new head for the prepending case.
+ * Copies from mbuf (chain) n from off for len to mbuf (chain) m
+ * either prepending or appending the data.
+ * The resulting mbuf (chain) m is fully writeable.
+ * m is destination (is made writeable)
+ * n is source, off is offset in source, len is len from offset
+ * dir, 0 append, 1 prepend
+ * how, wait or nowait
+ */
+
+static int
+m_bcopyxxx(void *s, void *t, u_int len)
+{
+	bcopy(s, t, (size_t)len);
+	return 0;
+}
+
+struct mbuf *
+m_copymdata(struct mbuf *m, struct mbuf *n, int off, int len,
+    int prep, int how)
+{
+	struct mbuf *mm, *x, *z, *prev = NULL;
+	caddr_t p;
+	int i, nlen = 0;
+	caddr_t buf[MLEN];
+
+	KASSERT(m != NULL && n != NULL, ("m_copymdata, no target or source"));
+	KASSERT(off >= 0, ("m_copymdata, negative off %d", off));
+	KASSERT(len >= 0, ("m_copymdata, negative len %d", len));
+	KASSERT(prep == 0 || prep == 1, ("m_copymdata, unknown direction %d", prep));
+
+	mm = m;
+	if (!prep) {
+		while(mm->m_next) {
+			prev = mm;
+			mm = mm->m_next;
+		}
+	}
+	for (z = n; z != NULL; z = z->m_next)
+		nlen += z->m_len;
+	if (len == M_COPYALL)
+		len = nlen - off;
+	if (off + len > nlen || len < 1)
+		return NULL;
+
+	if (!M_WRITABLE(mm)) {
+		/* XXX: Use proper m_xxx function instead. */
+		x = m_getcl(how, MT_DATA, mm->m_flags);
+		if (x == NULL)
+			return NULL;
+		bcopy(mm->m_ext.ext_buf, x->m_ext.ext_buf, x->m_ext.ext_size);
+		p = x->m_ext.ext_buf + (mm->m_data - mm->m_ext.ext_buf);
+		x->m_data = p;
+		mm->m_next = NULL;
+		if (mm != m)
+			prev->m_next = x;
+		m_free(mm);
+		mm = x;
+	}
+
+	/*
+	 * Append/prepend the data.  Allocating mbufs as necessary.
+	 */
+	/* Shortcut if enough free space in first/last mbuf. */
+	if (!prep && M_TRAILINGSPACE(mm) >= len) {
+		m_apply(n, off, len, m_bcopyxxx, mtod(mm, caddr_t) +
+			 mm->m_len);
+		mm->m_len += len;
+		mm->m_pkthdr.len += len;
+		return m;
+	}
+	if (prep && M_LEADINGSPACE(mm) >= len) {
+		mm->m_data = mtod(mm, caddr_t) - len;
+		m_apply(n, off, len, m_bcopyxxx, mtod(mm, caddr_t));
+		mm->m_len += len;
+		mm->m_pkthdr.len += len;
+		return mm;
+	}
+
+	/* Expand first/last mbuf to cluster if possible. */
+	if (!prep && !(mm->m_flags & M_EXT) && len > M_TRAILINGSPACE(mm)) {
+		bcopy(mm->m_data, &buf, mm->m_len);
+		m_clget(mm, how);
+		if (!(mm->m_flags & M_EXT))
+			return NULL;
+		bcopy(&buf, mm->m_ext.ext_buf, mm->m_len);
+		mm->m_data = mm->m_ext.ext_buf;
+		mm->m_pkthdr.header = NULL;
+	}
+	if (prep && !(mm->m_flags & M_EXT) && len > M_LEADINGSPACE(mm)) {
+		bcopy(mm->m_data, &buf, mm->m_len);
+		m_clget(mm, how);
+		if (!(mm->m_flags & M_EXT))
+			return NULL;
+		bcopy(&buf, (caddr_t *)mm->m_ext.ext_buf +
+		       mm->m_ext.ext_size - mm->m_len, mm->m_len);
+		mm->m_data = (caddr_t)mm->m_ext.ext_buf +
+			      mm->m_ext.ext_size - mm->m_len;
+		mm->m_pkthdr.header = NULL;
+	}
+
+	/* Append/prepend as many mbuf (clusters) as necessary to fit len. */
+	if (!prep && len > M_TRAILINGSPACE(mm)) {
+		if (!m_getm(mm, len - M_TRAILINGSPACE(mm), how, MT_DATA))
+			return NULL;
+	}
+	if (prep && len > M_LEADINGSPACE(mm)) {
+		if (!(z = m_getm(NULL, len - M_LEADINGSPACE(mm), how, MT_DATA)))
+			return NULL;
+		i = 0;
+		for (x = z; x != NULL; x = x->m_next) {
+			i += x->m_flags & M_EXT ? x->m_ext.ext_size :
+			      (x->m_flags & M_PKTHDR ? MHLEN : MLEN);
+			if (!x->m_next)
+				break;
+		}
+		z->m_data += i - len;
+		m_move_pkthdr(mm, z);
+		x->m_next = mm;
+		mm = z;
+	}
+
+	/* Seek to start position in source mbuf. Optimization for long chains. */
+	while (off > 0) {
+		if (off < n->m_len)
+			break;
+		off -= n->m_len;
+		n = n->m_next;
+	}
+
+	/* Copy data into target mbuf. */
+	z = mm;
+	while (len > 0) {
+		KASSERT(z != NULL, ("m_copymdata, falling off target edge"));
+		i = M_TRAILINGSPACE(z);
+		m_apply(n, off, i, m_bcopyxxx, mtod(z, caddr_t) + z->m_len);
+		z->m_len += i;
+		/* fixup pkthdr.len if necessary */
+		if ((prep ? mm : m)->m_flags & M_PKTHDR)
+			(prep ? mm : m)->m_pkthdr.len += i;
+		off += i;
+		len -= i;
+		z = z->m_next;
+	}
+	return (prep ? mm : m);
+}
+
+/*
  * Copy an entire packet, including header (which must be present).
  * An optimization of the common case `m_copym(m, 0, M_COPYALL, how)'.
  * Note that the copy is read-only, because clusters are not copied,
@@ -477,10 +760,7 @@
 	n->m_len = m->m_len;
 	if (m->m_flags & M_EXT) {
 		n->m_data = m->m_data;
-		n->m_ext = m->m_ext;
-		n->m_flags |= M_EXT;
-		MEXT_ADD_REF(m);
-		n->m_ext.ref_cnt = m->m_ext.ref_cnt;
+		mb_dupcl(n, m);
 	} else {
 		n->m_data = n->m_pktdat + (m->m_data - m->m_pktdat );
 		bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
@@ -498,10 +778,7 @@
 		n->m_len = m->m_len;
 		if (m->m_flags & M_EXT) {
 			n->m_data = m->m_data;
-			n->m_ext = m->m_ext;
-			n->m_flags |= M_EXT;
-			MEXT_ADD_REF(m);
-			n->m_ext.ref_cnt = m->m_ext.ref_cnt;
+			mb_dupcl(n, m);
 		} else {
 			bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
 		}
@@ -885,11 +1162,8 @@
 	}
 extpacket:
 	if (m->m_flags & M_EXT) {
-		n->m_flags |= M_EXT;
-		n->m_ext = m->m_ext;
-		MEXT_ADD_REF(m);
-		n->m_ext.ref_cnt = m->m_ext.ref_cnt;
 		n->m_data = m->m_data + len;
+		mb_dupcl(n, m);
 	} else {
 		bcopy(mtod(m, caddr_t) + len, mtod(n, caddr_t), remain);
 	}
@@ -906,7 +1180,7 @@
  */
 struct mbuf *
 m_devget(char *buf, int totlen, int off, struct ifnet *ifp,
-	 void (*copy)(char *from, caddr_t to, u_int len))
+    void (*copy)(char *from, caddr_t to, u_int len))
 {
 	struct mbuf *m;
 	struct mbuf *top = NULL, **mp = ⊤
@@ -1143,7 +1417,7 @@
 		    "\11proto5\10proto4\7proto3\6proto2\5proto1\4rdonly"
 		    "\3eor\2pkthdr\1ext", pdata ? "" : "\n");
 		if (pdata)
-			printf(", %*D\n", m2->m_len, (u_char *)m2->m_data, "-");
+			printf(", %*D\n", pdata, (u_char *)m2->m_data, "-");
 		if (len != -1)
 			len -= m2->m_len;
 		m2 = m2->m_next;
@@ -1347,55 +1621,61 @@
 
 #endif
 
+/*
+ * Copy the contents of uio into a properly sized mbuf chain.
+ */
 struct mbuf *
-m_uiotombuf(struct uio *uio, int how, int len, int align)
+m_uiotombuf(struct uio *uio, int how, int len, int align, int flags)
 {
-	struct mbuf *m_new = NULL, *m_final = NULL;
-	int progress = 0, error = 0, length, total;
+	struct mbuf *m, *mb;
+	int error, length, total;
+	int progress = 0;
 
+	/*
+	 * len can be zero or an arbitrary large value bound by
+	 * the total data supplied by the uio.
+	 */
 	if (len > 0)
 		total = min(uio->uio_resid, len);
 	else
 		total = uio->uio_resid;
+
+	/*
+	 * The smallest unit returned by m_getm2() is a single mbuf
+	 * with pkthdr.  We can't align past it.  Align align itself.
+	 */
+	if (align)
+		align &= ~(sizeof(long) - 1);
 	if (align >= MHLEN)
-		goto nospace;
-	if (total + align > MHLEN)
-		m_final = m_getcl(how, MT_DATA, M_PKTHDR);
-	else
-		m_final = m_gethdr(how, MT_DATA);
-	if (m_final == NULL)
-		goto nospace;
-	m_final->m_data += align;
-	m_new = m_final;
-	while (progress < total) {
-		length = total - progress;
-		if (length > MCLBYTES)
-			length = MCLBYTES;
-		if (m_new == NULL) {
-			if (length > MLEN)
-				m_new = m_getcl(how, MT_DATA, 0);
-			else
-				m_new = m_get(how, MT_DATA);
-			if (m_new == NULL)
-				goto nospace;
+		return (NULL);
+
+	/*
+	 * Give us the full allocation or nothing.
+	 * If len is zero return the smallest empty mbuf.
+	 */
+	m = m_getm2(NULL, max(total + align, 1), how, MT_DATA, flags);
+	if (m == NULL)
+		return (NULL);
+	m->m_data += align;
+
+	/* Fill all mbufs with uio data and update header information. */
+	for (mb = m; mb != NULL; mb = mb->m_next) {
+		length = min(M_TRAILINGSPACE(mb), total - progress);
+
+		error = uiomove(mtod(mb, void *), length, uio);
+		if (error) {
+			m_freem(m);
+			return (NULL);
 		}
-		error = uiomove(mtod(m_new, void *), length, uio);
-		if (error)
-			goto nospace;
+
+		mb->m_len = length;
 		progress += length;
-		m_new->m_len = length;
-		if (m_new != m_final)
-			m_cat(m_final, m_new);
-		m_new = NULL;
+		if (flags & M_PKTHDR)
+			m->m_pkthdr.len += length;
 	}
-	m_fixhdr(m_final);
-	return (m_final);
-nospace:
-	if (m_new)
-		m_free(m_new);
-	if (m_final)
-		m_freem(m_final);
-	return (NULL);
+	KASSERT(progress == total, ("%s: progress != total", __func__));
+
+	return (m);
 }
 
 /*
Index: kern_ktr.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_ktr.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/kern_ktr.c -L sys/kern/kern_ktr.c -u -r1.1.1.1 -r1.2
--- sys/kern/kern_ktr.c
+++ sys/kern/kern_ktr.c
@@ -33,7 +33,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_ktr.c,v 1.48 2005/06/10 23:21:29 jeff Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_ktr.c,v 1.53 2006/09/09 16:09:01 rwatson Exp $");
 
 #include "opt_ddb.h"
 #include "opt_ktr.h"
@@ -55,8 +55,10 @@
 #include <machine/ktr.h>
 #endif
 
-
+#ifdef DDB
 #include <ddb/ddb.h>
+#include <ddb/db_output.h>
+#endif
 
 #ifndef KTR_ENTRIES
 #define	KTR_ENTRIES	1024
@@ -100,6 +102,26 @@
 volatile int	ktr_idx = 0;
 struct	ktr_entry ktr_buf[KTR_ENTRIES];
 
+static int
+sysctl_debug_ktr_clear(SYSCTL_HANDLER_ARGS)
+{
+	int clear, error;
+
+	clear = 0;
+	error = sysctl_handle_int(oidp, &clear, 0, req);
+	if (error || !req->newptr)
+		return (error);
+
+	if (clear) {
+		bzero(ktr_buf, sizeof(ktr_buf));
+		ktr_idx = 0;
+	}
+
+	return (error);
+}
+SYSCTL_PROC(_debug_ktr, OID_AUTO, clear, CTLTYPE_INT|CTLFLAG_RW, 0, 0,
+    sysctl_debug_ktr_clear, "I", "Clear KTR Buffer");
+
 #ifdef KTR_VERBOSE
 int	ktr_verbose = KTR_VERBOSE;
 TUNABLE_INT("debug.ktr.verbose", &ktr_verbose);
@@ -134,21 +156,17 @@
 
 	enable = ktr_alq_enabled;
 
-        error = sysctl_handle_int(oidp, &enable, 0, req);
-        if (error || !req->newptr)
-                return (error);
+	error = sysctl_handle_int(oidp, &enable, 0, req);
+	if (error || !req->newptr)
+		return (error);
 
 	if (enable) {
 		if (ktr_alq_enabled)
 			return (0);
-		error = suser(curthread);
-		if (error)
-			return (error);
 		error = alq_open(&ktr_alq, (const char *)ktr_alq_file,
 		    req->td->td_ucred, ALQ_DEFAULT_CMODE,
 		    sizeof(struct ktr_entry), ktr_alq_depth);
 		if (error == 0) {
-			ktr_mask &= ~KTR_ALQ_MASK;
 			ktr_alq_cnt = 0;
 			ktr_alq_failed = 0;
 			ktr_alq_enabled = 1;
@@ -269,22 +287,17 @@
 
 DB_SHOW_COMMAND(ktr, db_ktr_all)
 {
-	int quit;
 	
-	quit = 0;
 	tstate.cur = (ktr_idx - 1) & (KTR_ENTRIES - 1);
 	tstate.first = -1;
-	if (strcmp(modif, "v") == 0)
-		db_ktr_verbose = 1;
-	else
-		db_ktr_verbose = 0;
-	if (strcmp(modif, "a") == 0) {
+	db_ktr_verbose = index(modif, 'v') != NULL;
+	if (index(modif, 'a') != NULL) {
+		db_disable_pager();
 		while (cncheckc() != -1)
 			if (db_mach_vtrace() == 0)
 				break;
 	} else {
-		db_setup_paging(db_simple_pager, &quit, db_lines_per_page);
-		while (!quit)
+		while (!db_pager_quit)
 			if (db_mach_vtrace() == 0)
 				break;
 	}
Index: kern_cpu.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_cpu.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/kern_cpu.c -L sys/kern/kern_cpu.c -u -r1.2 -r1.3
--- sys/kern/kern_cpu.c
+++ sys/kern/kern_cpu.c
@@ -1,5 +1,5 @@
 /*-
- * Copyright (c) 2004-2005 Nate Lawson (SDG)
+ * Copyright (c) 2004-2007 Nate Lawson (SDG)
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -25,7 +25,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_cpu.c,v 1.14.2.4 2006/03/05 00:03:29 mnag Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_cpu.c,v 1.27.4.1 2008/01/19 20:30:59 njl Exp $");
 
 #include <sys/param.h>
 #include <sys/bus.h>
@@ -37,12 +37,14 @@
 #include <sys/module.h>
 #include <sys/proc.h>
 #include <sys/queue.h>
+#include <sys/sbuf.h>
 #include <sys/sched.h>
+#include <sys/smp.h>
 #include <sys/sysctl.h>
 #include <sys/systm.h>
-#include <sys/sbuf.h>
 #include <sys/sx.h>
 #include <sys/timetc.h>
+#include <sys/taskqueue.h>
 
 #include "cpufreq_if.h"
 
@@ -73,6 +75,7 @@
 	int				max_mhz;
 	device_t			dev;
 	struct sysctl_ctx_list		sysctl_ctx;
+	struct task			startup_task;
 };
 
 struct cf_setting_array {
@@ -94,8 +97,8 @@
 	} while (0)
 
 static int	cpufreq_attach(device_t dev);
+static void	cpufreq_startup_task(void *ctx, int pending);
 static int	cpufreq_detach(device_t dev);
-static void	cpufreq_evaluate(void *arg);
 static int	cf_set_method(device_t dev, const struct cf_level *level,
 		    int priority);
 static int	cf_get_method(device_t dev, struct cf_level *level);
@@ -127,8 +130,6 @@
 static devclass_t cpufreq_dc;
 DRIVER_MODULE(cpufreq, cpu, cpufreq_driver, cpufreq_dc, 0, 0);
 
-static eventhandler_tag	cf_ev_tag;
-
 static int		cf_lowest_freq;
 static int		cf_verbose;
 TUNABLE_INT("debug.cpufreq.lowest", &cf_lowest_freq);
@@ -176,12 +177,25 @@
 	    SYSCTL_CHILDREN(device_get_sysctl_tree(parent)),
 	    OID_AUTO, "freq_levels", CTLTYPE_STRING | CTLFLAG_RD, sc, 0,
 	    cpufreq_levels_sysctl, "A", "CPU frequency levels");
-	cf_ev_tag = EVENTHANDLER_REGISTER(cpufreq_changed, cpufreq_evaluate,
-	    NULL, EVENTHANDLER_PRI_ANY);
+
+	/*
+	 * Queue a one-shot broadcast that levels have changed.
+	 * It will run once the system has completed booting.
+	 */
+	TASK_INIT(&sc->startup_task, 0, cpufreq_startup_task, dev);
+	taskqueue_enqueue(taskqueue_thread, &sc->startup_task);
 
 	return (0);
 }
 
+/* Handle any work to be done for all drivers that attached during boot. */
+static void 
+cpufreq_startup_task(void *ctx, int pending)
+{
+
+	cpufreq_settings_changed((device_t)ctx);
+}
+
 static int
 cpufreq_detach(device_t dev)
 {
@@ -202,18 +216,11 @@
 	numdevs = devclass_get_count(cpufreq_dc);
 	if (numdevs == 1) {
 		CF_DEBUG("final shutdown for %s\n", device_get_nameunit(dev));
-		EVENTHANDLER_DEREGISTER(cpufreq_changed, cf_ev_tag);
 	}
 
 	return (0);
 }
 
-static void
-cpufreq_evaluate(void *arg)
-{
-	/* TODO: Re-evaluate when notified of changes to drivers. */
-}
-
 static int
 cf_set_method(device_t dev, const struct cf_level *level, int priority)
 {
@@ -221,30 +228,37 @@
 	const struct cf_setting *set;
 	struct cf_saved_freq *saved_freq, *curr_freq;
 	struct pcpu *pc;
-	int cpu_id, error, i;
-	static int once;
+	int error, i;
 
 	sc = device_get_softc(dev);
 	error = 0;
 	set = NULL;
 	saved_freq = NULL;
 
-	/*
-	 * Check that the TSC isn't being used as a timecounter.
-	 * If it is, then return EBUSY and refuse to change the
-	 * clock speed.
-	 */
-	if (strcmp(timecounter->tc_name, "TSC") == 0) {
-		if (!once) {
-			printf("cpufreq: frequency change with timecounter"
-				" TSC not allowed, see cpufreq(4)\n");
-			once = 1;
-		}
-		return (EBUSY);
+	/* We are going to change levels so notify the pre-change handler. */
+	EVENTHANDLER_INVOKE(cpufreq_pre_change, level, &error);
+	if (error != 0) {
+		EVENTHANDLER_INVOKE(cpufreq_post_change, level, error);
+		return (error);
 	}
 
 	CF_MTX_LOCK(&sc->lock);
 
+#ifdef SMP
+	/*
+	 * If still booting and secondary CPUs not started yet, don't allow
+	 * changing the frequency until they're online.  This is because we
+	 * can't switch to them using sched_bind() and thus we'd only be
+	 * switching the main CPU.  XXXTODO: Need to think more about how to
+	 * handle having different CPUs at different frequencies.  
+	 */
+	if (mp_ncpus > 1 && !smp_active) {
+		device_printf(dev, "rejecting change, SMP not started yet\n");
+		error = ENXIO;
+		goto out;
+	}
+#endif /* SMP */
+
 	/*
 	 * If the requested level has a lower priority, don't allow
 	 * the new level right now.
@@ -296,22 +310,17 @@
 			goto out;
 		}
 
-		/* Bind to the target CPU before switching, if necessary. */
-		cpu_id = PCPU_GET(cpuid);
+		/* Bind to the target CPU before switching. */
 		pc = cpu_get_pcpu(set->dev);
-		if (cpu_id != pc->pc_cpuid) {
-			mtx_lock_spin(&sched_lock);
-			sched_bind(curthread, pc->pc_cpuid);
-			mtx_unlock_spin(&sched_lock);
-		}
+		thread_lock(curthread);
+		sched_bind(curthread, pc->pc_cpuid);
+		thread_unlock(curthread);
 		CF_DEBUG("setting abs freq %d on %s (cpu %d)\n", set->freq,
 		    device_get_nameunit(set->dev), PCPU_GET(cpuid));
 		error = CPUFREQ_DRV_SET(set->dev, set);
-		if (cpu_id != pc->pc_cpuid) {
-			mtx_lock_spin(&sched_lock);
-			sched_unbind(curthread);
-			mtx_unlock_spin(&sched_lock);
-		}
+		thread_lock(curthread);
+		sched_unbind(curthread);
+		thread_unlock(curthread);
 		if (error) {
 			goto out;
 		}
@@ -325,22 +334,17 @@
 			goto out;
 		}
 
-		/* Bind to the target CPU before switching, if necessary. */
-		cpu_id = PCPU_GET(cpuid);
+		/* Bind to the target CPU before switching. */
 		pc = cpu_get_pcpu(set->dev);
-		if (cpu_id != pc->pc_cpuid) {
-			mtx_lock_spin(&sched_lock);
-			sched_bind(curthread, pc->pc_cpuid);
-			mtx_unlock_spin(&sched_lock);
-		}
+		thread_lock(curthread);
+		sched_bind(curthread, pc->pc_cpuid);
+		thread_unlock(curthread);
 		CF_DEBUG("setting rel freq %d on %s (cpu %d)\n", set->freq,
 		    device_get_nameunit(set->dev), PCPU_GET(cpuid));
 		error = CPUFREQ_DRV_SET(set->dev, set);
-		if (cpu_id != pc->pc_cpuid) {
-			mtx_lock_spin(&sched_lock);
-			sched_unbind(curthread);
-			mtx_unlock_spin(&sched_lock);
-		}
+		thread_lock(curthread);
+		sched_unbind(curthread);
+		thread_unlock(curthread);
 		if (error) {
 			/* XXX Back out any successful setting? */
 			goto out;
@@ -378,8 +382,15 @@
 
 out:
 	CF_MTX_UNLOCK(&sc->lock);
+
+	/*
+	 * We changed levels (or attempted to) so notify the post-change
+	 * handler of new frequency or error.
+	 */
+	EVENTHANDLER_INVOKE(cpufreq_post_change, level, error);
 	if (error && set)
 		device_printf(set->dev, "set freq failed, err %d\n", error);
+
 	return (error);
 }
 
@@ -391,7 +402,7 @@
 	struct cf_setting *curr_set, set;
 	struct pcpu *pc;
 	device_t *devs;
-	int count, error, i, numdevs;
+	int count, error, i, n, numdevs;
 	uint64_t rate;
 
 	sc = device_get_softc(dev);
@@ -438,10 +449,10 @@
 	 * The estimation code below catches this case though.
 	 */
 	CF_MTX_LOCK(&sc->lock);
-	for (i = 0; i < numdevs && curr_set->freq == CPUFREQ_VAL_UNKNOWN; i++) {
-		if (!device_is_attached(devs[i]))
+	for (n = 0; n < numdevs && curr_set->freq == CPUFREQ_VAL_UNKNOWN; n++) {
+		if (!device_is_attached(devs[n]))
 			continue;
-		error = CPUFREQ_DRV_GET(devs[i], &set);
+		error = CPUFREQ_DRV_GET(devs[n], &set);
 		if (error)
 			continue;
 		for (i = 0; i < count; i++) {
@@ -595,6 +606,17 @@
 	/* Finally, output the list of levels. */
 	i = 0;
 	TAILQ_FOREACH(lev, &sc->all_levels, link) {
+		/*
+		 * Skip levels that are too close in frequency to the
+		 * previous levels.  Some systems report bogus duplicate
+		 * settings (i.e., for acpi_perf).
+		 */
+		if (i > 0 && CPUFREQ_CMP(lev->total_set.freq,
+		    levels[i - 1].total_set.freq)) {
+			sc->all_count--;
+			continue;
+		}
+
 		/* Skip levels that have a frequency that is too low. */
 		if (lev->total_set.freq < cf_lowest_freq) {
 			sc->all_count--;
@@ -1021,3 +1043,12 @@
 
 	return (0);
 }
+
+int
+cpufreq_settings_changed(device_t dev)
+{
+
+	EVENTHANDLER_INVOKE(cpufreq_levels_changed,
+	    device_get_unit(device_get_parent(dev)));
+	return (0);
+}
Index: subr_pcpu.c
===================================================================
RCS file: /home/cvs/src/sys/kern/subr_pcpu.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/subr_pcpu.c -L sys/kern/subr_pcpu.c -u -r1.1.1.1 -r1.2
--- sys/kern/subr_pcpu.c
+++ sys/kern/subr_pcpu.c
@@ -43,7 +43,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/subr_pcpu.c,v 1.6.2.2 2005/11/11 18:50:45 jhb Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/subr_pcpu.c,v 1.8 2005/11/03 21:06:29 jhb Exp $");
 
 #include "opt_ddb.h"
 
Index: syscalls.c
===================================================================
RCS file: /home/cvs/src/sys/kern/syscalls.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/syscalls.c -L sys/kern/syscalls.c -u -r1.2 -r1.3
--- sys/kern/syscalls.c
+++ sys/kern/syscalls.c
@@ -2,8 +2,8 @@
  * System call names.
  *
  * DO NOT EDIT-- this file is automatically generated.
- * $FreeBSD: src/sys/kern/syscalls.c,v 1.181.2.2 2006/03/17 01:47:32 rwatson Exp $
- * created from FreeBSD: src/sys/kern/syscalls.master,v 1.198.2.2 2006/03/17 01:47:06 rwatson Exp 
+ * $FreeBSD: src/sys/kern/syscalls.c,v 1.214 2007/08/16 05:32:26 davidxu Exp $
+ * created from FreeBSD: src/sys/kern/syscalls.master,v 1.232 2007/07/04 22:47:37 peter Exp 
  */
 
 const char *syscallnames[] = {
@@ -15,7 +15,7 @@
 	"open",			/* 5 = open */
 	"close",			/* 6 = close */
 	"wait4",			/* 7 = wait4 */
-	"old.creat",		/* 8 = old creat */
+	"compat.creat",		/* 8 = old creat */
 	"link",			/* 9 = link */
 	"unlink",			/* 10 = unlink */
 	"obs_execv",			/* 11 = obsolete execv */
@@ -25,8 +25,8 @@
 	"chmod",			/* 15 = chmod */
 	"chown",			/* 16 = chown */
 	"break",			/* 17 = break */
-	"old.getfsstat",		/* 18 = old getfsstat */
-	"old.lseek",		/* 19 = old lseek */
+	"compat4.getfsstat",		/* 18 = old getfsstat */
+	"compat.lseek",		/* 19 = old lseek */
 	"getpid",			/* 20 = getpid */
 	"mount",			/* 21 = mount */
 	"unmount",			/* 22 = unmount */
@@ -45,21 +45,21 @@
 	"fchflags",			/* 35 = fchflags */
 	"sync",			/* 36 = sync */
 	"kill",			/* 37 = kill */
-	"old.stat",		/* 38 = old stat */
+	"compat.stat",		/* 38 = old stat */
 	"getppid",			/* 39 = getppid */
-	"old.lstat",		/* 40 = old lstat */
+	"compat.lstat",		/* 40 = old lstat */
 	"dup",			/* 41 = dup */
 	"pipe",			/* 42 = pipe */
 	"getegid",			/* 43 = getegid */
 	"profil",			/* 44 = profil */
 	"ktrace",			/* 45 = ktrace */
-	"old.sigaction",		/* 46 = old sigaction */
+	"compat.sigaction",		/* 46 = old sigaction */
 	"getgid",			/* 47 = getgid */
-	"old.sigprocmask",		/* 48 = old sigprocmask */
+	"compat.sigprocmask",		/* 48 = old sigprocmask */
 	"getlogin",			/* 49 = getlogin */
 	"setlogin",			/* 50 = setlogin */
 	"acct",			/* 51 = acct */
-	"old.sigpending",		/* 52 = old sigpending */
+	"compat.sigpending",		/* 52 = old sigpending */
 	"sigaltstack",			/* 53 = sigaltstack */
 	"ioctl",			/* 54 = ioctl */
 	"reboot",			/* 55 = reboot */
@@ -69,16 +69,16 @@
 	"execve",			/* 59 = execve */
 	"umask",			/* 60 = umask */
 	"chroot",			/* 61 = chroot */
-	"old.fstat",		/* 62 = old fstat */
-	"old.getkerninfo",		/* 63 = old getkerninfo */
-	"old.getpagesize",		/* 64 = old getpagesize */
+	"compat.fstat",		/* 62 = old fstat */
+	"compat.getkerninfo",		/* 63 = old getkerninfo */
+	"compat.getpagesize",		/* 64 = old getpagesize */
 	"msync",			/* 65 = msync */
 	"vfork",			/* 66 = vfork */
 	"obs_vread",			/* 67 = obsolete vread */
 	"obs_vwrite",			/* 68 = obsolete vwrite */
 	"sbrk",			/* 69 = sbrk */
 	"sstk",			/* 70 = sstk */
-	"old.mmap",		/* 71 = old mmap */
+	"compat.mmap",		/* 71 = old mmap */
 	"vadvise",			/* 72 = vadvise */
 	"munmap",			/* 73 = munmap */
 	"mprotect",			/* 74 = mprotect */
@@ -91,11 +91,11 @@
 	"getpgrp",			/* 81 = getpgrp */
 	"setpgid",			/* 82 = setpgid */
 	"setitimer",			/* 83 = setitimer */
-	"old.wait",		/* 84 = old wait */
+	"compat.wait",		/* 84 = old wait */
 	"swapon",			/* 85 = swapon */
 	"getitimer",			/* 86 = getitimer */
-	"old.gethostname",		/* 87 = old gethostname */
-	"old.sethostname",		/* 88 = old sethostname */
+	"compat.gethostname",		/* 87 = old gethostname */
+	"compat.sethostname",		/* 88 = old sethostname */
 	"getdtablesize",			/* 89 = getdtablesize */
 	"dup2",			/* 90 = dup2 */
 	"#91",			/* 91 = getdopt */
@@ -106,22 +106,22 @@
 	"setpriority",			/* 96 = setpriority */
 	"socket",			/* 97 = socket */
 	"connect",			/* 98 = connect */
-	"old.accept",		/* 99 = old accept */
+	"compat.accept",		/* 99 = old accept */
 	"getpriority",			/* 100 = getpriority */
-	"old.send",		/* 101 = old send */
-	"old.recv",		/* 102 = old recv */
-	"old.sigreturn",		/* 103 = old sigreturn */
+	"compat.send",		/* 101 = old send */
+	"compat.recv",		/* 102 = old recv */
+	"compat.sigreturn",		/* 103 = old sigreturn */
 	"bind",			/* 104 = bind */
 	"setsockopt",			/* 105 = setsockopt */
 	"listen",			/* 106 = listen */
 	"obs_vtimes",			/* 107 = obsolete vtimes */
-	"old.sigvec",		/* 108 = old sigvec */
-	"old.sigblock",		/* 109 = old sigblock */
-	"old.sigsetmask",		/* 110 = old sigsetmask */
-	"old.sigsuspend",		/* 111 = old sigsuspend */
-	"old.sigstack",		/* 112 = old sigstack */
-	"old.recvmsg",		/* 113 = old recvmsg */
-	"old.sendmsg",		/* 114 = old sendmsg */
+	"compat.sigvec",		/* 108 = old sigvec */
+	"compat.sigblock",		/* 109 = old sigblock */
+	"compat.sigsetmask",		/* 110 = old sigsetmask */
+	"compat.sigsuspend",		/* 111 = old sigsuspend */
+	"compat.sigstack",		/* 112 = old sigstack */
+	"compat.recvmsg",		/* 113 = old recvmsg */
+	"compat.sendmsg",		/* 114 = old sendmsg */
 	"obs_vtrace",			/* 115 = obsolete vtrace */
 	"gettimeofday",			/* 116 = gettimeofday */
 	"getrusage",			/* 117 = getrusage */
@@ -132,12 +132,12 @@
 	"settimeofday",			/* 122 = settimeofday */
 	"fchown",			/* 123 = fchown */
 	"fchmod",			/* 124 = fchmod */
-	"old.recvfrom",		/* 125 = old recvfrom */
+	"compat.recvfrom",		/* 125 = old recvfrom */
 	"setreuid",			/* 126 = setreuid */
 	"setregid",			/* 127 = setregid */
 	"rename",			/* 128 = rename */
-	"old.truncate",		/* 129 = old truncate */
-	"old.ftruncate",		/* 130 = old ftruncate */
+	"compat.truncate",		/* 129 = old truncate */
+	"compat.ftruncate",		/* 130 = old ftruncate */
 	"flock",			/* 131 = flock */
 	"mkfifo",			/* 132 = mkfifo */
 	"sendto",			/* 133 = sendto */
@@ -148,24 +148,24 @@
 	"utimes",			/* 138 = utimes */
 	"obs_4.2",			/* 139 = obsolete 4.2 sigreturn */
 	"adjtime",			/* 140 = adjtime */
-	"old.getpeername",		/* 141 = old getpeername */
-	"old.gethostid",		/* 142 = old gethostid */
-	"old.sethostid",		/* 143 = old sethostid */
-	"old.getrlimit",		/* 144 = old getrlimit */
-	"old.setrlimit",		/* 145 = old setrlimit */
-	"old.killpg",		/* 146 = old killpg */
+	"compat.getpeername",		/* 141 = old getpeername */
+	"compat.gethostid",		/* 142 = old gethostid */
+	"compat.sethostid",		/* 143 = old sethostid */
+	"compat.getrlimit",		/* 144 = old getrlimit */
+	"compat.setrlimit",		/* 145 = old setrlimit */
+	"compat.killpg",		/* 146 = old killpg */
 	"setsid",			/* 147 = setsid */
 	"quotactl",			/* 148 = quotactl */
-	"old.quota",		/* 149 = old quota */
-	"old.getsockname",		/* 150 = old getsockname */
+	"compat.quota",		/* 149 = old quota */
+	"compat.getsockname",		/* 150 = old getsockname */
 	"#151",			/* 151 = sem_lock */
 	"#152",			/* 152 = sem_wakeup */
 	"#153",			/* 153 = asyncdaemon */
 	"#154",			/* 154 = nosys */
 	"nfssvc",			/* 155 = nfssvc */
-	"old.getdirentries",		/* 156 = old getdirentries */
-	"old.statfs",		/* 157 = old statfs */
-	"old.fstatfs",		/* 158 = old fstatfs */
+	"compat.getdirentries",		/* 156 = old getdirentries */
+	"compat4.statfs",		/* 157 = old statfs */
+	"compat4.fstatfs",		/* 158 = old fstatfs */
 	"#159",			/* 159 = nosys */
 	"lgetfh",			/* 160 = lgetfh */
 	"getfh",			/* 161 = getfh */
@@ -180,8 +180,8 @@
 	"msgsys",			/* 170 = msgsys */
 	"shmsys",			/* 171 = shmsys */
 	"#172",			/* 172 = nosys */
-	"pread",			/* 173 = pread */
-	"pwrite",			/* 174 = pwrite */
+	"freebsd6_pread",			/* 173 = freebsd6_pread */
+	"freebsd6_pwrite",			/* 174 = freebsd6_pwrite */
 	"#175",			/* 175 = nosys */
 	"ntp_adjtime",			/* 176 = ntp_adjtime */
 	"#177",			/* 177 = sfork */
@@ -204,11 +204,11 @@
 	"getrlimit",			/* 194 = getrlimit */
 	"setrlimit",			/* 195 = setrlimit */
 	"getdirentries",			/* 196 = getdirentries */
-	"mmap",			/* 197 = mmap */
+	"freebsd6_mmap",			/* 197 = freebsd6_mmap */
 	"__syscall",			/* 198 = __syscall */
-	"lseek",			/* 199 = lseek */
-	"truncate",			/* 200 = truncate */
-	"ftruncate",			/* 201 = ftruncate */
+	"freebsd6_lseek",			/* 199 = freebsd6_lseek */
+	"freebsd6_truncate",			/* 200 = freebsd6_truncate */
+	"freebsd6_ftruncate",			/* 201 = freebsd6_ftruncate */
 	"__sysctl",			/* 202 = __sysctl */
 	"mlock",			/* 203 = mlock */
 	"munlock",			/* 204 = munlock */
@@ -242,11 +242,11 @@
 	"clock_gettime",			/* 232 = clock_gettime */
 	"clock_settime",			/* 233 = clock_settime */
 	"clock_getres",			/* 234 = clock_getres */
-	"#235",			/* 235 = timer_create */
-	"#236",			/* 236 = timer_delete */
-	"#237",			/* 237 = timer_settime */
-	"#238",			/* 238 = timer_gettime */
-	"#239",			/* 239 = timer_getoverrun */
+	"ktimer_create",			/* 235 = ktimer_create */
+	"ktimer_delete",			/* 236 = ktimer_delete */
+	"ktimer_settime",			/* 237 = ktimer_settime */
+	"ktimer_gettime",			/* 238 = ktimer_gettime */
+	"ktimer_getoverrun",			/* 239 = ktimer_getoverrun */
 	"nanosleep",			/* 240 = nanosleep */
 	"#241",			/* 241 = nosys */
 	"#242",			/* 242 = nosys */
@@ -262,9 +262,9 @@
 	"openbsd_poll",			/* 252 = openbsd_poll */
 	"issetugid",			/* 253 = issetugid */
 	"lchown",			/* 254 = lchown */
-	"#255",			/* 255 = nosys */
-	"#256",			/* 256 = nosys */
-	"#257",			/* 257 = nosys */
+	"aio_read",			/* 255 = aio_read */
+	"aio_write",			/* 256 = aio_write */
+	"lio_listio",			/* 257 = lio_listio */
 	"#258",			/* 258 = nosys */
 	"#259",			/* 259 = nosys */
 	"#260",			/* 260 = nosys */
@@ -304,7 +304,7 @@
 	"#294",			/* 294 = nosys */
 	"#295",			/* 295 = nosys */
 	"#296",			/* 296 = nosys */
-	"old.fhstatfs",		/* 297 = old fhstatfs */
+	"compat4.fhstatfs",		/* 297 = old fhstatfs */
 	"fhopen",			/* 298 = fhopen */
 	"fhstat",			/* 299 = fhstat */
 	"modnext",			/* 300 = modnext */
@@ -325,9 +325,9 @@
 	"aio_suspend",			/* 315 = aio_suspend */
 	"aio_cancel",			/* 316 = aio_cancel */
 	"aio_error",			/* 317 = aio_error */
-	"aio_read",			/* 318 = aio_read */
-	"aio_write",			/* 319 = aio_write */
-	"lio_listio",			/* 320 = lio_listio */
+	"oaio_read",			/* 318 = oaio_read */
+	"oaio_write",			/* 319 = oaio_write */
+	"olio_listio",			/* 320 = olio_listio */
 	"yield",			/* 321 = yield */
 	"obs_thr_sleep",			/* 322 = obsolete thr_sleep */
 	"obs_thr_wakeup",			/* 323 = obsolete thr_wakeup */
@@ -343,15 +343,15 @@
 	"sched_get_priority_min",			/* 333 = sched_get_priority_min */
 	"sched_rr_get_interval",			/* 334 = sched_rr_get_interval */
 	"utrace",			/* 335 = utrace */
-	"old.sendfile",		/* 336 = old sendfile */
+	"compat4.sendfile",		/* 336 = old sendfile */
 	"kldsym",			/* 337 = kldsym */
 	"jail",			/* 338 = jail */
 	"#339",			/* 339 = pioctl */
 	"sigprocmask",			/* 340 = sigprocmask */
 	"sigsuspend",			/* 341 = sigsuspend */
-	"old.sigaction",		/* 342 = old sigaction */
+	"compat4.sigaction",		/* 342 = old sigaction */
 	"sigpending",			/* 343 = sigpending */
-	"old.sigreturn",		/* 344 = old sigreturn */
+	"compat4.sigreturn",		/* 344 = old sigreturn */
 	"sigtimedwait",			/* 345 = sigtimedwait */
 	"sigwaitinfo",			/* 346 = sigwaitinfo */
 	"__acl_get_file",			/* 347 = __acl_get_file */
@@ -463,4 +463,30 @@
 	"auditctl",			/* 453 = auditctl */
 	"_umtx_op",			/* 454 = _umtx_op */
 	"thr_new",			/* 455 = thr_new */
+	"sigqueue",			/* 456 = sigqueue */
+	"kmq_open",			/* 457 = kmq_open */
+	"kmq_setattr",			/* 458 = kmq_setattr */
+	"kmq_timedreceive",			/* 459 = kmq_timedreceive */
+	"kmq_timedsend",			/* 460 = kmq_timedsend */
+	"kmq_notify",			/* 461 = kmq_notify */
+	"kmq_unlink",			/* 462 = kmq_unlink */
+	"abort2",			/* 463 = abort2 */
+	"thr_set_name",			/* 464 = thr_set_name */
+	"aio_fsync",			/* 465 = aio_fsync */
+	"rtprio_thread",			/* 466 = rtprio_thread */
+	"#467",			/* 467 = nosys */
+	"#468",			/* 468 = nosys */
+	"#469",			/* 469 = __getpath_fromfd */
+	"#470",			/* 470 = __getpath_fromaddr */
+	"sctp_peeloff",			/* 471 = sctp_peeloff */
+	"sctp_generic_sendmsg",			/* 472 = sctp_generic_sendmsg */
+	"sctp_generic_sendmsg_iov",			/* 473 = sctp_generic_sendmsg_iov */
+	"sctp_generic_recvmsg",			/* 474 = sctp_generic_recvmsg */
+	"pread",			/* 475 = pread */
+	"pwrite",			/* 476 = pwrite */
+	"mmap",			/* 477 = mmap */
+	"lseek",			/* 478 = lseek */
+	"truncate",			/* 479 = truncate */
+	"ftruncate",			/* 480 = ftruncate */
+	"thr_kill2",			/* 481 = thr_kill2 */
 };
Index: kern_timeout.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_timeout.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/kern_timeout.c -L sys/kern/kern_timeout.c -u -r1.1.1.1 -r1.2
--- sys/kern/kern_timeout.c
+++ sys/kern/kern_timeout.c
@@ -35,7 +35,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_timeout.c,v 1.97.2.2 2005/09/26 19:49:12 jhb Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_timeout.c,v 1.106 2007/09/15 12:33:23 rwatson Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -46,6 +46,7 @@
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
+#include <sys/sleepqueue.h>
 #include <sys/sysctl.h>
 
 static int avg_depth;
@@ -78,37 +79,22 @@
 /**
  * Locked by callout_lock:
  *   curr_callout    - If a callout is in progress, it is curr_callout.
- *                     If curr_callout is non-NULL, threads waiting on
- *                     callout_wait will be woken up as soon as the 
+ *                     If curr_callout is non-NULL, threads waiting in
+ *                     callout_drain() will be woken up as soon as the 
  *                     relevant callout completes.
  *   curr_cancelled  - Changing to 1 with both callout_lock and c_mtx held
  *                     guarantees that the current callout will not run.
  *                     The softclock() function sets this to 0 before it
  *                     drops callout_lock to acquire c_mtx, and it calls
- *                     the handler only if curr_cancelled still 0 when
+ *                     the handler only if curr_cancelled is still 0 after
  *                     c_mtx is successfully acquired.
- *   wakeup_ctr      - Incremented every time a thread wants to wait
- *                     for a callout to complete.  Modified only when
+ *   callout_wait    - If a thread is waiting in callout_drain(), then
+ *                     callout_wait is nonzero.  Set only when
  *                     curr_callout is non-NULL.
- *   wakeup_needed   - If a thread is waiting on callout_wait, then
- *                     wakeup_needed is nonzero.  Increased only when
- *                     cutt_callout is non-NULL.
  */
 static struct callout *curr_callout;
 static int curr_cancelled;
-static int wakeup_ctr;
-static int wakeup_needed;
-
-/**
- * Locked by callout_wait_lock:
- *   callout_wait    - If wakeup_needed is set, callout_wait will be
- *                     triggered after the current callout finishes.
- *   wakeup_done_ctr - Set to the current value of wakeup_ctr after
- *                     callout_wait is triggered.
- */
-static struct mtx callout_wait_lock;
-static struct cv callout_wait;
-static int wakeup_done_ctr;
+static int callout_wait;
 
 /*
  * kern_timeout_callwheel_alloc() - kernel low level callwheel initialization 
@@ -157,8 +143,6 @@
 		TAILQ_INIT(&callwheel[i]);
 	}
 	mtx_init(&callout_lock, "callout", NULL, MTX_SPIN | MTX_RECURSE);
-	mtx_init(&callout_wait_lock, "callout_wait_lock", NULL, MTX_DEF);
-	cv_init(&callout_wait, "callout_wait");
 }
 
 /*
@@ -188,7 +172,6 @@
 	int mpcalls;
 	int mtxcalls;
 	int gcalls;
-	int wakeup_cookie;
 #ifdef DIAGNOSTIC
 	struct bintime bt1, bt2;
 	struct timespec ts2;
@@ -262,26 +245,27 @@
 					 */
 					if (curr_cancelled) {
 						mtx_unlock(c_mtx);
-						mtx_lock_spin(&callout_lock);
-						goto done_locked;
+						goto skip;
 					}
 					/* The callout cannot be stopped now. */
 					curr_cancelled = 1;
 
 					if (c_mtx == &Giant) {
 						gcalls++;
-						CTR1(KTR_CALLOUT, "callout %p",
-						    c_func);
+						CTR3(KTR_CALLOUT,
+						    "callout %p func %p arg %p",
+						    c, c_func, c_arg);
 					} else {
 						mtxcalls++;
-						CTR1(KTR_CALLOUT,
-						    "callout mtx %p",
-						    c_func);
+						CTR3(KTR_CALLOUT, "callout mtx"
+						    " %p func %p arg %p",
+						    c, c_func, c_arg);
 					}
 				} else {
 					mpcalls++;
-					CTR1(KTR_CALLOUT, "callout mpsafe %p",
-					    c_func);
+					CTR3(KTR_CALLOUT,
+					    "callout mpsafe %p func %p arg %p",
+					    c, c_func, c_arg);
 				}
 #ifdef DIAGNOSTIC
 				binuptime(&bt1);
@@ -308,22 +292,18 @@
 #endif
 				if ((c_flags & CALLOUT_RETURNUNLOCKED) == 0)
 					mtx_unlock(c_mtx);
+			skip:
 				mtx_lock_spin(&callout_lock);
-done_locked:
 				curr_callout = NULL;
-				if (wakeup_needed) {
+				if (callout_wait) {
 					/*
-					 * There might be someone waiting
+					 * There is someone waiting
 					 * for the callout to complete.
 					 */
-					wakeup_cookie = wakeup_ctr;
+					callout_wait = 0;
 					mtx_unlock_spin(&callout_lock);
-					mtx_lock(&callout_wait_lock);
-					cv_broadcast(&callout_wait);
-					wakeup_done_ctr = wakeup_cookie;
-					mtx_unlock(&callout_wait_lock);
+					wakeup(&callout_wait);
 					mtx_lock_spin(&callout_lock);
-					wakeup_needed = 0;
 				}
 				steps = 0;
 				c = nextsoftcheck;
@@ -445,11 +425,14 @@
 		 */
 		if (c->c_mtx != NULL && !curr_cancelled)
 			cancelled = curr_cancelled = 1;
-		if (wakeup_needed) {
+		if (callout_wait) {
 			/*
 			 * Someone has called callout_drain to kill this
 			 * callout.  Don't reschedule.
 			 */
+			CTR4(KTR_CALLOUT, "%s %p func %p arg %p",
+			    cancelled ? "cancelled" : "failed to cancel",
+			    c, c->c_func, c->c_arg);
 			mtx_unlock_spin(&callout_lock);
 			return (cancelled);
 		}
@@ -487,6 +470,8 @@
 	c->c_time = ticks + to_ticks;
 	TAILQ_INSERT_TAIL(&callwheel[c->c_time & callwheelmask], 
 			  c, c_links.tqe);
+	CTR5(KTR_CALLOUT, "%sscheduled %p func %p arg %p in %d",
+	    cancelled ? "re" : "", c, c->c_func, c->c_arg, to_ticks);
 	mtx_unlock_spin(&callout_lock);
 
 	return (cancelled);
@@ -497,7 +482,7 @@
 	struct	callout *c;
 	int	safe;
 {
-	int use_mtx, wakeup_cookie;
+	int use_mtx, sq_locked;
 
 	if (!safe && c->c_mtx != NULL) {
 #ifdef notyet /* Some callers do not hold Giant for Giant-locked callouts. */
@@ -510,41 +495,100 @@
 		use_mtx = 0;
 	}
 
+	sq_locked = 0;
+again:
 	mtx_lock_spin(&callout_lock);
 	/*
-	 * Don't attempt to delete a callout that's not on the queue.
+	 * If the callout isn't pending, it's not on the queue, so
+	 * don't attempt to remove it from the queue.  We can try to
+	 * stop it by other means however.
 	 */
 	if (!(c->c_flags & CALLOUT_PENDING)) {
 		c->c_flags &= ~CALLOUT_ACTIVE;
+
+		/*
+		 * If it wasn't on the queue and it isn't the current
+		 * callout, then we can't stop it, so just bail.
+		 */
 		if (c != curr_callout) {
+			CTR3(KTR_CALLOUT, "failed to stop %p func %p arg %p",
+			    c, c->c_func, c->c_arg);
 			mtx_unlock_spin(&callout_lock);
+			if (sq_locked)
+				sleepq_release(&callout_wait);
 			return (0);
 		}
-		if (safe) {
-			/* We need to wait until the callout is finished. */
-			wakeup_needed = 1;
-			wakeup_cookie = wakeup_ctr++;
-			mtx_unlock_spin(&callout_lock);
-			mtx_lock(&callout_wait_lock);
 
+		if (safe) {
 			/*
-			 * Check to make sure that softclock() didn't
-			 * do the wakeup in between our dropping
-			 * callout_lock and picking up callout_wait_lock
+			 * The current callout is running (or just
+			 * about to run) and blocking is allowed, so
+			 * just wait for the current invocation to
+			 * finish.
 			 */
-			if (wakeup_cookie - wakeup_done_ctr > 0)
-				cv_wait(&callout_wait, &callout_wait_lock);
+			while (c == curr_callout) {
+
+				/*
+				 * Use direct calls to sleepqueue interface
+				 * instead of cv/msleep in order to avoid
+				 * a LOR between callout_lock and sleepqueue
+				 * chain spinlocks.  This piece of code
+				 * emulates a msleep_spin() call actually.
+				 *
+				 * If we already have the sleepqueue chain
+				 * locked, then we can safely block.  If we
+				 * don't already have it locked, however,
+				 * we have to drop the callout_lock to lock
+				 * it.  This opens several races, so we
+				 * restart at the beginning once we have
+				 * both locks.  If nothing has changed, then
+				 * we will end up back here with sq_locked
+				 * set.
+				 */
+				if (!sq_locked) {
+					mtx_unlock_spin(&callout_lock);
+					sleepq_lock(&callout_wait);
+					sq_locked = 1;
+					goto again;
+				}
+
+				callout_wait = 1;
+				DROP_GIANT();
+				mtx_unlock_spin(&callout_lock);
+				sleepq_add(&callout_wait,
+				    &callout_lock.lock_object, "codrain",
+				    SLEEPQ_SLEEP, 0);
+				sleepq_wait(&callout_wait);
+				sq_locked = 0;
 
-			mtx_unlock(&callout_wait_lock);
+				/* Reacquire locks previously released. */
+				PICKUP_GIANT();
+				mtx_lock_spin(&callout_lock);
+			}
 		} else if (use_mtx && !curr_cancelled) {
-			/* We can stop the callout before it runs. */
+			/*
+			 * The current callout is waiting for it's
+			 * mutex which we hold.  Cancel the callout
+			 * and return.  After our caller drops the
+			 * mutex, the callout will be skipped in
+			 * softclock().
+			 */
 			curr_cancelled = 1;
+			CTR3(KTR_CALLOUT, "cancelled %p func %p arg %p",
+			    c, c->c_func, c->c_arg);
 			mtx_unlock_spin(&callout_lock);
+			KASSERT(!sq_locked, ("sleepqueue chain locked"));
 			return (1);
-		} else
-			mtx_unlock_spin(&callout_lock);
+		}
+		CTR3(KTR_CALLOUT, "failed to stop %p func %p arg %p",
+		    c, c->c_func, c->c_arg);
+		mtx_unlock_spin(&callout_lock);
+		KASSERT(!sq_locked, ("sleepqueue chain still locked"));
 		return (0);
 	}
+	if (sq_locked)
+		sleepq_release(&callout_wait);
+
 	c->c_flags &= ~(CALLOUT_ACTIVE | CALLOUT_PENDING);
 
 	if (nextsoftcheck == c) {
@@ -552,6 +596,9 @@
 	}
 	TAILQ_REMOVE(&callwheel[c->c_time & callwheelmask], c, c_links.tqe);
 
+	CTR3(KTR_CALLOUT, "cancelled %p func %p arg %p",
+	    c, c->c_func, c->c_arg);
+
 	if (c->c_flags & CALLOUT_LOCAL_ALLOC) {
 		c->c_func = NULL;
 		SLIST_INSERT_HEAD(&callfree, c, c_links.sle);
@@ -583,12 +630,12 @@
 {
 	bzero(c, sizeof *c);
 	c->c_mtx = mtx;
-	KASSERT((flags & ~CALLOUT_RETURNUNLOCKED) == 0,
+	KASSERT((flags & ~(CALLOUT_RETURNUNLOCKED)) == 0,
 	    ("callout_init_mtx: bad flags %d", flags));
 	/* CALLOUT_RETURNUNLOCKED makes no sense without a mutex. */
 	KASSERT(mtx != NULL || (flags & CALLOUT_RETURNUNLOCKED) == 0,
 	    ("callout_init_mtx: CALLOUT_RETURNUNLOCKED with no mutex"));
-	c->c_flags = flags & CALLOUT_RETURNUNLOCKED;
+	c->c_flags = flags & (CALLOUT_RETURNUNLOCKED);
 }
 
 #ifdef APM_FIXUP_CALLTODO
Index: subr_param.c
===================================================================
RCS file: /home/cvs/src/sys/kern/subr_param.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/subr_param.c -L sys/kern/subr_param.c -u -r1.1.1.1 -r1.2
--- sys/kern/subr_param.c
+++ sys/kern/subr_param.c
@@ -35,7 +35,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/subr_param.c,v 1.71.2.1 2005/10/17 00:16:54 kris Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/subr_param.c,v 1.73 2005/10/16 03:58:10 kris Exp $");
 
 #include "opt_param.h"
 #include "opt_maxusers.h"
Index: kern_kse.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_kse.c,v
retrieving revision 1.3
retrieving revision 1.4
diff -L sys/kern/kern_kse.c -L sys/kern/kern_kse.c -u -r1.3 -r1.4
--- sys/kern/kern_kse.c
+++ sys/kern/kern_kse.c
@@ -27,7 +27,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_kse.c,v 1.214.2.6 2006/03/07 18:08:09 jhb Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_kse.c,v 1.235.4.1 2008/01/19 18:15:05 kib Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -43,13 +43,12 @@
 #include <sys/sched.h>
 #include <sys/signalvar.h>
 #include <sys/sleepqueue.h>
+#include <sys/syslog.h>
 #include <sys/kse.h>
 #include <sys/ktr.h>
 #include <vm/uma.h>
 
-/*
- * KSEGRP related storage.
- */
+#ifdef KSE
 static uma_zone_t upcall_zone;
 
 /* DEBUG ONLY */
@@ -59,14 +58,20 @@
 extern int max_threads_per_proc;
 extern int max_groups_per_proc;
 extern int max_threads_hits;
-extern struct mtx kse_zombie_lock;
+extern struct mtx kse_lock;
 
 
 TAILQ_HEAD(, kse_upcall) zombie_upcalls =
 	TAILQ_HEAD_INITIALIZER(zombie_upcalls);
 
 static int thread_update_usr_ticks(struct thread *td);
-static void thread_alloc_spare(struct thread *td);
+static int thread_alloc_spare(struct thread *td);
+static struct thread *thread_schedule_upcall(struct thread *td, struct kse_upcall *ku);
+static struct kse_upcall *upcall_alloc(void);
+
+
+struct mtx kse_lock;
+MTX_SYSINIT(kse_lock, &kse_lock, "kse lock", MTX_SPIN);
 
 struct kse_upcall *
 upcall_alloc(void)
@@ -78,45 +83,45 @@
 }
 
 void
-upcall_free(struct kse_upcall *ku)
-{
-
-	uma_zfree(upcall_zone, ku);
-}
-
-void
-upcall_link(struct kse_upcall *ku, struct ksegrp *kg)
-{
-
-	mtx_assert(&sched_lock, MA_OWNED);
-	TAILQ_INSERT_TAIL(&kg->kg_upcalls, ku, ku_link);
-	ku->ku_ksegrp = kg;
-	kg->kg_numupcalls++;
-}
-
-void
-upcall_unlink(struct kse_upcall *ku)
+upcall_reap(void)
 {
-	struct ksegrp *kg = ku->ku_ksegrp;
+	TAILQ_HEAD(, kse_upcall) zupcalls;
+	struct kse_upcall *ku_item, *ku_tmp;
 
-	mtx_assert(&sched_lock, MA_OWNED);
-	KASSERT(ku->ku_owner == NULL, ("%s: have owner", __func__));
-	TAILQ_REMOVE(&kg->kg_upcalls, ku, ku_link);
-	kg->kg_numupcalls--;
-	upcall_stash(ku);
+	TAILQ_INIT(&zupcalls);
+	mtx_lock_spin(&kse_lock);
+	if (!TAILQ_EMPTY(&zombie_upcalls)) {
+		TAILQ_CONCAT(&zupcalls, &zombie_upcalls, ku_link);
+		TAILQ_INIT(&zombie_upcalls);
+	}
+	mtx_unlock_spin(&kse_lock);
+	TAILQ_FOREACH_SAFE(ku_item, &zupcalls, ku_link, ku_tmp)
+		uma_zfree(upcall_zone, ku_item);
 }
 
 void
 upcall_remove(struct thread *td)
 {
 
-	mtx_assert(&sched_lock, MA_OWNED);
+	PROC_SLOCK_ASSERT(td->td_proc, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	if (td->td_upcall != NULL) {
+		/*
+	 	* If we are not a bound thread then decrement the count of
+	 	* possible upcall sources
+	 	*/
+		if (td->td_pflags & TDP_SA) 
+			td->td_proc->p_numupcalls--;
+		mtx_lock_spin(&kse_lock);
 		td->td_upcall->ku_owner = NULL;
-		upcall_unlink(td->td_upcall);
+		TAILQ_REMOVE(&td->td_upcall->ku_proc->p_upcalls, td->td_upcall,
+		    ku_link);
+		TAILQ_INSERT_HEAD(&zombie_upcalls, td->td_upcall, ku_link);
+		mtx_unlock_spin(&kse_lock);
 		td->td_upcall = NULL;
 	}
 }
+#endif
 
 #ifndef _SYS_SYSPROTO_H_
 struct kse_switchin_args {
@@ -125,15 +130,31 @@
 };
 #endif
 
+#ifdef KSE
+void
+kse_unlink(struct thread *td)
+{
+	mtx_lock_spin(&kse_lock);
+	thread_unlink(td);
+	mtx_unlock_spin(&kse_lock);
+	upcall_remove(td);
+}
+#endif
+
 int
 kse_switchin(struct thread *td, struct kse_switchin_args *uap)
 {
+#ifdef KSE
 	struct kse_thr_mailbox tmbx;
 	struct kse_upcall *ku;
 	int error;
 
-	if ((ku = td->td_upcall) == NULL || TD_CAN_UNBIND(td))
+	thread_lock(td);
+	if ((ku = td->td_upcall) == NULL || TD_CAN_UNBIND(td)) {
+		thread_unlock(td);
 		return (EINVAL);
+	}
+	thread_unlock(td);
 	error = (uap->tmbx == NULL) ? EINVAL : 0;
 	if (!error)
 		error = copyin(uap->tmbx, &tmbx, sizeof(tmbx));
@@ -156,17 +177,20 @@
 			else
 				ptrace_clear_single_step(td);
 			if (tmbx.tm_dflags & TMDF_SUSPEND) {
-				mtx_lock_spin(&sched_lock);
+				thread_lock(td);
 				/* fuword can block, check again */
 				if (td->td_upcall)
 					ku->ku_flags |= KUF_DOUPCALL;
-				mtx_unlock_spin(&sched_lock);
+				thread_unlock(td);
 			}
 			_PRELE(td->td_proc);
 		}
 		PROC_UNLOCK(td->td_proc);
 	}
 	return ((error == 0) ? EJUSTRETURN : error);
+#else /* !KSE */
+	return (EOPNOTSUPP);
+#endif
 }
 
 /*
@@ -179,6 +203,7 @@
 int
 kse_thr_interrupt(struct thread *td, struct kse_thr_interrupt_args *uap)
 {
+#ifdef KSE
 	struct kse_execve_args args;
 	struct image_args iargs;
 	struct proc *p;
@@ -190,8 +215,12 @@
 
 	p = td->td_proc;
 
-	if (!(p->p_flag & P_SA))
+	PROC_LOCK(p);
+	if (!(p->p_flag & P_SA)) {
+		PROC_UNLOCK(p);
 		return (EINVAL);
+	}
+	PROC_UNLOCK(p);
 
 	switch (uap->cmd) {
 	case KSE_INTR_SENDSIG:
@@ -200,23 +229,25 @@
 	case KSE_INTR_INTERRUPT:
 	case KSE_INTR_RESTART:
 		PROC_LOCK(p);
-		mtx_lock_spin(&sched_lock);
+		PROC_SLOCK(p);
 		FOREACH_THREAD_IN_PROC(p, td2) {
 			if (td2->td_mailbox == uap->tmbx)
 				break;
 		}
 		if (td2 == NULL) {
-			mtx_unlock_spin(&sched_lock);
+			PROC_SUNLOCK(p);
 			PROC_UNLOCK(p);
 			return (ESRCH);
 		}
+		thread_lock(td2);
+		PROC_SUNLOCK(p);
 		if (uap->cmd == KSE_INTR_SENDSIG) {
 			if (uap->data > 0) {
 				td2->td_flags &= ~TDF_INTERRUPT;
-				mtx_unlock_spin(&sched_lock);
-				tdsignal(td2, (int)uap->data, SIGTARGET_TD);
+				thread_unlock(td2);
+				tdsignal(p, td2, (int)uap->data, NULL);
 			} else {
-				mtx_unlock_spin(&sched_lock);
+				thread_unlock(td2);
 			}
 		} else {
 			td2->td_flags |= TDF_INTERRUPT | TDF_ASTPENDING;
@@ -228,7 +259,7 @@
 				td2->td_intrval = ERESTART;
 			if (TD_ON_SLEEPQ(td2) && (td2->td_flags & TDF_SINTR))
 				sleepq_abort(td2, td2->td_intrval);
-			mtx_unlock_spin(&sched_lock);
+			thread_unlock(td2);
 		}
 		PROC_UNLOCK(p);
 		break;
@@ -243,23 +274,29 @@
 		/* this sub-function is only for bound thread */
 		if (td->td_pflags & TDP_SA)
 			return (EINVAL);
+		thread_lock(td);
 		ku = td->td_upcall;
+		thread_unlock(td);
 		tmbx = (void *)fuword((void *)&ku->ku_mailbox->km_curthread);
 		if (tmbx == NULL || tmbx == (void *)-1)
 			return (EINVAL);
 		flags = 0;
+		PROC_LOCK(p);
 		while ((p->p_flag & P_TRACED) && !(p->p_flag & P_SINGLE_EXIT)) {
 			flags = fuword32(&tmbx->tm_dflags);
 			if (!(flags & TMDF_SUSPEND))
 				break;
-			PROC_LOCK(p);
-			mtx_lock_spin(&sched_lock);
+			PROC_SLOCK(p);
 			thread_stopped(p);
-			thread_suspend_one(td);
 			PROC_UNLOCK(p);
+			thread_lock(td);
+			thread_suspend_one(td);
+			PROC_SUNLOCK(p);
 			mi_switch(SW_VOL, NULL);
-			mtx_unlock_spin(&sched_lock);
+			thread_unlock(td);
+			PROC_LOCK(p);
 		}
+		PROC_UNLOCK(p);
 		return (0);
 
 	case KSE_INTR_EXECVE:
@@ -270,7 +307,6 @@
 		    args.argv, args.envp);
 		if (error == 0)
 			error = kern_execve(td, &iargs, NULL);
-		exec_free_args(&iargs);
 		if (error == 0) {
 			PROC_LOCK(p);
 			SIGSETOR(td->td_siglist, args.sigpend);
@@ -284,6 +320,9 @@
 		return (EINVAL);
 	}
 	return (0);
+#else /* !KSE */
+	return (EOPNOTSUPP);
+#endif
 }
 
 /*
@@ -294,8 +333,8 @@
 int
 kse_exit(struct thread *td, struct kse_exit_args *uap)
 {
+#ifdef KSE
 	struct proc *p;
-	struct ksegrp *kg;
 	struct kse_upcall *ku, *ku2;
 	int    error, count;
 
@@ -303,35 +342,39 @@
 	/* 
 	 * Ensure that this is only called from the UTS
 	 */
-	if ((ku = td->td_upcall) == NULL || TD_CAN_UNBIND(td))
+	thread_lock(td);
+	if ((ku = td->td_upcall) == NULL || TD_CAN_UNBIND(td)) {
+		thread_unlock(td);
 		return (EINVAL);
-
-	kg = td->td_ksegrp;
-	count = 0;
+	}
+	thread_unlock(td);
 
 	/*
-	 * Calculate the existing non-exiting upcalls in this ksegroup.
+	 * Calculate the existing non-exiting upcalls in this process.
 	 * If we are the last upcall but there are still other threads,
 	 * then do not exit. We need the other threads to be able to 
 	 * complete whatever they are doing.
 	 * XXX This relies on the userland knowing what to do if we return.
 	 * It may be a better choice to convert ourselves into a kse_release
 	 * ( or similar) and wait in the kernel to be needed.
+	 * XXX Where are those other threads? I suppose they are waiting in
+	 * the kernel. We should wait for them all at the user boundary after
+	 * turning into an exit.
 	 */
+	count = 0;
 	PROC_LOCK(p);
-	mtx_lock_spin(&sched_lock);
-	FOREACH_UPCALL_IN_GROUP(kg, ku2) {
-		if (ku2->ku_flags & KUF_EXITING)
+	PROC_SLOCK(p);
+	FOREACH_UPCALL_IN_PROC(p, ku2) {
+		if ((ku2->ku_flags & KUF_EXITING) == 0)
 			count++;
 	}
-	if ((kg->kg_numupcalls - count) == 1 &&
-	    (kg->kg_numthreads > 1)) {
-		mtx_unlock_spin(&sched_lock);
+	if (count == 1 && (p->p_numthreads > 1)) {
+		PROC_SUNLOCK(p);
 		PROC_UNLOCK(p);
 		return (EDEADLK);
 	}
 	ku->ku_flags |= KUF_EXITING;
-	mtx_unlock_spin(&sched_lock);
+	PROC_SUNLOCK(p);
 	PROC_UNLOCK(p);
 
 	/* 
@@ -346,23 +389,18 @@
 	PROC_LOCK(p);
 	if (error)
 		psignal(p, SIGSEGV);
-	mtx_lock_spin(&sched_lock);
+	sigqueue_flush(&td->td_sigqueue);
+	PROC_SLOCK(p);
+	thread_lock(td);
 	upcall_remove(td);
+	thread_unlock(td);
 	if (p->p_numthreads != 1) {
-		/*
-		 * If we are not the last thread, but we are the last
-		 * thread in this ksegrp, then by definition this is not
-		 * the last group and we need to clean it up as well.
-		 * thread_exit will clean up the kseg as needed.
-		 */
 		thread_stopped(p);
 		thread_exit();
 		/* NOTREACHED */
 	}
 	/*
 	 * This is the last thread. Just return to the user.
-	 * We know that there is only one ksegrp too, as any others
-	 * would have been discarded in previous calls to thread_exit().
 	 * Effectively we have left threading mode..
 	 * The only real thing left to do is ensure that the
 	 * scheduler sets out concurrency back to 1 as that may be a
@@ -372,13 +410,17 @@
 	 * The other possibility would be to let the process exit.
 	 */
 	thread_unthread(td);
-	mtx_unlock_spin(&sched_lock);
+	PROC_SUNLOCK(p);
 	PROC_UNLOCK(p);
-#if 1
+#if 0
 	return (0);
 #else
+	printf("kse_exit: called on last thread. Calling exit1()");
 	exit1(td, 0);
 #endif
+#else /* !KSE */
+	return (EOPNOTSUPP);
+#endif
 }
 
 /*
@@ -393,8 +435,8 @@
 int
 kse_release(struct thread *td, struct kse_release_args *uap)
 {
+#ifdef KSE
 	struct proc *p;
-	struct ksegrp *kg;
 	struct kse_upcall *ku;
 	struct timespec timeout;
 	struct timeval tv;
@@ -402,9 +444,13 @@
 	int error;
 
 	p = td->td_proc;
-	kg = td->td_ksegrp;
-	if ((ku = td->td_upcall) == NULL || TD_CAN_UNBIND(td))
-		return (EINVAL);
+	thread_lock(td);
+	if ((ku = td->td_upcall) == NULL || TD_CAN_UNBIND(td)) {
+		thread_unlock(td);
+		printf("kse_release: called outside of threading. exiting");
+		exit1(td, 0);
+	}
+	thread_unlock(td);
 	if (uap->timeout != NULL) {
 		if ((error = copyin(uap->timeout, &timeout, sizeof(timeout))))
 			return (error);
@@ -437,23 +483,26 @@
 	} else {
 		if ((ku->ku_flags & KUF_DOUPCALL) == 0 &&
 		    ((ku->ku_mflags & KMF_NOCOMPLETED) ||
-		     (kg->kg_completed == NULL))) {
-			kg->kg_upsleeps++;
+		     (p->p_completed == NULL))) {
+			p->p_upsleeps++;
 			td->td_kflags |= TDK_KSEREL;
-			error = msleep(&kg->kg_completed, &p->p_mtx,
+			error = msleep(&p->p_completed, &p->p_mtx,
 				PPAUSE|PCATCH, "kserel",
 				(uap->timeout ? tvtohz(&tv) : 0));
 			td->td_kflags &= ~(TDK_KSEREL | TDK_WAKEUP);
-			kg->kg_upsleeps--;
+			p->p_upsleeps--;
 		}
 		PROC_UNLOCK(p);
 	}
 	if (ku->ku_flags & KUF_DOUPCALL) {
-		mtx_lock_spin(&sched_lock);
+		PROC_SLOCK(p);
 		ku->ku_flags &= ~KUF_DOUPCALL;
-		mtx_unlock_spin(&sched_lock);
+		PROC_SUNLOCK(p);
 	}
 	return (0);
+#else /* !KSE */
+	return (EOPNOTSUPP);
+#endif
 }
 
 /* struct kse_wakeup_args {
@@ -462,8 +511,8 @@
 int
 kse_wakeup(struct thread *td, struct kse_wakeup_args *uap)
 {
+#ifdef KSE
 	struct proc *p;
-	struct ksegrp *kg;
 	struct kse_upcall *ku;
 	struct thread *td2;
 
@@ -471,60 +520,64 @@
 	td2 = NULL;
 	ku = NULL;
 	/* KSE-enabled processes only, please. */
-	if (!(p->p_flag & P_SA))
-		return (EINVAL);
 	PROC_LOCK(p);
-	mtx_lock_spin(&sched_lock);
+	if (!(p->p_flag & P_SA)) {
+		PROC_UNLOCK(p);
+		return (EINVAL);
+	}
+	PROC_SLOCK(p);
 	if (uap->mbx) {
-		FOREACH_KSEGRP_IN_PROC(p, kg) {
-			FOREACH_UPCALL_IN_GROUP(kg, ku) {
-				if (ku->ku_mailbox == uap->mbx)
-					break;
-			}
-			if (ku)
+		FOREACH_UPCALL_IN_PROC(p, ku) {
+			if (ku->ku_mailbox == uap->mbx)
 				break;
 		}
 	} else {
-		kg = td->td_ksegrp;
-		if (kg->kg_upsleeps) {
-			mtx_unlock_spin(&sched_lock);
-			wakeup(&kg->kg_completed);
+		if (p->p_upsleeps) {
+			PROC_SUNLOCK(p);
+			wakeup(&p->p_completed);
 			PROC_UNLOCK(p);
 			return (0);
 		}
-		ku = TAILQ_FIRST(&kg->kg_upcalls);
+		ku = TAILQ_FIRST(&p->p_upcalls);
 	}
 	if (ku == NULL) {
-		mtx_unlock_spin(&sched_lock);
+		PROC_SUNLOCK(p);
 		PROC_UNLOCK(p);
 		return (ESRCH);
 	}
+	mtx_lock_spin(&kse_lock);
 	if ((td2 = ku->ku_owner) == NULL) {
-		mtx_unlock_spin(&sched_lock);
+		mtx_unlock_spin(&kse_lock);
+		PROC_SUNLOCK(p);
+		PROC_UNLOCK(p);
 		panic("%s: no owner", __func__);
 	} else if (td2->td_kflags & (TDK_KSEREL | TDK_KSERELSIG)) {
-		mtx_unlock_spin(&sched_lock);
+		mtx_unlock_spin(&kse_lock);
 		if (!(td2->td_kflags & TDK_WAKEUP)) {
 			td2->td_kflags |= TDK_WAKEUP;
 			if (td2->td_kflags & TDK_KSEREL)
-				sleepq_remove(td2, &kg->kg_completed);
+				sleepq_remove(td2, &p->p_completed);
 			else
 				sleepq_remove(td2, &p->p_siglist);
 		}
 	} else {
 		ku->ku_flags |= KUF_DOUPCALL;
-		mtx_unlock_spin(&sched_lock);
+		mtx_unlock_spin(&kse_lock);
 	}
+	PROC_SUNLOCK(p);
 	PROC_UNLOCK(p);
 	return (0);
+#else /* !KSE */
+	return (EOPNOTSUPP);
+#endif
 }
 
 /*
- * No new KSEG: first call: use current KSE, don't schedule an upcall
+ * newgroup == 0: first call: use current KSE, don't schedule an upcall
  * All other situations, do allocate max new KSEs and schedule an upcall.
  *
  * XXX should be changed so that 'first' behaviour lasts for as long
- * as you have not made a kse in this ksegrp. i.e. as long as we do not have
+ * as you have not made a thread in this proc. i.e. as long as we do not have
  * a mailbox..
  */
 /* struct kse_create_args {
@@ -534,8 +587,7 @@
 int
 kse_create(struct thread *td, struct kse_create_args *uap)
 {
-	struct ksegrp *newkg;
-	struct ksegrp *kg;
+#ifdef KSE
 	struct proc *p;
 	struct kse_mailbox mbx;
 	struct kse_upcall *newku;
@@ -543,187 +595,117 @@
 	struct thread *newtd;
 
 	p = td->td_proc;
-	kg = td->td_ksegrp;
-	if ((err = copyin(uap->mbx, &mbx, sizeof(mbx))))
-		return (err);
-
-	ncpus = mp_ncpus;
-	if (virtual_cpu != 0)
-		ncpus = virtual_cpu;
-	/*
-	 * If the new UTS mailbox says that this
-	 * will be a BOUND lwp, then it had better
-	 * have its thread mailbox already there.
-	 * In addition, this ksegrp will be limited to
-	 * a concurrency of 1. There is more on this later.
-	 */
-	if (mbx.km_flags & KMF_BOUND) {
-		if (mbx.km_curthread == NULL) 
-			return (EINVAL);
-		ncpus = 1;
-	} else {
-		sa = TDP_SA;
-	}
 
-	PROC_LOCK(p);
 	/*
 	 * Processes using the other threading model can't
 	 * suddenly start calling this one
+	 * XXX  maybe...
 	 */
+	PROC_LOCK(p);
 	if ((p->p_flag & (P_SA|P_HADTHREADS)) == P_HADTHREADS) {
 		PROC_UNLOCK(p);
 		return (EINVAL);
 	}
-
-	/*
-	 * Limit it to NCPU upcall contexts per ksegrp in any case.
-	 * There is a small race here as we don't hold proclock
-	 * until we inc the ksegrp count, but it's not really a big problem
-	 * if we get one too many, but we save a proc lock.
-	 */
-	if ((!uap->newgroup) && (kg->kg_numupcalls >= ncpus)) {
-		PROC_UNLOCK(p);
-		return (EPROCLIM);
-	}
-
 	if (!(p->p_flag & P_SA)) {
 		first = 1;
 		p->p_flag |= P_SA|P_HADTHREADS;
 	}
-
 	PROC_UNLOCK(p);
+
+	if ((err = copyin(uap->mbx, &mbx, sizeof(mbx))))
+		return (err);
+
+	ncpus = mp_ncpus;
+	if (virtual_cpu != 0)
+		ncpus = virtual_cpu;
 	/*
-	 * Now pay attention!
-	 * If we are going to be bound, then we need to be either
-	 * a new group, or the first call ever. In either
-	 * case we will be creating (or be) the only thread in a group.
-	 * and the concurrency will be set to 1.
-	 * This is not quite right, as we may still make ourself 
-	 * bound after making other ksegrps but it will do for now.
-	 * The library will only try do this much.
+	 * If the new UTS mailbox says that this
+	 * will be a BOUND lwp, then it had better
+	 * have its thread mailbox already there.
 	 */
-	if (!sa && !(uap->newgroup || first))
-		return (EINVAL);
-
-	if (uap->newgroup) {
-		newkg = ksegrp_alloc();
-		bzero(&newkg->kg_startzero,
-		    __rangeof(struct ksegrp, kg_startzero, kg_endzero));
-		bcopy(&kg->kg_startcopy, &newkg->kg_startcopy,
-		    __rangeof(struct ksegrp, kg_startcopy, kg_endcopy));
-		sched_init_concurrency(newkg);
+	if ((mbx.km_flags & KMF_BOUND) || uap->newgroup) {
+		/* It's a bound thread (1:1) */
+		if (mbx.km_curthread == NULL) 
+			return (EINVAL);
+		ncpus = 1;
+		if (!(uap->newgroup || first))
+			return (EINVAL);
+	} else {
+		/* It's an upcall capable thread */
+		sa = TDP_SA;
 		PROC_LOCK(p);
-		if (p->p_numksegrps >= max_groups_per_proc) {
+		/*
+		 * Limit it to NCPU upcall contexts per proc in any case.
+		 * numupcalls will soon be numkse or something
+		 * as it will represent the number of 
+		 * non-bound upcalls available.  (i.e. ones that can 
+		 * actually call up).
+		 */
+		if (p->p_numupcalls >= ncpus) {
 			PROC_UNLOCK(p);
-			ksegrp_free(newkg);
 			return (EPROCLIM);
 		}
-		ksegrp_link(newkg, p);
-		mtx_lock_spin(&sched_lock);
-		sched_fork_ksegrp(td, newkg);
-		mtx_unlock_spin(&sched_lock);
+		p->p_numupcalls++;
 		PROC_UNLOCK(p);
-	} else {
-		/*
-		 * We want to make a thread in our own ksegrp.
-		 * If we are just the first call, either kind
-		 * is ok, but if not then either we must be 
-		 * already an upcallable thread to make another,
-		 * or a bound thread to make one of those.
-		 * Once again, not quite right but good enough for now.. XXXKSE
-		 */
-		if (!first && ((td->td_pflags & TDP_SA) != sa))
-			return (EINVAL);
-
-		newkg = kg;
 	}
 
-	/* 
-	 * This test is a bit "indirect".
-	 * It might simplify things if we made a direct way of testing
-	 * if a ksegrp has been worked on before.
-	 * In the case of a bound request and the concurrency being set to 
-	 * one, the concurrency will already be 1 so it's just inefficient
-	 * but not dangerous to call this again. XXX
+	/*
+	 * For the first call this may not have been set.
+	 * Of course nor may it actually be needed.
+	 * thread_schedule_upcall() will look for it.
 	 */
-	if (newkg->kg_numupcalls == 0) {
-		/*
-		 * Initialize KSE group with the appropriate
-		 * concurrency.
-		 *
-		 * For a multiplexed group, create as as much concurrency
-		 * as the number of physical cpus.
-		 * This increases concurrency in the kernel even if the
-		 * userland is not MP safe and can only run on a single CPU.
-		 * In an ideal world, every physical cpu should execute a
-		 * thread.  If there is enough concurrency, threads in the
-		 * kernel can be executed parallel on different cpus at
-		 * full speed without being restricted by the number of
-		 * upcalls the userland provides.
-		 * Adding more upcall structures only increases concurrency
-		 * in userland.
-		 *
-		 * For a bound thread group, because there is only one thread
-		 * in the group, we only set the concurrency for the group 
-		 * to 1.  A thread in this kind of group will never schedule
-		 * an upcall when blocked.  This simulates pthread system
-		 * scope thread behaviour.
-		 */
-		sched_set_concurrency(newkg, ncpus);
+	if (td->td_standin == NULL) {
+		if (!thread_alloc_spare(td))
+			return (ENOMEM);
 	}
+
 	/* 
 	 * Even bound LWPs get a mailbox and an upcall to hold it.
+	 * XXX This should change.
 	 */
 	newku = upcall_alloc();
 	newku->ku_mailbox = uap->mbx;
 	newku->ku_func = mbx.km_func;
 	bcopy(&mbx.km_stack, &newku->ku_stack, sizeof(stack_t));
 
-	/*
-	 * For the first call this may not have been set.
-	 * Of course nor may it actually be needed.
-	 */
-	if (td->td_standin == NULL)
-		thread_alloc_spare(td);
-
 	PROC_LOCK(p);
-	mtx_lock_spin(&sched_lock);
-	if (newkg->kg_numupcalls >= ncpus) {
-		mtx_unlock_spin(&sched_lock);
-		PROC_UNLOCK(p);
-		upcall_free(newku);
-		return (EPROCLIM);
-	}
-
+	PROC_SLOCK(p);
 	/*
 	 * If we are the first time, and a normal thread,
 	 * then transfer all the signals back to the 'process'.
 	 * SA threading will make a special thread to handle them.
 	 */
-	if (first && sa) {
-		SIGSETOR(p->p_siglist, td->td_siglist);
-		SIGEMPTYSET(td->td_siglist);
+	if (first) {
+		sigqueue_move_set(&td->td_sigqueue, &p->p_sigqueue, 
+			&td->td_sigqueue.sq_signals);
 		SIGFILLSET(td->td_sigmask);
 		SIG_CANTMASK(td->td_sigmask);
 	}
 
 	/*
-	 * Make the new upcall available to the ksegrp.
+	 * Make the new upcall available to the process.
 	 * It may or may not use it, but it's available.
 	 */
-	upcall_link(newku, newkg);
+	TAILQ_INSERT_TAIL(&p->p_upcalls, newku, ku_link);
+	newku->ku_proc = p;
 	PROC_UNLOCK(p);
 	if (mbx.km_quantum)
-		newkg->kg_upquantum = max(1, mbx.km_quantum / tick);
+/* XXX should this be in the thread? */
+		p->p_upquantum = max(1, mbx.km_quantum / tick);
 
 	/*
 	 * Each upcall structure has an owner thread, find which
 	 * one owns it.
 	 */
+	thread_lock(td);
+	mtx_lock_spin(&kse_lock);
 	if (uap->newgroup) {
 		/*
-		 * Because the new ksegrp hasn't a thread,
-		 * create an initial upcall thread to own it.
+		 * The newgroup parameter now means
+		 * "bound, non SA, system scope"
+		 * It is only used for the interrupt thread at the
+		 * moment I think.. (or system scope threads dopey).
+		 * We'll rename it later.
 		 */
 		newtd = thread_schedule_upcall(td, newku);
 	} else {
@@ -743,11 +725,14 @@
 			newtd = thread_schedule_upcall(td, newku);
 		}
 	}
-	mtx_unlock_spin(&sched_lock);
+	mtx_unlock_spin(&kse_lock);
+	thread_unlock(td);
+	PROC_SUNLOCK(p);
 
 	/*
 	 * Let the UTS instance know its LWPID.
 	 * It doesn't really care. But the debugger will.
+	 * XXX warning.. remember that this moves.
 	 */
 	suword32(&newku->ku_mailbox->km_lwp, newtd->td_tid);
 
@@ -755,13 +740,22 @@
 	 * In the same manner, if the UTS has a current user thread, 
 	 * then it is also running on this LWP so set it as well.
 	 * The library could do that of course.. but why not..
+	 * XXX I'm not sure this can ever happen but ...
+	 * XXX does the UTS ever set this in the mailbox before calling this?
 	 */
 	if (mbx.km_curthread)
 		suword32(&mbx.km_curthread->tm_lwp, newtd->td_tid);
-
 	
 	if (sa) {
 		newtd->td_pflags |= TDP_SA;
+		/* 
+		 * If we are starting a new thread, kick it off.
+		 */
+		if (newtd != td) {
+			thread_lock(newtd);
+			sched_add(newtd, SRQ_BORING);
+			thread_unlock(newtd);
+		}
 	} else {
 		newtd->td_pflags &= ~TDP_SA;
 
@@ -793,20 +787,18 @@
 				_PRELE(p);
 			}
 			PROC_UNLOCK(p);
+			thread_lock(newtd);
+			sched_add(newtd, SRQ_BORING);
+			thread_unlock(newtd);
 		}
 	}
-	
-	/* 
-	 * If we are starting a new thread, kick it off.
-	 */
-	if (newtd != td) {
-		mtx_lock_spin(&sched_lock);
-		setrunqueue(newtd, SRQ_BORING);
-		mtx_unlock_spin(&sched_lock);
-	}
 	return (0);
+#else /* !KSE */
+	return (EOPNOTSUPP);
+#endif
 }
 
+#ifdef KSE
 /*
  * Initialize global thread allocation resources.
  */
@@ -819,60 +811,20 @@
 }
 
 /*
- * Stash an embarasingly extra upcall into the zombie upcall queue.
- */
-
-void
-upcall_stash(struct kse_upcall *ku)
-{
-	mtx_lock_spin(&kse_zombie_lock);
-	TAILQ_INSERT_HEAD(&zombie_upcalls, ku, ku_link);
-	mtx_unlock_spin(&kse_zombie_lock);
-}
-
-/*
- * Reap zombie kse resource.
- */
-void
-kse_GC(void)
-{
-	struct kse_upcall *ku_first, *ku_next;
-
-	/*
-	 * Don't even bother to lock if none at this instant,
-	 * we really don't care about the next instant..
-	 */
-	if (!TAILQ_EMPTY(&zombie_upcalls)) {
-		mtx_lock_spin(&kse_zombie_lock);
-		ku_first = TAILQ_FIRST(&zombie_upcalls);
-		if (ku_first)
-			TAILQ_INIT(&zombie_upcalls);
-		mtx_unlock_spin(&kse_zombie_lock);
-		while (ku_first) {
-			ku_next = TAILQ_NEXT(ku_first, ku_link);
-			upcall_free(ku_first);
-			ku_first = ku_next;
-		}
-	}
-}
-
-/*
  * Store the thread context in the UTS's mailbox.
  * then add the mailbox at the head of a list we are building in user space.
- * The list is anchored in the ksegrp structure.
+ * The list is anchored in the proc structure.
  */
 int
 thread_export_context(struct thread *td, int willexit)
 {
 	struct proc *p;
-	struct ksegrp *kg;
 	uintptr_t mbx;
 	void *addr;
 	int error = 0, sig;
 	mcontext_t mc;
 
 	p = td->td_proc;
-	kg = td->td_ksegrp;
 
 	/*
 	 * Post sync signal, or process SIGKILL and SIGSTOP.
@@ -881,9 +833,9 @@
 	 */
 	PROC_LOCK(p);
 	if (td->td_flags & TDF_NEEDSIGCHK) {
-		mtx_lock_spin(&sched_lock);
+		thread_lock(td);
 		td->td_flags &= ~TDF_NEEDSIGCHK;
-		mtx_unlock_spin(&sched_lock);
+		thread_unlock(td);
 		mtx_lock(&p->p_sigacts->ps_mtx);
 		while ((sig = cursig(td)) != 0)
 			postsig(sig);
@@ -913,14 +865,15 @@
 	 * entry into this one
 	 */
 	for (;;) {
-		mbx = (uintptr_t)kg->kg_completed;
+		mbx = (uintptr_t)p->p_completed;
 		if (suword(addr, mbx)) {
 			error = EFAULT;
 			goto bad;
 		}
 		PROC_LOCK(p);
-		if (mbx == (uintptr_t)kg->kg_completed) {
-			kg->kg_completed = td->td_mailbox;
+		if (mbx == (uintptr_t)p->p_completed) {
+			thread_lock(td);
+			p->p_completed = td->td_mailbox;
 			/*
 			 * The thread context may be taken away by
 			 * other upcall threads when we unlock
@@ -928,6 +881,7 @@
 			 * use it again in any other places.
 			 */
 			td->td_mailbox = NULL;
+			thread_unlock(td);
 			PROC_UNLOCK(p);
 			break;
 		}
@@ -943,19 +897,18 @@
 }
 
 /*
- * Take the list of completed mailboxes for this KSEGRP and put them on this
+ * Take the list of completed mailboxes for this Process and put them on this
  * upcall's mailbox as it's the next one going up.
  */
 static int
-thread_link_mboxes(struct ksegrp *kg, struct kse_upcall *ku)
+thread_link_mboxes(struct proc *p, struct kse_upcall *ku)
 {
-	struct proc *p = kg->kg_proc;
 	void *addr;
 	uintptr_t mbx;
 
 	addr = (void *)(&ku->ku_mailbox->km_completed);
 	for (;;) {
-		mbx = (uintptr_t)kg->kg_completed;
+		mbx = (uintptr_t)p->p_completed;
 		if (suword(addr, mbx)) {
 			PROC_LOCK(p);
 			psignal(p, SIGSEGV);
@@ -963,8 +916,8 @@
 			return (EFAULT);
 		}
 		PROC_LOCK(p);
-		if (mbx == (uintptr_t)kg->kg_completed) {
-			kg->kg_completed = NULL;
+		if (mbx == (uintptr_t)p->p_completed) {
+			p->p_completed = NULL;
 			PROC_UNLOCK(p);
 			break;
 		}
@@ -985,9 +938,9 @@
 		return (0);
 	if (user) {
 		/* Current always do via ast() */
-		mtx_lock_spin(&sched_lock);
+		thread_lock(td);
 		td->td_flags |= TDF_ASTPENDING;
-		mtx_unlock_spin(&sched_lock);
+		thread_unlock(td);
 		td->td_uuticks++;
 	} else if (td->td_mailbox != NULL)
 		td->td_usticks++;
@@ -1004,8 +957,12 @@
 	caddr_t addr;
 	u_int uticks;
 
-	if (td->td_mailbox == NULL)
+	thread_lock(td);
+	if (td->td_mailbox == NULL) {
+		thread_unlock(td);
 		return (-1);
+	}
+	thread_unlock(td);
 
 	if ((uticks = td->td_uuticks) != 0) {
 		td->td_uuticks = 0;
@@ -1030,25 +987,29 @@
 
 /*
  * This function is intended to be used to initialize a spare thread
- * for upcall. Initialize thread's large data area outside sched_lock
+ * for upcall. Initialize thread's large data area outside the thread lock
  * for thread_schedule_upcall(). The crhold is also here to get it out
  * from the schedlock as it has a mutex op itself.
  * XXX BUG.. we need to get the cr ref after the thread has 
  * checked and chenged its own, not 6 months before...  
  */
-void
+int
 thread_alloc_spare(struct thread *td)
 {
 	struct thread *spare;
 
 	if (td->td_standin)
-		return;
+		return (1);
 	spare = thread_alloc();
+	if (spare == NULL)
+		return (0);
 	td->td_standin = spare;
 	bzero(&spare->td_startzero,
 	    __rangeof(struct thread, td_startzero, td_endzero));
 	spare->td_proc = td->td_proc;
 	spare->td_ucred = crhold(td->td_ucred);
+	spare->td_flags = TDF_INMEM;
+	return (1);
 }
 
 /*
@@ -1060,8 +1021,8 @@
 {
 	struct thread *td2;
 
-	mtx_assert(&sched_lock, MA_OWNED);
-
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	mtx_assert(&kse_lock, MA_OWNED);
 	/*
 	 * Schedule an upcall thread on specified kse_upcall,
 	 * the kse_upcall must be free.
@@ -1082,19 +1043,18 @@
 	 */
 	bcopy(&td->td_startcopy, &td2->td_startcopy,
 	    __rangeof(struct thread, td_startcopy, td_endcopy));
-	thread_link(td2, ku->ku_ksegrp);
+	sched_fork_thread(td, td2);
+	thread_link(td2, ku->ku_proc);
 	/* inherit parts of blocked thread's context as a good template */
 	cpu_set_upcall(td2, td);
 	/* Let the new thread become owner of the upcall */
 	ku->ku_owner   = td2;
 	td2->td_upcall = ku;
-	td2->td_flags  = 0;
 	td2->td_pflags = TDP_SA|TDP_UPCALLING;
 	td2->td_state  = TDS_CAN_RUN;
 	td2->td_inhibitors = 0;
 	SIGFILLSET(td2->td_sigmask);
 	SIG_CANTMASK(td2->td_sigmask);
-	sched_fork_thread(td, td2);
 	return (td2);	/* bogus.. should be a void function */
 }
 
@@ -1103,10 +1063,9 @@
  * debugged.
  */
 void
-thread_signal_add(struct thread *td, int sig)
+thread_signal_add(struct thread *td, ksiginfo_t *ksi)
 {
 	struct proc *p;
-	siginfo_t siginfo;
 	struct sigacts *ps;
 	int error;
 
@@ -1115,11 +1074,11 @@
 	ps = p->p_sigacts;
 	mtx_assert(&ps->ps_mtx, MA_OWNED);
 
-	cpu_thread_siginfo(sig, 0, &siginfo);
 	mtx_unlock(&ps->ps_mtx);
-	SIGADDSET(td->td_sigmask, sig);
+	SIGADDSET(td->td_sigmask, ksi->ksi_signo);
 	PROC_UNLOCK(p);
-	error = copyout(&siginfo, &td->td_mailbox->tm_syncsig, sizeof(siginfo));
+	error = copyout(&ksi->ksi_info, &td->td_mailbox->tm_syncsig,
+			sizeof(siginfo_t));
 	if (error) {
 		PROC_LOCK(p);
 		sigexit(td, SIGSEGV);
@@ -1134,7 +1093,7 @@
 	struct kse_upcall *ku;
 	struct thread *td2;
 
-	mtx_assert(&sched_lock, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 
 	/*
 	 * If the outgoing thread is in threaded group and has never
@@ -1160,13 +1119,17 @@
 		 * start up immediatly, or at least before us if
 		 * we release our slot.
 		 */
+		mtx_lock_spin(&kse_lock);
 		ku = td->td_upcall;
 		ku->ku_owner = NULL;
 		td->td_upcall = NULL;
 		td->td_pflags &= ~TDP_CAN_UNBIND;
 		td2 = thread_schedule_upcall(td, ku);
+		mtx_unlock_spin(&kse_lock);
 		if (flags & SW_INVOL || nextthread) {
-			setrunqueue(td2, SRQ_YIELDING);
+			thread_lock(td2);
+			sched_add(td2, SRQ_YIELDING);
+			thread_unlock(td2);
 		} else {
 			/* Keep up with reality.. we have one extra thread 
 			 * in the picture.. and it's 'running'.
@@ -1184,7 +1147,6 @@
 thread_user_enter(struct thread *td)
 {
 	struct proc *p = td->td_proc;
-	struct ksegrp *kg;
 	struct kse_upcall *ku;
 	struct kse_thr_mailbox *tmbx;
 	uint32_t flags;
@@ -1207,15 +1169,26 @@
 	 * note where our mailbox is.
 	 */
 
-	kg = td->td_ksegrp;
+	thread_lock(td);
 	ku = td->td_upcall;
+	thread_unlock(td);
 
 	KASSERT(ku != NULL, ("no upcall owned"));
 	KASSERT(ku->ku_owner == td, ("wrong owner"));
 	KASSERT(!TD_CAN_UNBIND(td), ("can unbind"));
 
-	if (td->td_standin == NULL)
-		thread_alloc_spare(td);
+	if (td->td_standin == NULL) {
+		if (!thread_alloc_spare(td)) {
+			PROC_LOCK(p);
+			if (kern_logsigexit)
+				log(LOG_INFO,
+				    "pid %d (%s), uid %d: thread_alloc_spare failed\n",
+				    p->p_pid, p->p_comm,
+				    td->td_ucred ? td->td_ucred->cr_uid : -1);
+			sigexit(td, SIGSEGV);	/* XXX ? */
+			/* panic("thread_user_enter: thread_alloc_spare failed"); */
+		}
+	}
 	ku->ku_mflags = fuword32((void *)&ku->ku_mailbox->km_flags);
 	tmbx = (void *)fuword((void *)&ku->ku_mailbox->km_curthread);
 	if ((tmbx == NULL) || (tmbx == (void *)-1L) ||
@@ -1235,16 +1208,18 @@
 		} else {
 			td->td_mailbox = tmbx;
 			td->td_pflags |= TDP_CAN_UNBIND;
+			PROC_LOCK(p);
 			if (__predict_false(p->p_flag & P_TRACED)) {
 				flags = fuword32(&tmbx->tm_dflags);
 				if (flags & TMDF_SUSPEND) {
-					mtx_lock_spin(&sched_lock);
+					thread_lock(td);
 					/* fuword can block, check again */
 					if (td->td_upcall)
 						ku->ku_flags |= KUF_DOUPCALL;
-					mtx_unlock_spin(&sched_lock);
+					thread_unlock(td);
 				}
 			}
+			PROC_UNLOCK(p);
 		}
 	}
 }
@@ -1265,10 +1240,9 @@
 thread_userret(struct thread *td, struct trapframe *frame)
 {
 	struct kse_upcall *ku;
-	struct ksegrp *kg, *kg2;
 	struct proc *p;
 	struct timespec ts;
-	int error = 0, upcalls, uts_crit;
+	int error = 0, uts_crit;
 
 	/* Nothing to do with bound thread */
 	if (!(td->td_pflags & TDP_SA))
@@ -1285,7 +1259,7 @@
 	}
 
 	p = td->td_proc;
-	kg = td->td_ksegrp;
+	thread_lock(td);
 	ku = td->td_upcall;
 
 	/*
@@ -1295,11 +1269,12 @@
 	 * then it can return direct to userland.
 	 */
 	if (TD_CAN_UNBIND(td)) {
+		thread_unlock(td);
 		td->td_pflags &= ~TDP_CAN_UNBIND;
 		if ((td->td_flags & TDF_NEEDSIGCHK) == 0 &&
-		    (kg->kg_completed == NULL) &&
+		    (p->p_completed == NULL) &&
 		    (ku->ku_flags & KUF_DOUPCALL) == 0 &&
-		    (kg->kg_upquantum && ticks < kg->kg_nextupcall)) {
+		    (p->p_upquantum && ticks < p->p_nextupcall)) {
 			nanotime(&ts);
 			error = copyout(&ts,
 				(caddr_t)&ku->ku_mailbox->km_timeofday,
@@ -1318,53 +1293,46 @@
 		 */
 		td->td_pflags |= TDP_UPCALLING;
 	} else if (td->td_mailbox && (ku == NULL)) {
+		thread_unlock(td);
 		thread_export_context(td, 1);
 		PROC_LOCK(p);
-		if (kg->kg_upsleeps)
-			wakeup(&kg->kg_completed);
-		WITNESS_WARN(WARN_PANIC, &p->p_mtx.mtx_object,
+		if (p->p_upsleeps)
+			wakeup(&p->p_completed);
+		WITNESS_WARN(WARN_PANIC, &p->p_mtx.lock_object,
 		    "thread exiting in userret");
-		mtx_lock_spin(&sched_lock);
+		sigqueue_flush(&td->td_sigqueue);
+		PROC_SLOCK(p);
 		thread_stopped(p);
 		thread_exit();
 		/* NOTREACHED */
-	}
+	} else
+		thread_unlock(td);
 
 	KASSERT(ku != NULL, ("upcall is NULL"));
 	KASSERT(TD_CAN_UNBIND(td) == 0, ("can unbind"));
 
+	PROC_LOCK(p);
+	PROC_SLOCK(p);
 	if (p->p_numthreads > max_threads_per_proc) {
 		max_threads_hits++;
-		PROC_LOCK(p);
-		mtx_lock_spin(&sched_lock);
-		p->p_maxthrwaits++;
 		while (p->p_numthreads > max_threads_per_proc) {
-			upcalls = 0;
-			FOREACH_KSEGRP_IN_PROC(p, kg2) {
-				if (kg2->kg_numupcalls == 0)
-					upcalls++;
-				else
-					upcalls += kg2->kg_numupcalls;
-			}
-			if (upcalls >= max_threads_per_proc)
+			if (p->p_numupcalls >= max_threads_per_proc)
 				break;
-			mtx_unlock_spin(&sched_lock);
+			PROC_SUNLOCK(p);
 			if (msleep(&p->p_numthreads, &p->p_mtx, PPAUSE|PCATCH,
 			    "maxthreads", hz/10) != EWOULDBLOCK) {
-				mtx_lock_spin(&sched_lock);
+				PROC_SLOCK(p);
 				break;
-			} else {
-				mtx_lock_spin(&sched_lock);
-			}
+			} else
+				PROC_SLOCK(p);
 		}
-		p->p_maxthrwaits--;
-		mtx_unlock_spin(&sched_lock);
-		PROC_UNLOCK(p);
 	}
+	PROC_SUNLOCK(p);
+	PROC_UNLOCK(p);
 
 	if (td->td_pflags & TDP_UPCALLING) {
 		uts_crit = 0;
-		kg->kg_nextupcall = ticks + kg->kg_upquantum;
+		p->p_nextupcall = ticks + p->p_upquantum;
 		/*
 		 * There is no more work to do and we are going to ride
 		 * this thread up to userland as an upcall.
@@ -1375,9 +1343,9 @@
 
 		td->td_pflags &= ~TDP_UPCALLING;
 		if (ku->ku_flags & KUF_DOUPCALL) {
-			mtx_lock_spin(&sched_lock);
+			PROC_SLOCK(p);
 			ku->ku_flags &= ~KUF_DOUPCALL;
-			mtx_unlock_spin(&sched_lock);
+			PROC_SUNLOCK(p);
 		}
 		/*
 		 * Set user context to the UTS
@@ -1409,7 +1377,7 @@
 		 * this KSE's mailbox.
 		 */
 		if (!(ku->ku_mflags & KMF_NOCOMPLETED) &&
-		    (error = thread_link_mboxes(kg, ku)) != 0)
+		    (error = thread_link_mboxes(p, ku)) != 0)
 			goto out;
 	}
 	if (!uts_crit) {
@@ -1434,7 +1402,7 @@
 		 * for when we re-enter the kernel.
 		 */
 		if (td->td_standin == NULL)
-			thread_alloc_spare(td);
+			thread_alloc_spare(td); /* XXX care of failure ? */
 	}
 
 	ku->ku_mflags = 0;
@@ -1452,7 +1420,6 @@
 void
 thread_continued(struct proc *p)
 {
-	struct ksegrp *kg;
 	struct kse_upcall *ku;
 	struct thread *td;
 
@@ -1463,19 +1430,15 @@
 		return;
 
 	if (p->p_flag & P_TRACED) {
-		FOREACH_KSEGRP_IN_PROC(p, kg) {
-			td = TAILQ_FIRST(&kg->kg_threads);
-			if (td == NULL)
-				continue;
-			/* not a SA group, nothing to do */
-			if (!(td->td_pflags & TDP_SA))
-				continue;
-			FOREACH_UPCALL_IN_GROUP(kg, ku) {
-				mtx_lock_spin(&sched_lock);
+		td = TAILQ_FIRST(&p->p_threads);
+		if (td && (td->td_pflags & TDP_SA)) {
+			FOREACH_UPCALL_IN_PROC(p, ku) {
+				PROC_SLOCK(p);
 				ku->ku_flags |= KUF_DOUPCALL;
-				mtx_unlock_spin(&sched_lock);
-				wakeup(&kg->kg_completed);
+				PROC_SUNLOCK(p);
+				wakeup(&p->p_completed);
 			}
 		}
 	}
 }
+#endif
Index: subr_rman.c
===================================================================
RCS file: /home/cvs/src/sys/kern/subr_rman.c,v
retrieving revision 1.1.1.2
retrieving revision 1.2
diff -L sys/kern/subr_rman.c -L sys/kern/subr_rman.c -u -r1.1.1.2 -r1.2
--- sys/kern/subr_rman.c
+++ sys/kern/subr_rman.c
@@ -55,13 +55,15 @@
  * permitted.
  */
 
+#include "opt_ddb.h"
+
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/subr_rman.c,v 1.43.2.1 2006/01/20 07:38:01 yongari Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/subr_rman.c,v 1.57 2007/04/28 07:37:49 jmg Exp $");
 
-#define __RMAN_RESOURCE_VISIBLE
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
+#include <sys/limits.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
@@ -70,6 +72,33 @@
 #include <sys/rman.h>
 #include <sys/sysctl.h>
 
+#ifdef DDB
+#include <ddb/ddb.h>
+#endif
+
+/*
+ * We use a linked list rather than a bitmap because we need to be able to
+ * represent potentially huge objects (like all of a processor's physical
+ * address space).  That is also why the indices are defined to have type
+ * `unsigned long' -- that being the largest integral type in ISO C (1990).
+ * The 1999 version of C allows `long long'; we may need to switch to that
+ * at some point in the future, particularly if we want to support 36-bit
+ * addresses on IA32 hardware.
+ */
+struct resource_i {
+	struct resource		r_r;
+	TAILQ_ENTRY(resource_i)	r_link;
+	LIST_ENTRY(resource_i)	r_sharelink;
+	LIST_HEAD(, resource_i)	*r_sharehead;
+	u_long	r_start;	/* index of the first entry in this resource */
+	u_long	r_end;		/* index of the last entry (inclusive) */
+	u_int	r_flags;
+	void	*r_virtual;	/* virtual address of this resource */
+	struct	device *r_dev;	/* device which has allocated this resource */
+	struct	rman *r_rm;	/* resource manager from whence this came */
+	int	r_rid;		/* optional rid for this resource. */
+};
+
 int     rman_debug = 0;
 TUNABLE_INT("debug.rman_debug", &rman_debug);
 SYSCTL_INT(_debug, OID_AUTO, rman_debug, CTLFLAG_RW,
@@ -81,10 +110,22 @@
 
 struct	rman_head rman_head;
 static	struct mtx rman_mtx; /* mutex to protect rman_head */
-static	int int_rman_activate_resource(struct rman *rm, struct resource *r,
-				       struct resource **whohas);
-static	int int_rman_deactivate_resource(struct resource *r);
-static	int int_rman_release_resource(struct rman *rm, struct resource *r);
+static	int int_rman_activate_resource(struct rman *rm, struct resource_i *r,
+				       struct resource_i **whohas);
+static	int int_rman_deactivate_resource(struct resource_i *r);
+static	int int_rman_release_resource(struct rman *rm, struct resource_i *r);
+
+static __inline struct resource_i *
+int_alloc_resource(int malloc_flag)
+{
+	struct resource_i *r;
+
+	r = malloc(sizeof *r, M_RMAN, malloc_flag | M_ZERO);
+	if (r != NULL) {
+		r->r_r.__r_i = r;
+	}
+	return (r);
+}
 
 int
 rman_init(struct rman *rm)
@@ -114,18 +155,14 @@
 	return 0;
 }
 
-/*
- * NB: this interface is not robust against programming errors which
- * add multiple copies of the same region.
- */
 int
 rman_manage_region(struct rman *rm, u_long start, u_long end)
 {
-	struct resource *r, *s;
+	struct resource_i *r, *s, *t;
 
 	DPRINTF(("rman_manage_region: <%s> request: start %#lx, end %#lx\n",
 	    rm->rm_descr, start, end));
-	r = malloc(sizeof *r, M_RMAN, M_NOWAIT | M_ZERO);
+	r = int_alloc_resource(M_NOWAIT);
 	if (r == NULL)
 		return ENOMEM;
 	r->r_start = start;
@@ -133,15 +170,56 @@
 	r->r_rm = rm;
 
 	mtx_lock(rm->rm_mtx);
-	for (s = TAILQ_FIRST(&rm->rm_list);
-	     s && s->r_end < r->r_start;
-	     s = TAILQ_NEXT(s, r_link))
-		;
 
+	/* Skip entries before us. */
+	TAILQ_FOREACH(s, &rm->rm_list, r_link) {
+		if (s->r_end == ULONG_MAX)
+			break;
+		if (s->r_end + 1 >= r->r_start)
+			break;
+	}
+
+	/* If we ran off the end of the list, insert at the tail. */
 	if (s == NULL) {
 		TAILQ_INSERT_TAIL(&rm->rm_list, r, r_link);
 	} else {
-		TAILQ_INSERT_BEFORE(s, r, r_link);
+		/* Check for any overlap with the current region. */
+		if (r->r_start <= s->r_end && r->r_end >= s->r_start)
+			return EBUSY;
+
+		/* Check for any overlap with the next region. */
+		t = TAILQ_NEXT(s, r_link);
+		if (t && r->r_start <= t->r_end && r->r_end >= t->r_start)
+			return EBUSY;
+
+		/*
+		 * See if this region can be merged with the next region.  If
+		 * not, clear the pointer.
+		 */
+		if (t && (r->r_end + 1 != t->r_start || t->r_flags != 0))
+			t = NULL;
+
+		/* See if we can merge with the current region. */
+		if (s->r_end + 1 == r->r_start && s->r_flags == 0) {
+			/* Can we merge all 3 regions? */
+			if (t != NULL) {
+				s->r_end = t->r_end;
+				TAILQ_REMOVE(&rm->rm_list, t, r_link);
+				free(r, M_RMAN);
+				free(t, M_RMAN);
+			} else {
+				s->r_end = r->r_end;
+				free(r, M_RMAN);
+			}
+		} else if (t != NULL) {
+			/* Can we merge with just the next region? */
+			t->r_start = r->r_start;
+			free(r, M_RMAN);
+		} else if (s->r_end < r->r_start) {
+			TAILQ_INSERT_AFTER(&rm->rm_list, s, r, r_link);
+		} else {
+			TAILQ_INSERT_BEFORE(s, r, r_link);
+		}
 	}
 
 	mtx_unlock(rm->rm_mtx);
@@ -149,9 +227,19 @@
 }
 
 int
+rman_init_from_resource(struct rman *rm, struct resource *r)
+{
+	int rv;
+
+	if ((rv = rman_init(rm)) != 0)
+		return (rv);
+	return (rman_manage_region(rm, r->__r_i->r_start, r->__r_i->r_end));
+}
+
+int
 rman_fini(struct rman *rm)
 {
-	struct resource *r;
+	struct resource_i *r;
 
 	mtx_lock(rm->rm_mtx);
 	TAILQ_FOREACH(r, &rm->rm_list, r_link) {
@@ -186,14 +274,15 @@
 		      struct device *dev)
 {
 	u_int	want_activate;
-	struct	resource *r, *s, *rv;
+	struct	resource_i *r, *s, *rv;
 	u_long	rstart, rend, amask, bmask;
 
 	rv = NULL;
 
-	DPRINTF(("rman_reserve_resource: <%s> request: [%#lx, %#lx], length "
-	       "%#lx, flags %u, device %s\n", rm->rm_descr, start, end, count,
-	       flags, dev == NULL ? "<null>" : device_get_nameunit(dev)));
+	DPRINTF(("rman_reserve_resource_bound: <%s> request: [%#lx, %#lx], "
+	       "length %#lx, flags %u, device %s\n", rm->rm_descr, start, end,
+	       count, flags,
+	       dev == NULL ? "<null>" : device_get_nameunit(dev)));
 	want_activate = (flags & RF_ACTIVE);
 	flags &= ~RF_ACTIVE;
 
@@ -267,7 +356,7 @@
 			 * split it in two.  The first case requires
 			 * two new allocations; the second requires but one.
 			 */
-			rv = malloc(sizeof *rv, M_RMAN, M_NOWAIT | M_ZERO);
+			rv = int_alloc_resource(M_NOWAIT);
 			if (rv == NULL)
 				goto out;
 			rv->r_start = rstart;
@@ -285,7 +374,7 @@
 				/*
 				 * We are allocating in the middle.
 				 */
-				r = malloc(sizeof *r, M_RMAN, M_NOWAIT|M_ZERO);
+				r = int_alloc_resource(M_NOWAIT);
 				if (r == NULL) {
 					free(rv, M_RMAN);
 					rv = NULL;
@@ -343,7 +432,7 @@
 		    && (s->r_end - s->r_start + 1) == count &&
 		    (s->r_start & amask) == 0 &&
 		    ((s->r_start ^ s->r_end) & bmask) == 0) {
-			rv = malloc(sizeof *rv, M_RMAN, M_NOWAIT | M_ZERO);
+			rv = int_alloc_resource(M_NOWAIT);
 			if (rv == NULL)
 				goto out;
 			rv->r_start = s->r_start;
@@ -383,7 +472,7 @@
 	 * make sense for RF_TIMESHARE-type resources.)
 	 */
 	if (rv && want_activate) {
-		struct resource *whohas;
+		struct resource_i *whohas;
 		if (int_rman_activate_resource(rm, rv, &whohas)) {
 			int_rman_release_resource(rm, rv);
 			rv = NULL;
@@ -391,7 +480,7 @@
 	}
 
 	mtx_unlock(rm->rm_mtx);
-	return (rv);
+	return (rv == NULL ? NULL : &rv->r_r);
 }
 
 struct resource *
@@ -404,10 +493,10 @@
 }
 
 static int
-int_rman_activate_resource(struct rman *rm, struct resource *r,
-			   struct resource **whohas)
+int_rman_activate_resource(struct rman *rm, struct resource_i *r,
+			   struct resource_i **whohas)
 {
-	struct resource *s;
+	struct resource_i *s;
 	int ok;
 
 	/*
@@ -439,12 +528,13 @@
 }
 
 int
-rman_activate_resource(struct resource *r)
+rman_activate_resource(struct resource *re)
 {
 	int rv;
-	struct resource *whohas;
+	struct resource_i *r, *whohas;
 	struct rman *rm;
 
+	r = re->__r_i;
 	rm = r->r_rm;
 	mtx_lock(rm->rm_mtx);
 	rv = int_rman_activate_resource(rm, r, &whohas);
@@ -453,12 +543,13 @@
 }
 
 int
-rman_await_resource(struct resource *r, int pri, int timo)
+rman_await_resource(struct resource *re, int pri, int timo)
 {
 	int	rv;
-	struct	resource *whohas;
+	struct	resource_i *r, *whohas;
 	struct	rman *rm;
 
+	r = re->__r_i;
 	rm = r->r_rm;
 	mtx_lock(rm->rm_mtx);
 	for (;;) {
@@ -478,7 +569,7 @@
 }
 
 static int
-int_rman_deactivate_resource(struct resource *r)
+int_rman_deactivate_resource(struct resource_i *r)
 {
 
 	r->r_flags &= ~RF_ACTIVE;
@@ -494,17 +585,17 @@
 {
 	struct	rman *rm;
 
-	rm = r->r_rm;
+	rm = r->__r_i->r_rm;
 	mtx_lock(rm->rm_mtx);
-	int_rman_deactivate_resource(r);
+	int_rman_deactivate_resource(r->__r_i);
 	mtx_unlock(rm->rm_mtx);
 	return 0;
 }
 
 static int
-int_rman_release_resource(struct rman *rm, struct resource *r)
+int_rman_release_resource(struct rman *rm, struct resource_i *r)
 {
-	struct	resource *s, *t;
+	struct	resource_i *s, *t;
 
 	if (r->r_flags & RF_ACTIVE)
 		int_rman_deactivate_resource(r);
@@ -595,11 +686,14 @@
 }
 
 int
-rman_release_resource(struct resource *r)
+rman_release_resource(struct resource *re)
 {
 	int	rv;
-	struct	rman *rm = r->r_rm;
+	struct	resource_i *r;
+	struct	rman *rm;
 
+	r = re->__r_i;
+	rm = r->r_rm;
 	mtx_lock(rm->rm_mtx);
 	rv = int_rman_release_resource(rm, r);
 	mtx_unlock(rm->rm_mtx);
@@ -627,37 +721,37 @@
 u_long
 rman_get_start(struct resource *r)
 {
-	return (r->r_start);
+	return (r->__r_i->r_start);
 }
 
 u_long
 rman_get_end(struct resource *r)
 {
-	return (r->r_end);
+	return (r->__r_i->r_end);
 }
 
 u_long
 rman_get_size(struct resource *r)
 {
-	return (r->r_end - r->r_start + 1);
+	return (r->__r_i->r_end - r->__r_i->r_start + 1);
 }
 
 u_int
 rman_get_flags(struct resource *r)
 {
-	return (r->r_flags);
+	return (r->__r_i->r_flags);
 }
 
 void
 rman_set_virtual(struct resource *r, void *v)
 {
-	r->r_virtual = v;
+	r->__r_i->r_virtual = v;
 }
 
 void *
 rman_get_virtual(struct resource *r)
 {
-	return (r->r_virtual);
+	return (r->__r_i->r_virtual);
 }
 
 void
@@ -687,37 +781,44 @@
 void
 rman_set_rid(struct resource *r, int rid)
 {
-	r->r_rid = rid;
+	r->__r_i->r_rid = rid;
 }
 
 void
 rman_set_start(struct resource *r, u_long start)
 {
-	r->r_start = start;
+	r->__r_i->r_start = start;
 }
 
 void
 rman_set_end(struct resource *r, u_long end)
 {
-	r->r_end = end;
+	r->__r_i->r_end = end;
 }
 
 int
 rman_get_rid(struct resource *r)
 {
-	return (r->r_rid);
+	return (r->__r_i->r_rid);
 }
 
 struct device *
 rman_get_device(struct resource *r)
 {
-	return (r->r_dev);
+	return (r->__r_i->r_dev);
 }
 
 void
 rman_set_device(struct resource *r, struct device *dev)
 {
-	r->r_dev = dev;
+	r->__r_i->r_dev = dev;
+}
+
+int
+rman_is_region_manager(struct resource *r, struct rman *rm)
+{
+
+	return (r->__r_i->r_rm == rm);
 }
 
 /*
@@ -733,7 +834,7 @@
 	u_int			namelen = arg2;
 	int			rman_idx, res_idx;
 	struct rman		*rm;
-	struct resource		*res;
+	struct resource_i	*res;
 	struct u_rman		urm;
 	struct u_resource	ures;
 	int			error;
@@ -777,7 +878,7 @@
 	/*
 	 * Find the indexed resource and return it.
 	 */
-	mtx_lock(&rman_mtx);
+	mtx_lock(rm->rm_mtx);
 	TAILQ_FOREACH(res, &rm->rm_list, r_link) {
 		if (res_idx-- == 0) {
 			bzero(&ures, sizeof(ures));
@@ -801,14 +902,58 @@
 			ures.r_size = res->r_end - res->r_start + 1;
 			ures.r_flags = res->r_flags;
 
-			mtx_unlock(&rman_mtx);
+			mtx_unlock(rm->rm_mtx);
 			error = SYSCTL_OUT(req, &ures, sizeof(ures));
 			return (error);
 		}
 	}
-	mtx_unlock(&rman_mtx);
+	mtx_unlock(rm->rm_mtx);
 	return (ENOENT);
 }
 
 SYSCTL_NODE(_hw_bus, OID_AUTO, rman, CTLFLAG_RD, sysctl_rman,
     "kernel resource manager");
+
+#ifdef DDB
+static void
+dump_rman(struct rman *rm)
+{
+	struct resource_i *r;
+	const char *devname;
+
+	if (db_pager_quit)
+		return;
+	db_printf("rman: %s\n", rm->rm_descr);
+	db_printf("    0x%lx-0x%lx (full range)\n", rm->rm_start, rm->rm_end);
+	TAILQ_FOREACH(r, &rm->rm_list, r_link) {
+		if (r->r_dev != NULL) {
+			devname = device_get_nameunit(r->r_dev);
+			if (devname == NULL)
+				devname = "nomatch";
+		} else
+			devname = NULL;
+		db_printf("    0x%lx-0x%lx ", r->r_start, r->r_end);
+		if (devname != NULL)
+			db_printf("(%s)\n", devname);
+		else
+			db_printf("----\n");
+		if (db_pager_quit)
+			return;
+	}
+}
+
+DB_SHOW_COMMAND(rman, db_show_rman)
+{
+
+	if (have_addr)
+		dump_rman((struct rman *)addr);
+}
+
+DB_SHOW_COMMAND(allrman, db_show_all_rman)
+{
+	struct rman *rm;
+
+	TAILQ_FOREACH(rm, &rman_head, rm_link)
+		dump_rman(rm);
+}
+#endif
Index: subr_firmware.c
===================================================================
RCS file: /home/cvs/src/sys/kern/subr_firmware.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/subr_firmware.c -L sys/kern/subr_firmware.c -u -r1.1.1.1 -r1.2
--- sys/kern/subr_firmware.c
+++ sys/kern/subr_firmware.c
@@ -25,7 +25,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/subr_firmware.c,v 1.1.2.1 2006/02/23 02:13:31 mlaier Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/subr_firmware.c,v 1.9 2007/02/15 17:21:31 luigi Exp $");
 
 #include <sys/param.h>
 #include <sys/kernel.h>
@@ -38,44 +38,148 @@
 #include <sys/errno.h>
 #include <sys/linker.h>
 #include <sys/firmware.h>
+#include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/module.h>
 
+/*
+ * Loadable firmware support. See sys/sys/firmware.h and firmware(9)
+ * form more details on the subsystem.
+ *
+ * 'struct firmware' is the user-visible part of the firmware table.
+ * Additional internal information is stored in a 'struct priv_fw'
+ * (currently a static array). A slot is in use if FW_INUSE is true:
+ */
+
+#define FW_INUSE(p)	((p)->file != NULL || (p)->fw.name != NULL)
+
+/*
+ * fw.name != NULL when an image is registered; file != NULL for
+ * autoloaded images whose handling has not been completed.
+ *
+ * The state of a slot evolves as follows:
+ *	firmware_register	-->  fw.name = image_name
+ *	(autoloaded image)	-->  file = module reference
+ *	firmware_unregister	-->  fw.name = NULL
+ *	(unloadentry complete)	-->  file = NULL
+ *
+ * In order for the above to work, the 'file' field must remain
+ * unchanged in firmware_unregister().
+ *
+ * Images residing in the same module are linked to each other
+ * through the 'parent' argument of firmware_register().
+ * One image (typically, one with the same name as the module to let
+ * the autoloading mechanism work) is considered the parent image for
+ * all other images in the same module. Children affect the refcount
+ * on the parent image preventing improper unloading of the image itself.
+ */
+
+struct priv_fw {
+	int		refcnt;		/* reference count */
+
+	/*
+	 * parent entry, see above. Set on firmware_register(),
+	 * cleared on firmware_unregister().
+	 */
+	struct priv_fw	*parent;
+
+	int 		flags;	/* record FIRMWARE_UNLOAD requests */
+#define FW_UNLOAD	0x100
+
+	/*
+	 * 'file' is private info managed by the autoload/unload code.
+	 * Set at the end of firmware_get(), cleared only in the
+	 * firmware_task, so the latter can depend on its value even
+	 * while the lock is not held.
+	 */
+	linker_file_t   file;	/* module file, if autoloaded */
+
+	/*
+	 * 'fw' is the externally visible image information.
+	 * We do not make it the first field in priv_fw, to avoid the
+	 * temptation of casting pointers to each other.
+	 * Use PRIV_FW(fw) to get a pointer to the cointainer of fw.
+	 * Beware, PRIV_FW does not work for a NULL pointer.
+	 */
+	struct firmware	fw;	/* externally visible information */
+};
+
+/*
+ * PRIV_FW returns the pointer to the container of struct firmware *x.
+ * Cast to intptr_t to override the 'const' attribute of x
+ */
+#define PRIV_FW(x)	((struct priv_fw *)		\
+	((intptr_t)(x) - offsetof(struct priv_fw, fw)) )
+
+/*
+ * At the moment we use a static array as backing store for the registry.
+ * Should we move to a dynamic structure, keep in mind that we cannot
+ * reallocate the array because pointers are held externally.
+ * A list may work, though.
+ */
 #define	FIRMWARE_MAX	30
-static char *name_unload = "UNLOADING";
-static struct firmware firmware_table[FIRMWARE_MAX];
+static struct priv_fw firmware_table[FIRMWARE_MAX];
+
+/*
+ * module release are handled in a separate task as they might sleep.
+ */
 struct task firmware_task;
+
+/*
+ * This mutex protects accesses to the firmware table.
+ */
 struct mtx firmware_mtx;
 MTX_SYSINIT(firmware, &firmware_mtx, "firmware table", MTX_DEF);
 
 /*
+ * Helper function to lookup a name.
+ * As a side effect, it sets the pointer to a free slot, if any.
+ * This way we can concentrate most of the registry scanning in
+ * this function, which makes it easier to replace the registry
+ * with some other data structure.
+ */
+static struct priv_fw *
+lookup(const char *name, struct priv_fw **empty_slot)
+{
+	struct priv_fw *fp = NULL;
+	struct priv_fw *dummy;
+	int i;
+
+	if (empty_slot == NULL)
+		empty_slot = &dummy;
+	*empty_slot = NULL;
+	for (i = 0; i < FIRMWARE_MAX; i++) {
+		fp = &firmware_table[i];
+		if (fp->fw.name != NULL && strcasecmp(name, fp->fw.name) == 0)
+			break;
+		else if (!FW_INUSE(fp))
+			*empty_slot = fp;
+	}
+	return (i < FIRMWARE_MAX ) ? fp : NULL;
+}
+
+/*
  * Register a firmware image with the specified name.  The
  * image name must not already be registered.  If this is a
  * subimage then parent refers to a previously registered
  * image that this should be associated with.
  */
-struct firmware *
+const struct firmware *
 firmware_register(const char *imagename, const void *data, size_t datasize,
-    unsigned int version, struct firmware *parent)
+    unsigned int version, const struct firmware *parent)
 {
-	struct firmware *frp = NULL;
-	int i;
+	struct priv_fw *match, *frp;
 
 	mtx_lock(&firmware_mtx);
-	for (i = 0; i < FIRMWARE_MAX; i++) {
-		struct firmware *fp = &firmware_table[i];
-
-		if (fp->name == NULL) {
-			if (frp == NULL)
-				frp = fp;
-			continue;
-		}
-		if (strcasecmp(imagename, fp->name) == 0) {
-			mtx_unlock(&firmware_mtx);
-			printf("%s: image %s already registered!\n",
-				__func__, imagename);
-			return NULL;
-		}
+	/*
+	 * Do a lookup to make sure the name is unique or find a free slot.
+	 */
+	match = lookup(imagename, &frp);
+	if (match != NULL) {
+		mtx_unlock(&firmware_mtx);
+		printf("%s: image %s already registered!\n",
+			__func__, imagename);
+		return NULL;
 	}
 	if (frp == NULL) {
 		mtx_unlock(&firmware_mtx);
@@ -83,49 +187,20 @@
 		    __func__, imagename);
 		return NULL;
 	}
-	frp->name = imagename;
-	frp->data = data;
-	frp->datasize = datasize;
-	frp->version = version;
-	frp->refcnt = 0;
-	if (parent != NULL)
-		parent->refcnt++;
-	frp->parent = parent;
-	frp->file = NULL;
-	mtx_unlock(&firmware_mtx);
-	return frp;
-}
-
-static void
-clearentry(struct firmware *fp, int keep_file)
-{
-	KASSERT(fp->refcnt == 0, ("image %s refcnt %u", fp->name, fp->refcnt));
-	if (keep_file && (fp->file != NULL))
-		fp->name = name_unload;
-	else {
-		fp->name = NULL;
-		fp->file = NULL;
-	}
-	fp->data = NULL;
-	fp->datasize = 0;
-	fp->version = 0;
-	if (fp->parent != NULL) {	/* release parent reference */
-		fp->parent->refcnt--;
-		fp->parent = NULL;
+	bzero(frp, sizeof(frp));	/* start from a clean record */
+	frp->fw.name = imagename;
+	frp->fw.data = data;
+	frp->fw.datasize = datasize;
+	frp->fw.version = version;
+	if (parent != NULL) {
+		frp->parent = PRIV_FW(parent);
+		frp->parent->refcnt++;
 	}
-}
-
-static struct firmware *
-lookup(const char *name)
-{
-	int i;
-
-	for (i = 0; i < FIRMWARE_MAX; i++) {
-		struct firmware * fp = &firmware_table[i];
-		if (fp->name != NULL && strcasecmp(name, fp->name) == 0)
-			return fp;
-	}
-	return NULL;
+	mtx_unlock(&firmware_mtx);
+	if (bootverbose)
+		printf("firmware: '%s' version %u: %zu bytes loaded at %p\n",
+		    imagename, version, datasize, data);
+	return &frp->fw;
 }
 
 /*
@@ -136,111 +211,168 @@
 int
 firmware_unregister(const char *imagename)
 {
-	struct firmware *fp;
-	int refcnt = 0;
+	struct priv_fw *fp;
+	int err;
 
 	mtx_lock(&firmware_mtx);
-	/*
-	 * NB: it is ok for the lookup to fail; this can happen
-	 * when a module is unloaded on last reference and the
-	 * module unload handler unregister's each of it's
-	 * firmware images.
-	 */
-	fp = lookup(imagename);
-	if (fp != NULL) {
-		refcnt = fp->refcnt;
-		if (refcnt == 0)
-			clearentry(fp, 0);
+	fp = lookup(imagename, NULL);
+	if (fp == NULL) {
+		/*
+		 * It is ok for the lookup to fail; this can happen
+		 * when a module is unloaded on last reference and the
+		 * module unload handler unregister's each of it's
+		 * firmware images.
+		 */
+		err = 0;
+	} else if (fp->refcnt != 0) {	/* cannot unregister */
+		err = EBUSY;
+	}  else {
+		linker_file_t   x = fp->file;	/* save value */
+
+		if (fp->parent != NULL)	/* release parent reference */
+			fp->parent->refcnt--;
+		/*
+		 * Clear the whole entry with bzero to make sure we
+		 * do not forget anything. Then restore 'file' which is
+		 * non-null for autoloaded images.
+		 */
+		bzero(fp, sizeof(struct priv_fw));
+		fp->file = x;
+		err = 0;
 	}
 	mtx_unlock(&firmware_mtx);
-	return (refcnt != 0 ? EBUSY : 0);
+	return err;
 }
 
 /*
  * Lookup and potentially load the specified firmware image.
- * If the firmware is not found in the registry attempt to
- * load a kernel module with the image name.  If the firmware
- * is located a reference is returned.  The caller must release
- * this reference for the image to be eligible for removal/unload.
+ * If the firmware is not found in the registry, try to load a kernel
+ * module named as the image name.
+ * If the firmware is located, a reference is returned. The caller must
+ * release this reference for the image to be eligible for removal/unload.
  */
-struct firmware *
+const struct firmware *
 firmware_get(const char *imagename)
 {
 	struct thread *td;
-	struct firmware *fp;
+	struct priv_fw *fp;
 	linker_file_t result;
-	int requested_load = 0;
 
-again:
 	mtx_lock(&firmware_mtx);
-	fp = lookup(imagename);
-	if (fp != NULL) {
-		if (requested_load)
-			fp->file = result;
-		fp->refcnt++;
-		mtx_unlock(&firmware_mtx);
-		return fp;
-	}
+	fp = lookup(imagename, NULL);
+	if (fp != NULL)
+		goto found;
 	/*
-	 * Image not present, try to load the module holding it
-	 * or if we already tried give up.
+	 * Image not present, try to load the module holding it.
 	 */
 	mtx_unlock(&firmware_mtx);
-	if (requested_load) {
-		printf("%s: failed to load firmware image %s\n",
-		    __func__, imagename);
-		return NULL;
-	}
 	td = curthread;
-	if (suser(td) != 0 || securelevel_gt(td->td_ucred, 0) != 0) {
+	if (priv_check(td, PRIV_FIRMWARE_LOAD) != 0 ||
+	    securelevel_gt(td->td_ucred, 0) != 0) {
 		printf("%s: insufficient privileges to "
 		    "load firmware image %s\n", __func__, imagename);
 		return NULL;
 	}
-	mtx_lock(&Giant);		/* XXX */
 	(void) linker_reference_module(imagename, NULL, &result);
-	mtx_unlock(&Giant);		/* XXX */
-	requested_load = 1;
-	goto again;		/* sort of an Algol-style for loop */
+	/*
+	 * After loading the module, see if the image is registered now.
+	 */
+	mtx_lock(&firmware_mtx);
+	fp = lookup(imagename, NULL);
+	if (fp == NULL) {
+		mtx_unlock(&firmware_mtx);
+		printf("%s: failed to load firmware image %s\n",
+			__func__, imagename);
+		(void) linker_release_module(imagename, NULL, NULL);
+		return NULL;
+	}
+	fp->file = result;	/* record the module identity */
+
+found:				/* common exit point on success */
+	fp->refcnt++;
+	mtx_unlock(&firmware_mtx);
+	return &fp->fw;
 }
 
-static void
-unloadentry(void *unused1, int unused2)
+/*
+ * Release a reference to a firmware image returned by firmware_get.
+ * The caller may specify, with the FIRMWARE_UNLOAD flag, its desire
+ * to release the resource, but the flag is only advisory.
+ *
+ * If this is the last reference to the firmware image, and this is an
+ * autoloaded module, wake up the firmware_task to figure out what to do
+ * with the associated module.
+ */
+void
+firmware_put(const struct firmware *p, int flags)
 {
-	struct firmware *fp;
+	struct priv_fw *fp = PRIV_FW(p);
 
 	mtx_lock(&firmware_mtx);
-	while ((fp = lookup(name_unload))) {
-		/*
-		 * XXX: ugly, we should be able to lookup unlocked here if
-		 * we properly lock around clearentry below to avoid double
-		 * unload.  Play it safe for now.
-		 */
-		mtx_unlock(&firmware_mtx);
-
-		linker_file_unload(fp->file, LINKER_UNLOAD_NORMAL);
-
-		mtx_lock(&firmware_mtx);
-		clearentry(fp, 0);
+	fp->refcnt--;
+	if (fp->refcnt == 0) {
+		if (flags & FIRMWARE_UNLOAD)
+			fp->flags |= FW_UNLOAD;
+		if (fp->file)
+			taskqueue_enqueue(taskqueue_thread, &firmware_task);
 	}
 	mtx_unlock(&firmware_mtx);
 }
 
 /*
- * Release a reference to a firmware image returned by
- * firmware_get.  The reference is released and if this is
- * the last reference to the firmware image the associated
- * module may be released/unloaded.
+ * The body of the task in charge of unloading autoloaded modules
+ * that are not needed anymore.
+ * Images can be cross-linked so we may need to make multiple passes,
+ * but the time we spend in the loop is bounded because we clear entries
+ * as we touch them.
  */
-void
-firmware_put(struct firmware *fp, int flags)
+static void
+unloadentry(void *unused1, int unused2)
 {
+	int limit = FIRMWARE_MAX;
+	int i;	/* current cycle */
+
 	mtx_lock(&firmware_mtx);
-	fp->refcnt--;
-	if (fp->refcnt == 0 && (flags & FIRMWARE_UNLOAD))
-		clearentry(fp, 1);
-	if (fp->file)
-		taskqueue_enqueue(taskqueue_thread, &firmware_task);
+	/*
+	 * Scan the table. limit is set to make sure we make another
+	 * full sweep after matching an entry that requires unloading.
+	 */
+	for (i = 0; i < limit; i++) {
+		struct priv_fw *fp;
+		int err;
+
+		fp = &firmware_table[i % FIRMWARE_MAX];
+		if (fp->fw.name == NULL || fp->file == NULL ||
+		    fp->refcnt != 0 || (fp->flags & FW_UNLOAD) == 0)
+			continue;
+
+		/*
+		 * Found an entry. Now:
+		 * 1. bump up limit to make sure we make another full round;
+		 * 2. clear FW_UNLOAD so we don't try this entry again.
+		 * 3. release the lock while trying to unload the module.
+		 * 'file' remains set so that the entry cannot be reused
+		 * in the meantime (it also means that fp->file will
+		 * not change while we release the lock).
+		 */
+		limit = i + FIRMWARE_MAX;	/* make another full round */
+		fp->flags &= ~FW_UNLOAD;	/* do not try again */
+
+		mtx_unlock(&firmware_mtx);
+		err = linker_release_module(NULL, NULL, fp->file);
+		mtx_lock(&firmware_mtx);
+
+		/*
+		 * We rely on the module to call firmware_unregister()
+		 * on unload to actually release the entry.
+		 * If err = 0 we can drop our reference as the system
+		 * accepted it. Otherwise unloading failed (e.g. the
+		 * module itself gave an error) so our reference is
+		 * still valid.
+		 */
+		if (err == 0)
+			fp->file = NULL; 
+	}
 	mtx_unlock(&firmware_mtx);
 }
 
@@ -250,13 +382,34 @@
 static int
 firmware_modevent(module_t mod, int type, void *unused)
 {
+	struct priv_fw *fp;
+	int i, err = EINVAL;
+
 	switch (type) {
 	case MOD_LOAD:
 		TASK_INIT(&firmware_task, 0, unloadentry, NULL);
 		return 0;
+
 	case MOD_UNLOAD:
+		/* request all autoloaded modules to be released */
+		mtx_lock(&firmware_mtx);
+		for (i = 0; i < FIRMWARE_MAX; i++) {
+			fp = &firmware_table[i];
+			fp->flags |= FW_UNLOAD;;
+		}
+		mtx_unlock(&firmware_mtx);
+		taskqueue_enqueue(taskqueue_thread, &firmware_task);
 		taskqueue_drain(taskqueue_thread, &firmware_task);
-		return 0;
+		for (i = 0; i < FIRMWARE_MAX; i++) {
+			fp = &firmware_table[i];
+			if (fp->fw.name != NULL) {
+				printf("%s: image %p ref %d still active slot %d\n",
+					__func__, fp->fw.name,
+					fp->refcnt,  i);
+				err = EINVAL;
+			}
+		}
+		return err;
 	}
 	return EINVAL;
 }
Index: kern_xxx.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_xxx.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/kern_xxx.c -L sys/kern/kern_xxx.c -u -r1.1.1.1 -r1.2
--- sys/kern/kern_xxx.c
+++ sys/kern/kern_xxx.c
@@ -30,7 +30,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_xxx.c,v 1.46 2005/01/06 23:35:39 imp Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_xxx.c,v 1.49 2007/03/05 13:10:57 rwatson Exp $");
 
 #include "opt_compat.h"
 
@@ -38,6 +38,7 @@
 #include <sys/systm.h>
 #include <sys/sysproto.h>
 #include <sys/kernel.h>
+#include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
@@ -53,9 +54,6 @@
 	u_int	len;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 ogethostname(td, uap)
@@ -81,9 +79,6 @@
 	u_int	len;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 osethostname(td, uap)
@@ -107,9 +102,6 @@
 	int	dummy;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 ogethostid(td, uap)
@@ -128,9 +120,6 @@
 	long	hostid;
 };
 #endif
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 osethostid(td, uap)
@@ -139,7 +128,8 @@
 {
 	int error;
 
-	if ((error = suser(td)))
+	error = priv_check(td, PRIV_SETHOSTID);
+	if (error)
 		return (error);
 	mtx_lock(&Giant);
 	hostid = uap->hostid;
@@ -147,22 +137,20 @@
 	return (0);
 }
 
-/*
- * MPSAFE
- */
 int
 oquota(td, uap)
 	struct thread *td;
 	struct oquota_args *uap;
 {
+
 	return (ENOSYS);
 }
 #endif /* COMPAT_43 */
 
 /*
- * This is the FreeBSD-1.1 compatable uname(2) interface.  These
- * days it is done in libc as a wrapper around a bunch of sysctl's.
- * This must maintain the old 1.1 binary ABI.
+ * This is the FreeBSD-1.1 compatable uname(2) interface.  These days it is
+ * done in libc as a wrapper around a bunch of sysctl's.  This must maintain
+ * the old 1.1 binary ABI.
  */
 #if SYS_NMLN != 32
 #error "FreeBSD-1.1 uname syscall has been broken"
@@ -172,10 +160,6 @@
         struct utsname  *name;
 };
 #endif
-
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 uname(td, uap)
@@ -255,10 +239,6 @@
         int     len;
 };
 #endif
-
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 getdomainname(td, uap)
@@ -283,10 +263,6 @@
         int     len;
 };
 #endif
-
-/*
- * MPSAFE
- */
 /* ARGSUSED */
 int
 setdomainname(td, uap)
@@ -295,9 +271,10 @@
 {
         int error, domainnamelen;
 
+	error = priv_check(td, PRIV_SETDOMAINNAME);
+	if (error)
+		return (error);
 	mtx_lock(&Giant);
-        if ((error = suser(td)))
-		goto done2;
         if ((u_int)uap->len > sizeof (domainname) - 1) {
 		error = EINVAL;
 		goto done2;
@@ -309,4 +286,3 @@
 	mtx_unlock(&Giant);
         return (error);
 }
-
Index: tty.c
===================================================================
RCS file: /home/cvs/src/sys/kern/tty.c,v
retrieving revision 1.3
retrieving revision 1.4
diff -L sys/kern/tty.c -L sys/kern/tty.c -u -r1.3 -r1.4
--- sys/kern/tty.c
+++ sys/kern/tty.c
@@ -71,7 +71,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/tty.c,v 1.250.2.1 2005/11/06 16:09:32 jhb Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/tty.c,v 1.273.4.1 2008/01/12 00:20:06 jhb Exp $");
 
 #include "opt_compat.h"
 #include "opt_tty.h"
@@ -83,11 +83,10 @@
 #include <sys/mutex.h>
 #include <sys/namei.h>
 #include <sys/sx.h>
-#ifndef BURN_BRIDGES
-#if defined(COMPAT_43)
+#if defined(COMPAT_43TTY)
 #include <sys/ioctl_compat.h>
 #endif
-#endif
+#include <sys/priv.h>
 #include <sys/proc.h>
 #define	TTYDEFCHARS
 #include <sys/tty.h>
@@ -148,7 +147,9 @@
 	.d_flags =	D_TTY | D_NEEDGIANT,
 };
 
-static int	proc_compare(struct proc *p1, struct proc *p2);
+static int	proc_sum(struct proc *, int *);
+static int	proc_compare(struct proc *, struct proc *);
+static int	thread_compare(struct thread *, struct thread *);
 static int	ttnread(struct tty *tp);
 static void	ttyecho(int c, struct tty *tp);
 static int	ttyoutput(int c, struct tty *tp);
@@ -253,6 +254,7 @@
  */
 static	TAILQ_HEAD(, tty) tty_list = TAILQ_HEAD_INITIALIZER(tty_list);
 static struct mtx tty_list_mutex;
+MTX_SYSINIT(tty_list, &tty_list_mutex, "ttylist", MTX_DEF);
 
 static struct unrhdr *tty_unit;
 
@@ -331,7 +333,7 @@
 	tp->t_hotchar = 0;
 	tp->t_pgrp = NULL;
 	tp->t_session = NULL;
-	ostate= tp->t_state;
+	ostate = tp->t_state;
 	tp->t_state = 0;
 	knlist_clear(&tp->t_rsel.si_note, 0);
 	knlist_clear(&tp->t_wsel.si_note, 0);
@@ -517,7 +519,7 @@
 			if (CCEQ(cc[VSTOP], c)) {
 				if (!ISSET(tp->t_state, TS_TTSTOP)) {
 					SET(tp->t_state, TS_TTSTOP);
-					(*tp->t_stop)(tp, 0);
+					tt_stop(tp, 0);
 					return (0);
 				}
 				if (!CCEQ(cc[VSTART], c))
@@ -834,8 +836,7 @@
 	case  TIOCSTI:
 	case  TIOCSTOP:
 	case  TIOCSWINSZ:
-#ifndef BURN_BRIDGES
-#if defined(COMPAT_43)
+#if defined(COMPAT_43TTY)
 	case  TIOCLBIC:
 	case  TIOCLBIS:
 	case  TIOCLSET:
@@ -845,7 +846,6 @@
 	case  TIOCSETP:
 	case  TIOCSLTC:
 #endif
-#endif
 		sx_slock(&proctree_lock);
 		PROC_LOCK(p);
 		while (isbackground(p, tp) && !(p->p_flag & P_PPWAIT) &&
@@ -873,45 +873,33 @@
 		break;
 	}
 
-	if (tp->t_break != NULL) {
-		switch (cmd) {
-		case TIOCSBRK:
-			tp->t_break(tp, 1);
-			return (0);
-		case TIOCCBRK:
-			tp->t_break(tp, 0);
-			return (0);
-		default:
-			break;
-		}
-	}
 
 	if (tp->t_modem != NULL) {
 		switch (cmd) {
 		case TIOCSDTR:
-			tp->t_modem(tp, SER_DTR, 0);
+			tt_modem(tp, SER_DTR, 0);
 			return (0);
 		case TIOCCDTR:
-			tp->t_modem(tp, 0, SER_DTR);
+			tt_modem(tp, 0, SER_DTR);
 			return (0);
 		case TIOCMSET:
 			bits = *(int *)data;
 			sig = (bits & (TIOCM_DTR | TIOCM_RTS)) >> 1;
 			sig2 = ((~bits) & (TIOCM_DTR | TIOCM_RTS)) >> 1;
-			tp->t_modem(tp, sig, sig2);
+			tt_modem(tp, sig, sig2);
 			return (0);
 		case TIOCMBIS:
 			bits = *(int *)data;
 			sig = (bits & (TIOCM_DTR | TIOCM_RTS)) >> 1;
-			tp->t_modem(tp, sig, 0);
+			tt_modem(tp, sig, 0);
 			return (0);
 		case TIOCMBIC:
 			bits = *(int *)data;
 			sig = (bits & (TIOCM_DTR | TIOCM_RTS)) >> 1;
-			tp->t_modem(tp, 0, sig);
+			tt_modem(tp, 0, sig);
 			return (0);
 		case TIOCMGET:
-			sig = tp->t_modem(tp, 0, 0);
+			sig = tt_modem(tp, 0, 0);
 			/* See <sys/serial.h. for the "<< 1" stuff */
 			bits = TIOCM_LE + (sig << 1);
 			*(int *)data = bits;
@@ -1034,7 +1022,7 @@
 		break;
 	case TIOCMSDTRWAIT:
 		/* must be root since the wait applies to following logins */
-		error = suser(td);
+		error = priv_check(td, PRIV_TTY_DTRWAIT);
 		if (error)
 			return (error);
 		tp->t_dtr_wait = *(int *)data * hz / 100;
@@ -1072,7 +1060,8 @@
 			/*
 			 * Set device hardware.
 			 */
-			if (tp->t_param && (error = (*tp->t_param)(tp, t))) {
+			error = tt_param(tp, t);
+			if (error) {
 				splx(s);
 				return (error);
 			}
@@ -1182,9 +1171,9 @@
 		splx(s);
 		break;
 	case TIOCSTI:			/* simulate terminal input */
-		if ((flag & FREAD) == 0 && suser(td))
+		if ((flag & FREAD) == 0 && priv_check(td, PRIV_TTY_STI))
 			return (EPERM);
-		if (!isctty(p, tp) && suser(td))
+		if (!isctty(p, tp) && priv_check(td, PRIV_TTY_STI))
 			return (EACCES);
 		s = spltty();
 		ttyld_rint(tp, *(u_char *)data);
@@ -1194,7 +1183,7 @@
 		s = spltty();
 		if (!ISSET(tp->t_state, TS_TTSTOP)) {
 			SET(tp->t_state, TS_TTSTOP);
-			(*tp->t_stop)(tp, 0);
+			tt_stop(tp, 0);
 		}
 		splx(s);
 		break;
@@ -1257,7 +1246,7 @@
 		}
 		break;
 	case TIOCSDRAINWAIT:
-		error = suser(td);
+		error = priv_check(td, PRIV_TTY_DRAINWAIT);
 		if (error)
 			return (error);
 		tp->t_timeout = *(int *)data * hz;
@@ -1267,16 +1256,16 @@
 	case TIOCGDRAINWAIT:
 		*(int *)data = tp->t_timeout / hz;
 		break;
+	case TIOCSBRK:
+		return (tt_break(tp, 1));
+	case TIOCCBRK:
+		return (tt_break(tp, 0));
 	default:
-#if defined(COMPAT_43)
-#ifndef BURN_BRIDGES
+#if defined(COMPAT_43TTY)
 		return (ttcompat(tp, cmd, data, flag));
 #else
 		return (ENOIOCTL);
 #endif
-#else
-		return (ENOIOCTL);
-#endif
 	}
 	return (0);
 }
@@ -1330,6 +1319,8 @@
 	int s;
 
 	tp = tty_gettp(dev);
+	if (tp->t_state & TS_GONE)
+		return (ENODEV);
 
 	switch (kn->kn_filter) {
 	case EVFILT_READ:
@@ -1344,7 +1335,7 @@
 		return (EINVAL);
 	}
 
-	kn->kn_hook = (caddr_t)dev;
+	kn->kn_hook = (caddr_t)tp;
 
 	s = spltty();
 	knlist_add(klist, kn, 0);
@@ -1356,7 +1347,7 @@
 static void
 filt_ttyrdetach(struct knote *kn)
 {
-	struct tty *tp = ((struct cdev *)kn->kn_hook)->si_tty;
+	struct tty *tp = (struct tty *)kn->kn_hook;
 	int s = spltty();
 
 	knlist_remove(&tp->t_rsel.si_note, kn, 0);
@@ -1366,10 +1357,10 @@
 static int
 filt_ttyread(struct knote *kn, long hint)
 {
-	struct tty *tp = ((struct cdev *)kn->kn_hook)->si_tty;
+	struct tty *tp = (struct tty *)kn->kn_hook;
 
 	kn->kn_data = ttnread(tp);
-	if (ISSET(tp->t_state, TS_ZOMBIE)) {
+	if ((tp->t_state & TS_GONE) || ISSET(tp->t_state, TS_ZOMBIE)) {
 		kn->kn_flags |= EV_EOF;
 		return (1);
 	}
@@ -1379,7 +1370,7 @@
 static void
 filt_ttywdetach(struct knote *kn)
 {
-	struct tty *tp = ((struct cdev *)kn->kn_hook)->si_tty;
+	struct tty *tp = (struct tty *)kn->kn_hook;
 	int s = spltty();
 
 	knlist_remove(&tp->t_wsel.si_note, kn, 0);
@@ -1389,10 +1380,10 @@
 static int
 filt_ttywrite(struct knote *kn, long hint)
 {
-	struct tty *tp = ((struct cdev *)kn->kn_hook)->si_tty;
+	struct tty *tp = (struct tty *)kn->kn_hook;
 
 	kn->kn_data = tp->t_outq.c_cc;
-	if (ISSET(tp->t_state, TS_ZOMBIE))
+	if ((tp->t_state & TS_GONE) || ISSET(tp->t_state, TS_ZOMBIE))
 		return (1);
 	return (kn->kn_data <= tp->t_olowat &&
 	    ISSET(tp->t_state, TS_CONNECTED));
@@ -1429,7 +1420,7 @@
 	s = spltty();
 	while ((tp->t_outq.c_cc || ISSET(tp->t_state, TS_BUSY)) &&
 	       ISSET(tp->t_state, TS_CONNECTED) && tp->t_oproc) {
-		(*tp->t_oproc)(tp);
+		tt_oproc(tp);
 		if ((tp->t_outq.c_cc || ISSET(tp->t_state, TS_BUSY)) &&
 		    ISSET(tp->t_state, TS_CONNECTED)) {
 			SET(tp->t_state, TS_SO_OCOMPLETE);
@@ -1479,7 +1470,7 @@
 		FLUSHQ(&tp->t_outq);
 		CLR(tp->t_state, TS_TTSTOP);
 	}
-	(*tp->t_stop)(tp, rw);
+	tt_stop(tp, rw);
 	if (rw & FREAD) {
 		FLUSHQ(&tp->t_canq);
 		FLUSHQ(&tp->t_rawq);
@@ -1611,8 +1602,7 @@
 ttstart(struct tty *tp)
 {
 
-	if (tp->t_oproc != NULL)	/* XXX: Kludge for pty. */
-		(*tp->t_oproc)(tp);
+	tt_oproc(tp);
 	return (0);
 }
 
@@ -1650,7 +1640,7 @@
 		} else if (!ISSET(tp->t_state, TS_CAR_OFLOW)) {
 			SET(tp->t_state, TS_CAR_OFLOW);
 			SET(tp->t_state, TS_TTSTOP);
-			(*tp->t_stop)(tp, 0);
+			tt_stop(tp, 0);
 		}
 	} else if (flag == 0) {
 		/*
@@ -1732,7 +1722,7 @@
 	int s, first, error = 0;
 	int has_stime = 0, last_cc = 0;
 	long slp = 0;		/* XXX this should be renamed `timo'. */
-	struct timeval stime;
+	struct timeval stime = { 0, 0 };
 	struct pgrp *pg;
 
 	td = curthread;
@@ -2542,12 +2532,13 @@
 {
 	struct timeval utime, stime;
 	struct proc *p, *pick;
-	struct thread *td;
+	struct thread *td, *picktd;
 	const char *stateprefix, *state;
 	long rss;
 	int load, pctcpu;
 	pid_t pid;
 	char comm[MAXCOMLEN + 1];
+	struct rusage ru;
 
 	if (ttycheckoutq(tp,0) == 0)
 		return;
@@ -2580,31 +2571,25 @@
 
 	/*
 	 * Pick the most interesting process and copy some of its
-	 * state for printing later.  sched_lock must be held for
-	 * most parts of this.  Holding it throughout is simplest
-	 * and prevents even unimportant inconsistencies in the
-	 * copy of the state, but may increase interrupt latency
-	 * too much.
+	 * state for printing later.  This operation could rely on stale
+	 * data as we can't hold the proc slock or thread locks over the
+	 * whole list. However, we're guaranteed not to reference an exited
+	 * thread or proc since we hold the tty locked.
 	 */
 	pick = NULL;
-	mtx_lock_spin(&sched_lock);
 	LIST_FOREACH(p, &tp->t_pgrp->pg_members, p_pglist)
 		if (proc_compare(pick, p))
 			pick = p;
 
-	td = FIRST_THREAD_IN_PROC(pick);	/* XXXKSE */
-#if 0
-	KASSERT(td != NULL, ("ttyinfo: no thread"));
-#else
-	if (td == NULL) {
-		mtx_unlock_spin(&sched_lock);
-		PGRP_UNLOCK(tp->t_pgrp);
-		ttyprintf(tp, "foreground process without thread\n");
-		tp->t_rocount = 0;
-		return;
-	}
-#endif
+	PROC_SLOCK(pick);
+	picktd = NULL;
+	td = FIRST_THREAD_IN_PROC(pick);
+	FOREACH_THREAD_IN_PROC(pick, td)
+		if (thread_compare(picktd, td))
+			picktd = td;
+	td = picktd;
 	stateprefix = "";
+	thread_lock(td);
 	if (TD_IS_RUNNING(td))
 		state = "running";
 	else if (TD_ON_RUNQ(td) || TD_CAN_RUN(td))
@@ -2625,14 +2610,15 @@
 	else
 		state = "unknown";
 	pctcpu = (sched_pctcpu(td) * 10000 + FSCALE / 2) >> FSHIFT;
+	thread_unlock(td);
 	if (pick->p_state == PRS_NEW || pick->p_state == PRS_ZOMBIE)
 		rss = 0;
 	else
 		rss = pgtok(vmspace_resident_count(pick->p_vmspace));
-	mtx_unlock_spin(&sched_lock);
+	PROC_SUNLOCK(pick);
 	PROC_LOCK(pick);
 	PGRP_UNLOCK(tp->t_pgrp);
-	calcru(pick, &utime, &stime);
+	rufetchcalc(pick, &ru, &utime, &stime);
 	pid = pick->p_pid;
 	bcopy(pick->p_comm, comm, sizeof(comm));
 	PROC_UNLOCK(pick);
@@ -2660,18 +2646,6 @@
  *	   we pick out just "short-term" sleepers (P_SINTR == 0).
  *	4) Further ties are broken by picking the highest pid.
  */
-#define ISRUN(p, val)						\
-do {								\
-	struct thread *td;					\
-	val = 0;						\
-	FOREACH_THREAD_IN_PROC(p, td) {				\
-		if (TD_ON_RUNQ(td) ||				\
-		    TD_IS_RUNNING(td)) {			\
-			val = 1;				\
-			break;					\
-		}						\
-	}							\
-} while (0)
 
 #define TESTAB(a, b)    ((a)<<1 | (b))
 #define ONLYA   2
@@ -2679,71 +2653,134 @@
 #define BOTH    3
 
 static int
-proc_compare(struct proc *p1, struct proc *p2)
+proc_sum(struct proc *p, int *estcpup)
 {
+	struct thread *td;
+	int estcpu;
+	int val;
 
-	int esta, estb;
-	struct ksegrp *kg;
-	mtx_assert(&sched_lock, MA_OWNED);
-	if (p1 == NULL)
+	val = 0;
+	estcpu = 0;
+	FOREACH_THREAD_IN_PROC(p, td) {
+		thread_lock(td);
+		if (TD_ON_RUNQ(td) ||
+		    TD_IS_RUNNING(td))
+			val = 1;
+		estcpu += sched_pctcpu(td);
+		thread_unlock(td);
+	}
+	*estcpup = estcpu;
+
+	return (val);
+}
+
+static int
+thread_compare(struct thread *td, struct thread *td2)
+{
+	int runa, runb;
+	int slpa, slpb;
+	fixpt_t esta, estb;
+
+	if (td == NULL)
 		return (1);
 
-	ISRUN(p1, esta);
-	ISRUN(p2, estb);
-	
+	/*
+	 * Fetch running stats, pctcpu usage, and interruptable flag.
+ 	 */
+	thread_lock(td);
+	runa = TD_IS_RUNNING(td) | TD_ON_RUNQ(td);
+	slpa = td->td_flags & TDF_SINTR;
+	esta = sched_pctcpu(td);
+	thread_unlock(td);
+	thread_lock(td2);
+	runb = TD_IS_RUNNING(td2) | TD_ON_RUNQ(td2);
+	estb = sched_pctcpu(td2);
+	slpb = td2->td_flags & TDF_SINTR;
+	thread_unlock(td2);
 	/*
 	 * see if at least one of them is runnable
 	 */
-	switch (TESTAB(esta, estb)) {
+	switch (TESTAB(runa, runb)) {
 	case ONLYA:
 		return (0);
 	case ONLYB:
 		return (1);
 	case BOTH:
-		/*
-		 * tie - favor one with highest recent cpu utilization
-		 */
-		esta = estb = 0;
-		FOREACH_KSEGRP_IN_PROC(p1,kg) {
-			esta += kg->kg_estcpu;
-		}
-		FOREACH_KSEGRP_IN_PROC(p2,kg) {
-			estb += kg->kg_estcpu;
-		}
-		if (estb > esta)
-			return (1);
-		if (esta > estb)
-			return (0);
-		return (p2->p_pid > p1->p_pid);	/* tie - return highest pid */
+		break;
 	}
 	/*
-	 * weed out zombies
+	 *  favor one with highest recent cpu utilization
 	 */
-	switch (TESTAB(p1->p_state == PRS_ZOMBIE, p2->p_state == PRS_ZOMBIE)) {
-	case ONLYA:
+	if (estb > esta)
 		return (1);
-	case ONLYB:
+	if (esta > estb)
 		return (0);
+	/*
+	 * favor one sleeping in a non-interruptible sleep
+	 */
+	switch (TESTAB(slpa, slpb)) {
+	case ONLYA:
+		return (0);
+	case ONLYB:
+		return (1);
 	case BOTH:
-		return (p2->p_pid > p1->p_pid); /* tie - return highest pid */
+		break;
 	}
 
-#if 0 /* XXXKSE */
+	return (td < td2);
+}
+
+static int
+proc_compare(struct proc *p1, struct proc *p2)
+{
+
+	int runa, runb;
+	fixpt_t esta, estb;
+
+	if (p1 == NULL)
+		return (1);
+
 	/*
-	 * pick the one with the smallest sleep time
+	 * Fetch various stats about these processes.  After we drop the
+	 * lock the information could be stale but the race is unimportant.
+	 */
+	PROC_SLOCK(p1);
+	runa = proc_sum(p1, &esta);
+	PROC_SUNLOCK(p1);
+	PROC_SLOCK(p2);
+	runb = proc_sum(p2, &estb);
+	PROC_SUNLOCK(p2);
+	
+	/*
+	 * see if at least one of them is runnable
 	 */
-	if (p2->p_slptime > p1->p_slptime)
+	switch (TESTAB(runa, runb)) {
+	case ONLYA:
 		return (0);
-	if (p1->p_slptime > p2->p_slptime)
+	case ONLYB:
 		return (1);
+	case BOTH:
+		break;
+	}
 	/*
-	 * favor one sleeping in a non-interruptible sleep
+	 *  favor one with highest recent cpu utilization
 	 */
-	if (p1->p_sflag & PS_SINTR && (p2->p_sflag & PS_SINTR) == 0)
+	if (estb > esta)
 		return (1);
-	if (p2->p_sflag & PS_SINTR && (p1->p_sflag & PS_SINTR) == 0)
+	if (esta > estb)
 		return (0);
-#endif
+	/*
+	 * weed out zombies
+	 */
+	switch (TESTAB(p1->p_state == PRS_ZOMBIE, p2->p_state == PRS_ZOMBIE)) {
+	case ONLYA:
+		return (1);
+	case ONLYB:
+		return (0);
+	case BOTH:
+		break;
+	}
+
 	return (p2->p_pid > p1->p_pid);		/* tie - return highest pid */
 }
 
@@ -2841,23 +2878,10 @@
  * tty_open().
  */
 struct tty *
-ttymalloc(struct tty *tp)
+ttyalloc()
 {
-	static int once;
-
-	if (!once) {
-		mtx_init(&tty_list_mutex, "ttylist", NULL, MTX_DEF);
-		once++;
-	}
+	struct tty *tp;
 
-	if (tp) {
-		/*
-		 * XXX: Either this argument should go away, or we should
-		 * XXX: require it and do a ttyrel(tp) here and allocate
-		 * XXX: a new tty.  For now do nothing.
-		 */
-		return(tp);
-	}
 	tp = malloc(sizeof *tp, M_TTYS, M_WAITOK | M_ZERO);
 	mtx_init(&tp->t_mtx, "tty", NULL, MTX_DEF);
 
@@ -2882,13 +2906,6 @@
 	return (tp);
 }
 
-struct tty *
-ttyalloc()
-{
-
-	return (ttymalloc(NULL));
-}
-
 static void
 ttypurge(struct cdev *dev)
 {
@@ -2912,9 +2929,11 @@
  */
 
 int 
-ttycreate(struct tty *tp, struct cdevsw *csw, int unit, int flags, const char *fmt, ...)
+ttycreate(struct tty *tp, int flags, const char *fmt, ...)
 {
 	char namebuf[SPECNAMELEN - 3];		/* XXX space for "tty" */
+	struct cdevsw *csw = NULL;
+	int unit = 0;
 	va_list ap;
 	struct cdev *cp;
 	int i, minor, sminor, sunit;
@@ -2964,7 +2983,7 @@
 	cp->si_drv2 = &tp->t_lock_in;
 	cp->si_tty = tp;
 
-	if (flags & MINOR_CALLOUT) {
+	if (flags & TS_CALLOUT) {
 		cp = make_dev(csw, minor | MINOR_CALLOUT,
 		    UID_UUCP, GID_DIALER, 0660, "cua%s", namebuf);
 		dev_depends(tp->t_dev, cp);
@@ -2998,13 +3017,20 @@
 {
 
 	tp->t_state |= TS_GONE;
+	if (SEL_WAITING(&tp->t_rsel))
+		selwakeuppri(&tp->t_rsel, TTIPRI);
+	if (SEL_WAITING(&tp->t_wsel))
+		selwakeuppri(&tp->t_wsel, TTOPRI);
+	if (ISSET(tp->t_state, TS_ASYNC) && tp->t_sigio != NULL)
+		pgsigio(&tp->t_sigio, SIGIO, (tp->t_session != NULL));
 	wakeup(&tp->t_dtr_wait);
 	wakeup(TSA_CARR_ON(tp));
 	wakeup(TSA_HUP_OR_INPUT(tp));
 	wakeup(TSA_OCOMPLETE(tp));
 	wakeup(TSA_OLOWAT(tp));
-	if (tp->t_purge != NULL)
-		tp->t_purge(tp);
+	KNOTE_UNLOCKED(&tp->t_rsel.si_note, 0);
+	KNOTE_UNLOCKED(&tp->t_wsel.si_note, 0);
+	tt_purge(tp);
 }
 
 /*
@@ -3014,16 +3040,19 @@
  *
  * XXX: This shall sleep until all threads have left the driver.
  */
- 
 void
 ttyfree(struct tty *tp)
 {
+	struct cdev *dev;
 	u_int unit;
  
 	mtx_assert(&Giant, MA_OWNED);
 	ttygone(tp);
 	unit = tp->t_devunit;
-	destroy_dev(tp->t_mdev);
+	dev = tp->t_mdev;
+	tp->t_dev = NULL;
+	ttyrel(tp);
+	destroy_dev(dev);
 	free_unr(tty_unit, unit);
 }
 
@@ -3039,7 +3068,6 @@
 	tp = TAILQ_FIRST(&tty_list);
 	if (tp != NULL)
 		ttyref(tp);
-	mtx_unlock(&tty_list_mutex);
 	while (tp != NULL) {
 		bzero(&xt, sizeof xt);
 		xt.xt_size = sizeof xt;
@@ -3048,6 +3076,18 @@
 		xt.xt_cancc = tp->t_canq.c_cc;
 		xt.xt_outcc = tp->t_outq.c_cc;
 		XT_COPY(line);
+
+		/*
+		 * XXX: We hold the tty list lock while doing this to
+		 * work around a race with pty/pts tty destruction.
+		 * They set t_dev to NULL and then call ttyrel() to
+		 * free the structure which will block on the list
+		 * lock before they call destroy_dev() on the cdev
+		 * backing t_dev.
+		 *
+		 * XXX: ttyfree() now does the same since it has been
+		 * fixed to not leak ttys.
+		 */
 		if (tp->t_dev != NULL)
 			xt.xt_dev = dev2udev(tp->t_dev);
 		XT_COPY(state);
@@ -3070,6 +3110,7 @@
 		XT_COPY(olowat);
 		XT_COPY(ospeedwat);
 #undef XT_COPY
+		mtx_unlock(&tty_list_mutex);
 		error = SYSCTL_OUT(req, &xt, sizeof xt);
 		if (error != 0) {
 			ttyrel(tp);
@@ -3082,7 +3123,9 @@
 		mtx_unlock(&tty_list_mutex);
 		ttyrel(tp);
 		tp = tp2;
+		mtx_lock(&tty_list_mutex);
 	}
+	mtx_unlock(&tty_list_mutex);
 	return (0);
 }
 
@@ -3108,6 +3151,7 @@
 	struct tty	*tp;
 
 	tp = dev->si_tty;
+
 	s = spltty();
 	/*
 	 * We jump to this label after all non-interrupted sleeps to pick
@@ -3135,7 +3179,8 @@
 				goto out;
 			goto open_top;
 		}
-		if (tp->t_state & TS_XCLUDE && suser(td))
+		if (tp->t_state & TS_XCLUDE && priv_check(td,
+		    PRIV_TTY_EXCLUSIVE))
 			return (EBUSY);
 	} else {
 		/*
@@ -3147,16 +3192,15 @@
 		tp->t_termios = ISCALLOUT(dev) ? tp->t_init_out : tp->t_init_in;
 		tp->t_cflag = tp->t_termios.c_cflag;
 		if (tp->t_modem != NULL)
-			tp->t_modem(tp, SER_DTR | SER_RTS, 0);
+			tt_modem(tp, SER_DTR | SER_RTS, 0);
 		++tp->t_wopeners;
-		error = tp->t_param(tp, &tp->t_termios);
+		error = tt_param(tp, &tp->t_termios);
 		--tp->t_wopeners;
-		if (error == 0 && tp->t_open != NULL)
-			error = tp->t_open(tp, dev);
+		if (error == 0)
+			error = tt_open(tp, dev);
 		if (error != 0)
 			goto out;
-		if (ISCALLOUT(dev) || (tp->t_modem != NULL &&
-		    (tp->t_modem(tp, 0, 0) & SER_DCD)))
+		if (ISCALLOUT(dev) || (tt_modem(tp, 0, 0) & SER_DCD))
 			ttyld_modem(tp, 1);
 	}
 	/*
@@ -3177,9 +3221,8 @@
 		tp->t_actout = TRUE;
 out:
 	splx(s);
-	if (!(tp->t_state & TS_ISOPEN) && tp->t_wopeners == 0 &&
-	    tp->t_close != NULL)
-		tp->t_close(tp);
+	if (!(tp->t_state & TS_ISOPEN) && tp->t_wopeners == 0)
+		tt_close(tp);
 	return (error);
 }
 
@@ -3191,8 +3234,7 @@
 	tp = dev->si_tty;
 	ttyld_close(tp, flag);
 	ttyldoptim(tp);
-	if (tp->t_close != NULL)
-		tp->t_close(tp);
+	tt_close(tp);
 	tp->t_do_timestamp = 0;
 	if (tp->t_pps != NULL)
 		tp->t_pps->ppsparam.mode = 0;
@@ -3364,7 +3406,7 @@
 	ct = dev->si_drv2;
 	switch (cmd) {
 	case TIOCSETA:
-		error = suser(td);
+		error = priv_check(td, PRIV_TTY_SETA);
 		if (error != 0)
 			return (error);
 		*ct = *(struct termios *)data;
@@ -3424,6 +3466,7 @@
 	tp->t_lock_in.c_ispeed = tp->t_lock_in.c_ospeed = speed;
 	tp->t_init_out = tp->t_init_in;
 	tp->t_termios = tp->t_init_in;
+	ttsetwater(tp);
 }
 
 /*
Index: uipc_sem.c
===================================================================
RCS file: /home/cvs/src/sys/kern/uipc_sem.c,v
retrieving revision 1.1.1.2
retrieving revision 1.2
diff -L sys/kern/uipc_sem.c -L sys/kern/uipc_sem.c -u -r1.1.1.2 -r1.2
--- sys/kern/uipc_sem.c
+++ sys/kern/uipc_sem.c
@@ -32,7 +32,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/uipc_sem.c,v 1.20.2.1 2006/02/13 23:51:19 rwatson Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/uipc_sem.c,v 1.28.4.1 2008/01/17 19:52:01 rwatson Exp $");
 
 #include "opt_mac.h"
 #include "opt_posix.h"
@@ -42,26 +42,27 @@
 #include <sys/sysproto.h>
 #include <sys/eventhandler.h>
 #include <sys/kernel.h>
+#include <sys/ksem.h>
+#include <sys/priv.h>
 #include <sys/proc.h>
+#include <sys/posix4.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/module.h>
 #include <sys/condvar.h>
 #include <sys/sem.h>
 #include <sys/uio.h>
+#include <sys/semaphore.h>
 #include <sys/syscall.h>
 #include <sys/stat.h>
 #include <sys/sysent.h>
 #include <sys/sysctl.h>
 #include <sys/time.h>
-#include <sys/mac.h>
 #include <sys/malloc.h>
 #include <sys/fcntl.h>
+#include <sys/_semaphore.h>
 
-#include <posix4/ksem.h>
-#include <posix4/posix4.h>
-#include <posix4/semaphore.h>
-#include <posix4/_semaphore.h>
+#include <security/mac/mac_framework.h>
 
 static int sem_count_proc(struct proc *p);
 static struct ksem *sem_lookup_byname(const char *name);
@@ -71,6 +72,7 @@
 static int sem_perm(struct thread *td, struct ksem *ks);
 static void sem_enter(struct proc *p, struct ksem *ks);
 static int sem_leave(struct proc *p, struct ksem *ks);
+static void sem_exechook(void *arg, struct proc *p, struct image_params *imgp);
 static void sem_exithook(void *arg, struct proc *p);
 static void sem_forkhook(void *arg, struct proc *p1, struct proc *p2,
     int flags);
@@ -417,21 +419,32 @@
 {
 	struct ucred *uc;
 
+	/*
+	 * XXXRW: This permission routine appears to be incorrect.  If the
+	 * user matches, we shouldn't go on to the group if the user
+	 * permissions don't allow the action?  Not changed for now.  To fix,
+	 * change from a series of if (); if (); to if () else if () else...
+	 */
 	uc = td->td_ucred;
 	DP(("sem_perm: uc(%d,%d) ks(%d,%d,%o)\n",
 	    uc->cr_uid, uc->cr_gid,
 	     ks->ks_uid, ks->ks_gid, ks->ks_mode));
-	if ((uc->cr_uid == ks->ks_uid && (ks->ks_mode & S_IWUSR) != 0) ||
-	    (uc->cr_gid == ks->ks_gid && (ks->ks_mode & S_IWGRP) != 0) ||
-	    (ks->ks_mode & S_IWOTH) != 0 || suser(td) == 0)
+	if ((uc->cr_uid == ks->ks_uid) && (ks->ks_mode & S_IWUSR) != 0)
+		return (0);
+	if ((uc->cr_gid == ks->ks_gid) && (ks->ks_mode & S_IWGRP) != 0)
+		return (0);
+	if ((ks->ks_mode & S_IWOTH) != 0)
 		return (0);
-	return (EPERM);
+	return (priv_check(td, PRIV_SEM_WRITE));
 }
 
 static void
 sem_free(struct ksem *ks)
 {
 
+#ifdef MAC
+	mac_destroy_posix_sem(ks);
+#endif
 	nsems--;
 	if (ks->ks_onlist)
 		LIST_REMOVE(ks, ks_entry);
@@ -508,7 +521,6 @@
 };
 int ksem_unlink(struct thread *td, struct ksem_unlink_args *uap);
 #endif
-	
 int
 ksem_unlink(struct thread *td, struct ksem_unlink_args *uap)
 {
@@ -556,7 +568,6 @@
 };
 int ksem_close(struct thread *td, struct ksem_close_args *uap);
 #endif
-
 int
 ksem_close(struct thread *td, struct ksem_close_args *uap)
 {
@@ -629,7 +640,6 @@
 };
 int ksem_wait(struct thread *td, struct ksem_wait_args *uap);
 #endif
-
 int
 ksem_wait(struct thread *td, struct ksem_wait_args *uap)
 {
@@ -640,7 +650,7 @@
 #ifndef _SYS_SYSPROTO_H_
 struct ksem_timedwait_args {
 	semid_t id;
-	struct timespec *abstime;
+	const struct timespec *abstime;
 };
 int ksem_timedwait(struct thread *td, struct ksem_timedwait_args *uap);
 #endif
@@ -919,6 +929,12 @@
 }
 
 static void
+sem_exechook(void *arg, struct proc *p, struct image_params *imgp __unused)
+{
+   	sem_exithook(arg, p);   	
+}
+
+static void
 sem_exithook(void *arg, struct proc *p)
 {
 	struct ksem *ks, *ksnext;
@@ -951,7 +967,7 @@
 		p31b_setcfg(CTL_P1003_1B_SEM_VALUE_MAX, SEM_VALUE_MAX);
 		sem_exit_tag = EVENTHANDLER_REGISTER(process_exit, sem_exithook,
 		    NULL, EVENTHANDLER_PRI_ANY);
-		sem_exec_tag = EVENTHANDLER_REGISTER(process_exec, sem_exithook,
+		sem_exec_tag = EVENTHANDLER_REGISTER(process_exec, sem_exechook,
 		    NULL, EVENTHANDLER_PRI_ANY);
 		sem_fork_tag = EVENTHANDLER_REGISTER(process_fork, sem_forkhook, NULL, EVENTHANDLER_PRI_ANY);
                 break;
Index: bus_if.m
===================================================================
RCS file: /home/cvs/src/sys/kern/bus_if.m,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/bus_if.m -L sys/kern/bus_if.m -u -r1.1.1.1 -r1.2
--- sys/kern/bus_if.m
+++ sys/kern/bus_if.m
@@ -23,7 +23,7 @@
 # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 # SUCH DAMAGE.
 #
-# $FreeBSD: src/sys/kern/bus_if.m,v 1.29 2005/01/06 23:35:38 imp Exp $
+# $FreeBSD: src/sys/kern/bus_if.m,v 1.34 2007/02/23 12:19:01 piso Exp $
 #
 
 #include <sys/bus.h>
@@ -326,6 +326,7 @@
 	device_t	_child;
 	struct resource *_irq;
 	int		_flags;
+	driver_filter_t	*_filter;
 	driver_intr_t	*_intr;
 	void		*_arg;
 	void		**_cookiep;
@@ -507,3 +508,36 @@
 	enum intr_trigger _trig;
 	enum intr_polarity _pol;
 } DEFAULT bus_generic_config_intr;
+
+/**
+ * @brief Notify a (bus) driver about a child that the hints mechanism
+ * believes it has discovered.
+ *
+ * The bus is responsible for then adding the child in the right order
+ * and discovering other things about the child.  The bus driver is
+ * free to ignore this hint, to do special things, etc.  It is all up
+ * to the bus driver to interpret.
+ *
+ * This method is only called in response to the parent bus asking for
+ * hinted devices to be enumerated.
+ *
+ * @param _dev		the bus device
+ * @param _dname	the name of the device w/o unit numbers
+ * @param _dunit	the unit number of the device
+ */
+METHOD void hinted_child {
+	device_t	_dev;
+	const char *	_dname;
+	int		_dunit;
+};
+
+/**
+ * @brief Returns bus_dma_tag_t for use w/ devices on the bus.
+ *
+ * @param _dev		the parent device of @p _child
+ * @param _child	the device to which the tag will belong
+ */
+METHOD bus_dma_tag_t get_dma_tag {
+	device_t	_dev;
+	device_t	_child;
+} DEFAULT bus_generic_get_dma_tag;
Index: uipc_mbuf2.c
===================================================================
RCS file: /home/cvs/src/sys/kern/uipc_mbuf2.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/uipc_mbuf2.c -L sys/kern/uipc_mbuf2.c -u -r1.1.1.1 -r1.2
--- sys/kern/uipc_mbuf2.c
+++ sys/kern/uipc_mbuf2.c
@@ -61,7 +61,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/uipc_mbuf2.c,v 1.31.2.1 2005/07/25 00:08:12 rwatson Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/uipc_mbuf2.c,v 1.33 2006/10/22 11:52:13 rwatson Exp $");
 
 /*#define PULLDOWN_DEBUG*/
 
@@ -71,11 +71,12 @@
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
-#include <sys/mac.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/mutex.h>
 
+#include <security/mac/mac_framework.h>
+
 static MALLOC_DEFINE(M_PACKET_TAGS, MBUF_TAG_MEM_NAME,
     "packet-attached information");
 
Index: imgact_elf.c
===================================================================
RCS file: /home/cvs/src/sys/kern/imgact_elf.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/imgact_elf.c -L sys/kern/imgact_elf.c -u -r1.2 -r1.3
--- sys/kern/imgact_elf.c
+++ sys/kern/imgact_elf.c
@@ -29,7 +29,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/imgact_elf.c,v 1.162.2.3 2006/03/16 00:25:31 alc Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/imgact_elf.c,v 1.178.2.2.2.1 2008/01/19 18:15:05 kib Exp $");
 
 #include "opt_compat.h"
 
@@ -106,6 +106,10 @@
 
 static Elf_Brandinfo *elf_brand_list[MAX_BRANDS];
 
+#define	trunc_page_ps(va, ps)	((va) & ~(ps - 1))
+#define	round_page_ps(va, ps)	(((va) + (ps - 1)) & ~(ps - 1))
+#define	aligned(a, t)	(trunc_page_ps((u_long)(a), sizeof(t)) == (u_long)(a))
+
 int
 __elfN(insert_brand_entry)(Elf_Brandinfo *entry)
 {
@@ -145,7 +149,7 @@
 	int rval = FALSE;
 
 	sx_slock(&allproc_lock);
-	LIST_FOREACH(p, &allproc, p_list) {
+	FOREACH_PROC_IN_SYSTEM(p) {
 		if (p->p_sysent == entry->sysvec) {
 			rval = TRUE;
 			break;
@@ -360,9 +364,6 @@
 		return (ENOEXEC);
 	}
 
-#define trunc_page_ps(va, ps)	((va) & ~(ps - 1))
-#define round_page_ps(va, ps)	(((va) + (ps - 1)) & ~(ps - 1))
-
 	map_addr = trunc_page_ps((vm_offset_t)vmaddr, pagesize);
 	file_addr = trunc_page_ps(offset, pagesize);
 
@@ -549,6 +550,10 @@
 	}
 
 	phdr = (const Elf_Phdr *)(imgp->image_header + hdr->e_phoff);
+	if (!aligned(phdr, Elf_Addr)) {
+		error = ENOEXEC;
+		goto fail;
+	}
 
 	for (i = 0, numsegs = 0; i < hdr->e_phnum; i++) {
 		if (phdr[i].p_type == PT_LOAD) {	/* Loadable segment */
@@ -592,11 +597,13 @@
 	return (error);
 }
 
+static const char FREEBSD_ABI_VENDOR[] = "FreeBSD";
+
 static int
 __CONCAT(exec_, __elfN(imgact))(struct image_params *imgp)
 {
 	const Elf_Ehdr *hdr = (const Elf_Ehdr *)imgp->image_header;
-	const Elf_Phdr *phdr;
+	const Elf_Phdr *phdr, *pnote = NULL;
 	Elf_Auxargs *elf_auxargs;
 	struct vmspace *vmspace;
 	vm_prot_t prot;
@@ -607,7 +614,9 @@
 	int error = 0, i;
 	const char *interp = NULL;
 	Elf_Brandinfo *brand_info;
+	const Elf_Note *note, *note_end;
 	char *path;
+	const char *note_name;
 	struct thread *td = curthread;
 	struct sysentvec *sv;
 
@@ -632,6 +641,8 @@
 		return (ENOEXEC);
 	}
 	phdr = (const Elf_Phdr *)(imgp->image_header + hdr->e_phoff);
+	if (!aligned(phdr, Elf_Addr))
+		return (ENOEXEC);
 	for (i = 0; i < hdr->e_phnum; i++) {
 		if (phdr[i].p_type == PT_INTERP) {
 			/* Path to interpreter */
@@ -649,7 +660,8 @@
 		    hdr->e_ident[EI_OSABI]);
 		return (ENOEXEC);
 	}
-	if (hdr->e_type == ET_DYN && brand_info->brand != ELFOSABI_LINUX)
+	if (hdr->e_type == ET_DYN &&
+	    (brand_info->flags & BI_CAN_EXEC_DYN) == 0)
 		return (ENOEXEC);
 	sv = brand_info->sysvec;
 	if (interp != NULL && brand_info->interp_newpath != NULL)
@@ -665,9 +677,12 @@
 	 */
 	VOP_UNLOCK(imgp->vp, 0, td);
 
-	exec_new_vmspace(imgp, sv);
+	error = exec_new_vmspace(imgp, sv);
+	imgp->proc->p_sysent = sv;
 
 	vn_lock(imgp->vp, LK_EXCLUSIVE | LK_RETRY, td);
+	if (error)
+		return (error);
 
 	vmspace = imgp->proc->p_vmspace;
 
@@ -743,6 +758,9 @@
 		case PT_PHDR: 	/* Program header table info */
 			proghdr = phdr[i].p_vaddr;
 			break;
+		case PT_NOTE:
+			pnote = &phdr[i];
+			break;
 		default:
 			break;
 		}
@@ -783,7 +801,6 @@
 
 	imgp->entry_addr = entry;
 
-	imgp->proc->p_sysent = sv;
 	if (interp != NULL) {
 		VOP_UNLOCK(imgp->vp, 0, td);
 		if (brand_info->emul_path != NULL &&
@@ -825,6 +842,41 @@
 	imgp->auxargs = elf_auxargs;
 	imgp->interpreted = 0;
 
+	/*
+	 * Try to fetch the osreldate for FreeBSD binary from the ELF
+	 * OSABI-note. Only the first page of the image is searched,
+	 * the same as for headers.
+	 */
+	if (pnote != NULL && pnote->p_offset < PAGE_SIZE &&
+	    pnote->p_offset + pnote->p_filesz < PAGE_SIZE ) {
+		note = (const Elf_Note *)(imgp->image_header + pnote->p_offset);
+		if (!aligned(note, Elf32_Addr)) {
+			free(imgp->auxargs, M_TEMP);
+			imgp->auxargs = NULL;
+			return (ENOEXEC);
+		}
+		note_end = (const Elf_Note *)(imgp->image_header + pnote->p_offset +
+		    pnote->p_filesz);
+		while (note < note_end) {
+			if (note->n_namesz == sizeof(FREEBSD_ABI_VENDOR) &&
+			    note->n_descsz == sizeof(int32_t) &&
+			    note->n_type == 1 /* ABI_NOTETYPE */) {
+				note_name = (const char *)(note + 1);
+				if (strncmp(FREEBSD_ABI_VENDOR, note_name,
+				    sizeof(FREEBSD_ABI_VENDOR)) == 0) {
+					imgp->proc->p_osrel = *(const int32_t *)
+					    (note_name +
+					    round_page_ps(sizeof(FREEBSD_ABI_VENDOR),
+						sizeof(Elf32_Addr)));
+					break;
+				}
+			}
+			note = (const Elf_Note *)((const char *)(note + 1) +
+			    round_page_ps(note->n_namesz, sizeof(Elf32_Addr)) +
+			    round_page_ps(note->n_descsz, sizeof(Elf32_Addr)));
+		}
+	}
+
 	return (error);
 }
 
@@ -891,8 +943,6 @@
 static void __elfN(putnote)(void *, size_t *, const char *, int,
     const void *, size_t);
 
-extern int osreldate;
-
 int
 __elfN(coredump)(td, vp, limit)
 	struct thread *td;
@@ -1017,11 +1067,12 @@
 	struct proc *p = td->td_proc;
 	vm_map_t map = &p->p_vmspace->vm_map;
 	vm_map_entry_t entry;
+	vm_object_t backing_object, object;
+	boolean_t ignore_entry;
 
+	vm_map_lock_read(map);
 	for (entry = map->header.next; entry != &map->header;
 	    entry = entry->next) {
-		vm_object_t obj;
-
 		/*
 		 * Don't dump inaccessible mappings, deal with legacy
 		 * coredump mode.
@@ -1047,21 +1098,25 @@
 		if (entry->eflags & (MAP_ENTRY_NOCOREDUMP|MAP_ENTRY_IS_SUB_MAP))
 			continue;
 
-		if ((obj = entry->object.vm_object) == NULL)
+		if ((object = entry->object.vm_object) == NULL)
 			continue;
 
-		/* Find the deepest backing object. */
-		while (obj->backing_object != NULL)
-			obj = obj->backing_object;
-
 		/* Ignore memory-mapped devices and such things. */
-		if (obj->type != OBJT_DEFAULT &&
-		    obj->type != OBJT_SWAP &&
-		    obj->type != OBJT_VNODE)
+		VM_OBJECT_LOCK(object);
+		while ((backing_object = object->backing_object) != NULL) {
+			VM_OBJECT_LOCK(backing_object);
+			VM_OBJECT_UNLOCK(object);
+			object = backing_object;
+		}
+		ignore_entry = object->type != OBJT_DEFAULT &&
+		    object->type != OBJT_SWAP && object->type != OBJT_VNODE;
+		VM_OBJECT_UNLOCK(object);
+		if (ignore_entry)
 			continue;
 
 		(*func)(entry, closure);
 	}
+	vm_map_unlock_read(map);
 }
 
 /*
Index: vfs_hash.c
===================================================================
RCS file: /home/cvs/src/sys/kern/vfs_hash.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/vfs_hash.c -L sys/kern/vfs_hash.c -u -r1.1.1.1 -r1.2
--- sys/kern/vfs_hash.c
+++ sys/kern/vfs_hash.c
@@ -26,7 +26,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/vfs_hash.c,v 1.9.2.1 2005/09/12 15:53:58 tegge Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/vfs_hash.c,v 1.13 2007/03/13 01:50:26 tegge Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -35,7 +35,7 @@
 #include <sys/mount.h>
 #include <sys/vnode.h>
 
-static MALLOC_DEFINE(M_VFS_HASH, "VFS hash", "VFS hash table");
+static MALLOC_DEFINE(M_VFS_HASH, "vfs_hash", "VFS hash table");
 
 static LIST_HEAD(vfs_hash_head, vnode)	*vfs_hash_tbl;
 static LIST_HEAD(,vnode)		vfs_hash_side;
@@ -55,14 +55,14 @@
 SYSINIT(vfs_hash, SI_SUB_VFS, SI_ORDER_SECOND, vfs_hashinit, NULL)
 
 static struct vfs_hash_head *
-vfs_hash_index(struct mount *mp, u_int hash)
+vfs_hash_index(const struct mount *mp, u_int hash)
 {
 
 	return(&vfs_hash_tbl[(hash + mp->mnt_hashseed) & vfs_hash_mask]);
 }
 
 int
-vfs_hash_get(struct mount *mp, u_int hash, int flags, struct thread *td, struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg)
+vfs_hash_get(const struct mount *mp, u_int hash, int flags, struct thread *td, struct vnode **vpp, vfs_hash_cmp_t *fn, void *arg)
 {
 	struct vnode *vp;
 	int error;
@@ -109,7 +109,6 @@
 	struct vnode *vp2;
 	int error;
 
-	lockmgr(vp->v_vnlock, flags & LK_TYPE_MASK, NULL, td);
 	*vpp = NULL;
 	while (1) {
 		mtx_lock(&vfs_hash_mtx);
Index: sched_4bsd.c
===================================================================
RCS file: /home/cvs/src/sys/kern/sched_4bsd.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/sched_4bsd.c -L sys/kern/sched_4bsd.c -u -r1.2 -r1.3
--- sys/kern/sched_4bsd.c
+++ sys/kern/sched_4bsd.c
@@ -33,12 +33,10 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/sched_4bsd.c,v 1.77 2005/06/24 00:16:57 peter Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/sched_4bsd.c,v 1.106.2.1 2007/12/20 07:15:40 davidxu Exp $");
 
 #include "opt_hwpmc_hooks.h"
 
-#define kse td_sched
-
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
@@ -53,6 +51,8 @@
 #include <sys/sysctl.h>
 #include <sys/sx.h>
 #include <sys/turnstile.h>
+#include <sys/umtx.h>
+#include <machine/pcb.h>
 #include <machine/smp.h>
 
 #ifdef HWPMC_HOOKS
@@ -74,84 +74,35 @@
 #define	NICE_WEIGHT		1	/* Priorities per nice level. */
 
 /*
- * The schedulable entity that can be given a context to run.
- * A process may have several of these. Probably one per processor
- * but posibly a few more. In this universe they are grouped
- * with a KSEG that contains the priority and niceness
- * for the group.
- */
-struct kse {
-	TAILQ_ENTRY(kse) ke_procq;	/* (j/z) Run queue. */
-	struct thread	*ke_thread;	/* (*) Active associated thread. */
-	fixpt_t		ke_pctcpu;	/* (j) %cpu during p_swtime. */
-	char		ke_rqindex;	/* (j) Run queue index. */
-	enum {
-		KES_THREAD = 0x0,	/* slaved to thread state */
-		KES_ONRUNQ
-	} ke_state;			/* (j) KSE status. */
-	int		ke_cpticks;	/* (j) Ticks of cpu time. */
-	struct runq	*ke_runq;	/* runq the kse is currently on */
+ * The schedulable entity that runs a context.
+ * This is  an extension to the thread structure and is tailored to
+ * the requirements of this scheduler
+ */
+struct td_sched {
+	TAILQ_ENTRY(td_sched) ts_procq;	/* (j/z) Run queue. */
+	struct thread	*ts_thread;	/* (*) Active associated thread. */
+	fixpt_t		ts_pctcpu;	/* (j) %cpu during p_swtime. */
+	u_char		ts_rqindex;	/* (j) Run queue index. */
+	int		ts_cpticks;	/* (j) Ticks of cpu time. */
+	int		ts_slptime;	/* (j) Seconds !RUNNING. */
+	struct runq	*ts_runq;	/* runq the thread is currently on */
 };
 
-#define ke_proc		ke_thread->td_proc
-#define ke_ksegrp	ke_thread->td_ksegrp
-
-#define td_kse td_sched
-
 /* flags kept in td_flags */
-#define TDF_DIDRUN	TDF_SCHED0	/* KSE actually ran. */
-#define TDF_EXIT	TDF_SCHED1	/* KSE is being killed. */
+#define TDF_DIDRUN	TDF_SCHED0	/* thread actually ran. */
+#define TDF_EXIT	TDF_SCHED1	/* thread is being killed. */
 #define TDF_BOUND	TDF_SCHED2
 
-#define ke_flags	ke_thread->td_flags
-#define KEF_DIDRUN	TDF_DIDRUN /* KSE actually ran. */
-#define KEF_EXIT	TDF_EXIT /* KSE is being killed. */
-#define KEF_BOUND	TDF_BOUND /* stuck to one CPU */
-
-#define SKE_RUNQ_PCPU(ke)						\
-    ((ke)->ke_runq != 0 && (ke)->ke_runq != &runq)
-
-struct kg_sched {
-	struct thread	*skg_last_assigned; /* (j) Last thread assigned to */
-					   /* the system scheduler. */
-	int	skg_avail_opennings;	/* (j) Num KSEs requested in group. */
-	int	skg_concurrency;	/* (j) Num KSEs requested in group. */
-};
-#define kg_last_assigned	kg_sched->skg_last_assigned
-#define kg_avail_opennings	kg_sched->skg_avail_opennings
-#define kg_concurrency		kg_sched->skg_concurrency
-
-#define SLOT_RELEASE(kg)						\
-do {									\
-	kg->kg_avail_opennings++; 					\
-	CTR3(KTR_RUNQ, "kg %p(%d) Slot released (->%d)",		\
-	kg,								\
-	kg->kg_concurrency,						\
-	 kg->kg_avail_opennings);					\
-/*	KASSERT((kg->kg_avail_opennings <= kg->kg_concurrency),		\
-	    ("slots out of whack"));*/					\
-} while (0)
-
-#define SLOT_USE(kg)							\
-do {									\
-	kg->kg_avail_opennings--; 					\
-	CTR3(KTR_RUNQ, "kg %p(%d) Slot used (->%d)",			\
-	kg,								\
-	kg->kg_concurrency,						\
-	 kg->kg_avail_opennings);					\
-/*	KASSERT((kg->kg_avail_opennings >= 0),				\
-	    ("slots out of whack"));*/					\
-} while (0)
+#define ts_flags	ts_thread->td_flags
+#define TSF_DIDRUN	TDF_DIDRUN /* thread actually ran. */
+#define TSF_EXIT	TDF_EXIT /* thread is being killed. */
+#define TSF_BOUND	TDF_BOUND /* stuck to one CPU */
 
-/*
- * KSE_CAN_MIGRATE macro returns true if the kse can migrate between
- * cpus.
- */
-#define KSE_CAN_MIGRATE(ke)						\
-    ((ke)->ke_thread->td_pinned == 0 && ((ke)->ke_flags & KEF_BOUND) == 0)
+#define SKE_RUNQ_PCPU(ts)						\
+    ((ts)->ts_runq != 0 && (ts)->ts_runq != &runq)
 
-static struct kse kse0;
-static struct kg_sched kg_sched0;
+static struct td_sched td_sched0;
+struct mtx sched_lock;
 
 static int	sched_tdcnt;	/* Total runnable threads in the system. */
 static int	sched_quantum;	/* Roundrobin scheduling quantum in ticks. */
@@ -159,9 +110,6 @@
 
 static struct callout roundrobin_callout;
 
-static void	slot_fill(struct ksegrp *kg);
-static struct kse *sched_choose(void);		/* XXX Should be thread * */
-
 static void	setup_runqs(void);
 static void	roundrobin(void *arg);
 static void	schedcpu(void);
@@ -169,9 +117,9 @@
 static void	sched_priority(struct thread *td, u_char prio);
 static void	sched_setup(void *dummy);
 static void	maybe_resched(struct thread *td);
-static void	updatepri(struct ksegrp *kg);
-static void	resetpriority(struct ksegrp *kg);
-static void	resetpriority_thread(struct thread *td, struct ksegrp *kg);
+static void	updatepri(struct thread *td);
+static void	resetpriority(struct thread *td);
+static void	resetpriority_thread(struct thread *td);
 #ifdef SMP
 static int	forward_wakeup(int  cpunum);
 #endif
@@ -274,20 +222,12 @@
 	   "account for htt");
 
 #endif
+#if 0
 static int sched_followon = 0;
 SYSCTL_INT(_kern_sched, OID_AUTO, followon, CTLFLAG_RW,
 	   &sched_followon, 0,
 	   "allow threads to share a quantum");
-
-static int sched_pfollowons = 0;
-SYSCTL_INT(_kern_sched, OID_AUTO, pfollowons, CTLFLAG_RD,
-	   &sched_pfollowons, 0,
-	   "number of followons done to a different ksegrp");
-
-static int sched_kgfollowons = 0;
-SYSCTL_INT(_kern_sched, OID_AUTO, kgfollowons, CTLFLAG_RD,
-	   &sched_kgfollowons, 0,
-	   "number of followons done in a ksegrp");
+#endif
 
 static __inline void
 sched_load_add(void)
@@ -310,7 +250,7 @@
 maybe_resched(struct thread *td)
 {
 
-	mtx_assert(&sched_lock, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	if (td->td_priority < curthread->td_priority)
 		curthread->td_flags |= TDF_NEEDRESCHED;
 }
@@ -338,20 +278,20 @@
 
 /*
  * Constants for digital decay and forget:
- *	90% of (kg_estcpu) usage in 5 * loadav time
- *	95% of (ke_pctcpu) usage in 60 seconds (load insensitive)
+ *	90% of (td_estcpu) usage in 5 * loadav time
+ *	95% of (ts_pctcpu) usage in 60 seconds (load insensitive)
  *          Note that, as ps(1) mentions, this can let percentages
  *          total over 100% (I've seen 137.9% for 3 processes).
  *
- * Note that schedclock() updates kg_estcpu and p_cpticks asynchronously.
+ * Note that schedclock() updates td_estcpu and p_cpticks asynchronously.
  *
- * We wish to decay away 90% of kg_estcpu in (5 * loadavg) seconds.
+ * We wish to decay away 90% of td_estcpu in (5 * loadavg) seconds.
  * That is, the system wants to compute a value of decay such
  * that the following for loop:
  * 	for (i = 0; i < (5 * loadavg); i++)
- * 		kg_estcpu *= decay;
+ * 		td_estcpu *= decay;
  * will compute
- * 	kg_estcpu *= 0.1;
+ * 	td_estcpu *= 0.1;
  * for all values of loadavg:
  *
  * Mathematically this loop can be expressed by saying:
@@ -404,7 +344,7 @@
 #define	loadfactor(loadav)	(2 * (loadav))
 #define	decay_cpu(loadfac, cpu)	(((loadfac) * (cpu)) / ((loadfac) + FSCALE))
 
-/* decay 95% of `ke_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
+/* decay 95% of `ts_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */
 static fixpt_t	ccpu = 0.95122942450071400909 * FSCALE;	/* exp(-1/20) */
 SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, "");
 
@@ -433,79 +373,70 @@
 	register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]);
 	struct thread *td;
 	struct proc *p;
-	struct kse *ke;
-	struct ksegrp *kg;
+	struct td_sched *ts;
 	int awake, realstathz;
 
 	realstathz = stathz ? stathz : hz;
 	sx_slock(&allproc_lock);
 	FOREACH_PROC_IN_SYSTEM(p) {
-		/*
-		 * Prevent state changes and protect run queue.
-		 */
-		mtx_lock_spin(&sched_lock);
-		/*
-		 * Increment time in/out of memory.  We ignore overflow; with
-		 * 16-bit int's (remember them?) overflow takes 45 days.
-		 */
-		p->p_swtime++;
-		FOREACH_KSEGRP_IN_PROC(p, kg) { 
+		PROC_SLOCK(p);
+		FOREACH_THREAD_IN_PROC(p, td) { 
 			awake = 0;
-			FOREACH_THREAD_IN_GROUP(kg, td) {
-				ke = td->td_kse;
-				/*
-				 * Increment sleep time (if sleeping).  We
-				 * ignore overflow, as above.
-				 */
-				/*
-				 * The kse slptimes are not touched in wakeup
-				 * because the thread may not HAVE a KSE.
-				 */
-				if (ke->ke_state == KES_ONRUNQ) {
-					awake = 1;
-					ke->ke_flags &= ~KEF_DIDRUN;
-				} else if ((ke->ke_state == KES_THREAD) &&
-				    (TD_IS_RUNNING(td))) {
-					awake = 1;
-					/* Do not clear KEF_DIDRUN */
-				} else if (ke->ke_flags & KEF_DIDRUN) {
-					awake = 1;
-					ke->ke_flags &= ~KEF_DIDRUN;
-				}
+			thread_lock(td);
+			ts = td->td_sched;
+			/*
+			 * Increment sleep time (if sleeping).  We
+			 * ignore overflow, as above.
+			 */
+			/*
+			 * The td_sched slptimes are not touched in wakeup
+			 * because the thread may not HAVE everything in
+			 * memory? XXX I think this is out of date.
+			 */
+			if (TD_ON_RUNQ(td)) {
+				awake = 1;
+				ts->ts_flags &= ~TSF_DIDRUN;
+			} else if (TD_IS_RUNNING(td)) {
+				awake = 1;
+				/* Do not clear TSF_DIDRUN */
+			} else if (ts->ts_flags & TSF_DIDRUN) {
+				awake = 1;
+				ts->ts_flags &= ~TSF_DIDRUN;
+			}
 
-				/*
-				 * ke_pctcpu is only for ps and ttyinfo().
-				 * Do it per kse, and add them up at the end?
-				 * XXXKSE
-				 */
-				ke->ke_pctcpu = (ke->ke_pctcpu * ccpu) >>
-				    FSHIFT;
-				/*
-				 * If the kse has been idle the entire second,
-				 * stop recalculating its priority until
-				 * it wakes up.
-				 */
-				if (ke->ke_cpticks == 0)
-					continue;
+			/*
+			 * ts_pctcpu is only for ps and ttyinfo().
+			 * Do it per td_sched, and add them up at the end?
+			 * XXXKSE
+			 */
+			ts->ts_pctcpu = (ts->ts_pctcpu * ccpu) >> FSHIFT;
+			/*
+			 * If the td_sched has been idle the entire second,
+			 * stop recalculating its priority until
+			 * it wakes up.
+			 */
+			if (ts->ts_cpticks != 0) {
 #if	(FSHIFT >= CCPU_SHIFT)
-				ke->ke_pctcpu += (realstathz == 100)
-				    ? ((fixpt_t) ke->ke_cpticks) <<
+				ts->ts_pctcpu += (realstathz == 100)
+				    ? ((fixpt_t) ts->ts_cpticks) <<
 				    (FSHIFT - CCPU_SHIFT) :
-				    100 * (((fixpt_t) ke->ke_cpticks)
+				    100 * (((fixpt_t) ts->ts_cpticks)
 				    << (FSHIFT - CCPU_SHIFT)) / realstathz;
 #else
-				ke->ke_pctcpu += ((FSCALE - ccpu) *
-				    (ke->ke_cpticks *
+				ts->ts_pctcpu += ((FSCALE - ccpu) *
+				    (ts->ts_cpticks *
 				    FSCALE / realstathz)) >> FSHIFT;
 #endif
-				ke->ke_cpticks = 0;
-			} /* end of kse loop */
+				ts->ts_cpticks = 0;
+			}
 			/* 
-			 * If there are ANY running threads in this KSEGRP,
+			 * If there are ANY running threads in this process,
 			 * then don't count it as sleeping.
+XXX  this is broken
+
 			 */
 			if (awake) {
-				if (kg->kg_slptime > 1) {
+				if (ts->ts_slptime > 1) {
 					/*
 					 * In an ideal world, this should not
 					 * happen, because whoever woke us
@@ -515,20 +446,21 @@
 					 * priority.  Should KASSERT at some
 					 * point when all the cases are fixed.
 					 */
-					updatepri(kg);
+					updatepri(td);
 				}
-				kg->kg_slptime = 0;
+				ts->ts_slptime = 0;
 			} else
-				kg->kg_slptime++;
-			if (kg->kg_slptime > 1)
+				ts->ts_slptime++;
+			if (ts->ts_slptime > 1) {
+				thread_unlock(td);
 				continue;
-			kg->kg_estcpu = decay_cpu(loadfac, kg->kg_estcpu);
-		      	resetpriority(kg);
-			FOREACH_THREAD_IN_GROUP(kg, td) {
-				resetpriority_thread(td, kg);
 			}
-		} /* end of ksegrp loop */
-		mtx_unlock_spin(&sched_lock);
+			td->td_estcpu = decay_cpu(loadfac, td->td_estcpu);
+		      	resetpriority(td);
+			resetpriority_thread(td);
+			thread_unlock(td);
+		} /* end of thread loop */
+		PROC_SUNLOCK(p);
 	} /* end of process loop */
 	sx_sunlock(&allproc_lock);
 }
@@ -539,34 +471,35 @@
 static void
 schedcpu_thread(void)
 {
-	int nowake;
 
 	for (;;) {
 		schedcpu();
-		tsleep(&nowake, 0, "-", hz);
+		pause("-", hz);
 	}
 }
 
 /*
  * Recalculate the priority of a process after it has slept for a while.
- * For all load averages >= 1 and max kg_estcpu of 255, sleeping for at
- * least six times the loadfactor will decay kg_estcpu to zero.
+ * For all load averages >= 1 and max td_estcpu of 255, sleeping for at
+ * least six times the loadfactor will decay td_estcpu to zero.
  */
 static void
-updatepri(struct ksegrp *kg)
+updatepri(struct thread *td)
 {
-	register fixpt_t loadfac;
-	register unsigned int newcpu;
+	struct td_sched *ts;
+	fixpt_t loadfac;
+	unsigned int newcpu;
 
+	ts = td->td_sched;
 	loadfac = loadfactor(averunnable.ldavg[0]);
-	if (kg->kg_slptime > 5 * loadfac)
-		kg->kg_estcpu = 0;
+	if (ts->ts_slptime > 5 * loadfac)
+		td->td_estcpu = 0;
 	else {
-		newcpu = kg->kg_estcpu;
-		kg->kg_slptime--;	/* was incremented in schedcpu() */
-		while (newcpu && --kg->kg_slptime)
+		newcpu = td->td_estcpu;
+		ts->ts_slptime--;	/* was incremented in schedcpu() */
+		while (newcpu && --ts->ts_slptime)
 			newcpu = decay_cpu(loadfac, newcpu);
-		kg->kg_estcpu = newcpu;
+		td->td_estcpu = newcpu;
 	}
 }
 
@@ -576,25 +509,25 @@
  * than that of the current process.
  */
 static void
-resetpriority(struct ksegrp *kg)
+resetpriority(struct thread *td)
 {
 	register unsigned int newpriority;
 
-	if (kg->kg_pri_class == PRI_TIMESHARE) {
-		newpriority = PUSER + kg->kg_estcpu / INVERSE_ESTCPU_WEIGHT +
-		    NICE_WEIGHT * (kg->kg_proc->p_nice - PRIO_MIN);
+	if (td->td_pri_class == PRI_TIMESHARE) {
+		newpriority = PUSER + td->td_estcpu / INVERSE_ESTCPU_WEIGHT +
+		    NICE_WEIGHT * (td->td_proc->p_nice - PRIO_MIN);
 		newpriority = min(max(newpriority, PRI_MIN_TIMESHARE),
 		    PRI_MAX_TIMESHARE);
-		kg->kg_user_pri = newpriority;
+		sched_user_prio(td, newpriority);
 	}
 }
 
 /*
- * Update the thread's priority when the associated ksegroup's user
+ * Update the thread's priority when the associated process's user
  * priority changes.
  */
 static void
-resetpriority_thread(struct thread *td, struct ksegrp *kg)
+resetpriority_thread(struct thread *td)
 {
 
 	/* Only change threads with a time sharing user priority. */
@@ -605,7 +538,7 @@
 	/* XXX the whole needresched thing is broken, but not silly. */
 	maybe_resched(td);
 
-	sched_prio(td, kg->kg_user_pri);
+	sched_prio(td, td->td_user_pri);
 }
 
 /* ARGSUSED */
@@ -641,12 +574,10 @@
 	 * Set up the scheduler specific parts of proc0.
 	 */
 	proc0.p_sched = NULL; /* XXX */
-	ksegrp0.kg_sched = &kg_sched0;
-	thread0.td_sched = &kse0;
-	kse0.ke_thread = &thread0;
-	kse0.ke_state = KES_THREAD;
-	kg_sched0.skg_concurrency = 1;
-	kg_sched0.skg_avail_opennings = 0; /* we are already running */
+	thread0.td_sched = &td_sched0;
+	thread0.td_lock = &sched_lock;
+	td_sched0.ts_thread = &thread0;
+	mtx_init(&sched_lock, "sched lock", NULL, MTX_SPIN | MTX_RECURSE);
 }
 
 int
@@ -670,8 +601,8 @@
 /*
  * We adjust the priority of the current process.  The priority of
  * a process gets worse as it accumulates CPU time.  The cpu usage
- * estimator (kg_estcpu) is increased here.  resetpriority() will
- * compute a different priority each time kg_estcpu increases by
+ * estimator (td_estcpu) is increased here.  resetpriority() will
+ * compute a different priority each time td_estcpu increases by
  * INVERSE_ESTCPU_WEIGHT
  * (until MAXPRI is reached).  The cpu usage estimator ramps up
  * quite quickly when the process is running (linearly), and decays
@@ -684,102 +615,86 @@
 void
 sched_clock(struct thread *td)
 {
-	struct ksegrp *kg;
-	struct kse *ke;
+	struct td_sched *ts;
 
-	mtx_assert(&sched_lock, MA_OWNED);
-	kg = td->td_ksegrp;
-	ke = td->td_kse;
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	ts = td->td_sched;
 
-	ke->ke_cpticks++;
-	kg->kg_estcpu = ESTCPULIM(kg->kg_estcpu + 1);
-	if ((kg->kg_estcpu % INVERSE_ESTCPU_WEIGHT) == 0) {
-		resetpriority(kg);
-		resetpriority_thread(td, kg);
+	ts->ts_cpticks++;
+	td->td_estcpu = ESTCPULIM(td->td_estcpu + 1);
+	if ((td->td_estcpu % INVERSE_ESTCPU_WEIGHT) == 0) {
+		resetpriority(td);
+		resetpriority_thread(td);
 	}
 }
 
 /*
  * charge childs scheduling cpu usage to parent.
- *
- * XXXKSE assume only one thread & kse & ksegrp keep estcpu in each ksegrp.
- * Charge it to the ksegrp that did the wait since process estcpu is sum of
- * all ksegrps, this is strictly as expected.  Assume that the child process
- * aggregated all the estcpu into the 'built-in' ksegrp.
  */
 void
 sched_exit(struct proc *p, struct thread *td)
 {
-	sched_exit_ksegrp(FIRST_KSEGRP_IN_PROC(p), td);
-	sched_exit_thread(FIRST_THREAD_IN_PROC(p), td);
-}
 
-void
-sched_exit_ksegrp(struct ksegrp *kg, struct thread *childtd)
-{
-
-	mtx_assert(&sched_lock, MA_OWNED);
-	kg->kg_estcpu = ESTCPULIM(kg->kg_estcpu + childtd->td_ksegrp->kg_estcpu);
+	CTR3(KTR_SCHED, "sched_exit: %p(%s) prio %d",
+	    td, td->td_proc->p_comm, td->td_priority);
+	PROC_SLOCK_ASSERT(p, MA_OWNED);
+	sched_exit_thread(FIRST_THREAD_IN_PROC(p), td);
 }
 
 void
 sched_exit_thread(struct thread *td, struct thread *child)
 {
+
 	CTR3(KTR_SCHED, "sched_exit_thread: %p(%s) prio %d",
 	    child, child->td_proc->p_comm, child->td_priority);
+	thread_lock(td);
+	td->td_estcpu = ESTCPULIM(td->td_estcpu + child->td_estcpu);
+	thread_unlock(td);
+	mtx_lock_spin(&sched_lock);
 	if ((child->td_proc->p_flag & P_NOLOAD) == 0)
 		sched_load_rem();
+	mtx_unlock_spin(&sched_lock);
 }
 
 void
 sched_fork(struct thread *td, struct thread *childtd)
 {
-	sched_fork_ksegrp(td, childtd->td_ksegrp);
 	sched_fork_thread(td, childtd);
 }
 
 void
-sched_fork_ksegrp(struct thread *td, struct ksegrp *child)
-{
-	mtx_assert(&sched_lock, MA_OWNED);
-	child->kg_estcpu = td->td_ksegrp->kg_estcpu;
-}
-
-void
 sched_fork_thread(struct thread *td, struct thread *childtd)
 {
+	childtd->td_estcpu = td->td_estcpu;
+	childtd->td_lock = &sched_lock;
 	sched_newthread(childtd);
 }
 
 void
 sched_nice(struct proc *p, int nice)
 {
-	struct ksegrp *kg;
 	struct thread *td;
 
 	PROC_LOCK_ASSERT(p, MA_OWNED);
-	mtx_assert(&sched_lock, MA_OWNED);
+	PROC_SLOCK_ASSERT(p, MA_OWNED);
 	p->p_nice = nice;
-	FOREACH_KSEGRP_IN_PROC(p, kg) {
-		resetpriority(kg);
-		FOREACH_THREAD_IN_GROUP(kg, td) {
-			resetpriority_thread(td, kg);
-		}
+	FOREACH_THREAD_IN_PROC(p, td) {
+		thread_lock(td);
+		resetpriority(td);
+		resetpriority_thread(td);
+		thread_unlock(td);
 	}
 }
 
 void
-sched_class(struct ksegrp *kg, int class)
+sched_class(struct thread *td, int class)
 {
-	mtx_assert(&sched_lock, MA_OWNED);
-	kg->kg_pri_class = class;
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	td->td_pri_class = class;
 }
 
 /*
  * Adjust the priority of a thread.
- * This may include moving the thread within the KSEGRP,
- * changing the assignment of a kse to the thread,
- * and moving a KSE in the system run queue.
  */
 static void
 sched_priority(struct thread *td, u_char prio)
@@ -788,13 +703,14 @@
 	    td, td->td_proc->p_comm, td->td_priority, prio, curthread, 
 	    curthread->td_proc->p_comm);
 
-	mtx_assert(&sched_lock, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	if (td->td_priority == prio)
 		return;
-	if (TD_ON_RUNQ(td)) {
-		adjustrunqueue(td, prio);
-	} else {
-		td->td_priority = prio;
+	td->td_priority = prio;
+	if (TD_ON_RUNQ(td) && 
+	    td->td_sched->ts_rqindex != (prio / RQ_PPQ)) {
+		sched_rem(td);
+		sched_add(td, SRQ_BORING);
 	}
 }
 
@@ -825,7 +741,7 @@
 
 	if (td->td_base_pri >= PRI_MIN_TIMESHARE &&
 	    td->td_base_pri <= PRI_MAX_TIMESHARE)
-		base_pri = td->td_ksegrp->kg_user_pri;
+		base_pri = td->td_user_pri;
 	else
 		base_pri = td->td_base_pri;
 	if (prio >= base_pri) {
@@ -863,54 +779,75 @@
 }
 
 void
-sched_sleep(struct thread *td)
+sched_user_prio(struct thread *td, u_char prio)
 {
+	u_char oldprio;
 
-	mtx_assert(&sched_lock, MA_OWNED);
-	td->td_ksegrp->kg_slptime = 0;
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	td->td_base_user_pri = prio;
+	if (td->td_flags & TDF_UBORROWING && td->td_user_pri <= prio)
+		return;
+	oldprio = td->td_user_pri;
+	td->td_user_pri = prio;
+}
+
+void
+sched_lend_user_prio(struct thread *td, u_char prio)
+{
+	u_char oldprio;
+
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	td->td_flags |= TDF_UBORROWING;
+
+	oldprio = td->td_user_pri;
+	td->td_user_pri = prio;
+}
+
+void
+sched_unlend_user_prio(struct thread *td, u_char prio)
+{
+	u_char base_pri;
+
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	base_pri = td->td_base_user_pri;
+	if (prio >= base_pri) {
+		td->td_flags &= ~TDF_UBORROWING;
+		sched_user_prio(td, base_pri);
+	} else {
+		sched_lend_user_prio(td, prio);
+	}
 }
 
-static void remrunqueue(struct thread *td);
+void
+sched_sleep(struct thread *td)
+{
+
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	td->td_slptick = ticks;
+	td->td_sched->ts_slptime = 0;
+}
 
 void
 sched_switch(struct thread *td, struct thread *newtd, int flags)
 {
-	struct kse *ke;
-	struct ksegrp *kg;
+	struct td_sched *ts;
 	struct proc *p;
 
-	ke = td->td_kse;
+	ts = td->td_sched;
 	p = td->td_proc;
 
-	mtx_assert(&sched_lock, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	/*  
+	 * Switch to the sched lock to fix things up and pick
+	 * a new thread.
+	 */
+	if (td->td_lock != &sched_lock) {
+		mtx_lock_spin(&sched_lock);
+		thread_unlock(td);
+	}
 
 	if ((p->p_flag & P_NOLOAD) == 0)
 		sched_load_rem();
-	/* 
-	 * We are volunteering to switch out so we get to nominate
-	 * a successor for the rest of our quantum
-	 * First try another thread in our ksegrp, and then look for 
-	 * other ksegrps in our process.
-	 */
-	if (sched_followon &&
-	    (p->p_flag & P_HADTHREADS) &&
-	    (flags & SW_VOL) &&
-	    newtd == NULL) {
-		/* lets schedule another thread from this process */
-		 kg = td->td_ksegrp;
-		 if ((newtd = TAILQ_FIRST(&kg->kg_runq))) {
-			remrunqueue(newtd);
-			sched_kgfollowons++;
-		 } else {
-			FOREACH_KSEGRP_IN_PROC(p, kg) {
-				if ((newtd = TAILQ_FIRST(&kg->kg_runq))) {
-					sched_pfollowons++;
-					remrunqueue(newtd);
-					break;
-				}
-			}
-		}
-	}
 
 	if (newtd) 
 		newtd->td_flags |= (td->td_flags & TDF_NEEDRESCHED);
@@ -925,24 +862,17 @@
 	 * or stopped or any thing else similar.  We never put the idle
 	 * threads on the run queue, however.
 	 */
-	if (td == PCPU_GET(idlethread))
+	if (td->td_flags & TDF_IDLETD) {
 		TD_SET_CAN_RUN(td);
-	else {
-		SLOT_RELEASE(td->td_ksegrp);
+#ifdef SMP
+		idle_cpus_mask &= ~PCPU_GET(cpumask);
+#endif
+	} else {
 		if (TD_IS_RUNNING(td)) {
-			/* Put us back on the run queue (kse and all). */
-			setrunqueue(td, (flags & SW_PREEMPT) ?
+			/* Put us back on the run queue. */
+			sched_add(td, (flags & SW_PREEMPT) ?
 			    SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED :
 			    SRQ_OURSELF|SRQ_YIELDING);
-		} else if (p->p_flag & P_HADTHREADS) {
-			/*
-			 * We will not be on the run queue. So we must be
-			 * sleeping or similar. As it's available,
-			 * someone else can use the KSE if they need it.
-			 * It's NOT available if we are about to need it
-			 */
-			if (newtd == NULL || newtd->td_ksegrp != td->td_ksegrp)
-				slot_fill(td->td_ksegrp);
 		}
 	}
 	if (newtd) {
@@ -955,45 +885,68 @@
 		 * * A followon
 		 */
 		KASSERT((newtd->td_inhibitors == 0),
-			("trying to run inhibitted thread"));
-		SLOT_USE(newtd->td_ksegrp);
-		newtd->td_kse->ke_flags |= KEF_DIDRUN;
+			("trying to run inhibited thread"));
+		newtd->td_sched->ts_flags |= TSF_DIDRUN;
         	TD_SET_RUNNING(newtd);
 		if ((newtd->td_proc->p_flag & P_NOLOAD) == 0)
 			sched_load_add();
 	} else {
 		newtd = choosethread();
 	}
+	MPASS(newtd->td_lock == &sched_lock);
 
 	if (td != newtd) {
 #ifdef	HWPMC_HOOKS
 		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
 			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_OUT);
 #endif
-		cpu_switch(td, newtd);
+
+                /* I feel sleepy */
+		cpu_switch(td, newtd, td->td_lock);
+		/*
+		 * Where am I?  What year is it?
+		 * We are in the same thread that went to sleep above,
+		 * but any amount of time may have passed. All out context
+		 * will still be available as will local variables.
+		 * PCPU values however may have changed as we may have
+		 * changed CPU so don't trust cached values of them.
+		 * New threads will go to fork_exit() instead of here
+		 * so if you change things here you may need to change
+		 * things there too.
+		 * If the thread above was exiting it will never wake
+		 * up again here, so either it has saved everything it
+		 * needed to, or the thread_wait() or wait() will
+		 * need to reap it.
+		 */
 #ifdef	HWPMC_HOOKS
 		if (PMC_PROC_IS_USING_PMCS(td->td_proc))
 			PMC_SWITCH_CONTEXT(td, PMC_FN_CSW_IN);
 #endif
 	}
 
+#ifdef SMP
+	if (td->td_flags & TDF_IDLETD)
+		idle_cpus_mask |= PCPU_GET(cpumask);
+#endif
 	sched_lock.mtx_lock = (uintptr_t)td;
 	td->td_oncpu = PCPU_GET(cpuid);
+	MPASS(td->td_lock == &sched_lock);
 }
 
 void
 sched_wakeup(struct thread *td)
 {
-	struct ksegrp *kg;
+	struct td_sched *ts;
 
-	mtx_assert(&sched_lock, MA_OWNED);
-	kg = td->td_ksegrp;
-	if (kg->kg_slptime > 1) {
-		updatepri(kg);
-		resetpriority(kg);
-	}
-	kg->kg_slptime = 0;
-	setrunqueue(td, SRQ_BORING);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	ts = td->td_sched;
+	if (ts->ts_slptime > 1) {
+		updatepri(td);
+		resetpriority(td);
+	}
+	td->td_slptick = ticks;
+	ts->ts_slptime = 0;
+	sched_add(td, SRQ_BORING);
 }
 
 #ifdef SMP
@@ -1123,41 +1076,50 @@
 sched_add(struct thread *td, int flags)
 #ifdef SMP
 {
-	struct kse *ke;
+	struct td_sched *ts;
 	int forwarded = 0;
 	int cpu;
 	int single_cpu = 0;
 
-	ke = td->td_kse;
-	mtx_assert(&sched_lock, MA_OWNED);
-	KASSERT(ke->ke_state != KES_ONRUNQ,
-	    ("sched_add: kse %p (%s) already in run queue", ke,
-	    ke->ke_proc->p_comm));
-	KASSERT(ke->ke_proc->p_sflag & PS_INMEM,
-	    ("sched_add: process swapped out"));
+	ts = td->td_sched;
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	KASSERT((td->td_inhibitors == 0),
+	    ("sched_add: trying to run inhibited thread"));
+	KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)),
+	    ("sched_add: bad thread state"));
+	KASSERT(td->td_flags & TDF_INMEM,
+	    ("sched_add: thread swapped out"));
 	CTR5(KTR_SCHED, "sched_add: %p(%s) prio %d by %p(%s)",
 	    td, td->td_proc->p_comm, td->td_priority, curthread,
 	    curthread->td_proc->p_comm);
-
+	/*
+	 * Now that the thread is moving to the run-queue, set the lock
+	 * to the scheduler's lock.
+	 */
+	if (td->td_lock != &sched_lock) {
+		mtx_lock_spin(&sched_lock);
+		thread_lock_set(td, &sched_lock);
+	}
+	TD_SET_RUNQ(td);
 
 	if (td->td_pinned != 0) {
 		cpu = td->td_lastcpu;
-		ke->ke_runq = &runq_pcpu[cpu];
+		ts->ts_runq = &runq_pcpu[cpu];
 		single_cpu = 1;
 		CTR3(KTR_RUNQ,
-		    "sched_add: Put kse:%p(td:%p) on cpu%d runq", ke, td, cpu);
-	} else if ((ke)->ke_flags & KEF_BOUND) {
+		    "sched_add: Put td_sched:%p(td:%p) on cpu%d runq", ts, td, cpu);
+	} else if ((ts)->ts_flags & TSF_BOUND) {
 		/* Find CPU from bound runq */
-		KASSERT(SKE_RUNQ_PCPU(ke),("sched_add: bound kse not on cpu runq"));
-		cpu = ke->ke_runq - &runq_pcpu[0];
+		KASSERT(SKE_RUNQ_PCPU(ts),("sched_add: bound td_sched not on cpu runq"));
+		cpu = ts->ts_runq - &runq_pcpu[0];
 		single_cpu = 1;
 		CTR3(KTR_RUNQ,
-		    "sched_add: Put kse:%p(td:%p) on cpu%d runq", ke, td, cpu);
+		    "sched_add: Put td_sched:%p(td:%p) on cpu%d runq", ts, td, cpu);
 	} else {	
 		CTR2(KTR_RUNQ,
-		    "sched_add: adding kse:%p (td:%p) to gbl runq", ke, td);
+		    "sched_add: adding td_sched:%p (td:%p) to gbl runq", ts, td);
 		cpu = NOCPU;
-		ke->ke_runq = &runq;
+		ts->ts_runq = &runq;
 	}
 	
 	if (single_cpu && (cpu != PCPU_GET(cpuid))) {
@@ -1183,25 +1145,33 @@
 	
 	if ((td->td_proc->p_flag & P_NOLOAD) == 0)
 		sched_load_add();
-	SLOT_USE(td->td_ksegrp);
-	runq_add(ke->ke_runq, ke, flags);
-	ke->ke_state = KES_ONRUNQ;
+	runq_add(ts->ts_runq, ts, flags);
 }
 #else /* SMP */
 {
-	struct kse *ke;
-	ke = td->td_kse;
-	mtx_assert(&sched_lock, MA_OWNED);
-	KASSERT(ke->ke_state != KES_ONRUNQ,
-	    ("sched_add: kse %p (%s) already in run queue", ke,
-	    ke->ke_proc->p_comm));
-	KASSERT(ke->ke_proc->p_sflag & PS_INMEM,
-	    ("sched_add: process swapped out"));
+	struct td_sched *ts;
+	ts = td->td_sched;
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	KASSERT((td->td_inhibitors == 0),
+	    ("sched_add: trying to run inhibited thread"));
+	KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)),
+	    ("sched_add: bad thread state"));
+	KASSERT(td->td_flags & TDF_INMEM,
+	    ("sched_add: thread swapped out"));
 	CTR5(KTR_SCHED, "sched_add: %p(%s) prio %d by %p(%s)",
 	    td, td->td_proc->p_comm, td->td_priority, curthread,
 	    curthread->td_proc->p_comm);
-	CTR2(KTR_RUNQ, "sched_add: adding kse:%p (td:%p) to runq", ke, td);
-	ke->ke_runq = &runq;
+	/*
+	 * Now that the thread is moving to the run-queue, set the lock
+	 * to the scheduler's lock.
+	 */
+	if (td->td_lock != &sched_lock) {
+		mtx_lock_spin(&sched_lock);
+		thread_lock_set(td, &sched_lock);
+	}
+	TD_SET_RUNQ(td);
+	CTR2(KTR_RUNQ, "sched_add: adding td_sched:%p (td:%p) to runq", ts, td);
+	ts->ts_runq = &runq;
 
 	/* 
 	 * If we are yielding (on the way out anyhow) 
@@ -1220,9 +1190,7 @@
 	}	
 	if ((td->td_proc->p_flag & P_NOLOAD) == 0)
 		sched_load_add();
-	SLOT_USE(td->td_ksegrp);
-	runq_add(ke->ke_runq, ke, flags);
-	ke->ke_state = KES_ONRUNQ;
+	runq_add(ts->ts_runq, ts, flags);
 	maybe_resched(td);
 }
 #endif /* SMP */
@@ -1230,13 +1198,13 @@
 void
 sched_rem(struct thread *td)
 {
-	struct kse *ke;
+	struct td_sched *ts;
 
-	ke = td->td_kse;
-	KASSERT(ke->ke_proc->p_sflag & PS_INMEM,
-	    ("sched_rem: process swapped out"));
-	KASSERT((ke->ke_state == KES_ONRUNQ),
-	    ("sched_rem: KSE not on run queue"));
+	ts = td->td_sched;
+	KASSERT(td->td_flags & TDF_INMEM,
+	    ("sched_rem: thread swapped out"));
+	KASSERT(TD_ON_RUNQ(td),
+	    ("sched_rem: thread not on run queue"));
 	mtx_assert(&sched_lock, MA_OWNED);
 	CTR5(KTR_SCHED, "sched_rem: %p(%s) prio %d by %p(%s)",
 	    td, td->td_proc->p_comm, td->td_priority, curthread,
@@ -1244,59 +1212,58 @@
 
 	if ((td->td_proc->p_flag & P_NOLOAD) == 0)
 		sched_load_rem();
-	SLOT_RELEASE(td->td_ksegrp);
-	runq_remove(ke->ke_runq, ke);
-
-	ke->ke_state = KES_THREAD;
+	runq_remove(ts->ts_runq, ts);
+	TD_SET_CAN_RUN(td);
 }
 
 /*
  * Select threads to run.
  * Notice that the running threads still consume a slot.
  */
-struct kse *
+struct thread *
 sched_choose(void)
 {
-	struct kse *ke;
+	struct td_sched *ts;
 	struct runq *rq;
 
+	mtx_assert(&sched_lock,  MA_OWNED);
 #ifdef SMP
-	struct kse *kecpu;
+	struct td_sched *kecpu;
 
 	rq = &runq;
-	ke = runq_choose(&runq);
+	ts = runq_choose(&runq);
 	kecpu = runq_choose(&runq_pcpu[PCPU_GET(cpuid)]);
 
-	if (ke == NULL || 
+	if (ts == NULL || 
 	    (kecpu != NULL && 
-	     kecpu->ke_thread->td_priority < ke->ke_thread->td_priority)) {
-		CTR2(KTR_RUNQ, "choosing kse %p from pcpu runq %d", kecpu,
+	     kecpu->ts_thread->td_priority < ts->ts_thread->td_priority)) {
+		CTR2(KTR_RUNQ, "choosing td_sched %p from pcpu runq %d", kecpu,
 		     PCPU_GET(cpuid));
-		ke = kecpu;
+		ts = kecpu;
 		rq = &runq_pcpu[PCPU_GET(cpuid)];
 	} else { 
-		CTR1(KTR_RUNQ, "choosing kse %p from main runq", ke);
+		CTR1(KTR_RUNQ, "choosing td_sched %p from main runq", ts);
 	}
 
 #else
 	rq = &runq;
-	ke = runq_choose(&runq);
+	ts = runq_choose(&runq);
 #endif
 
-	if (ke != NULL) {
-		runq_remove(rq, ke);
-		ke->ke_state = KES_THREAD;
-
-		KASSERT(ke->ke_proc->p_sflag & PS_INMEM,
-		    ("sched_choose: process swapped out"));
-	}
-	return (ke);
+	if (ts) {
+		runq_remove(rq, ts);
+		ts->ts_flags |= TSF_DIDRUN;
+
+		KASSERT(ts->ts_thread->td_flags & TDF_INMEM,
+		    ("sched_choose: thread swapped out"));
+		return (ts->ts_thread);
+	} 
+	return (PCPU_GET(idlethread));
 }
 
 void
 sched_userret(struct thread *td)
 {
-	struct ksegrp *kg;
 	/*
 	 * XXX we cheat slightly on the locking here to avoid locking in
 	 * the usual case.  Setting td_priority here is essentially an
@@ -1308,34 +1275,31 @@
 	 */
 	KASSERT((td->td_flags & TDF_BORROWING) == 0,
 	    ("thread with borrowed priority returning to userland"));
-	kg = td->td_ksegrp;
-	if (td->td_priority != kg->kg_user_pri) {
-		mtx_lock_spin(&sched_lock);
-		td->td_priority = kg->kg_user_pri;
-		td->td_base_pri = kg->kg_user_pri;
-		mtx_unlock_spin(&sched_lock);
+	if (td->td_priority != td->td_user_pri) {
+		thread_lock(td);
+		td->td_priority = td->td_user_pri;
+		td->td_base_pri = td->td_user_pri;
+		thread_unlock(td);
 	}
 }
 
 void
 sched_bind(struct thread *td, int cpu)
 {
-	struct kse *ke;
+	struct td_sched *ts;
 
-	mtx_assert(&sched_lock, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	KASSERT(TD_IS_RUNNING(td),
 	    ("sched_bind: cannot bind non-running thread"));
 
-	ke = td->td_kse;
+	ts = td->td_sched;
 
-	ke->ke_flags |= KEF_BOUND;
+	ts->ts_flags |= TSF_BOUND;
 #ifdef SMP
-	ke->ke_runq = &runq_pcpu[cpu];
+	ts->ts_runq = &runq_pcpu[cpu];
 	if (PCPU_GET(cpuid) == cpu)
 		return;
 
-	ke->ke_state = KES_THREAD;
-
 	mi_switch(SW_VOL, NULL);
 #endif
 }
@@ -1343,48 +1307,121 @@
 void
 sched_unbind(struct thread* td)
 {
-	mtx_assert(&sched_lock, MA_OWNED);
-	td->td_kse->ke_flags &= ~KEF_BOUND;
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	td->td_sched->ts_flags &= ~TSF_BOUND;
 }
 
 int
 sched_is_bound(struct thread *td)
 {
-	mtx_assert(&sched_lock, MA_OWNED);
-	return (td->td_kse->ke_flags & KEF_BOUND);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	return (td->td_sched->ts_flags & TSF_BOUND);
 }
 
-int
-sched_load(void)
+void
+sched_relinquish(struct thread *td)
 {
-	return (sched_tdcnt);
+	thread_lock(td);
+	SCHED_STAT_INC(switch_relinquish);
+	mi_switch(SW_VOL, NULL);
+	thread_unlock(td);
 }
 
 int
-sched_sizeof_ksegrp(void)
+sched_load(void)
 {
-	return (sizeof(struct ksegrp) + sizeof(struct kg_sched));
+	return (sched_tdcnt);
 }
+
 int
 sched_sizeof_proc(void)
 {
 	return (sizeof(struct proc));
 }
+
 int
 sched_sizeof_thread(void)
 {
-	return (sizeof(struct thread) + sizeof(struct kse));
+	return (sizeof(struct thread) + sizeof(struct td_sched));
 }
 
 fixpt_t
 sched_pctcpu(struct thread *td)
 {
-	struct kse *ke;
+	struct td_sched *ts;
 
-	ke = td->td_kse;
-	return (ke->ke_pctcpu);
+	ts = td->td_sched;
+	return (ts->ts_pctcpu);
+}
 
-	return (0);
+void
+sched_tick(void)
+{
+}
+
+/*
+ * The actual idle process.
+ */
+void
+sched_idletd(void *dummy)
+{
+	struct proc *p;
+	struct thread *td;
+
+	td = curthread;
+	p = td->td_proc;
+	for (;;) {
+		mtx_assert(&Giant, MA_NOTOWNED);
+
+		while (sched_runnable() == 0)
+			cpu_idle();
+
+		mtx_lock_spin(&sched_lock);
+		mi_switch(SW_VOL, NULL);
+		mtx_unlock_spin(&sched_lock);
+	}
+}
+
+/*
+ * A CPU is entering for the first time or a thread is exiting.
+ */
+void
+sched_throw(struct thread *td)
+{
+	/*
+	 * Correct spinlock nesting.  The idle thread context that we are
+	 * borrowing was created so that it would start out with a single
+	 * spin lock (sched_lock) held in fork_trampoline().  Since we've
+	 * explicitly acquired locks in this function, the nesting count
+	 * is now 2 rather than 1.  Since we are nested, calling
+	 * spinlock_exit() will simply adjust the counts without allowing
+	 * spin lock using code to interrupt us.
+	 */
+	if (td == NULL) {
+		mtx_lock_spin(&sched_lock);
+		spinlock_exit();
+	} else {
+		MPASS(td->td_lock == &sched_lock);
+	}
+	mtx_assert(&sched_lock, MA_OWNED);
+	KASSERT(curthread->td_md.md_spinlock_count == 1, ("invalid count"));
+	PCPU_SET(switchtime, cpu_ticks());
+	PCPU_SET(switchticks, ticks);
+	cpu_throw(td, choosethread());	/* doesn't return */
 }
+
+void
+sched_fork_exit(struct thread *td)
+{
+
+	/*
+	 * Finish setting up thread glue so that it begins execution in a
+	 * non-nested critical section with sched_lock held but not recursed.
+	 */
+	td->td_oncpu = PCPU_GET(cpuid);
+	sched_lock.mtx_lock = (uintptr_t)td;
+	THREAD_LOCK_ASSERT(td, MA_OWNED | MA_NOTRECURSED);
+}
+
 #define KERN_SWITCH_INCLUDE 1
 #include "kern/kern_switch.c"
Index: imgact_gzip.c
===================================================================
RCS file: /home/cvs/src/sys/kern/imgact_gzip.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/imgact_gzip.c -L sys/kern/imgact_gzip.c -u -r1.2 -r1.3
--- sys/kern/imgact_gzip.c
+++ sys/kern/imgact_gzip.c
@@ -22,7 +22,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/imgact_gzip.c,v 1.54.2.1 2006/03/16 00:25:32 alc Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/imgact_gzip.c,v 1.55.4.1 2008/01/19 18:15:05 kib Exp $");
 
 #include <sys/param.h>
 #include <sys/exec.h>
@@ -239,9 +239,13 @@
 	/*
 	 * Destroy old process VM and create a new one (with a new stack)
 	 */
-	exec_new_vmspace(gz->ip, &aout_sysvec);
+	error = exec_new_vmspace(gz->ip, &aout_sysvec);
 
 	vn_lock(gz->ip->vp, LK_EXCLUSIVE | LK_RETRY, td);
+	if (error) {
+		gz->where = __LINE__;
+		return (error);
+	}
 
 	vmspace = gz->ip->proc->p_vmspace;
 
Index: makesyscalls.sh
===================================================================
RCS file: /home/cvs/src/sys/kern/makesyscalls.sh,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/makesyscalls.sh -L sys/kern/makesyscalls.sh -u -r1.1.1.1 -r1.2
--- sys/kern/makesyscalls.sh
+++ sys/kern/makesyscalls.sh
@@ -1,12 +1,13 @@
 #! /bin/sh -
 #	@(#)makesyscalls.sh	8.1 (Berkeley) 6/10/93
-# $FreeBSD: src/sys/kern/makesyscalls.sh,v 1.62 2005/05/30 15:09:15 rwatson Exp $
+# $FreeBSD: src/sys/kern/makesyscalls.sh,v 1.68 2007/07/04 22:38:28 peter Exp $
 
 set -e
 
 # name of compat options:
 compat=COMPAT_43
 compat4=COMPAT_FREEBSD4
+compat6=COMPAT_FREEBSD6
 
 # output files:
 sysnames="syscalls.c"
@@ -18,21 +19,25 @@
 syscallprefix="SYS_"
 switchname="sysent"
 namesname="syscallnames"
+systrace="systrace_args.c"
 
 # tmp files:
+sysaue="sysent.aue.$$"
 sysdcl="sysent.dcl.$$"
 syscompat="sysent.compat.$$"
 syscompatdcl="sysent.compatdcl.$$"
 syscompat4="sysent.compat4.$$"
 syscompat4dcl="sysent.compat4dcl.$$"
+syscompat6="sysent.compat6.$$"
+syscompat6dcl="sysent.compat6dcl.$$"
 sysent="sysent.switch.$$"
 sysinc="sysinc.switch.$$"
 sysarg="sysarg.switch.$$"
 sysprotoend="sysprotoend.$$"
 
-trap "rm $sysdcl $syscompat $syscompatdcl $syscompat4 $syscompat4dcl $sysent $sysinc $sysarg $sysprotoend" 0
+trap "rm $sysaue $sysdcl $syscompat $syscompatdcl $syscompat4 $syscompat4dcl $syscompat6 $syscompat6dcl $sysent $sysinc $sysarg $sysprotoend" 0
 
-touch $sysdcl $syscompat $syscompatdcl $syscompat4 $syscompat4dcl $sysent $sysinc $sysarg $sysprotoend
+touch $sysaue $sysdcl $syscompat $syscompatdcl $syscompat4 $syscompat4dcl $syscompat6 $syscompat6dcl $sysent $sysinc $sysarg $sysprotoend
 
 case $# in
     0)	echo "usage: $0 input-file <config-file>" 1>&2
@@ -58,6 +63,7 @@
 }
 ' < $1 | awk "
 	BEGIN {
+		sysaue = \"$sysaue\"
 		sysdcl = \"$sysdcl\"
 		sysproto = \"$sysproto\"
 		sysprotoend = \"$sysprotoend\"
@@ -66,6 +72,8 @@
 		syscompatdcl = \"$syscompatdcl\"
 		syscompat4 = \"$syscompat4\"
 		syscompat4dcl = \"$syscompat4dcl\"
+		syscompat6 = \"$syscompat6\"
+		syscompat6dcl = \"$syscompat6dcl\"
 		sysent = \"$sysent\"
 		syssw = \"$syssw\"
 		sysinc = \"$sysinc\"
@@ -73,8 +81,10 @@
 		sysnames = \"$sysnames\"
 		syshdr = \"$syshdr\"
 		sysmk = \"$sysmk\"
+		systrace = \"$systrace\"
 		compat = \"$compat\"
 		compat4 = \"$compat4\"
+		compat6 = \"$compat6\"
 		syscallprefix = \"$syscallprefix\"
 		switchname = \"$switchname\"
 		namesname = \"$namesname\"
@@ -91,6 +101,7 @@
 
 		printf "\n#ifdef %s\n\n", compat > syscompat
 		printf "\n#ifdef %s\n\n", compat4 > syscompat4
+		printf "\n#ifdef %s\n\n", compat6 > syscompat6
 
 		printf "/*\n * System call names.\n *\n" > sysnames
 		printf " * DO NOT EDIT-- this file is automatically generated.\n" > sysnames
@@ -102,6 +113,10 @@
 		printf "# FreeBSD system call names.\n" > sysmk
 		printf "# DO NOT EDIT-- this file is automatically generated.\n" > sysmk
 		printf "# $%s$\n", "FreeBSD" > sysmk
+
+		printf "/*\n * System call argument to DTrace register array converstion.\n *\n" > systrace
+		printf " * DO NOT EDIT-- this file is automatically generated.\n" > systrace
+		printf " * $%s$\n", "FreeBSD" > systrace
 	}
 	NR == 1 {
 		gsub("[$]FreeBSD: ", "", $0)
@@ -117,10 +132,9 @@
 		printf "#define\t%s\n\n", sysproto_h > sysarg
 		printf "#include <sys/signal.h>\n" > sysarg
 		printf "#include <sys/acl.h>\n" > sysarg
-		printf "#include <sys/thr.h>\n" > sysarg
-		printf "#include <sys/umtx.h>\n" > sysarg
-		printf "#include <posix4/_semaphore.h>\n\n" > sysarg
+		printf "#include <sys/_semaphore.h>\n" > sysarg
 		printf "#include <sys/ucontext.h>\n\n" > sysarg
+		printf "#include <bsm/audit_kevents.h>\n\n" > sysarg
 		printf "struct proc;\n\n" > sysarg
 		printf "struct thread;\n\n" > sysarg
 		printf "#define\tPAD_(t)\t(sizeof(register_t) <= sizeof(t) ? \\\n" > sysarg
@@ -140,6 +154,11 @@
 
 		printf "# created from%s\nMIASM = ", $0 > sysmk
 
+		printf " * This file is part of the DTrace syscall provider.\n */\n\n" > systrace
+		printf "static void\nsystrace_args(int sysnum, void *params, u_int64_t *uarg, int *n_args)\n{\n" > systrace
+		printf "\tint64_t *iarg  = (int64_t *) uarg;\n" > systrace
+		printf "\tswitch (sysnum) {\n" > systrace
+
 		next
 	}
 	NF == 0 || $1 ~ /^;/ {
@@ -155,6 +174,7 @@
 		print > sysarg
 		print > syscompat
 		print > syscompat4
+		print > syscompat6
 		print > sysnames
 		savesyscall = syscall
 		next
@@ -165,6 +185,7 @@
 		print > sysarg
 		print > syscompat
 		print > syscompat4
+		print > syscompat6
 		print > sysnames
 		syscall = savesyscall
 		next
@@ -175,6 +196,7 @@
 		print > sysarg
 		print > syscompat
 		print > syscompat4
+		print > syscompat6
 		print > sysnames
 		next
 	}
@@ -243,6 +265,8 @@
 				argalias = "o" argalias
 			if ($3 == "COMPAT4")
 				argalias = "freebsd4_" argalias
+			if ($3 == "COMPAT6")
+				argalias = "freebsd6_" argalias
 		}
 		f++
 
@@ -288,41 +312,28 @@
 		auditev = $2;
 	}
 
-	# The 'M' type prefix
-	#
-	{
-		mpsafe = "SYF_MPSAFE | ";
-		if ($3 == "MSTD") {
-			$3 = "STD";
-		} else if ($3 == "MNODEF") {
-			$3 = "NODEF";
-		} else if ($3 == "MNOARGS") {
-			$3 = "NOARGS";
-		} else if ($3 == "MNOPROTO") {
-			$3 = "NOPROTO";
-		} else if ($3 == "MNOIMPL") {
-			$3 = "NOIMPL";
-		} else if ($3 == "MNOSTD") {
-			$3 = "NOSTD";
-		} else if ($3 == "MCOMPAT") {
-			$3 = "COMPAT";
-		} else if ($3 == "MCOMPAT4") {
-			$3 = "COMPAT4";
-		} else if ($3 == "MCPT_NOA") {
-			$3 = "CPT_NOA";
-		} else if ($3 == "MLIBCOMPAT") {
-			$3 = "LIBCOMPAT";
-		} else if ($3 == "MOBSOL") {
-			$3 = "OBSOL";
-		} else if ($3 == "MUNIMPL") {
-			$3 = "UNIMPL";
-		} else {
-			mpsafe = "";
-		}
-	}
 	$3 == "STD" || $3 == "NODEF" || $3 == "NOARGS"  || $3 == "NOPROTO" \
 	    || $3 == "NOIMPL" || $3 == "NOSTD" {
 		parseline()
+		printf("\t/* %s */\n\tcase %d: {\n", funcname, syscall) > systrace
+		if (argc > 0) {
+			printf("\t\tstruct %s *p = params;\n", argalias) > systrace
+			for (i = 1; i <= argc; i++) {
+				if (index(argtype[i], "*") > 0 || argtype[i] == "caddr_t")
+					printf("\t\tuarg[%d] = (intptr_t) p->%s; /* %s */\n", \
+					     i - 1, \
+					     argname[i], argtype[i]) > systrace
+				else if (substr(argtype[i], 1, 1) == "u" || argtype[i] == "size_t")
+					printf("\t\tuarg[%d] = p->%s; /* %s */\n", \
+					     i - 1, \
+					     argname[i], argtype[i]) > systrace
+				else
+					printf("\t\tiarg[%d] = p->%s; /* %s */\n", \
+					     i - 1, \
+					     argname[i], argtype[i]) > systrace
+			}
+		}
+		printf("\t\t*n_args = %d;\n\t\tbreak;\n\t}\n", argc) > systrace
 		if ((!nosys || funcname != "nosys") && \
 		    (funcname != "lkmnosys") && (funcname != "lkmressys")) {
 			if (argc != 0 && $3 != "NOARGS" && $3 != "NOPROTO") {
@@ -347,21 +358,23 @@
 			printf("%s\t%s(struct thread *, struct %s *)",
 			    rettype, funcname, argalias) > sysdcl
 			printf(";\n") > sysdcl
+			printf("#define\t%sAUE_%s\t%s\n", syscallprefix,
+			    funcalias, auditev) > sysaue
 		}
 		if (funcname == "nosys")
 			nosys = 1
 		if (funcname == "lkmnosys")
 			lkmnosys = 1
-		printf("\t{ %s%s, (sy_call_t *)", mpsafe, argssize) > sysent
-		column = 8 + 2 + length(mpsafe) + length(argssize) + 15
+		printf("\t{ %s, (sy_call_t *)", argssize) > sysent
+		column = 8 + 2 + length(argssize) + 15
 		if ($3 == "NOIMPL") {
-			printf("%s },", "nosys, AUE_NULL") > sysent
+			printf("%s },", "nosys, AUE_NULL, NULL, 0, 0") > sysent
 			column = column + length("nosys") + 3
 		} else if ($3 == "NOSTD") {
-			printf("%s },", "lkmressys, AUE_NULL") > sysent
+			printf("%s },", "lkmressys, AUE_NULL, NULL, 0, 0") > sysent
 			column = column + length("lkmressys") + 3
 		} else {
-			printf("%s, %s },", funcname, auditev) > sysent
+			printf("%s, %s, NULL, 0, 0 },", funcname, auditev) > sysent
 			column = column + length(funcname) + length(auditev) + 3
 		} 
 		align_sysent_comment(column)
@@ -376,7 +389,7 @@
 		syscall++
 		next
 	}
-	$3 == "COMPAT" || $3 == "COMPAT4" || $3 == "CPT_NOA" {
+	$3 == "COMPAT" || $3 == "COMPAT4" || $3 == "COMPAT6" || $3 == "CPT_NOA" {
 		if ($3 == "COMPAT" || $3 == "CPT_NOA") {
 			ncompat++
 			out = syscompat
@@ -389,6 +402,12 @@
 			outdcl = syscompat4dcl
 			wrap = "compat4"
 			prefix = "freebsd4_"
+		} else if ($3 == "COMPAT6") {
+			ncompat6++
+			out = syscompat6
+			outdcl = syscompat6dcl
+			wrap = "compat6"
+			prefix = "freebsd6_"
 		}
 		parseline()
 		if (argc != 0 && $3 != "CPT_NOA") {
@@ -406,15 +425,21 @@
 			    argalias) > sysarg
 		printf("%s\t%s%s(struct thread *, struct %s *);\n",
 		    rettype, prefix, funcname, argalias) > outdcl
-		printf("\t{ %s(%s%s,%s), %s },",
-		    wrap, mpsafe, argssize, funcname, auditev) > sysent
-		align_sysent_comment(8 + 9 + length(mpsafe) + \
+		printf("\t{ %s(%s,%s), %s, NULL, 0, 0 },",
+		    wrap, argssize, funcname, auditev) > sysent
+		align_sysent_comment(8 + 9 + \
 		    length(argssize) + 1 + length(funcname) + length(auditev) + 4)
 		printf("/* %d = old %s */\n", syscall, funcalias) > sysent
-		printf("\t\"old.%s\",\t\t/* %d = old %s */\n",
-		    funcalias, syscall, funcalias) > sysnames
-		printf("\t\t\t\t/* %d is old %s */\n",
-		    syscall, funcalias) > syshdr
+		printf("\t\"%s.%s\",\t\t/* %d = old %s */\n",
+		    wrap, funcalias, syscall, funcalias) > sysnames
+		if ($3 == "COMPAT" || $3 == "CPT_NOA") {
+			printf("\t\t\t\t/* %d is old %s */\n",
+			    syscall, funcalias) > syshdr
+		} else {
+			printf("#define\t%s%s%s\t%d\n", syscallprefix,
+			    prefix, funcalias, syscall) > syshdr
+			printf(" \\\n\t%s%s.o", prefix, funcalias) > sysmk
+		}
 		syscall++
 		next
 	}
@@ -422,9 +447,9 @@
 		ncompat++
 		parseline()
 		printf("%s\to%s();\n", rettype, funcname) > syscompatdcl
-		printf("\t{ compat(%s%s,%s), %s },",
-		    mpsafe, argssize, funcname, auditev) > sysent
-		align_sysent_comment(8 + 9 + length(mpsafe) + \
+		printf("\t{ compat(%s,%s), %s, NULL, 0, 0 },",
+		    argssize, funcname, auditev) > sysent
+		align_sysent_comment(8 + 9 + \
 		    length(argssize) + 1 + length(funcname) + length(auditev) + 4)
 		printf("/* %d = old %s */\n", syscall, funcalias) > sysent
 		printf("\t\"old.%s\",\t\t/* %d = old %s */\n",
@@ -436,7 +461,7 @@
 		next
 	}
 	$3 == "OBSOL" {
-		printf("\t{ 0, (sy_call_t *)nosys, AUE_NULL },") > sysent
+		printf("\t{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },") > sysent
 		align_sysent_comment(34)
 		printf("/* %d = obsolete %s */\n", syscall, comment) > sysent
 		printf("\t\"obs_%s\",\t\t\t/* %d = obsolete %s */\n",
@@ -447,7 +472,7 @@
 		next
 	}
 	$3 == "UNIMPL" {
-		printf("\t{ 0, (sy_call_t *)nosys, AUE_NULL },\t\t\t/* %d = %s */\n",
+		printf("\t{ 0, (sy_call_t *)nosys, AUE_NULL, NULL, 0, 0 },\t\t\t/* %d = %s */\n",
 		    syscall, comment) > sysent
 		printf("\t\"#%d\",\t\t\t/* %d = %s */\n",
 		    syscall, syscall, comment) > sysnames
@@ -461,7 +486,7 @@
 	END {
 		printf "\n#define AS(name) (sizeof(struct name) / sizeof(register_t))\n" > sysinc
 
-		if (ncompat != 0 || ncompat4 != 0)
+		if (ncompat != 0 || ncompat4 != 0 || ncompat6 != 0)
 			printf "#include \"opt_compat.h\"\n\n" > syssw
 		printf "#include \<bsm/audit_kevents.h\>\n" > syssw
 
@@ -481,11 +506,19 @@
 			printf "#endif\n" > sysinc
 		}
 
-		printf("\n#endif /* %s */\n\n", compat) > syscompatdcl
+		if (ncompat6 != 0) {
+			printf "\n#ifdef %s\n", compat6 > sysinc
+			printf "#define compat6(n, name) n, (sy_call_t *)__CONCAT(freebsd6_,name)\n" > sysinc
+			printf "#else\n" > sysinc
+			printf "#define compat6(n, name) 0, (sy_call_t *)nosys\n" > sysinc
+			printf "#endif\n" > sysinc
+		}
 
+		printf("\n#endif /* %s */\n\n", compat) > syscompatdcl
 		printf("\n#endif /* %s */\n\n", compat4) > syscompat4dcl
+		printf("\n#endif /* %s */\n\n", compat6) > syscompat6dcl
 
-		printf("#undef PAD_\n") > sysprotoend
+		printf("\n#undef PAD_\n") > sysprotoend
 		printf("#undef PADL_\n") > sysprotoend
 		printf("#undef PADR_\n") > sysprotoend
 		printf("\n#endif /* !%s */\n", sysproto_h) > sysprotoend
@@ -495,11 +528,13 @@
 		printf("};\n") > sysnames
 		printf("#define\t%sMAXSYSCALL\t%d\n", syscallprefix, syscall) \
 		    > syshdr
+		printf "\tdefault:\n\t\t*n_args = 0;\n\t\tbreak;\n\t};\n}\n" > systrace
 	} '
 
 cat $sysinc $sysent >> $syssw
 cat $sysarg $sysdcl \
 	$syscompat $syscompatdcl \
 	$syscompat4 $syscompat4dcl \
-	$sysprotoend > $sysproto
+	$syscompat6 $syscompat6dcl \
+	$sysaue $sysprotoend > $sysproto
 
--- /dev/null
+++ sys/kern/serdev_if.m
@@ -0,0 +1,94 @@
+#-
+# Copyright (c) 2006 Marcel Moolenaar
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+# $FreeBSD: src/sys/kern/serdev_if.m,v 1.2 2006/04/23 22:12:39 marcel Exp $
+#
+
+#include <sys/bus.h>
+#include <sys/serial.h>
+
+# The serdev interface is used by umbrella drivers and children thereof to
+# establish a more intimate relationship, necessary for efficient handling
+# of multiple (concurrent) serial communication channels.  Examples include
+# serial communications controller (SCC) drivers, multi-I/O adapter drivers
+# and intelligent multi-port serial drivers.  Methods specifically deal
+# with interrupt handling and configuration.  Conceptually, the umbrella
+# driver is responsible for the overall operation of the hardware and uses
+# child drivers to handle each individual channel.
+# The serdev interface is intended to inherit the device interface.
+
+INTERFACE serdev;
+
+# Default implementations of some methods.
+CODE {
+	static serdev_intr_t *
+	default_ihand(device_t dev, int ipend)
+	{
+		return (NULL);
+	}
+
+	static int
+	default_ipend(device_t dev)
+	{
+		return (-1);
+	}
+
+	static int
+	default_sysdev(device_t dev)
+	{
+		return (0);
+	}
+};
+
+# ihand() - Query serial device interrupt handler.
+# This method is called by the umbrella driver to obtain function pointers
+# to interrupt handlers for each individual interrupt source. This allows
+# the umbralla driver to control the servicing of interrupts between the
+# different channels in the most flexible way.
+METHOD serdev_intr_t* ihand {
+	device_t dev;
+	int ipend;
+} DEFAULT default_ihand;
+
+# ipend() - Query pending interrupt status.
+# This method is called by the umbrella driver to obtain interrupt status
+# for the UART in question. This allows the umbrella driver to build a
+# matrix and service the interrupts in the most flexible way by calling
+# interrupt handlers collected with the ihand() method.
+METHOD int ipend {
+	device_t dev;
+} DEFAULT default_ipend;
+
+# sysdev() - Query system device status 
+# This method may be called by the umbrella driver for each child driver
+# to establish if a particular channel and mode is currently being used
+# for system specific usage. If this is the case, the hardware is not
+# reset and the channel will not change its operation mode.
+# The return value is !0 if the channel and mode are used for a system
+# device and 0 otherwise.
+METHOD int sysdev {
+	device_t dev;
+} DEFAULT default_sysdev;
+
Index: kern_linker.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_linker.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/kern_linker.c -L sys/kern/kern_linker.c -u -r1.1.1.1 -r1.2
--- sys/kern/kern_linker.c
+++ sys/kern/kern_linker.c
@@ -25,9 +25,10 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_linker.c,v 1.117.2.1 2005/11/04 17:05:13 jdp Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_linker.c,v 1.149 2007/05/31 11:51:51 kib Exp $");
 
 #include "opt_ddb.h"
+#include "opt_hwpmc_hooks.h"
 #include "opt_mac.h"
 
 #include <sys/param.h>
@@ -36,31 +37,57 @@
 #include <sys/malloc.h>
 #include <sys/sysproto.h>
 #include <sys/sysent.h>
+#include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/sx.h>
-#include <sys/mac.h>
 #include <sys/module.h>
+#include <sys/mount.h>
 #include <sys/linker.h>
 #include <sys/fcntl.h>
 #include <sys/libkern.h>
 #include <sys/namei.h>
 #include <sys/vnode.h>
+#include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 
+#include <security/mac/mac_framework.h>
+
 #include "linker_if.h"
 
+#ifdef HWPMC_HOOKS
+#include <sys/pmckern.h>
+#endif
+
 #ifdef KLD_DEBUG
 int kld_debug = 0;
 #endif
 
+#define	KLD_LOCK()		sx_xlock(&kld_sx)
+#define	KLD_UNLOCK()		sx_xunlock(&kld_sx)
+#define	KLD_LOCKED()		sx_xlocked(&kld_sx)
+#define	KLD_LOCK_ASSERT() do {						\
+	if (!cold)							\
+		sx_assert(&kld_sx, SX_XLOCKED);				\
+} while (0)
+
 /*
  * static char *linker_search_path(const char *name, struct mod_depend
  * *verinfo);
  */
 static const char 	*linker_basename(const char *path);
 
+/*
+ * Find a currently loaded file given its filename.
+ */
+static linker_file_t linker_find_file_by_name(const char* _filename);
+
+/*
+ * Find a currently loaded file given its file id.
+ */
+static linker_file_t linker_find_file_by_id(int _fileid);
+
 /* Metadata from the static kernel */
 SET_DECLARE(modmetadata_set, struct mod_metadata);
 
@@ -68,7 +95,7 @@
 
 linker_file_t linker_kernel_file;
 
-static struct mtx kld_mtx;	/* kernel linker mutex */
+static struct sx kld_sx;	/* kernel linker lock */
 
 static linker_class_list_t classes;
 static linker_file_list_t linker_files;
@@ -78,17 +105,15 @@
 #define	LINKER_GET_NEXT_FILE_ID(a) do {					\
 	linker_file_t lftmp;						\
 									\
+	KLD_LOCK_ASSERT();						\
 retry:									\
-	mtx_lock(&kld_mtx);						\
 	TAILQ_FOREACH(lftmp, &linker_files, link) {			\
 		if (next_file_id == lftmp->id) {			\
 			next_file_id++;					\
-			mtx_unlock(&kld_mtx);				\
 			goto retry;					\
 		}							\
 	}								\
 	(a) = next_file_id;						\
-	mtx_unlock(&kld_mtx);	/* Hold for safe read of id variable */	\
 } while(0)
 
 
@@ -103,8 +128,14 @@
 typedef struct modlist *modlist_t;
 static modlisthead_t found_modules;
 
-static modlist_t	modlist_lookup2(const char *name,
-			    struct mod_depend *verinfo);
+static int	linker_file_add_dependency(linker_file_t file,
+		    linker_file_t dep);
+static caddr_t	linker_file_lookup_symbol_internal(linker_file_t file,
+		    const char* name, int deps);
+static int	linker_load_module(const char *kldname,
+		    const char *modname, struct linker_file *parent,
+		    struct mod_depend *verinfo, struct linker_file **lfpp);
+static modlist_t modlist_lookup2(const char *name, struct mod_depend *verinfo);
 
 static char *
 linker_strdup(const char *str)
@@ -120,7 +151,7 @@
 linker_init(void *arg)
 {
 
-	mtx_init(&kld_mtx, "kernel linker", NULL, MTX_DEF);
+	sx_init(&kld_sx, "kernel linker");
 	TAILQ_INIT(&classes);
 	TAILQ_INIT(&linker_files);
 }
@@ -166,7 +197,7 @@
 	/*
 	 * Perform a bubble sort of the system initialization objects by
 	 * their subsystem (primary key) and order (secondary key).
-	 * 
+	 *
 	 * Since some things care about execution order, this is the operation
 	 * which ensures continued function.
 	 */
@@ -186,6 +217,7 @@
 	 * Traverse the (now) ordered list of system initialization tasks.
 	 * Perform each task, and continue on to the next task.
 	 */
+	mtx_lock(&Giant);
 	for (sipp = start; sipp < stop; sipp++) {
 		if ((*sipp)->subsystem == SI_SUB_DUMMY)
 			continue;	/* skip dummy task(s) */
@@ -193,6 +225,7 @@
 		/* Call function */
 		(*((*sipp)->func)) ((*sipp)->udata);
 	}
+	mtx_unlock(&Giant);
 }
 
 static void
@@ -210,7 +243,7 @@
 	/*
 	 * Perform a reverse bubble sort of the system initialization objects
 	 * by their subsystem (primary key) and order (secondary key).
-	 * 
+	 *
 	 * Since some things care about execution order, this is the operation
 	 * which ensures continued function.
 	 */
@@ -230,6 +263,7 @@
 	 * Traverse the (now) ordered list of system initialization tasks.
 	 * Perform each task, and continue on to the next task.
 	 */
+	mtx_lock(&Giant);
 	for (sipp = start; sipp < stop; sipp++) {
 		if ((*sipp)->subsystem == SI_SUB_DUMMY)
 			continue;	/* skip dummy task(s) */
@@ -237,6 +271,7 @@
 		/* Call function */
 		(*((*sipp)->func)) ((*sipp)->udata);
 	}
+	mtx_unlock(&Giant);
 }
 
 static void
@@ -251,8 +286,10 @@
 	if (linker_file_lookup_set(lf, "sysctl_set", &start, &stop, NULL) != 0)
 		return;
 
+	mtx_lock(&Giant);
 	for (oidp = start; oidp < stop; oidp++)
 		sysctl_register_oid(*oidp);
+	mtx_unlock(&Giant);
 }
 
 static void
@@ -266,8 +303,10 @@
 	if (linker_file_lookup_set(lf, "sysctl_set", &start, &stop, NULL) != 0)
 		return;
 
+	mtx_lock(&Giant);
 	for (oidp = start; oidp < stop; oidp++)
 		sysctl_unregister_oid(*oidp);
+	mtx_unlock(&Giant);
 }
 
 static int
@@ -281,7 +320,7 @@
 	    " in %s\n", lf->filename));
 
 	if (linker_file_lookup_set(lf, "modmetadata_set", &start,
-	    &stop, 0) != 0) {
+	    &stop, NULL) != 0) {
 		/*
 		 * This fallback should be unnecessary, but if we get booted
 		 * from boot2 instead of loader and we are missing our
@@ -325,22 +364,23 @@
 {
 	linker_class_t lc;
 	linker_file_t lf;
-	int foundfile, error = 0;
+	int foundfile, error;
 
 	/* Refuse to load modules if securelevel raised */
 	if (securelevel > 0)
 		return (EPERM);
 
+	KLD_LOCK_ASSERT();
 	lf = linker_find_file_by_name(filename);
 	if (lf) {
 		KLD_DPF(FILE, ("linker_load_file: file %s is already loaded,"
 		    " incrementing refs\n", filename));
 		*result = lf;
 		lf->refs++;
-		goto out;
+		return (0);
 	}
-	lf = NULL;
 	foundfile = 0;
+	error = 0;
 
 	/*
 	 * We do not need to protect (lock) classes here because there is
@@ -361,14 +401,15 @@
 			error = linker_file_register_modules(lf);
 			if (error == EEXIST) {
 				linker_file_unload(lf, LINKER_UNLOAD_FORCE);
-				goto out;
+				return (error);
 			}
+			KLD_UNLOCK();
 			linker_file_register_sysctls(lf);
 			linker_file_sysinit(lf);
+			KLD_LOCK();
 			lf->flags |= LINKER_FILE_LINKED;
 			*result = lf;
-			error = 0;
-			goto out;
+			return (0);
 		}
 	}
 	/*
@@ -388,7 +429,6 @@
 			error = ENOEXEC;
 	} else
 		error = ENOENT;		/* Nothing found */
-out:
 	return (error);
 }
 
@@ -397,67 +437,107 @@
     linker_file_t *result)
 {
 	modlist_t mod;
+	int error;
 
+	KLD_LOCK();
 	if ((mod = modlist_lookup2(modname, verinfo)) != NULL) {
 		*result = mod->container;
 		(*result)->refs++;
+		KLD_UNLOCK();
 		return (0);
 	}
 
-	return (linker_load_module(NULL, modname, NULL, verinfo, result));
+	error = linker_load_module(NULL, modname, NULL, verinfo, result);
+	KLD_UNLOCK();
+	return (error);
 }
 
-linker_file_t
+int
+linker_release_module(const char *modname, struct mod_depend *verinfo,
+    linker_file_t lf)
+{
+	modlist_t mod;
+	int error;
+
+	KLD_LOCK();
+	if (lf == NULL) {
+		KASSERT(modname != NULL,
+		    ("linker_release_module: no file or name"));
+		mod = modlist_lookup2(modname, verinfo);
+		if (mod == NULL) {
+			KLD_UNLOCK();
+			return (ESRCH);
+		}
+		lf = mod->container;
+	} else
+		KASSERT(modname == NULL && verinfo == NULL,
+		    ("linker_release_module: both file and name"));
+	error =	linker_file_unload(lf, LINKER_UNLOAD_NORMAL);
+	KLD_UNLOCK();
+	return (error);
+}
+
+static linker_file_t
 linker_find_file_by_name(const char *filename)
 {
-	linker_file_t lf = 0;
+	linker_file_t lf;
 	char *koname;
 
 	koname = malloc(strlen(filename) + 4, M_LINKER, M_WAITOK);
-	if (koname == NULL)
-		goto out;
 	sprintf(koname, "%s.ko", filename);
 
-	mtx_lock(&kld_mtx);
+	KLD_LOCK_ASSERT();
 	TAILQ_FOREACH(lf, &linker_files, link) {
 		if (strcmp(lf->filename, koname) == 0)
 			break;
 		if (strcmp(lf->filename, filename) == 0)
 			break;
 	}
-	mtx_unlock(&kld_mtx);
-out:
-	if (koname)
-		free(koname, M_LINKER);
+	free(koname, M_LINKER);
 	return (lf);
 }
 
-linker_file_t
+static linker_file_t
 linker_find_file_by_id(int fileid)
 {
-	linker_file_t lf = 0;
-	
-	mtx_lock(&kld_mtx);
+	linker_file_t lf;
+
+	KLD_LOCK_ASSERT();
 	TAILQ_FOREACH(lf, &linker_files, link)
-		if (lf->id == fileid)
+		if (lf->id == fileid && lf->flags & LINKER_FILE_LINKED)
 			break;
-	mtx_unlock(&kld_mtx);
 	return (lf);
 }
 
+int
+linker_file_foreach(linker_predicate_t *predicate, void *context)
+{
+	linker_file_t lf;
+	int retval = 0;
+
+	KLD_LOCK();
+	TAILQ_FOREACH(lf, &linker_files, link) {
+		retval = predicate(lf, context);
+		if (retval != 0)
+			break;
+	}
+	KLD_UNLOCK();
+	return (retval);
+}
+
 linker_file_t
 linker_make_file(const char *pathname, linker_class_t lc)
 {
 	linker_file_t lf;
 	const char *filename;
 
-	lf = NULL;
+	KLD_LOCK_ASSERT();
 	filename = linker_basename(pathname);
 
 	KLD_DPF(FILE, ("linker_make_file: new file, filename=%s\n", filename));
 	lf = (linker_file_t)kobj_create((kobj_class_t)lc, M_LINKER, M_WAITOK);
 	if (lf == NULL)
-		goto out;
+		return (NULL);
 	lf->refs = 1;
 	lf->userrefs = 0;
 	lf->flags = 0;
@@ -467,10 +547,7 @@
 	lf->deps = NULL;
 	STAILQ_INIT(&lf->common);
 	TAILQ_INIT(&lf->modules);
-	mtx_lock(&kld_mtx);
 	TAILQ_INSERT_TAIL(&linker_files, lf, link);
-	mtx_unlock(&kld_mtx);
-out:
 	return (lf);
 }
 
@@ -482,66 +559,59 @@
 	struct common_symbol *cp;
 	int error, i;
 
-	error = 0;
-
 	/* Refuse to unload modules if securelevel raised. */
 	if (securelevel > 0)
 		return (EPERM);
-#ifdef MAC
-	error = mac_check_kld_unload(curthread->td_ucred);
-	if (error)
-		return (error);
-#endif
 
+	KLD_LOCK_ASSERT();
 	KLD_DPF(FILE, ("linker_file_unload: lf->refs=%d\n", file->refs));
-	if (file->refs == 1) {
-		KLD_DPF(FILE, ("linker_file_unload: file is unloading,"
-		    " informing modules\n"));
+
+	/* Easy case of just dropping a reference. */
+	if (file->refs > 1) {
+		file->refs--;
+		return (0);
+	}
+
+	KLD_DPF(FILE, ("linker_file_unload: file is unloading,"
+	    " informing modules\n"));
+
+	/*
+	 * Inform any modules associated with this file.
+	 */
+	MOD_XLOCK;
+	for (mod = TAILQ_FIRST(&file->modules); mod; mod = next) {
+		next = module_getfnext(mod);
+		MOD_XUNLOCK;
 
 		/*
-		 * Inform any modules associated with this file.
+		 * Give the module a chance to veto the unload.
 		 */
-		MOD_XLOCK;
-		for (mod = TAILQ_FIRST(&file->modules); mod; mod = next) {
-			next = module_getfnext(mod);
-			MOD_XUNLOCK;
-
-			/*
-			 * Give the module a chance to veto the unload.
-			 */
-			if ((error = module_unload(mod, flags)) != 0) {
-				KLD_DPF(FILE, ("linker_file_unload: module %p"
-				    " vetoes unload\n", mod));
-				goto out;
-			} else
-				MOD_XLOCK;
-			module_release(mod);
+		if ((error = module_unload(mod, flags)) != 0) {
+			KLD_DPF(FILE, ("linker_file_unload: module %p"
+			    " vetoes unload\n", mod));
+			return (error);
 		}
-		MOD_XUNLOCK;
-	}
-	file->refs--;
-	if (file->refs > 0) {
-		goto out;
+		MOD_XLOCK;
+		module_release(mod);
 	}
-	for (ml = TAILQ_FIRST(&found_modules); ml; ml = nextml) {
-		nextml = TAILQ_NEXT(ml, link);
+	MOD_XUNLOCK;
+
+	TAILQ_FOREACH_SAFE(ml, &found_modules, link, nextml) {
 		if (ml->container == file) {
 			TAILQ_REMOVE(&found_modules, ml, link);
 			free(ml, M_LINKER);
 		}
 	}
 
-	/* 
-	 * Don't try to run SYSUNINITs if we are unloaded due to a 
+	/*
+	 * Don't try to run SYSUNINITs if we are unloaded due to a
 	 * link error.
 	 */
 	if (file->flags & LINKER_FILE_LINKED) {
 		linker_file_sysuninit(file);
 		linker_file_unregister_sysctls(file);
 	}
-	mtx_lock(&kld_mtx);
 	TAILQ_REMOVE(&linker_files, file, link);
-	mtx_unlock(&kld_mtx);
 
 	if (file->deps) {
 		for (i = 0; i < file->ndeps; i++)
@@ -549,9 +619,8 @@
 		free(file->deps, M_LINKER);
 		file->deps = NULL;
 	}
-	for (cp = STAILQ_FIRST(&file->common); cp;
-	    cp = STAILQ_FIRST(&file->common)) {
-		STAILQ_REMOVE(&file->common, cp, common_symbol, link);
+	while ((cp = STAILQ_FIRST(&file->common)) != NULL) {
+		STAILQ_REMOVE_HEAD(&file->common, link);
 		free(cp, M_LINKER);
 	}
 
@@ -561,15 +630,15 @@
 		file->filename = NULL;
 	}
 	kobj_delete((kobj_t) file, M_LINKER);
-out:
-	return (error);
+	return (0);
 }
 
-int
+static int
 linker_file_add_dependency(linker_file_t file, linker_file_t dep)
 {
 	linker_file_t *newdeps;
 
+	KLD_LOCK_ASSERT();
 	newdeps = malloc((file->ndeps + 1) * sizeof(linker_file_t *),
 	    M_LINKER, M_WAITOK | M_ZERO);
 	if (newdeps == NULL)
@@ -588,25 +657,51 @@
 
 /*
  * Locate a linker set and its contents.  This is a helper function to avoid
- * linker_if.h exposure elsewhere.  Note: firstp and lastp are really void ***
+ * linker_if.h exposure elsewhere.  Note: firstp and lastp are really void **.
+ * This function is used in this file so we can avoid having lots of (void **)
+ * casts.
  */
 int
 linker_file_lookup_set(linker_file_t file, const char *name,
     void *firstp, void *lastp, int *countp)
 {
+	int error, locked;
 
-	return (LINKER_LOOKUP_SET(file, name, firstp, lastp, countp));
+	locked = KLD_LOCKED();
+	if (!locked)
+		KLD_LOCK();
+	error = LINKER_LOOKUP_SET(file, name, firstp, lastp, countp);
+	if (!locked)
+		KLD_UNLOCK();
+	return (error);
 }
 
 caddr_t
 linker_file_lookup_symbol(linker_file_t file, const char *name, int deps)
 {
+	caddr_t sym;
+	int locked;
+
+	locked = KLD_LOCKED();
+	if (!locked)
+		KLD_LOCK();
+	sym = linker_file_lookup_symbol_internal(file, name, deps);
+	if (!locked)
+		KLD_UNLOCK();
+	return (sym);
+}
+
+static caddr_t
+linker_file_lookup_symbol_internal(linker_file_t file, const char *name,
+    int deps)
+{
 	c_linker_sym_t sym;
 	linker_symval_t symval;
 	caddr_t address;
 	size_t common_size = 0;
 	int i;
 
+	KLD_LOCK_ASSERT();
 	KLD_DPF(SYM, ("linker_file_lookup_symbol: file=%p, name=%s, deps=%d\n",
 	    file, name, deps));
 
@@ -627,8 +722,8 @@
 	}
 	if (deps) {
 		for (i = 0; i < file->ndeps; i++) {
-			address = linker_file_lookup_symbol(file->deps[i],
-			    name, 0);
+			address = linker_file_lookup_symbol_internal(
+			    file->deps[i], name, 0);
 			if (address) {
 				KLD_DPF(SYM, ("linker_file_lookup_symbol:"
 				    " deps value=%p\n", address));
@@ -658,10 +753,6 @@
 		cp = malloc(sizeof(struct common_symbol)
 		    + common_size + strlen(name) + 1, M_LINKER,
 		    M_WAITOK | M_ZERO);
-		if (cp == NULL) {
-			KLD_DPF(SYM, ("linker_file_lookup_symbol: nomem\n"));
-			return (0);
-		}
 		cp->address = (caddr_t)(cp + 1);
 		cp->name = cp->address + common_size;
 		strcpy(cp->name, name);
@@ -680,7 +771,7 @@
 /*
  * DDB Helpers.  DDB has to look across multiple files with their own symbol
  * tables and string tables.
- * 
+ *
  * Note that we do not obey list locking protocols here.  We really don't need
  * DDB to hang because somebody's got the lock held.  We'll take the chance
  * that the files list is inconsistant instead.
@@ -745,73 +836,87 @@
 /*
  * Syscalls.
  */
-/*
- * MPSAFE
- */
 int
-kldload(struct thread *td, struct kldload_args *uap)
+kern_kldload(struct thread *td, const char *file, int *fileid)
 {
-	char *kldname, *modname;
-	char *pathname = NULL;
+#ifdef HWPMC_HOOKS
+	struct pmckern_map_in pkm;
+#endif
+	const char *kldname, *modname;
 	linker_file_t lf;
-	int error = 0;
-
-	td->td_retval[0] = -1;
-
-	mtx_lock(&Giant);
+	int error;
 
 	if ((error = securelevel_gt(td->td_ucred, 0)) != 0)
-		goto out;
-
-	if ((error = suser(td)) != 0)
-		goto out;
+		return (error);
 
-	pathname = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
-	if ((error = copyinstr(uap->file, pathname, MAXPATHLEN, NULL)) != 0)
-		goto out;
+	if ((error = priv_check(td, PRIV_KLD_LOAD)) != 0)
+		return (error);
 
 	/*
-	 * If path do not contain qualified name or any dot in it
-	 * (kldname.ko, or kldname.ver.ko) treat it as interface
+	 * If file does not contain a qualified name or any dot in it
+	 * (kldname.ko, or kldname.ver.ko) treat it as an interface
 	 * name.
 	 */
-	if (index(pathname, '/') || index(pathname, '.')) {
-		kldname = pathname;
+	if (index(file, '/') || index(file, '.')) {
+		kldname = file;
 		modname = NULL;
 	} else {
 		kldname = NULL;
-		modname = pathname;
+		modname = file;
 	}
+
+	KLD_LOCK();
 	error = linker_load_module(kldname, modname, NULL, NULL, &lf);
 	if (error)
-		goto out;
-
+		goto unlock;
+#ifdef HWPMC_HOOKS
+	pkm.pm_file = lf->filename;
+	pkm.pm_address = (uintptr_t) lf->address;
+	PMC_CALL_HOOK(td, PMC_FN_KLD_LOAD, (void *) &pkm);
+#endif
 	lf->userrefs++;
-	td->td_retval[0] = lf->id;
-out:
-	if (pathname)
-		free(pathname, M_TEMP);
-	mtx_unlock(&Giant);
+	if (fileid != NULL)
+		*fileid = lf->id;
+unlock:
+	KLD_UNLOCK();
 	return (error);
 }
 
-/*
- * MPSAFE
- */
-static int
+int
+kldload(struct thread *td, struct kldload_args *uap)
+{
+	char *pathname = NULL;
+	int error, fileid;
+
+	td->td_retval[0] = -1;
+
+	pathname = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
+	error = copyinstr(uap->file, pathname, MAXPATHLEN, NULL);
+	if (error == 0) {
+		error = kern_kldload(td, pathname, &fileid);
+		if (error == 0)
+			td->td_retval[0] = fileid;
+	}
+	free(pathname, M_TEMP);
+	return (error);
+}
+
+int
 kern_kldunload(struct thread *td, int fileid, int flags)
 {
+#ifdef HWPMC_HOOKS
+	struct pmckern_map_out pkm;
+#endif
 	linker_file_t lf;
 	int error = 0;
 
-	mtx_lock(&Giant);
-
 	if ((error = securelevel_gt(td->td_ucred, 0)) != 0)
-		goto out;
+		return (error);
 
-	if ((error = suser(td)) != 0)
-		goto out;
+	if ((error = priv_check(td, PRIV_KLD_UNLOAD)) != 0)
+		return (error);
 
+	KLD_LOCK();
 	lf = linker_find_file_by_id(fileid);
 	if (lf) {
 		KLD_DPF(FILE, ("kldunload: lf->userrefs=%d\n", lf->userrefs));
@@ -822,22 +927,28 @@
 			printf("kldunload: attempt to unload file that was"
 			    " loaded by the kernel\n");
 			error = EBUSY;
-			goto out;
+		} else {
+#ifdef HWPMC_HOOKS
+			/* Save data needed by hwpmc(4) before unloading. */
+			pkm.pm_address = (uintptr_t) lf->address;
+			pkm.pm_size = lf->size;
+#endif
+			lf->userrefs--;
+			error = linker_file_unload(lf, flags);
+			if (error)
+				lf->userrefs++;
 		}
-		lf->userrefs--;
-		error = linker_file_unload(lf, flags);
-		if (error)
-			lf->userrefs++;
 	} else
 		error = ENOENT;
-out:
-	mtx_unlock(&Giant);
+
+#ifdef HWPMC_HOOKS
+	if (error == 0)
+		PMC_CALL_HOOK(td, PMC_FN_KLD_UNLOAD, (void *) &pkm);
+#endif
+	KLD_UNLOCK();
 	return (error);
 }
 
-/*
- * MPSAFE
- */
 int
 kldunload(struct thread *td, struct kldunload_args *uap)
 {
@@ -845,9 +956,6 @@
 	return (kern_kldunload(td, uap->fileid, LINKER_UNLOAD_NORMAL));
 }
 
-/*
- * MPSAFE
- */
 int
 kldunloadf(struct thread *td, struct kldunloadf_args *uap)
 {
@@ -858,16 +966,13 @@
 	return (kern_kldunload(td, uap->fileid, uap->flags));
 }
 
-/*
- * MPSAFE
- */
 int
 kldfind(struct thread *td, struct kldfind_args *uap)
 {
 	char *pathname;
 	const char *filename;
 	linker_file_t lf;
-	int error = 0;
+	int error;
 
 #ifdef MAC
 	error = mac_check_kld_stat(td->td_ucred);
@@ -875,7 +980,6 @@
 		return (error);
 #endif
 
-	mtx_lock(&Giant);
 	td->td_retval[0] = -1;
 
 	pathname = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
@@ -883,21 +987,18 @@
 		goto out;
 
 	filename = linker_basename(pathname);
+	KLD_LOCK();
 	lf = linker_find_file_by_name(filename);
 	if (lf)
 		td->td_retval[0] = lf->id;
 	else
 		error = ENOENT;
+	KLD_UNLOCK();
 out:
-	if (pathname)
-		free(pathname, M_TEMP);
-	mtx_unlock(&Giant);
+	free(pathname, M_TEMP);
 	return (error);
 }
 
-/*
- * MPSAFE
- */
 int
 kldnext(struct thread *td, struct kldnext_args *uap)
 {
@@ -910,40 +1011,46 @@
 		return (error);
 #endif
 
-	mtx_lock(&Giant);
-
-	if (uap->fileid == 0) {
-		mtx_lock(&kld_mtx);
-		if (TAILQ_FIRST(&linker_files))
-			td->td_retval[0] = TAILQ_FIRST(&linker_files)->id;
-		else
-			td->td_retval[0] = 0;
-		mtx_unlock(&kld_mtx);
-		goto out;
+	KLD_LOCK();
+	if (uap->fileid == 0)
+		lf = TAILQ_FIRST(&linker_files);
+	else {
+		lf = linker_find_file_by_id(uap->fileid);
+		if (lf == NULL) {
+			error = ENOENT;
+			goto out;
+		}
+		lf = TAILQ_NEXT(lf, link);
 	}
-	lf = linker_find_file_by_id(uap->fileid);
-	if (lf) {
-		if (TAILQ_NEXT(lf, link))
-			td->td_retval[0] = TAILQ_NEXT(lf, link)->id;
-		else
-			td->td_retval[0] = 0;
-	} else
-		error = ENOENT;
+
+	/* Skip partially loaded files. */
+	while (lf != NULL && !(lf->flags & LINKER_FILE_LINKED))
+		lf = TAILQ_NEXT(lf, link);
+
+	if (lf)
+		td->td_retval[0] = lf->id;
+	else
+		td->td_retval[0] = 0;
 out:
-	mtx_unlock(&Giant);
+	KLD_UNLOCK();
 	return (error);
 }
 
-/*
- * MPSAFE
- */
 int
 kldstat(struct thread *td, struct kldstat_args *uap)
 {
+	struct kld_file_stat stat;
 	linker_file_t lf;
-	int error = 0;
-	int namelen, version;
-	struct kld_file_stat *stat;
+	int error, namelen;
+
+	/*
+	 * Check the version of the user's structure.
+	 */
+	error = copyin(uap->stat, &stat, sizeof(struct kld_file_stat));
+	if (error)
+		return (error);
+	if (stat.version != sizeof(struct kld_file_stat))
+		return (EINVAL);
 
 #ifdef MAC
 	error = mac_check_kld_stat(td->td_ucred);
@@ -951,48 +1058,28 @@
 		return (error);
 #endif
 
-	mtx_lock(&Giant);
-
+	KLD_LOCK();
 	lf = linker_find_file_by_id(uap->fileid);
 	if (lf == NULL) {
-		error = ENOENT;
-		goto out;
+		KLD_UNLOCK();
+		return (ENOENT);
 	}
-	stat = uap->stat;
 
-	/*
-	 * Check the version of the user's structure.
-	 */
-	if ((error = copyin(&stat->version, &version, sizeof(version))) != 0)
-		goto out;
-	if (version != sizeof(struct kld_file_stat)) {
-		error = EINVAL;
-		goto out;
-	}
 	namelen = strlen(lf->filename) + 1;
 	if (namelen > MAXPATHLEN)
 		namelen = MAXPATHLEN;
-	if ((error = copyout(lf->filename, &stat->name[0], namelen)) != 0)
-		goto out;
-	if ((error = copyout(&lf->refs, &stat->refs, sizeof(int))) != 0)
-		goto out;
-	if ((error = copyout(&lf->id, &stat->id, sizeof(int))) != 0)
-		goto out;
-	if ((error = copyout(&lf->address, &stat->address,
-	    sizeof(caddr_t))) != 0)
-		goto out;
-	if ((error = copyout(&lf->size, &stat->size, sizeof(size_t))) != 0)
-		goto out;
+	bcopy(lf->filename, &stat.name[0], namelen);
+	stat.refs = lf->refs;
+	stat.id = lf->id;
+	stat.address = lf->address;
+	stat.size = lf->size;
+	KLD_UNLOCK();
 
 	td->td_retval[0] = 0;
-out:
-	mtx_unlock(&Giant);
-	return (error);
+
+	return (copyout(&stat, uap->stat, sizeof(struct kld_file_stat)));
 }
 
-/*
- * MPSAFE
- */
 int
 kldfirstmod(struct thread *td, struct kldfirstmod_args *uap)
 {
@@ -1006,7 +1093,7 @@
 		return (error);
 #endif
 
-	mtx_lock(&Giant);
+	KLD_LOCK();
 	lf = linker_find_file_by_id(uap->fileid);
 	if (lf) {
 		MOD_SLOCK;
@@ -1018,13 +1105,10 @@
 		MOD_SUNLOCK;
 	} else
 		error = ENOENT;
-	mtx_unlock(&Giant);
+	KLD_UNLOCK();
 	return (error);
 }
 
-/*
- * MPSAFE
- */
 int
 kldsym(struct thread *td, struct kldsym_args *uap)
 {
@@ -1041,25 +1125,20 @@
 		return (error);
 #endif
 
-	mtx_lock(&Giant);
-
 	if ((error = copyin(uap->data, &lookup, sizeof(lookup))) != 0)
-		goto out;
+		return (error);
 	if (lookup.version != sizeof(lookup) ||
-	    uap->cmd != KLDSYM_LOOKUP) {
-		error = EINVAL;
-		goto out;
-	}
+	    uap->cmd != KLDSYM_LOOKUP)
+		return (EINVAL);
 	symstr = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
 	if ((error = copyinstr(lookup.symname, symstr, MAXPATHLEN, NULL)) != 0)
 		goto out;
+	KLD_LOCK();
 	if (uap->fileid != 0) {
 		lf = linker_find_file_by_id(uap->fileid);
-		if (lf == NULL) {
+		if (lf == NULL)
 			error = ENOENT;
-			goto out;
-		}
-		if (LINKER_LOOKUP_SYMBOL(lf, symstr, &sym) == 0 &&
+		else if (LINKER_LOOKUP_SYMBOL(lf, symstr, &sym) == 0 &&
 		    LINKER_SYMBOL_VALUES(lf, sym, &symval) == 0) {
 			lookup.symvalue = (uintptr_t) symval.value;
 			lookup.symsize = symval.size;
@@ -1067,7 +1146,6 @@
 		} else
 			error = ENOENT;
 	} else {
-		mtx_lock(&kld_mtx);
 		TAILQ_FOREACH(lf, &linker_files, link) {
 			if (LINKER_LOOKUP_SYMBOL(lf, symstr, &sym) == 0 &&
 			    LINKER_SYMBOL_VALUES(lf, sym, &symval) == 0) {
@@ -1078,14 +1156,12 @@
 				break;
 			}
 		}
-		mtx_unlock(&kld_mtx);
 		if (lf == NULL)
 			error = ENOENT;
 	}
+	KLD_UNLOCK();
 out:
-	if (symstr)
-		free(symstr, M_TEMP);
-	mtx_unlock(&Giant);
+	free(symstr, M_TEMP);
 	return (error);
 }
 
@@ -1115,8 +1191,7 @@
 	if (verinfo == NULL)
 		return (modlist_lookup(name, 0));
 	bestmod = NULL;
-	for (mod = TAILQ_FIRST(&found_modules); mod;
-	    mod = TAILQ_NEXT(mod, link)) {
+	TAILQ_FOREACH(mod, &found_modules, link) {
 		if (strcmp(mod->name, name) != 0)
 			continue;
 		ver = mod->version;
@@ -1174,7 +1249,7 @@
 	caddr_t modptr;
 	const char *modname, *nmodname;
 	char *modtype;
-	linker_file_t lf;
+	linker_file_t lf, nlf;
 	linker_class_t lc;
 	int error;
 	linker_file_list_t loaded_files;
@@ -1228,8 +1303,8 @@
 		linker_addmodules(linker_kernel_file, start, stop, 1);
 
 	/*
-	 * this is a once-off kinky bubble sort resolve relocation dependency
-	 * requirements
+	 * This is a once-off kinky bubble sort to resolve relocation
+	 * dependency requirements.
 	 */
 restart:
 	TAILQ_FOREACH(lf, &loaded_files, loaded) {
@@ -1257,7 +1332,7 @@
 				}
 				if (nmdp < stop)   /* it's a self reference */
 					continue;
-	
+
 				/*
 				 * ok, the module isn't here yet, we
 				 * are not finished
@@ -1284,10 +1359,10 @@
 					    nver) != NULL) {
 						printf("module %s already"
 						    " present!\n", modname);
-						linker_file_unload(lf,
-						    LINKER_UNLOAD_FORCE);
 						TAILQ_REMOVE(&loaded_files,
 						    lf, loaded);
+						linker_file_unload(lf,
+						    LINKER_UNLOAD_FORCE);
 						/* we changed tailq next ptr */
 						goto restart;
 					}
@@ -1309,16 +1384,16 @@
 	/*
 	 * At this point, we check to see what could not be resolved..
 	 */
-	TAILQ_FOREACH(lf, &loaded_files, loaded) {
+	while ((lf = TAILQ_FIRST(&loaded_files)) != NULL) {
+		TAILQ_REMOVE(&loaded_files, lf, loaded);
 		printf("KLD file %s is missing dependencies\n", lf->filename);
 		linker_file_unload(lf, LINKER_UNLOAD_FORCE);
-		TAILQ_REMOVE(&loaded_files, lf, loaded);
 	}
 
 	/*
 	 * We made it. Finish off the linking in the order we determined.
 	 */
-	TAILQ_FOREACH(lf, &depended_files, loaded) {
+	TAILQ_FOREACH_SAFE(lf, &depended_files, loaded, nlf) {
 		if (linker_kernel_file) {
 			linker_kernel_file->refs++;
 			error = linker_file_add_dependency(lf,
@@ -1353,6 +1428,7 @@
 		 */
 		error = LINKER_LINK_PRELOAD_FINISH(lf);
 		if (error) {
+			TAILQ_REMOVE(&depended_files, lf, loaded);
 			printf("KLD file %s - could not finalize loading\n",
 			    lf->filename);
 			linker_file_unload(lf, LINKER_UNLOAD_FORCE);
@@ -1372,15 +1448,15 @@
 
 /*
  * Search for a not-loaded module by name.
- * 
+ *
  * Modules may be found in the following locations:
- * 
+ *
  * - preloaded (result is just the module name) - on disk (result is full path
  * to module)
- * 
+ *
  * If the module name is qualified in any way (contains path, etc.) the we
  * simply return a copy of it.
- * 
+ *
  * The search path can be manipulated via sysctl.  Note that we use the ';'
  * character as a separator to be consistent with the bootloader.
  */
@@ -1411,7 +1487,7 @@
 	struct nameidata nd;
 	struct thread *td = curthread;	/* XXX */
 	char *result, **cpp, *sep;
-	int error, len, extlen, reclen, flags;
+	int error, len, extlen, reclen, flags, vfslocked;
 	enum vtype type;
 
 	extlen = 0;
@@ -1432,16 +1508,18 @@
 		 * Attempt to open the file, and return the path if
 		 * we succeed and it's a regular file.
 		 */
-		NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, result, td);
+		NDINIT(&nd, LOOKUP, FOLLOW | MPSAFE, UIO_SYSSPACE, result, td);
 		flags = FREAD;
-		error = vn_open(&nd, &flags, 0, -1);
+		error = vn_open(&nd, &flags, 0, NULL);
 		if (error == 0) {
+			vfslocked = NDHASGIANT(&nd);
 			NDFREE(&nd, NDF_ONLY_PNBUF);
 			type = nd.ni_vp->v_type;
 			if (vap)
 				VOP_GETATTR(nd.ni_vp, vap, td->td_ucred, td);
 			VOP_UNLOCK(nd.ni_vp, 0, td);
 			vn_close(nd.ni_vp, FREAD, td->td_ucred, td);
+			VFS_UNLOCK_GIANT(vfslocked);
 			if (type == VREG)
 				return (result);
 		}
@@ -1469,6 +1547,7 @@
 	u_char *hints = NULL;
 	u_char *cp, *recptr, *bufend, *result, *best, *pathbuf, *sep;
 	int error, ival, bestver, *intp, reclen, found, flags, clen, blen;
+	int vfslocked = 0;
 
 	result = NULL;
 	bestver = found = 0;
@@ -1480,11 +1559,12 @@
 	snprintf(pathbuf, reclen, "%.*s%s%s", pathlen, path, sep,
 	    linker_hintfile);
 
-	NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, pathbuf, td);
+	NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE, UIO_SYSSPACE, pathbuf, td);
 	flags = FREAD;
-	error = vn_open(&nd, &flags, 0, -1);
+	error = vn_open(&nd, &flags, 0, NULL);
 	if (error)
 		goto bad;
+	vfslocked = NDHASGIANT(&nd);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
 	if (nd.ni_vp->v_type != VREG)
 		goto bad;
@@ -1508,6 +1588,7 @@
 		goto bad;
 	VOP_UNLOCK(nd.ni_vp, 0, td);
 	vn_close(nd.ni_vp, FREAD, cred, td);
+	VFS_UNLOCK_GIANT(vfslocked);
 	nd.ni_vp = NULL;
 	if (reclen != 0) {
 		printf("can't read %d\n", reclen);
@@ -1576,6 +1657,7 @@
 	if (nd.ni_vp != NULL) {
 		VOP_UNLOCK(nd.ni_vp, 0, td);
 		vn_close(nd.ni_vp, FREAD, cred, td);
+		VFS_UNLOCK_GIANT(vfslocked);
 	}
 	/*
 	 * If nothing found or hints is absent - fallback to the old
@@ -1618,21 +1700,13 @@
 static char *
 linker_search_kld(const char *name)
 {
-	char *cp, *ep, *result, **cpp;
-	int extlen, len;
+	char *cp, *ep, *result;
+	int len;
 
 	/* qualified at all? */
 	if (index(name, '/'))
 		return (linker_strdup(name));
 
-	extlen = 0;
-	for (cpp = linker_ext_list; *cpp; cpp++) {
-		len = strlen(*cpp);
-		if (len > extlen)
-			extlen = len;
-	}
-	extlen++;		/* trailing '\0' */
-
 	/* traverse the linker path */
 	len = strlen(name);
 	for (ep = linker_path; *ep; ep++) {
@@ -1659,11 +1733,71 @@
 	return (filename);
 }
 
+#ifdef HWPMC_HOOKS
+
+struct hwpmc_context {
+	int	nobjects;
+	int	nmappings;
+	struct pmckern_map_in *kobase;
+};
+
+static int
+linker_hwpmc_list_object(linker_file_t lf, void *arg)
+{
+	struct hwpmc_context *hc;
+
+	hc = arg;
+
+	/* If we run out of mappings, fail. */
+	if (hc->nobjects >= hc->nmappings)
+		return (1);
+
+	/* Save the info for this linker file. */
+	hc->kobase[hc->nobjects].pm_file = lf->filename;
+	hc->kobase[hc->nobjects].pm_address = (uintptr_t)lf->address;
+	hc->nobjects++;
+	return (0);
+}
+
+/*
+ * Inform hwpmc about the set of kernel modules currently loaded.
+ */
+void *
+linker_hwpmc_list_objects(void)
+{
+	struct hwpmc_context hc;
+
+	hc.nmappings = 15;	/* a reasonable default */
+
+ retry:
+	/* allocate nmappings+1 entries */
+	MALLOC(hc.kobase, struct pmckern_map_in *,
+	    (hc.nmappings + 1) * sizeof(struct pmckern_map_in), M_LINKER,
+	    M_WAITOK | M_ZERO);
+
+	hc.nobjects = 0;
+	if (linker_file_foreach(linker_hwpmc_list_object, &hc) != 0) {
+		hc.nmappings = hc.nobjects;
+		FREE(hc.kobase, M_LINKER);
+		goto retry;
+	}
+
+	KASSERT(hc.nobjects > 0, ("linker_hpwmc_list_objects: no kernel "
+		"objects?"));
+
+	/* The last entry of the malloced area comprises of all zeros. */
+	KASSERT(hc.kobase[hc.nobjects].pm_file == NULL,
+	    ("linker_hwpmc_list_objects: last object not NULL"));
+
+	return ((void *)hc.kobase);
+}
+#endif
+
 /*
  * Find a file which contains given module and load it, if "parent" is not
  * NULL, register a reference to it.
  */
-int
+static int
 linker_load_module(const char *kldname, const char *modname,
     struct linker_file *parent, struct mod_depend *verinfo,
     struct linker_file **lfpp)
@@ -1673,6 +1807,7 @@
 	char *pathname;
 	int error;
 
+	KLD_LOCK_ASSERT();
 	if (modname == NULL) {
 		/*
  		 * We have to load KLD
@@ -1704,11 +1839,9 @@
 	 * provide different versions of the same modules.
 	 */
 	filename = linker_basename(pathname);
-	if (linker_find_file_by_name(filename)) {
+	if (linker_find_file_by_name(filename))
 		error = EEXIST;
-		goto out;
-	}
-	do {
+	else do {
 		error = linker_load_file(pathname, &lfdep);
 		if (error)
 			break;
@@ -1726,9 +1859,7 @@
 		if (lfpp)
 			*lfpp = lfdep;
 	} while (0);
-out:
-	if (pathname)
-		free(pathname, M_LINKER);
+	free(pathname, M_LINKER);
 	return (error);
 }
 
@@ -1750,6 +1881,7 @@
 	/*
 	 * All files are dependant on /kernel.
 	 */
+	KLD_LOCK_ASSERT();
 	if (linker_kernel_file) {
 		linker_kernel_file->refs++;
 		error = linker_file_add_dependency(lf, linker_kernel_file);
@@ -1841,16 +1973,16 @@
 	error = sysctl_wire_old_buffer(req, 0);
 	if (error != 0)
 		return (error);
-	mtx_lock(&kld_mtx);
+	KLD_LOCK();
 	TAILQ_FOREACH(lf, &linker_files, link) {
 		error = LINKER_EACH_FUNCTION_NAME(lf,
 		    sysctl_kern_function_list_iterate, req);
 		if (error) {
-			mtx_unlock(&kld_mtx);
+			KLD_UNLOCK();
 			return (error);
 		}
 	}
-	mtx_unlock(&kld_mtx);
+	KLD_UNLOCK();
 	return (SYSCTL_OUT(req, "", 1));
 }
 
Index: vfs_aio.c
===================================================================
RCS file: /home/cvs/src/sys/kern/vfs_aio.c,v
retrieving revision 1.3
retrieving revision 1.4
diff -L sys/kern/vfs_aio.c -L sys/kern/vfs_aio.c -u -r1.3 -r1.4
--- sys/kern/vfs_aio.c
+++ sys/kern/vfs_aio.c
@@ -19,7 +19,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/vfs_aio.c,v 1.195.2.2 2005/11/08 16:08:40 jhb Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/vfs_aio.c,v 1.233.4.1 2008/01/28 10:43:10 dumbbell Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -38,43 +38,54 @@
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/unistd.h>
+#include <sys/posix4.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/signalvar.h>
 #include <sys/protosw.h>
+#include <sys/sema.h>
+#include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/syscall.h>
 #include <sys/sysent.h>
 #include <sys/sysctl.h>
 #include <sys/sx.h>
+#include <sys/taskqueue.h>
 #include <sys/vnode.h>
 #include <sys/conf.h>
 #include <sys/event.h>
+#include <sys/mount.h>
+
+#include <machine/atomic.h>
 
-#include <posix4/posix4.h>
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
+#include <vm/vm_object.h>
 #include <vm/uma.h>
 #include <sys/aio.h>
 
 #include "opt_vfs_aio.h"
 
-NET_NEEDS_GIANT("aio");
-
 /*
  * Counter for allocating reference ids to new jobs.  Wrapped to 1 on
- * overflow.
+ * overflow. (XXX will be removed soon.)
+ */
+static u_long jobrefid;
+
+/*
+ * Counter for aio_fsync.
  */
-static	long jobrefid;
+static uint64_t jobseqno;
 
-#define JOBST_NULL		0x0
-#define JOBST_JOBQGLOBAL	0x2
-#define JOBST_JOBRUNNING	0x3
-#define JOBST_JOBFINISHED	0x4
-#define	JOBST_JOBQBUF		0x5
-#define	JOBST_JOBBFINISHED	0x6
+#define JOBST_NULL		0
+#define JOBST_JOBQSOCK		1
+#define JOBST_JOBQGLOBAL	2
+#define JOBST_JOBRUNNING	3
+#define JOBST_JOBFINISHED	4
+#define JOBST_JOBQBUF		5
+#define JOBST_JOBQSYNC		6
 
 #ifndef MAX_AIO_PER_PROC
 #define MAX_AIO_PER_PROC	32
@@ -141,7 +152,7 @@
     "Number of aio requests presently handled by the buf subsystem");
 
 /* Number of async I/O thread in the process of being started */
-/* XXX This should be local to _aio_aqueue() */
+/* XXX This should be local to aio_aqueue() */
 static int num_aio_resv_start = 0;
 
 static int aiod_timeout;
@@ -170,26 +181,70 @@
 SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio, CTLFLAG_RW, &max_buf_aio, 0,
     "Maximum buf aio requests per process (stored in the process)");
 
+typedef struct oaiocb {
+	int	aio_fildes;		/* File descriptor */
+	off_t	aio_offset;		/* File offset for I/O */
+	volatile void *aio_buf;         /* I/O buffer in process space */
+	size_t	aio_nbytes;		/* Number of bytes for I/O */
+	struct	osigevent aio_sigevent;	/* Signal to deliver */
+	int	aio_lio_opcode;		/* LIO opcode */
+	int	aio_reqprio;		/* Request priority -- ignored */
+	struct	__aiocb_private	_aiocb_private;
+} oaiocb_t;
+
+/*
+ * Below is a key of locks used to protect each member of struct aiocblist
+ * aioliojob and kaioinfo and any backends.
+ *
+ * * - need not protected
+ * a - locked by kaioinfo lock
+ * b - locked by backend lock, the backend lock can be null in some cases,
+ *     for example, BIO belongs to this type, in this case, proc lock is
+ *     reused.
+ * c - locked by aio_job_mtx, the lock for the generic file I/O backend.
+ */
+
+/*
+ * Current, there is only two backends: BIO and generic file I/O.
+ * socket I/O is served by generic file I/O, this is not a good idea, since
+ * disk file I/O and any other types without O_NONBLOCK flag can block daemon
+ * threads, if there is no thread to serve socket I/O, the socket I/O will be
+ * delayed too long or starved, we should create some threads dedicated to
+ * sockets to do non-blocking I/O, same for pipe and fifo, for these I/O
+ * systems we really need non-blocking interface, fiddling O_NONBLOCK in file
+ * structure is not safe because there is race between userland and aio
+ * daemons.
+ */
+
 struct aiocblist {
-	TAILQ_ENTRY(aiocblist) list;	/* List of jobs */
-	TAILQ_ENTRY(aiocblist) plist;	/* List of jobs for proc */
-	int	jobflags;
-	int	jobstate;
-	int	inputcharge;
-	int	outputcharge;
-	struct	buf *bp;		/* Buffer pointer */
-	struct	proc *userproc;		/* User process */ /* Not td! */
-	struct  ucred *cred;		/* Active credential when created */
-	struct	file *fd_file;		/* Pointer to file structure */
-	struct	aio_liojob *lio;	/* Optional lio job */
-	struct	aiocb *uuaiocb;		/* Pointer in userspace of aiocb */
-	struct	knlist klist;		/* list of knotes */
-	struct	aiocb uaiocb;		/* Kernel I/O control block */
+	TAILQ_ENTRY(aiocblist) list;	/* (b) internal list of for backend */
+	TAILQ_ENTRY(aiocblist) plist;	/* (a) list of jobs for each backend */
+	TAILQ_ENTRY(aiocblist) allist;  /* (a) list of all jobs in proc */
+	int	jobflags;		/* (a) job flags */
+	int	jobstate;		/* (b) job state */
+	int	inputcharge;		/* (*) input blockes */
+	int	outputcharge;		/* (*) output blockes */
+	struct	buf *bp;		/* (*) private to BIO backend,
+				  	 * buffer pointer
+					 */
+	struct	proc *userproc;		/* (*) user process */
+	struct  ucred *cred;		/* (*) active credential when created */
+	struct	file *fd_file;		/* (*) pointer to file structure */
+	struct	aioliojob *lio;		/* (*) optional lio job */
+	struct	aiocb *uuaiocb;		/* (*) pointer in userspace of aiocb */
+	struct	knlist klist;		/* (a) list of knotes */
+	struct	aiocb uaiocb;		/* (*) kernel I/O control block */
+	ksiginfo_t ksi;			/* (a) realtime signal info */
+	struct	task biotask;		/* (*) private to BIO backend */
+	uint64_t seqno;			/* (*) job number */
+	int	pending;		/* (a) number of pending I/O, aio_fsync only */
 };
 
 /* jobflags */
-#define AIOCBLIST_RUNDOWN	0x4
-#define AIOCBLIST_DONE		0x10
+#define AIOCBLIST_DONE		0x01
+#define AIOCBLIST_BUFDONE	0x02
+#define AIOCBLIST_RUNDOWN	0x04
+#define AIOCBLIST_CHECKSYNC	0x08
 
 /*
  * AIO process info
@@ -197,71 +252,95 @@
 #define AIOP_FREE	0x1			/* proc on free queue */
 
 struct aiothreadlist {
-	int aiothreadflags;			/* AIO proc flags */
-	TAILQ_ENTRY(aiothreadlist) list;	/* List of processes */
-	struct thread *aiothread;		/* The AIO thread */
+	int aiothreadflags;			/* (c) AIO proc flags */
+	TAILQ_ENTRY(aiothreadlist) list;	/* (c) list of processes */
+	struct thread *aiothread;		/* (*) the AIO thread */
 };
 
 /*
  * data-structure for lio signal management
  */
-struct aio_liojob {
-	int	lioj_flags;
-	int	lioj_buffer_count;
-	int	lioj_buffer_finished_count;
-	int	lioj_queue_count;
-	int	lioj_queue_finished_count;
-	struct	sigevent lioj_signal;	/* signal on all I/O done */
-	TAILQ_ENTRY(aio_liojob) lioj_list;
+struct aioliojob {
+	int	lioj_flags;			/* (a) listio flags */
+	int	lioj_count;			/* (a) listio flags */
+	int	lioj_finished_count;		/* (a) listio flags */
+	struct	sigevent lioj_signal;		/* (a) signal on all I/O done */
+	TAILQ_ENTRY(aioliojob) lioj_list;	/* (a) lio list */
+	struct  knlist klist;			/* (a) list of knotes */
+	ksiginfo_t lioj_ksi;			/* (a) Realtime signal info */
 };
+
 #define	LIOJ_SIGNAL		0x1	/* signal on all done (lio) */
 #define	LIOJ_SIGNAL_POSTED	0x2	/* signal has been posted */
+#define LIOJ_KEVENT_POSTED	0x4	/* kevent triggered */
 
 /*
  * per process aio data structure
  */
 struct kaioinfo {
-	int	kaio_flags;		/* per process kaio flags */
-	int	kaio_maxactive_count;	/* maximum number of AIOs */
-	int	kaio_active_count;	/* number of currently used AIOs */
-	int	kaio_qallowed_count;	/* maxiumu size of AIO queue */
-	int	kaio_queue_count;	/* size of AIO queue */
-	int	kaio_ballowed_count;	/* maximum number of buffers */
-	int	kaio_queue_finished_count; /* number of daemon jobs finished */
-	int	kaio_buffer_count;	/* number of physio buffers */
-	int	kaio_buffer_finished_count; /* count of I/O done */
-	TAILQ_HEAD(,aio_liojob) kaio_liojoblist; /* list of lio jobs */
-	TAILQ_HEAD(,aiocblist) kaio_jobqueue;	/* job queue for process */
-	TAILQ_HEAD(,aiocblist) kaio_jobdone;	/* done queue for process */
-	TAILQ_HEAD(,aiocblist) kaio_bufqueue;	/* buffer job queue for process */
-	TAILQ_HEAD(,aiocblist) kaio_bufdone;	/* buffer done queue for process */
-	TAILQ_HEAD(,aiocblist) kaio_sockqueue;	/* queue for aios waiting on sockets */
+	struct mtx	kaio_mtx;	/* the lock to protect this struct */
+	int	kaio_flags;		/* (a) per process kaio flags */
+	int	kaio_maxactive_count;	/* (*) maximum number of AIOs */
+	int	kaio_active_count;	/* (c) number of currently used AIOs */
+	int	kaio_qallowed_count;	/* (*) maxiumu size of AIO queue */
+	int	kaio_count;		/* (a) size of AIO queue */
+	int	kaio_ballowed_count;	/* (*) maximum number of buffers */
+	int	kaio_buffer_count;	/* (a) number of physio buffers */
+	TAILQ_HEAD(,aiocblist) kaio_all;	/* (a) all AIOs in the process */
+	TAILQ_HEAD(,aiocblist) kaio_done;	/* (a) done queue for process */
+	TAILQ_HEAD(,aioliojob) kaio_liojoblist; /* (a) list of lio jobs */
+	TAILQ_HEAD(,aiocblist) kaio_jobqueue;	/* (a) job queue for process */
+	TAILQ_HEAD(,aiocblist) kaio_bufqueue;	/* (a) buffer job queue for process */
+	TAILQ_HEAD(,aiocblist) kaio_sockqueue;  /* (a) queue for aios waiting on sockets,
+						 *  NOT USED YET.
+						 */
+	TAILQ_HEAD(,aiocblist) kaio_syncqueue;	/* (a) queue for aio_fsync */
+	struct	task	kaio_task;	/* (*) task to kick aio threads */
 };
 
+#define AIO_LOCK(ki)		mtx_lock(&(ki)->kaio_mtx)
+#define AIO_UNLOCK(ki)		mtx_unlock(&(ki)->kaio_mtx)
+#define AIO_LOCK_ASSERT(ki, f)	mtx_assert(&(ki)->kaio_mtx, (f))
+#define AIO_MTX(ki)		(&(ki)->kaio_mtx)
+
 #define KAIO_RUNDOWN	0x1	/* process is being run down */
 #define KAIO_WAKEUP	0x2	/* wakeup process when there is a significant event */
 
-static TAILQ_HEAD(,aiothreadlist) aio_freeproc;		/* Idle daemons */
-static struct mtx aio_freeproc_mtx;
-
-static TAILQ_HEAD(,aiocblist) aio_jobs;			/* Async job list */
+static TAILQ_HEAD(,aiothreadlist) aio_freeproc;		/* (c) Idle daemons */
+static struct sema aio_newproc_sem;
+static struct mtx aio_job_mtx;
+static struct mtx aio_sock_mtx;
+static TAILQ_HEAD(,aiocblist) aio_jobs;			/* (c) Async job list */
+static struct unrhdr *aiod_unr;
 
-static void	aio_init_aioinfo(struct proc *p);
+void		aio_init_aioinfo(struct proc *p);
 static void	aio_onceonly(void);
 static int	aio_free_entry(struct aiocblist *aiocbe);
 static void	aio_process(struct aiocblist *aiocbe);
-static int	aio_newproc(void);
-static int	aio_aqueue(struct thread *td, struct aiocb *job, int type);
+static int	aio_newproc(int *);
+int		aio_aqueue(struct thread *td, struct aiocb *job,
+			struct aioliojob *lio, int type, int osigev);
 static void	aio_physwakeup(struct buf *bp);
 static void	aio_proc_rundown(void *arg, struct proc *p);
-static int	aio_fphysio(struct aiocblist *aiocbe);
+static void	aio_proc_rundown_exec(void *arg, struct proc *p, struct image_params *imgp);
 static int	aio_qphysio(struct proc *p, struct aiocblist *iocb);
-static void	aio_daemon(void *uproc);
+static void	biohelper(void *, int);
+static void	aio_daemon(void *param);
 static void	aio_swake_cb(struct socket *, struct sockbuf *);
 static int	aio_unload(void);
+static void	aio_bio_done_notify(struct proc *userp, struct aiocblist *aiocbe, int type);
+#define DONE_BUF	1
+#define DONE_QUEUE	2
+static int	do_lio_listio(struct thread *td, struct lio_listio_args *uap, int oldsigev);
+static int	aio_kick(struct proc *userp);
+static void	aio_kick_nowait(struct proc *userp);
+static void	aio_kick_helper(void *context, int pending);
 static int	filt_aioattach(struct knote *kn);
 static void	filt_aiodetach(struct knote *kn);
 static int	filt_aio(struct knote *kn, long hint);
+static int	filt_lioattach(struct knote *kn);
+static void	filt_liodetach(struct knote *kn);
+static int	filt_lio(struct knote *kn, long hint);
 
 /*
  * Zones for:
@@ -276,9 +355,13 @@
 /* kqueue filters for aio */
 static struct filterops aio_filtops =
 	{ 0, filt_aioattach, filt_aiodetach, filt_aio };
+static struct filterops lio_filtops =
+	{ 0, filt_lioattach, filt_liodetach, filt_lio };
 
 static eventhandler_tag exit_tag, exec_tag;
 
+TASKQUEUE_DEFINE_THREAD(aiod_bio);
+
 /*
  * Main operations function for use as a kernel module.
  */
@@ -309,14 +392,18 @@
 	NULL
 };
 
-SYSCALL_MODULE_HELPER(aio_return);
-SYSCALL_MODULE_HELPER(aio_suspend);
 SYSCALL_MODULE_HELPER(aio_cancel);
 SYSCALL_MODULE_HELPER(aio_error);
+SYSCALL_MODULE_HELPER(aio_fsync);
 SYSCALL_MODULE_HELPER(aio_read);
-SYSCALL_MODULE_HELPER(aio_write);
+SYSCALL_MODULE_HELPER(aio_return);
+SYSCALL_MODULE_HELPER(aio_suspend);
 SYSCALL_MODULE_HELPER(aio_waitcomplete);
+SYSCALL_MODULE_HELPER(aio_write);
 SYSCALL_MODULE_HELPER(lio_listio);
+SYSCALL_MODULE_HELPER(oaio_read);
+SYSCALL_MODULE_HELPER(oaio_write);
+SYSCALL_MODULE_HELPER(olio_listio);
 
 DECLARE_MODULE(aio, aio_mod,
 	SI_SUB_VFS, SI_ORDER_ANY);
@@ -333,12 +420,16 @@
 	aio_swake = &aio_swake_cb;
 	exit_tag = EVENTHANDLER_REGISTER(process_exit, aio_proc_rundown, NULL,
 	    EVENTHANDLER_PRI_ANY);
-	exec_tag = EVENTHANDLER_REGISTER(process_exec, aio_proc_rundown, NULL,
+	exec_tag = EVENTHANDLER_REGISTER(process_exec, aio_proc_rundown_exec, NULL,
 	    EVENTHANDLER_PRI_ANY);
 	kqueue_add_filteropts(EVFILT_AIO, &aio_filtops);
+	kqueue_add_filteropts(EVFILT_LIO, &lio_filtops);
 	TAILQ_INIT(&aio_freeproc);
-	mtx_init(&aio_freeproc_mtx, "aio_freeproc", NULL, MTX_DEF);
+	sema_init(&aio_newproc_sem, 0, "aio_new_proc");
+	mtx_init(&aio_job_mtx, "aio_job", NULL, MTX_DEF);
+	mtx_init(&aio_sock_mtx, "aio_sock", NULL, MTX_DEF);
 	TAILQ_INIT(&aio_jobs);
+	aiod_unr = new_unrhdr(1, INT_MAX, NULL);
 	kaio_zone = uma_zcreate("AIO", sizeof(struct kaioinfo), NULL, NULL,
 	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	aiop_zone = uma_zcreate("AIOP", sizeof(struct aiothreadlist), NULL,
@@ -347,7 +438,7 @@
 	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	aiol_zone = uma_zcreate("AIOL", AIO_LISTIO_MAX*sizeof(intptr_t) , NULL,
 	    NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
-	aiolio_zone = uma_zcreate("AIOLIO", sizeof(struct aio_liojob), NULL,
+	aiolio_zone = uma_zcreate("AIOLIO", sizeof(struct aioliojob), NULL,
 	    NULL, NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	aiod_timeout = AIOD_TIMEOUT_DEFAULT;
 	aiod_lifetime = AIOD_LIFETIME_DEFAULT;
@@ -370,6 +461,9 @@
 	 * XXX: no unloads by default, it's too dangerous.
 	 * perhaps we could do it if locked out callers and then
 	 * did an aio_proc_rundown() on each process.
+	 *
+	 * jhb: aio_proc_rundown() needs to run on curproc though,
+	 * so I don't think that would fly.
 	 */
 	if (!unloadable)
 		return (EOPNOTSUPP);
@@ -377,11 +471,23 @@
 	error = kqueue_del_filteropts(EVFILT_AIO);
 	if (error)
 		return error;
-
+	error = kqueue_del_filteropts(EVFILT_LIO);
+	if (error)
+		return error;
 	async_io_version = 0;
 	aio_swake = NULL;
+	taskqueue_free(taskqueue_aiod_bio);
+	delete_unrhdr(aiod_unr);
+	uma_zdestroy(kaio_zone);
+	uma_zdestroy(aiop_zone);
+	uma_zdestroy(aiocb_zone);
+	uma_zdestroy(aiol_zone);
+	uma_zdestroy(aiolio_zone);
 	EVENTHANDLER_DEREGISTER(process_exit, exit_tag);
 	EVENTHANDLER_DEREGISTER(process_exec, exec_tag);
+	mtx_destroy(&aio_job_mtx);
+	mtx_destroy(&aio_sock_mtx);
+	sema_destroy(&aio_newproc_sem);
 	p31b_setcfg(CTL_P1003_1B_AIO_LISTIO_MAX, -1);
 	p31b_setcfg(CTL_P1003_1B_AIO_MAX, -1);
 	p31b_setcfg(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, -1);
@@ -392,37 +498,55 @@
  * Init the per-process aioinfo structure.  The aioinfo limits are set
  * per-process for user limit (resource) management.
  */
-static void
+void
 aio_init_aioinfo(struct proc *p)
 {
 	struct kaioinfo *ki;
 
 	ki = uma_zalloc(kaio_zone, M_WAITOK);
+	mtx_init(&ki->kaio_mtx, "aiomtx", NULL, MTX_DEF);
 	ki->kaio_flags = 0;
 	ki->kaio_maxactive_count = max_aio_per_proc;
 	ki->kaio_active_count = 0;
 	ki->kaio_qallowed_count = max_aio_queue_per_proc;
-	ki->kaio_queue_count = 0;
+	ki->kaio_count = 0;
 	ki->kaio_ballowed_count = max_buf_aio;
 	ki->kaio_buffer_count = 0;
-	ki->kaio_buffer_finished_count = 0;
-	TAILQ_INIT(&ki->kaio_jobdone);
+	TAILQ_INIT(&ki->kaio_all);
+	TAILQ_INIT(&ki->kaio_done);
 	TAILQ_INIT(&ki->kaio_jobqueue);
-	TAILQ_INIT(&ki->kaio_bufdone);
 	TAILQ_INIT(&ki->kaio_bufqueue);
 	TAILQ_INIT(&ki->kaio_liojoblist);
 	TAILQ_INIT(&ki->kaio_sockqueue);
+	TAILQ_INIT(&ki->kaio_syncqueue);
+	TASK_INIT(&ki->kaio_task, 0, aio_kick_helper, p);
 	PROC_LOCK(p);
 	if (p->p_aioinfo == NULL) {
 		p->p_aioinfo = ki;
 		PROC_UNLOCK(p);
 	} else {
 		PROC_UNLOCK(p);
+		mtx_destroy(&ki->kaio_mtx);
 		uma_zfree(kaio_zone, ki);
 	}
 
 	while (num_aio_procs < target_aio_procs)
-		aio_newproc();
+		aio_newproc(NULL);
+}
+
+static int
+aio_sendsig(struct proc *p, struct sigevent *sigev, ksiginfo_t *ksi)
+{
+	int ret = 0;
+
+	PROC_LOCK(p);
+	if (!KSI_ONQ(ksi)) {
+		ksi->ksi_code = SI_ASYNCIO;
+		ksi->ksi_flags |= KSI_EXT | KSI_INS;
+		ret = psignal_event(p, sigev, ksi);
+	}
+	PROC_UNLOCK(p);
+	return (ret);
 }
 
 /*
@@ -434,225 +558,170 @@
 aio_free_entry(struct aiocblist *aiocbe)
 {
 	struct kaioinfo *ki;
-	struct aio_liojob *lj;
+	struct aioliojob *lj;
 	struct proc *p;
-	int error;
-	int s;
-
-	if (aiocbe->jobstate == JOBST_NULL)
-		panic("aio_free_entry: freeing already free job");
 
 	p = aiocbe->userproc;
+	MPASS(curproc == p);
 	ki = p->p_aioinfo;
-	lj = aiocbe->lio;
-	if (ki == NULL)
-		panic("aio_free_entry: missing p->p_aioinfo");
+	MPASS(ki != NULL);
 
-	while (aiocbe->jobstate == JOBST_JOBRUNNING) {
-		aiocbe->jobflags |= AIOCBLIST_RUNDOWN;
-		tsleep(aiocbe, PRIBIO, "jobwai", 0);
-	}
-	if (aiocbe->bp == NULL) {
-		if (ki->kaio_queue_count <= 0)
-			panic("aio_free_entry: process queue size <= 0");
-		if (num_queue_count <= 0)
-			panic("aio_free_entry: system wide queue size <= 0");
-
-		if (lj) {
-			lj->lioj_queue_count--;
-			if (aiocbe->jobflags & AIOCBLIST_DONE)
-				lj->lioj_queue_finished_count--;
-		}
-		ki->kaio_queue_count--;
-		if (aiocbe->jobflags & AIOCBLIST_DONE)
-			ki->kaio_queue_finished_count--;
-		num_queue_count--;
-	} else {
-		if (lj) {
-			lj->lioj_buffer_count--;
-			if (aiocbe->jobflags & AIOCBLIST_DONE)
-				lj->lioj_buffer_finished_count--;
-		}
-		if (aiocbe->jobflags & AIOCBLIST_DONE)
-			ki->kaio_buffer_finished_count--;
-		ki->kaio_buffer_count--;
-		num_buf_aio--;
-	}
+	AIO_LOCK_ASSERT(ki, MA_OWNED);
+	MPASS(aiocbe->jobstate == JOBST_JOBFINISHED);
 
-	/* aiocbe is going away, we need to destroy any knotes */
-	/* XXXKSE Note the thread here is used to eventually find the
-	 * owning process again, but it is also used to do a fo_close
-	 * and that requires the thread. (but does it require the
-	 * OWNING thread? (or maybe the running thread?)
-	 * There is a semantic problem here...
-	 */
-	knlist_delete(&aiocbe->klist, FIRST_THREAD_IN_PROC(p), 0); /* XXXKSE */
+	atomic_subtract_int(&num_queue_count, 1);
 
-	if ((ki->kaio_flags & KAIO_WAKEUP) || ((ki->kaio_flags & KAIO_RUNDOWN)
-	    && ((ki->kaio_buffer_count == 0) && (ki->kaio_queue_count == 0)))) {
-		ki->kaio_flags &= ~KAIO_WAKEUP;
-		wakeup(p);
-	}
+	ki->kaio_count--;
+	MPASS(ki->kaio_count >= 0);
 
-	if (aiocbe->jobstate == JOBST_JOBQBUF) {
-		if ((error = aio_fphysio(aiocbe)) != 0)
-			return (error);
-		if (aiocbe->jobstate != JOBST_JOBBFINISHED)
-			panic("aio_free_entry: invalid physio finish-up state");
-		s = splbio();
-		TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist);
-		splx(s);
-	} else if (aiocbe->jobstate == JOBST_JOBQGLOBAL) {
-		s = splnet();
-		TAILQ_REMOVE(&aio_jobs, aiocbe, list);
-		TAILQ_REMOVE(&ki->kaio_jobqueue, aiocbe, plist);
-		splx(s);
-	} else if (aiocbe->jobstate == JOBST_JOBFINISHED)
-		TAILQ_REMOVE(&ki->kaio_jobdone, aiocbe, plist);
-	else if (aiocbe->jobstate == JOBST_JOBBFINISHED) {
-		s = splbio();
-		TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist);
-		splx(s);
-		if (aiocbe->bp) {
-			vunmapbuf(aiocbe->bp);
-			relpbuf(aiocbe->bp, NULL);
-			aiocbe->bp = NULL;
+	TAILQ_REMOVE(&ki->kaio_done, aiocbe, plist);
+	TAILQ_REMOVE(&ki->kaio_all, aiocbe, allist);
+
+	lj = aiocbe->lio;
+	if (lj) {
+		lj->lioj_count--;
+		lj->lioj_finished_count--;
+
+		if (lj->lioj_count == 0) {
+			TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
+			/* lio is going away, we need to destroy any knotes */
+			knlist_delete(&lj->klist, curthread, 1);
+			PROC_LOCK(p);
+			sigqueue_take(&lj->lioj_ksi);
+			PROC_UNLOCK(p);
+			uma_zfree(aiolio_zone, lj);
 		}
 	}
-	if (lj && (lj->lioj_buffer_count == 0) && (lj->lioj_queue_count == 0)) {
-		TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
-		uma_zfree(aiolio_zone, lj);
-	}
+
+	/* aiocbe is going away, we need to destroy any knotes */
+	knlist_delete(&aiocbe->klist, curthread, 1);
+	PROC_LOCK(p);
+	sigqueue_take(&aiocbe->ksi);
+	PROC_UNLOCK(p);
+
+	MPASS(aiocbe->bp == NULL);
 	aiocbe->jobstate = JOBST_NULL;
+	AIO_UNLOCK(ki);
+
+	/*
+	 * The thread argument here is used to find the owning process
+	 * and is also passed to fo_close() which may pass it to various
+	 * places such as devsw close() routines.  Because of that, we
+	 * need a thread pointer from the process owning the job that is
+	 * persistent and won't disappear out from under us or move to
+	 * another process.
+	 *
+	 * Currently, all the callers of this function call it to remove
+	 * an aiocblist from the current process' job list either via a
+	 * syscall or due to the current process calling exit() or
+	 * execve().  Thus, we know that p == curproc.  We also know that
+	 * curthread can't exit since we are curthread.
+	 *
+	 * Therefore, we use curthread as the thread to pass to
+	 * knlist_delete().  This does mean that it is possible for the
+	 * thread pointer at close time to differ from the thread pointer
+	 * at open time, but this is already true of file descriptors in
+	 * a multithreaded process.
+	 */
 	fdrop(aiocbe->fd_file, curthread);
 	crfree(aiocbe->cred);
 	uma_zfree(aiocb_zone, aiocbe);
+	AIO_LOCK(ki);
+
 	return (0);
 }
 
+static void
+aio_proc_rundown_exec(void *arg, struct proc *p, struct image_params *imgp __unused)
+{
+   	aio_proc_rundown(arg, p);
+}
+
 /*
  * Rundown the jobs for a given process.
  */
 static void
 aio_proc_rundown(void *arg, struct proc *p)
 {
-	int s;
 	struct kaioinfo *ki;
-	struct aio_liojob *lj, *ljn;
-	struct aiocblist *aiocbe, *aiocbn;
+	struct aioliojob *lj;
+	struct aiocblist *cbe, *cbn;
 	struct file *fp;
 	struct socket *so;
+	int remove;
 
+	KASSERT(curthread->td_proc == p,
+	    ("%s: called on non-curproc", __func__));
 	ki = p->p_aioinfo;
 	if (ki == NULL)
 		return;
 
-	mtx_lock(&Giant);
-	ki->kaio_flags |= LIOJ_SIGNAL_POSTED;
-	while ((ki->kaio_active_count > 0) || (ki->kaio_buffer_count >
-	    ki->kaio_buffer_finished_count)) {
-		ki->kaio_flags |= KAIO_RUNDOWN;
-		if (tsleep(p, PRIBIO, "kaiowt", aiod_timeout))
-			break;
-	}
+	AIO_LOCK(ki);
+	ki->kaio_flags |= KAIO_RUNDOWN;
+
+restart:
 
 	/*
-	 * Move any aio ops that are waiting on socket I/O to the normal job
-	 * queues so they are cleaned up with any others.
+	 * Try to cancel all pending requests. This code simulates
+	 * aio_cancel on all pending I/O requests.
 	 */
-	s = splnet();
-	for (aiocbe = TAILQ_FIRST(&ki->kaio_sockqueue); aiocbe; aiocbe =
-	    aiocbn) {
-		aiocbn = TAILQ_NEXT(aiocbe, plist);
-		fp = aiocbe->fd_file;
-		if (fp != NULL) {
+	TAILQ_FOREACH_SAFE(cbe, &ki->kaio_jobqueue, plist, cbn) {
+		remove = 0;
+		mtx_lock(&aio_job_mtx);
+		if (cbe->jobstate == JOBST_JOBQGLOBAL) {
+			TAILQ_REMOVE(&aio_jobs, cbe, list);
+			remove = 1;
+		} else if (cbe->jobstate == JOBST_JOBQSOCK) {
+			fp = cbe->fd_file;
+			MPASS(fp->f_type == DTYPE_SOCKET);
 			so = fp->f_data;
-			TAILQ_REMOVE(&so->so_aiojobq, aiocbe, list);
-			if (TAILQ_EMPTY(&so->so_aiojobq)) {
-				SOCKBUF_LOCK(&so->so_snd);
-				so->so_snd.sb_flags &= ~SB_AIO;
-				SOCKBUF_UNLOCK(&so->so_snd);
-				SOCKBUF_LOCK(&so->so_rcv);
-				so->so_rcv.sb_flags &= ~SB_AIO;
-				SOCKBUF_UNLOCK(&so->so_rcv);
-			}
+			TAILQ_REMOVE(&so->so_aiojobq, cbe, list);
+			remove = 1;
+		} else if (cbe->jobstate == JOBST_JOBQSYNC) {
+			TAILQ_REMOVE(&ki->kaio_syncqueue, cbe, list);
+			remove = 1;
+		}
+		mtx_unlock(&aio_job_mtx);
+
+		if (remove) {
+			cbe->jobstate = JOBST_JOBFINISHED;
+			cbe->uaiocb._aiocb_private.status = -1;
+			cbe->uaiocb._aiocb_private.error = ECANCELED;
+			TAILQ_REMOVE(&ki->kaio_jobqueue, cbe, plist);
+			aio_bio_done_notify(p, cbe, DONE_QUEUE);
 		}
-		TAILQ_REMOVE(&ki->kaio_sockqueue, aiocbe, plist);
-		TAILQ_INSERT_HEAD(&aio_jobs, aiocbe, list);
-		TAILQ_INSERT_HEAD(&ki->kaio_jobqueue, aiocbe, plist);
-	}
-	splx(s);
-
-restart1:
-	for (aiocbe = TAILQ_FIRST(&ki->kaio_jobdone); aiocbe; aiocbe = aiocbn) {
-		aiocbn = TAILQ_NEXT(aiocbe, plist);
-		if (aio_free_entry(aiocbe))
-			goto restart1;
 	}
 
-restart2:
-	for (aiocbe = TAILQ_FIRST(&ki->kaio_jobqueue); aiocbe; aiocbe =
-	    aiocbn) {
-		aiocbn = TAILQ_NEXT(aiocbe, plist);
-		if (aio_free_entry(aiocbe))
-			goto restart2;
-	}
-
-/*
- * Note the use of lots of splbio here, trying to avoid splbio for long chains
- * of I/O.  Probably unnecessary.
- */
-restart3:
-	s = splbio();
-	while (TAILQ_FIRST(&ki->kaio_bufqueue)) {
+	/* Wait for all running I/O to be finished */
+	if (TAILQ_FIRST(&ki->kaio_bufqueue) ||
+	    TAILQ_FIRST(&ki->kaio_jobqueue)) {
 		ki->kaio_flags |= KAIO_WAKEUP;
-		tsleep(p, PRIBIO, "aioprn", 0);
-		splx(s);
-		goto restart3;
-	}
-	splx(s);
-
-restart4:
-	s = splbio();
-	for (aiocbe = TAILQ_FIRST(&ki->kaio_bufdone); aiocbe; aiocbe = aiocbn) {
-		aiocbn = TAILQ_NEXT(aiocbe, plist);
-		if (aio_free_entry(aiocbe)) {
-			splx(s);
-			goto restart4;
-		}
+		msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO, "aioprn", hz);
+		goto restart;
 	}
-	splx(s);
 
-	/*
-	 * If we've slept, jobs might have moved from one queue to another.
-	 * Retry rundown if we didn't manage to empty the queues.
-	 */
-	if (TAILQ_FIRST(&ki->kaio_jobdone) != NULL ||
-	    TAILQ_FIRST(&ki->kaio_jobqueue) != NULL ||
-	    TAILQ_FIRST(&ki->kaio_bufqueue) != NULL ||
-	    TAILQ_FIRST(&ki->kaio_bufdone) != NULL)
-		goto restart1;
-
-	for (lj = TAILQ_FIRST(&ki->kaio_liojoblist); lj; lj = ljn) {
-		ljn = TAILQ_NEXT(lj, lioj_list);
-		if ((lj->lioj_buffer_count == 0) && (lj->lioj_queue_count ==
-		    0)) {
+	/* Free all completed I/O requests. */
+	while ((cbe = TAILQ_FIRST(&ki->kaio_done)) != NULL)
+		aio_free_entry(cbe);
+
+	while ((lj = TAILQ_FIRST(&ki->kaio_liojoblist)) != NULL) {
+		if (lj->lioj_count == 0) {
 			TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
+			knlist_delete(&lj->klist, curthread, 1);
+			PROC_LOCK(p);
+			sigqueue_take(&lj->lioj_ksi);
+			PROC_UNLOCK(p);
 			uma_zfree(aiolio_zone, lj);
 		} else {
-#ifdef DIAGNOSTIC
-			printf("LIO job not cleaned up: B:%d, BF:%d, Q:%d, "
-			    "QF:%d\n", lj->lioj_buffer_count,
-			    lj->lioj_buffer_finished_count,
-			    lj->lioj_queue_count,
-			    lj->lioj_queue_finished_count);
-#endif
+			panic("LIO job not cleaned up: C:%d, FC:%d\n",
+			    lj->lioj_count, lj->lioj_finished_count);
 		}
 	}
-
+	AIO_UNLOCK(ki);
+	taskqueue_drain(taskqueue_aiod_bio, &ki->kaio_task);
+	mtx_destroy(&ki->kaio_mtx);
 	uma_zfree(kaio_zone, ki);
 	p->p_aioinfo = NULL;
-	mtx_unlock(&Giant);
 }
 
 /*
@@ -661,26 +730,53 @@
 static struct aiocblist *
 aio_selectjob(struct aiothreadlist *aiop)
 {
-	int s;
 	struct aiocblist *aiocbe;
 	struct kaioinfo *ki;
 	struct proc *userp;
 
-	s = splnet();
-	for (aiocbe = TAILQ_FIRST(&aio_jobs); aiocbe; aiocbe =
-	    TAILQ_NEXT(aiocbe, list)) {
+	mtx_assert(&aio_job_mtx, MA_OWNED);
+	TAILQ_FOREACH(aiocbe, &aio_jobs, list) {
 		userp = aiocbe->userproc;
 		ki = userp->p_aioinfo;
 
 		if (ki->kaio_active_count < ki->kaio_maxactive_count) {
 			TAILQ_REMOVE(&aio_jobs, aiocbe, list);
-			splx(s);
-			return (aiocbe);
+			/* Account for currently active jobs. */
+			ki->kaio_active_count++;
+			aiocbe->jobstate = JOBST_JOBRUNNING;
+			break;
 		}
 	}
-	splx(s);
+	return (aiocbe);
+}
+
+/*
+ *  Move all data to a permanent storage device, this code
+ *  simulates fsync syscall.
+ */
+static int
+aio_fsync_vnode(struct thread *td, struct vnode *vp)
+{
+	struct mount *mp;
+	int vfslocked;
+	int error;
 
-	return (NULL);
+	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
+	if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0)
+		goto drop;
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	if (vp->v_object != NULL) {
+		VM_OBJECT_LOCK(vp->v_object);
+		vm_object_page_clean(vp->v_object, 0, 0, 0);
+		VM_OBJECT_UNLOCK(vp->v_object);
+	}
+	error = VOP_FSYNC(vp, MNT_WAIT, td);
+
+	VOP_UNLOCK(vp, 0, td);
+	vn_finished_write(mp);
+drop:
+	VFS_UNLOCK_GIANT(vfslocked);
+	return (error);
 }
 
 /*
@@ -688,15 +784,17 @@
  * the non-physio version of the operations.  The normal vn operations are used,
  * and this code should work in all instances for every type of file, including
  * pipes, sockets, fifos, and regular files.
+ *
+ * XXX I don't think it works well for socket, pipe, and fifo.
  */
 static void
 aio_process(struct aiocblist *aiocbe)
 {
 	struct ucred *td_savedcred;
 	struct thread *td;
-	struct proc *mycp;
 	struct aiocb *cb;
 	struct file *fp;
+	struct socket *so;
 	struct uio auio;
 	struct iovec aiov;
 	int cnt;
@@ -707,10 +805,20 @@
 	td = curthread;
 	td_savedcred = td->td_ucred;
 	td->td_ucred = aiocbe->cred;
-	mycp = td->td_proc;
 	cb = &aiocbe->uaiocb;
 	fp = aiocbe->fd_file;
 
+	if (cb->aio_lio_opcode == LIO_SYNC) {
+		error = 0;
+		cnt = 0;
+		if (fp->f_vnode != NULL)
+			error = aio_fsync_vnode(td, fp->f_vnode);
+		cb->_aiocb_private.error = error;
+		cb->_aiocb_private.status = 0;
+		td->td_ucred = td_savedcred;
+		return;
+	}
+
 	aiov.iov_base = (void *)(uintptr_t)cb->aio_buf;
 	aiov.iov_len = cb->aio_nbytes;
 
@@ -722,21 +830,26 @@
 	auio.uio_segflg = UIO_USERSPACE;
 	auio.uio_td = td;
 
-	inblock_st = mycp->p_stats->p_ru.ru_inblock;
-	oublock_st = mycp->p_stats->p_ru.ru_oublock;
+	inblock_st = td->td_ru.ru_inblock;
+	oublock_st = td->td_ru.ru_oublock;
 	/*
-	 * _aio_aqueue() acquires a reference to the file that is
+	 * aio_aqueue() acquires a reference to the file that is
 	 * released in aio_free_entry().
 	 */
 	if (cb->aio_lio_opcode == LIO_READ) {
 		auio.uio_rw = UIO_READ;
-		error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, td);
+		if (auio.uio_resid == 0)
+			error = 0;
+		else
+			error = fo_read(fp, &auio, fp->f_cred, FOF_OFFSET, td);
 	} else {
+		if (fp->f_type == DTYPE_VNODE)
+			bwillwrite();
 		auio.uio_rw = UIO_WRITE;
 		error = fo_write(fp, &auio, fp->f_cred, FOF_OFFSET, td);
 	}
-	inblock_end = mycp->p_stats->p_ru.ru_inblock;
-	oublock_end = mycp->p_stats->p_ru.ru_oublock;
+	inblock_end = td->td_ru.ru_inblock;
+	oublock_end = td->td_ru.ru_oublock;
 
 	aiocbe->inputcharge = inblock_end - inblock_st;
 	aiocbe->outputcharge = oublock_end - oublock_st;
@@ -745,9 +858,17 @@
 		if (error == ERESTART || error == EINTR || error == EWOULDBLOCK)
 			error = 0;
 		if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE)) {
-			PROC_LOCK(aiocbe->userproc);
-			psignal(aiocbe->userproc, SIGPIPE);
-			PROC_UNLOCK(aiocbe->userproc);
+			int sigpipe = 1;
+			if (fp->f_type == DTYPE_SOCKET) {
+				so = fp->f_data;
+				if (so->so_options & SO_NOSIGPIPE)
+					sigpipe = 0;
+			}
+			if (sigpipe) {
+				PROC_LOCK(aiocbe->userproc);
+				psignal(aiocbe->userproc, SIGPIPE);
+				PROC_UNLOCK(aiocbe->userproc);
+			}
 		}
 	}
 
@@ -757,24 +878,90 @@
 	td->td_ucred = td_savedcred;
 }
 
+static void
+aio_bio_done_notify(struct proc *userp, struct aiocblist *aiocbe, int type)
+{
+	struct aioliojob *lj;
+	struct kaioinfo *ki;
+	struct aiocblist *scb, *scbn;
+	int lj_done;
+
+	ki = userp->p_aioinfo;
+	AIO_LOCK_ASSERT(ki, MA_OWNED);
+	lj = aiocbe->lio;
+	lj_done = 0;
+	if (lj) {
+		lj->lioj_finished_count++;
+		if (lj->lioj_count == lj->lioj_finished_count)
+			lj_done = 1;
+	}
+	if (type == DONE_QUEUE) {
+		aiocbe->jobflags |= AIOCBLIST_DONE;
+	} else {
+		aiocbe->jobflags |= AIOCBLIST_BUFDONE;
+	}
+	TAILQ_INSERT_TAIL(&ki->kaio_done, aiocbe, plist);
+	aiocbe->jobstate = JOBST_JOBFINISHED;
+
+	if (ki->kaio_flags & KAIO_RUNDOWN)
+		goto notification_done;
+
+	if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL ||
+	    aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID)
+		aio_sendsig(userp, &aiocbe->uaiocb.aio_sigevent, &aiocbe->ksi);
+
+	KNOTE_LOCKED(&aiocbe->klist, 1);
+
+	if (lj_done) {
+		if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
+			lj->lioj_flags |= LIOJ_KEVENT_POSTED;
+			KNOTE_LOCKED(&lj->klist, 1);
+		}
+		if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED))
+		    == LIOJ_SIGNAL
+		    && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL ||
+		        lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) {
+			aio_sendsig(userp, &lj->lioj_signal, &lj->lioj_ksi);
+			lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
+		}
+	}
+
+notification_done:
+	if (aiocbe->jobflags & AIOCBLIST_CHECKSYNC) {
+		TAILQ_FOREACH_SAFE(scb, &ki->kaio_syncqueue, list, scbn) {
+			if (aiocbe->fd_file == scb->fd_file &&
+			    aiocbe->seqno < scb->seqno) {
+				if (--scb->pending == 0) {
+					mtx_lock(&aio_job_mtx);
+					scb->jobstate = JOBST_JOBQGLOBAL;
+					TAILQ_REMOVE(&ki->kaio_syncqueue, scb, list);
+					TAILQ_INSERT_TAIL(&aio_jobs, scb, list);
+					aio_kick_nowait(userp);
+					mtx_unlock(&aio_job_mtx);
+				}
+			}
+		}
+	}
+	if (ki->kaio_flags & KAIO_WAKEUP) {
+		ki->kaio_flags &= ~KAIO_WAKEUP;
+		wakeup(&userp->p_aioinfo);
+	}
+}
+
 /*
  * The AIO daemon, most of the actual work is done in aio_process,
  * but the setup (and address space mgmt) is done in this routine.
  */
 static void
-aio_daemon(void *uproc)
+aio_daemon(void *_id)
 {
-	int s;
-	struct aio_liojob *lj;
-	struct aiocb *cb;
 	struct aiocblist *aiocbe;
 	struct aiothreadlist *aiop;
 	struct kaioinfo *ki;
 	struct proc *curcp, *mycp, *userp;
 	struct vmspace *myvm, *tmpvm;
 	struct thread *td = curthread;
-	struct pgrp *newpgrp;
-	struct session *newsess;
+	int id = (intptr_t)_id;
 
 	/*
 	 * Local copies of curproc (cp) and vmspace (myvm)
@@ -790,32 +977,18 @@
 	 */
 	aiop = uma_zalloc(aiop_zone, M_WAITOK);
 	aiop->aiothread = td;
-	aiop->aiothreadflags |= AIOP_FREE;
-
-	/*
-	 * Place thread (lightweight process) onto the AIO free thread list.
-	 */
-	mtx_lock(&aio_freeproc_mtx);
-	TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
-	mtx_unlock(&aio_freeproc_mtx);
+	aiop->aiothreadflags = 0;
 
 	/* The daemon resides in its own pgrp. */
-	MALLOC(newpgrp, struct pgrp *, sizeof(struct pgrp), M_PGRP,
-		M_WAITOK | M_ZERO);
-	MALLOC(newsess, struct session *, sizeof(struct session), M_SESSION,
-		M_WAITOK | M_ZERO);
-
-	sx_xlock(&proctree_lock);
-	enterpgrp(mycp, mycp->p_pid, newpgrp, newsess);
-	sx_xunlock(&proctree_lock);
-	mtx_lock(&Giant);
+	setsid(td, NULL);
 
 	/*
 	 * Wakeup parent process.  (Parent sleeps to keep from blasting away
 	 * and creating too many daemons.)
 	 */
-	wakeup(mycp);
+	sema_post(&aio_newproc_sem);
 
+	mtx_lock(&aio_job_mtx);
 	for (;;) {
 		/*
 		 * curcp is the current daemon process context.
@@ -826,22 +999,18 @@
 		/*
 		 * Take daemon off of free queue
 		 */
-		mtx_lock(&aio_freeproc_mtx);
 		if (aiop->aiothreadflags & AIOP_FREE) {
 			TAILQ_REMOVE(&aio_freeproc, aiop, list);
 			aiop->aiothreadflags &= ~AIOP_FREE;
 		}
-		mtx_unlock(&aio_freeproc_mtx);
 
 		/*
 		 * Check for jobs.
 		 */
 		while ((aiocbe = aio_selectjob(aiop)) != NULL) {
-			cb = &aiocbe->uaiocb;
+			mtx_unlock(&aio_job_mtx);
 			userp = aiocbe->userproc;
 
-			aiocbe->jobstate = JOBST_JOBRUNNING;
-
 			/*
 			 * Connect to process address space for user program.
 			 */
@@ -875,71 +1044,30 @@
 			}
 
 			ki = userp->p_aioinfo;
-			lj = aiocbe->lio;
-
-			/* Account for currently active jobs. */
-			ki->kaio_active_count++;
 
 			/* Do the I/O function. */
 			aio_process(aiocbe);
 
+			mtx_lock(&aio_job_mtx);
 			/* Decrement the active job count. */
 			ki->kaio_active_count--;
+			mtx_unlock(&aio_job_mtx);
 
-			/*
-			 * Increment the completion count for wakeup/signal
-			 * comparisons.
-			 */
-			aiocbe->jobflags |= AIOCBLIST_DONE;
-			ki->kaio_queue_finished_count++;
-			if (lj)
-				lj->lioj_queue_finished_count++;
-			if ((ki->kaio_flags & KAIO_WAKEUP) || ((ki->kaio_flags
-			    & KAIO_RUNDOWN) && (ki->kaio_active_count == 0))) {
-				ki->kaio_flags &= ~KAIO_WAKEUP;
-				wakeup(userp);
-			}
-
-			s = splbio();
-			if (lj && (lj->lioj_flags &
-			    (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) == LIOJ_SIGNAL) {
-				if ((lj->lioj_queue_finished_count ==
-				    lj->lioj_queue_count) &&
-				    (lj->lioj_buffer_finished_count ==
-				    lj->lioj_buffer_count)) {
-					PROC_LOCK(userp);
-					psignal(userp,
-					    lj->lioj_signal.sigev_signo);
-					PROC_UNLOCK(userp);
-					lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
-				}
-			}
-			splx(s);
-
-			aiocbe->jobstate = JOBST_JOBFINISHED;
-
-			s = splnet();
+			AIO_LOCK(ki);
 			TAILQ_REMOVE(&ki->kaio_jobqueue, aiocbe, plist);
-			TAILQ_INSERT_TAIL(&ki->kaio_jobdone, aiocbe, plist);
-			splx(s);
-			KNOTE_UNLOCKED(&aiocbe->klist, 0);
-
-			if (aiocbe->jobflags & AIOCBLIST_RUNDOWN) {
-				wakeup(aiocbe);
-				aiocbe->jobflags &= ~AIOCBLIST_RUNDOWN;
-			}
+			aio_bio_done_notify(userp, aiocbe, DONE_QUEUE);
+			AIO_UNLOCK(ki);
 
-			if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL) {
-				PROC_LOCK(userp);
-				psignal(userp, cb->aio_sigevent.sigev_signo);
-				PROC_UNLOCK(userp);
-			}
+			mtx_lock(&aio_job_mtx);
 		}
 
 		/*
 		 * Disconnect from user address space.
 		 */
 		if (curcp != mycp) {
+
+			mtx_unlock(&aio_job_mtx);
+
 			/* Get the user address space to disconnect from. */
 			tmpvm = mycp->p_vmspace;
 
@@ -958,9 +1086,18 @@
 			vmspace_free(tmpvm);
 
 			curcp = mycp;
+
+			mtx_lock(&aio_job_mtx);
+			/*
+			 * We have to restart to avoid race, we only sleep if
+			 * no job can be selected, that should be
+			 * curcp == mycp.
+			 */
+			continue;
 		}
 
-		mtx_lock(&aio_freeproc_mtx);
+		mtx_assert(&aio_job_mtx, MA_OWNED);
+
 		TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list);
 		aiop->aiothreadflags |= AIOP_FREE;
 
@@ -968,18 +1105,16 @@
 		 * If daemon is inactive for a long time, allow it to exit,
 		 * thereby freeing resources.
 		 */
-		if (msleep(aiop->aiothread, &aio_freeproc_mtx, PDROP | PRIBIO,
-		    "aiordy", aiod_lifetime)) {
-			s = splnet();
+		if (msleep(aiop->aiothread, &aio_job_mtx, PRIBIO, "aiordy",
+		    aiod_lifetime)) {
 			if (TAILQ_EMPTY(&aio_jobs)) {
-				mtx_lock(&aio_freeproc_mtx);
 				if ((aiop->aiothreadflags & AIOP_FREE) &&
 				    (num_aio_procs > target_aio_procs)) {
 					TAILQ_REMOVE(&aio_freeproc, aiop, list);
-					mtx_unlock(&aio_freeproc_mtx);
-					splx(s);
-					uma_zfree(aiop_zone, aiop);
 					num_aio_procs--;
+					mtx_unlock(&aio_job_mtx);
+					uma_zfree(aiop_zone, aiop);
+					free_unr(aiod_unr, id);
 #ifdef DIAGNOSTIC
 					if (mycp->p_vmspace->vm_refcnt <= 1) {
 						printf("AIOD: bad vm refcnt for"
@@ -989,36 +1124,40 @@
 #endif
 					kthread_exit(0);
 				}
-				mtx_unlock(&aio_freeproc_mtx);
 			}
-			splx(s);
 		}
 	}
+	mtx_unlock(&aio_job_mtx);
+	panic("shouldn't be here\n");
 }
 
 /*
- * Create a new AIO daemon.  This is mostly a kernel-thread fork routine.  The
+ * Create a new AIO daemon. This is mostly a kernel-thread fork routine. The
  * AIO daemon modifies its environment itself.
  */
 static int
-aio_newproc(void)
+aio_newproc(int *start)
 {
 	int error;
 	struct proc *p;
+	int id;
 
-	error = kthread_create(aio_daemon, curproc, &p, RFNOWAIT, 0, "aiod%d",
-	    num_aio_procs);
-	if (error)
-		return (error);
-
-	/*
-	 * Wait until daemon is started, but continue on just in case to
-	 * handle error conditions.
-	 */
-	error = tsleep(p, PZERO, "aiosta", aiod_timeout);
-
-	num_aio_procs++;
-
+	id = alloc_unr(aiod_unr);
+	error = kthread_create(aio_daemon, (void *)(intptr_t)id, &p,
+		RFNOWAIT, 0, "aiod%d", id);
+	if (error == 0) {
+		/*
+		 * Wait until daemon is started.
+		 */
+		sema_wait(&aio_newproc_sem);
+		mtx_lock(&aio_job_mtx);
+		num_aio_procs++;
+		if (start != NULL)
+			(*start)--;
+		mtx_unlock(&aio_job_mtx);
+	} else {
+		free_unr(aiod_unr, id);
+	}
 	return (error);
 }
 
@@ -1027,22 +1166,20 @@
  * VCHR devices.  This method doesn't use an aio helper thread, and
  * thus has very low overhead.
  *
- * Assumes that the caller, _aio_aqueue(), has incremented the file
+ * Assumes that the caller, aio_aqueue(), has incremented the file
  * structure's reference count, preventing its deallocation for the
  * duration of this call.
  */
 static int
 aio_qphysio(struct proc *p, struct aiocblist *aiocbe)
 {
-	int error;
 	struct aiocb *cb;
 	struct file *fp;
 	struct buf *bp;
 	struct vnode *vp;
 	struct kaioinfo *ki;
-	struct aio_liojob *lj;
-	int s;
-	int notify;
+	struct aioliojob *lj;
+	int error;
 
 	cb = &aiocbe->uaiocb;
 	fp = aiocbe->fd_file;
@@ -1070,6 +1207,9 @@
  	if (cb->aio_nbytes % vp->v_bufobj.bo_bsize)
 		return (-1);
 
+	if (cb->aio_nbytes > vp->v_rdev->si_iosize_max)
+		return (-1);
+
 	if (cb->aio_nbytes >
 	    MAXPHYS - (((vm_offset_t) cb->aio_buf) & PAGE_MASK))
 		return (-1);
@@ -1078,16 +1218,18 @@
 	if (ki->kaio_buffer_count >= ki->kaio_ballowed_count)
 		return (-1);
 
-	ki->kaio_buffer_count++;
-
-	lj = aiocbe->lio;
-	if (lj)
-		lj->lioj_buffer_count++;
-
 	/* Create and build a buffer header for a transfer. */
 	bp = (struct buf *)getpbuf(NULL);
 	BUF_KERNPROC(bp);
 
+	AIO_LOCK(ki);
+	ki->kaio_count++;
+	ki->kaio_buffer_count++;
+	lj = aiocbe->lio;
+	if (lj)
+		lj->lioj_count++;
+	AIO_UNLOCK(ki);
+
 	/*
 	 * Get a copy of the kva from the physical buffer.
 	 */
@@ -1111,96 +1253,34 @@
 		goto doerror;
 	}
 
-	s = splbio();
+	AIO_LOCK(ki);
 	aiocbe->bp = bp;
 	bp->b_caller1 = (void *)aiocbe;
 	TAILQ_INSERT_TAIL(&ki->kaio_bufqueue, aiocbe, plist);
+	TAILQ_INSERT_TAIL(&ki->kaio_all, aiocbe, allist);
 	aiocbe->jobstate = JOBST_JOBQBUF;
 	cb->_aiocb_private.status = cb->aio_nbytes;
-	num_buf_aio++;
+	AIO_UNLOCK(ki);
+
+	atomic_add_int(&num_queue_count, 1);
+	atomic_add_int(&num_buf_aio, 1);
+
 	bp->b_error = 0;
 
-	splx(s);
+	TASK_INIT(&aiocbe->biotask, 0, biohelper, aiocbe);
 
 	/* Perform transfer. */
 	dev_strategy(vp->v_rdev, bp);
-
-	notify = 0;
-	s = splbio();
-
-	/*
-	 * If we had an error invoking the request, or an error in processing
-	 * the request before we have returned, we process it as an error in
-	 * transfer.  Note that such an I/O error is not indicated immediately,
-	 * but is returned using the aio_error mechanism.  In this case,
-	 * aio_suspend will return immediately.
-	 */
-	if (bp->b_error || (bp->b_ioflags & BIO_ERROR)) {
-		struct aiocb *job = aiocbe->uuaiocb;
-
-		aiocbe->uaiocb._aiocb_private.status = 0;
-		suword(&job->_aiocb_private.status, 0);
-		aiocbe->uaiocb._aiocb_private.error = bp->b_error;
-		suword(&job->_aiocb_private.error, bp->b_error);
-
-		ki->kaio_buffer_finished_count++;
-
-		if (aiocbe->jobstate != JOBST_JOBBFINISHED) {
-			aiocbe->jobstate = JOBST_JOBBFINISHED;
-			aiocbe->jobflags |= AIOCBLIST_DONE;
-			TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist);
-			TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist);
-			notify = 1;
-		}
-	}
-	splx(s);
-	if (notify)
-		KNOTE_UNLOCKED(&aiocbe->klist, 0);
 	return (0);
 
 doerror:
+	AIO_LOCK(ki);
+	ki->kaio_count--;
 	ki->kaio_buffer_count--;
 	if (lj)
-		lj->lioj_buffer_count--;
+		lj->lioj_count--;
 	aiocbe->bp = NULL;
-	relpbuf(bp, NULL);
-	return (error);
-}
-
-/*
- * This waits/tests physio completion.
- */
-static int
-aio_fphysio(struct aiocblist *iocb)
-{
-	int s;
-	struct buf *bp;
-	int error;
-
-	bp = iocb->bp;
-
-	s = splbio();
-	while ((bp->b_flags & B_DONE) == 0) {
-		if (tsleep(bp, PRIBIO, "physstr", aiod_timeout)) {
-			if ((bp->b_flags & B_DONE) == 0) {
-				splx(s);
-				return (EINPROGRESS);
-			} else
-				break;
-		}
-	}
-	splx(s);
-
-	/* Release mapping into kernel space. */
-	vunmapbuf(bp);
-	iocb->bp = 0;
-
-	error = 0;
-
-	/* Check for an error. */
-	if (bp->b_ioflags & BIO_ERROR)
-		error = bp->b_error;
-
+	AIO_UNLOCK(ki);
 	relpbuf(bp, NULL);
 	return (error);
 }
@@ -1211,94 +1291,106 @@
 static void
 aio_swake_cb(struct socket *so, struct sockbuf *sb)
 {
-	struct aiocblist *cb,*cbn;
-	struct proc *p;
-	struct kaioinfo *ki = NULL;
-	int opcode, wakecount = 0;
-	struct aiothreadlist *aiop;
+	struct aiocblist *cb, *cbn;
+	int opcode;
 
-	if (sb == &so->so_snd) {
+	if (sb == &so->so_snd)
 		opcode = LIO_WRITE;
-		SOCKBUF_LOCK(&so->so_snd);
-		so->so_snd.sb_flags &= ~SB_AIO;
-		SOCKBUF_UNLOCK(&so->so_snd);
-	} else {
+	else
 		opcode = LIO_READ;
-		SOCKBUF_LOCK(&so->so_rcv);
-		so->so_rcv.sb_flags &= ~SB_AIO;
-		SOCKBUF_UNLOCK(&so->so_rcv);
-	}
 
-	for (cb = TAILQ_FIRST(&so->so_aiojobq); cb; cb = cbn) {
-		cbn = TAILQ_NEXT(cb, list);
+	SOCKBUF_LOCK(sb);
+	sb->sb_flags &= ~SB_AIO;
+	mtx_lock(&aio_job_mtx);
+	TAILQ_FOREACH_SAFE(cb, &so->so_aiojobq, list, cbn) {
 		if (opcode == cb->uaiocb.aio_lio_opcode) {
-			p = cb->userproc;
-			ki = p->p_aioinfo;
+			if (cb->jobstate != JOBST_JOBQSOCK)
+				panic("invalid queue value");
+			/* XXX
+			 * We don't have actual sockets backend yet,
+			 * so we simply move the requests to the generic
+			 * file I/O backend.
+			 */
 			TAILQ_REMOVE(&so->so_aiojobq, cb, list);
-			TAILQ_REMOVE(&ki->kaio_sockqueue, cb, plist);
 			TAILQ_INSERT_TAIL(&aio_jobs, cb, list);
-			TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, cb, plist);
-			wakecount++;
-			if (cb->jobstate != JOBST_JOBQGLOBAL)
-				panic("invalid queue value");
-		}
-	}
-
-	while (wakecount--) {
-		mtx_lock(&aio_freeproc_mtx);
-		if ((aiop = TAILQ_FIRST(&aio_freeproc)) != 0) {
-			TAILQ_REMOVE(&aio_freeproc, aiop, list);
-			aiop->aiothreadflags &= ~AIOP_FREE;
-			wakeup(aiop->aiothread);
+			aio_kick_nowait(cb->userproc);
 		}
-		mtx_unlock(&aio_freeproc_mtx);
 	}
+	mtx_unlock(&aio_job_mtx);
+	SOCKBUF_UNLOCK(sb);
 }
 
 /*
  * Queue a new AIO request.  Choosing either the threaded or direct physio VCHR
  * technique is done in this code.
  */
-static int
-_aio_aqueue(struct thread *td, struct aiocb *job, struct aio_liojob *lj, int type)
+int
+aio_aqueue(struct thread *td, struct aiocb *job, struct aioliojob *lj,
+	int type, int oldsigev)
 {
 	struct proc *p = td->td_proc;
-	struct filedesc *fdp;
 	struct file *fp;
-	unsigned int fd;
 	struct socket *so;
-	int s;
-	int error;
-	int opcode;
-	struct aiocblist *aiocbe;
-	struct aiothreadlist *aiop;
+	struct aiocblist *aiocbe, *cb;
 	struct kaioinfo *ki;
 	struct kevent kev;
-	struct kqueue *kq;
-	struct file *kq_fp;
 	struct sockbuf *sb;
+	int opcode;
+	int error;
+	int fd, kqfd;
+	int jid;
 
-	aiocbe = uma_zalloc(aiocb_zone, M_WAITOK);
-	aiocbe->inputcharge = 0;
-	aiocbe->outputcharge = 0;
-	/* XXX - need a lock */
-	knlist_init(&aiocbe->klist, NULL, NULL, NULL, NULL);
+	if (p->p_aioinfo == NULL)
+		aio_init_aioinfo(p);
+
+	ki = p->p_aioinfo;
 
 	suword(&job->_aiocb_private.status, -1);
 	suword(&job->_aiocb_private.error, 0);
 	suword(&job->_aiocb_private.kernelinfo, -1);
 
-	error = copyin(job, &aiocbe->uaiocb, sizeof(aiocbe->uaiocb));
-	if (error) {
-		suword(&job->_aiocb_private.error, error);
-		uma_zfree(aiocb_zone, aiocbe);
-		return (error);
+	if (num_queue_count >= max_queue_count ||
+	    ki->kaio_count >= ki->kaio_qallowed_count) {
+		suword(&job->_aiocb_private.error, EAGAIN);
+		return (EAGAIN);
 	}
-	if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL &&
-		!_SIG_VALID(aiocbe->uaiocb.aio_sigevent.sigev_signo)) {
-		uma_zfree(aiocb_zone, aiocbe);
+
+	aiocbe = uma_zalloc(aiocb_zone, M_WAITOK | M_ZERO);
+	aiocbe->inputcharge = 0;
+	aiocbe->outputcharge = 0;
+	knlist_init(&aiocbe->klist, AIO_MTX(ki), NULL, NULL, NULL);
+
+	if (oldsigev) {
+		bzero(&aiocbe->uaiocb, sizeof(struct aiocb));
+		error = copyin(job, &aiocbe->uaiocb, sizeof(struct oaiocb));
+		bcopy(&aiocbe->uaiocb.__spare__, &aiocbe->uaiocb.aio_sigevent,
+			sizeof(struct osigevent));
+	} else {
+		error = copyin(job, &aiocbe->uaiocb, sizeof(struct aiocb));
+	}
+	if (error) {
+		suword(&job->_aiocb_private.error, error);
+		uma_zfree(aiocb_zone, aiocbe);
+		return (error);
+	}
+
+	if (aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT &&
+	    aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_SIGNAL &&
+	    aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_THREAD_ID &&
+	    aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_NONE) {
+		suword(&job->_aiocb_private.error, EINVAL);
+		uma_zfree(aiocb_zone, aiocbe);
 		return (EINVAL);
 	}
+	
+	if ((aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL ||
+	     aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_THREAD_ID) &&
+		!_SIG_VALID(aiocbe->uaiocb.aio_sigevent.sigev_signo)) {
+		uma_zfree(aiocb_zone, aiocbe);
+		return (EINVAL);
+	}
+
+	ksiginfo_init(&aiocbe->ksi);
 
 	/* Save userspace address of the job info. */
 	aiocbe->uuaiocb = job;
@@ -1308,90 +1400,72 @@
 		aiocbe->uaiocb.aio_lio_opcode = type;
 	opcode = aiocbe->uaiocb.aio_lio_opcode;
 
-	/* Get the fd info for process. */
-	fdp = p->p_fd;
-
-	/*
-	 * Range check file descriptor.
-	 */
-	FILEDESC_LOCK(fdp);
+	/* Fetch the file object for the specified file descriptor. */
 	fd = aiocbe->uaiocb.aio_fildes;
-	if (fd >= fdp->fd_nfiles) {
-		FILEDESC_UNLOCK(fdp);
+	switch (opcode) {
+	case LIO_WRITE:
+		error = fget_write(td, fd, &fp);
+		break;
+	case LIO_READ:
+		error = fget_read(td, fd, &fp);
+		break;
+	default:
+		error = fget(td, fd, &fp);
+	}
+	if (error) {
 		uma_zfree(aiocb_zone, aiocbe);
-		if (type == 0)
-			suword(&job->_aiocb_private.error, EBADF);
-		return (EBADF);
+		suword(&job->_aiocb_private.error, error);
+		return (error);
 	}
 
-	fp = aiocbe->fd_file = fdp->fd_ofiles[fd];
-	if ((fp == NULL) ||
-	    ((opcode == LIO_WRITE) && ((fp->f_flag & FWRITE) == 0)) ||
-	    ((opcode == LIO_READ) && ((fp->f_flag & FREAD) == 0))) {
-		FILEDESC_UNLOCK(fdp);
-		uma_zfree(aiocb_zone, aiocbe);
-		if (type == 0)
-			suword(&job->_aiocb_private.error, EBADF);
-		return (EBADF);
+	if (opcode == LIO_SYNC && fp->f_vnode == NULL) {
+		error = EINVAL;
+		goto aqueue_fail;
 	}
-	fhold(fp);
-	FILEDESC_UNLOCK(fdp);
 
-	if (aiocbe->uaiocb.aio_offset == -1LL) {
+	if (opcode != LIO_SYNC && aiocbe->uaiocb.aio_offset == -1LL) {
 		error = EINVAL;
 		goto aqueue_fail;
 	}
-	error = suword(&job->_aiocb_private.kernelinfo, jobrefid);
+
+	aiocbe->fd_file = fp;
+
+	mtx_lock(&aio_job_mtx);
+	jid = jobrefid++;
+	aiocbe->seqno = jobseqno++;
+	mtx_unlock(&aio_job_mtx);
+	error = suword(&job->_aiocb_private.kernelinfo, jid);
 	if (error) {
 		error = EINVAL;
 		goto aqueue_fail;
 	}
-	aiocbe->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jobrefid;
-	if (jobrefid == LONG_MAX)
-		jobrefid = 1;
-	else
-		jobrefid++;
+	aiocbe->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jid;
 
 	if (opcode == LIO_NOP) {
 		fdrop(fp, td);
 		uma_zfree(aiocb_zone, aiocbe);
-		if (type == 0) {
-			suword(&job->_aiocb_private.error, 0);
-			suword(&job->_aiocb_private.status, 0);
-			suword(&job->_aiocb_private.kernelinfo, 0);
-		}
 		return (0);
 	}
-	if ((opcode != LIO_READ) && (opcode != LIO_WRITE)) {
-		if (type == 0)
-			suword(&job->_aiocb_private.status, 0);
+	if ((opcode != LIO_READ) && (opcode != LIO_WRITE) &&
+	    (opcode != LIO_SYNC)) {
 		error = EINVAL;
 		goto aqueue_fail;
 	}
 
-	if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_KEVENT) {
-		kev.ident = aiocbe->uaiocb.aio_sigevent.sigev_notify_kqueue;
-		kev.udata = aiocbe->uaiocb.aio_sigevent.sigev_value.sigval_ptr;
-	} else
+	if (aiocbe->uaiocb.aio_sigevent.sigev_notify != SIGEV_KEVENT)
 		goto no_kqueue;
-	if ((u_int)kev.ident >= fdp->fd_nfiles ||
-	    (kq_fp = fdp->fd_ofiles[kev.ident]) == NULL ||
-	    (kq_fp->f_type != DTYPE_KQUEUE)) {
-		error = EBADF;
-		goto aqueue_fail;
-	}
-	kq = kq_fp->f_data;
+	kqfd = aiocbe->uaiocb.aio_sigevent.sigev_notify_kqueue;
 	kev.ident = (uintptr_t)aiocbe->uuaiocb;
 	kev.filter = EVFILT_AIO;
 	kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1;
 	kev.data = (intptr_t)aiocbe;
-	error = kqueue_register(kq, &kev, td, 1);
+	kev.udata = aiocbe->uaiocb.aio_sigevent.sigev_value.sival_ptr;
+	error = kqfd_register(kqfd, &kev, td, 1);
 aqueue_fail:
 	if (error) {
 		fdrop(fp, td);
 		uma_zfree(aiocb_zone, aiocbe);
-		if (type == 0)
-			suword(&job->_aiocb_private.error, error);
+		suword(&job->_aiocb_private.error, error);
 		goto done;
 	}
 no_kqueue:
@@ -1402,7 +1476,9 @@
 	aiocbe->cred = crhold(td->td_ucred);
 	aiocbe->jobflags = 0;
 	aiocbe->lio = lj;
-	ki = p->p_aioinfo;
+
+	if (opcode == LIO_SYNC)
+		goto queueit;
 
 	if (fp->f_type == DTYPE_SOCKET) {
 		/*
@@ -1421,56 +1497,111 @@
 		so = fp->f_data;
 		sb = (opcode == LIO_READ) ? &so->so_rcv : &so->so_snd;
 		SOCKBUF_LOCK(sb);
-		s = splnet();
 		if (((opcode == LIO_READ) && (!soreadable(so))) || ((opcode ==
 		    LIO_WRITE) && (!sowriteable(so)))) {
-			TAILQ_INSERT_TAIL(&so->so_aiojobq, aiocbe, list);
-			TAILQ_INSERT_TAIL(&ki->kaio_sockqueue, aiocbe, plist);
 			sb->sb_flags |= SB_AIO;
-			aiocbe->jobstate = JOBST_JOBQGLOBAL; /* XXX */
-			ki->kaio_queue_count++;
-			num_queue_count++;
+
+			mtx_lock(&aio_job_mtx);
+			TAILQ_INSERT_TAIL(&so->so_aiojobq, aiocbe, list);
+			mtx_unlock(&aio_job_mtx);
+
+			AIO_LOCK(ki);
+			TAILQ_INSERT_TAIL(&ki->kaio_all, aiocbe, allist);
+			TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist);
+			aiocbe->jobstate = JOBST_JOBQSOCK;
+			ki->kaio_count++;
+			if (lj)
+				lj->lioj_count++;
+			AIO_UNLOCK(ki);
 			SOCKBUF_UNLOCK(sb);
-			splx(s);
+			atomic_add_int(&num_queue_count, 1);
 			error = 0;
 			goto done;
 		}
 		SOCKBUF_UNLOCK(sb);
-		splx(s);
 	}
 
 	if ((error = aio_qphysio(p, aiocbe)) == 0)
 		goto done;
+#if 0
 	if (error > 0) {
-		suword(&job->_aiocb_private.status, 0);
 		aiocbe->uaiocb._aiocb_private.error = error;
 		suword(&job->_aiocb_private.error, error);
 		goto done;
 	}
-
+#endif
+queueit:
 	/* No buffer for daemon I/O. */
 	aiocbe->bp = NULL;
+	atomic_add_int(&num_queue_count, 1);
 
-	ki->kaio_queue_count++;
+	AIO_LOCK(ki);
+	ki->kaio_count++;
 	if (lj)
-		lj->lioj_queue_count++;
-	s = splnet();
+		lj->lioj_count++;
 	TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist);
+	TAILQ_INSERT_TAIL(&ki->kaio_all, aiocbe, allist);
+	if (opcode == LIO_SYNC) {
+		TAILQ_FOREACH(cb, &ki->kaio_jobqueue, plist) {
+			if (cb->fd_file == aiocbe->fd_file &&
+			    cb->uaiocb.aio_lio_opcode != LIO_SYNC &&
+			    cb->seqno < aiocbe->seqno) {
+				cb->jobflags |= AIOCBLIST_CHECKSYNC;
+				aiocbe->pending++;
+			}
+		}
+		TAILQ_FOREACH(cb, &ki->kaio_bufqueue, plist) {
+			if (cb->fd_file == aiocbe->fd_file &&
+			    cb->uaiocb.aio_lio_opcode != LIO_SYNC &&
+			    cb->seqno < aiocbe->seqno) {
+				cb->jobflags |= AIOCBLIST_CHECKSYNC;
+				aiocbe->pending++;
+			}
+		}
+		if (aiocbe->pending != 0) {
+			TAILQ_INSERT_TAIL(&ki->kaio_syncqueue, aiocbe, list);
+			aiocbe->jobstate = JOBST_JOBQSYNC;
+			AIO_UNLOCK(ki);
+			goto done;
+		}
+	}
+	mtx_lock(&aio_job_mtx);
 	TAILQ_INSERT_TAIL(&aio_jobs, aiocbe, list);
-	splx(s);
 	aiocbe->jobstate = JOBST_JOBQGLOBAL;
-
-	num_queue_count++;
+	aio_kick_nowait(p);
+	mtx_unlock(&aio_job_mtx);
+	AIO_UNLOCK(ki);
 	error = 0;
+done:
+	return (error);
+}
 
-	/*
-	 * If we don't have a free AIO process, and we are below our quota, then
-	 * start one.  Otherwise, depend on the subsequent I/O completions to
-	 * pick-up this job.  If we don't sucessfully create the new process
-	 * (thread) due to resource issues, we return an error for now (EAGAIN),
-	 * which is likely not the correct thing to do.
-	 */
-	mtx_lock(&aio_freeproc_mtx);
+static void
+aio_kick_nowait(struct proc *userp)
+{
+	struct kaioinfo *ki = userp->p_aioinfo;
+	struct aiothreadlist *aiop;
+
+	mtx_assert(&aio_job_mtx, MA_OWNED);
+	if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) {
+		TAILQ_REMOVE(&aio_freeproc, aiop, list);
+		aiop->aiothreadflags &= ~AIOP_FREE;
+		wakeup(aiop->aiothread);
+	} else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) &&
+	    ((ki->kaio_active_count + num_aio_resv_start) <
+	    ki->kaio_maxactive_count)) {
+		taskqueue_enqueue(taskqueue_aiod_bio, &ki->kaio_task);
+	}
+}
+
+static int
+aio_kick(struct proc *userp)
+{
+	struct kaioinfo *ki = userp->p_aioinfo;
+	struct aiothreadlist *aiop;
+	int error, ret = 0;
+
+	mtx_assert(&aio_job_mtx, MA_OWNED);
 retryproc:
 	if ((aiop = TAILQ_FIRST(&aio_freeproc)) != NULL) {
 		TAILQ_REMOVE(&aio_freeproc, aiop, list);
@@ -1480,40 +1611,30 @@
 	    ((ki->kaio_active_count + num_aio_resv_start) <
 	    ki->kaio_maxactive_count)) {
 		num_aio_resv_start++;
-		mtx_unlock(&aio_freeproc_mtx);
-		if ((error = aio_newproc()) == 0) {
-			mtx_lock(&aio_freeproc_mtx);
+		mtx_unlock(&aio_job_mtx);
+		error = aio_newproc(&num_aio_resv_start);
+		mtx_lock(&aio_job_mtx);
+		if (error) {
 			num_aio_resv_start--;
 			goto retryproc;
 		}
-		mtx_lock(&aio_freeproc_mtx);
-		num_aio_resv_start--;
+	} else {
+		ret = -1;
 	}
-	mtx_unlock(&aio_freeproc_mtx);
-done:
-	return (error);
+	return (ret);
 }
 
-/*
- * This routine queues an AIO request, checking for quotas.
- */
-static int
-aio_aqueue(struct thread *td, struct aiocb *job, int type)
+static void
+aio_kick_helper(void *context, int pending)
 {
-	struct proc *p = td->td_proc;
-	struct kaioinfo *ki;
-
-	if (p->p_aioinfo == NULL)
-		aio_init_aioinfo(p);
+	struct proc *userp = context;
 
-	if (num_queue_count >= max_queue_count)
-		return (EAGAIN);
-
-	ki = p->p_aioinfo;
-	if (ki->kaio_queue_count >= ki->kaio_qallowed_count)
-		return (EAGAIN);
-
-	return _aio_aqueue(td, job, NULL, type);
+	mtx_lock(&aio_job_mtx);
+	while (--pending >= 0) {
+		if (aio_kick(userp))
+			break;
+	}
+	mtx_unlock(&aio_job_mtx);
 }
 
 /*
@@ -1524,56 +1645,41 @@
 aio_return(struct thread *td, struct aio_return_args *uap)
 {
 	struct proc *p = td->td_proc;
-	int s;
-	long jobref;
-	struct aiocblist *cb, *ncb;
-	struct aiocb *ujob;
+	struct aiocblist *cb;
+	struct aiocb *uaiocb;
 	struct kaioinfo *ki;
-
-	ujob = uap->aiocbp;
-	jobref = fuword(&ujob->_aiocb_private.kernelinfo);
-	if (jobref == -1 || jobref == 0)
-		return (EINVAL);
+	int status, error;
 
 	ki = p->p_aioinfo;
 	if (ki == NULL)
 		return (EINVAL);
-	PROC_LOCK(p);
-	TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) {
-		if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) ==
-		    jobref) {
-			if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
-				p->p_stats->p_ru.ru_oublock +=
-				    cb->outputcharge;
-				cb->outputcharge = 0;
-			} else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
-				p->p_stats->p_ru.ru_inblock += cb->inputcharge;
-				cb->inputcharge = 0;
-			}
-			goto done;
-		}
-	}
-	s = splbio();
-	for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = ncb) {
-		ncb = TAILQ_NEXT(cb, plist);
-		if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo)
-		    == jobref) {
+	uaiocb = uap->aiocbp;
+	AIO_LOCK(ki);
+	TAILQ_FOREACH(cb, &ki->kaio_done, plist) {
+		if (cb->uuaiocb == uaiocb)
 			break;
-		}
 	}
-	splx(s);
- done:
-	PROC_UNLOCK(p);
 	if (cb != NULL) {
-		if (ujob == cb->uuaiocb) {
-			td->td_retval[0] =
-			    cb->uaiocb._aiocb_private.status;
-		} else
-			td->td_retval[0] = EFAULT;
+		MPASS(cb->jobstate == JOBST_JOBFINISHED);
+		status = cb->uaiocb._aiocb_private.status;
+		error = cb->uaiocb._aiocb_private.error;
+		td->td_retval[0] = status;
+		if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
+			td->td_ru.ru_oublock += cb->outputcharge;
+			cb->outputcharge = 0;
+		} else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
+			td->td_ru.ru_inblock += cb->inputcharge;
+			cb->inputcharge = 0;
+		}
 		aio_free_entry(cb);
-		return (0);
+		AIO_UNLOCK(ki);
+		suword(&uaiocb->_aiocb_private.error, error);
+		suword(&uaiocb->_aiocb_private.status, status);
+	} else {
+		error = EINVAL;
+		AIO_UNLOCK(ki);
 	}
-	return (EINVAL);
+	return (error);
 }
 
 /*
@@ -1587,12 +1693,12 @@
 	struct timespec ts;
 	struct aiocb *const *cbptr, *cbp;
 	struct kaioinfo *ki;
-	struct aiocblist *cb;
-	int i;
-	int njoblist;
-	int error, s, timo;
-	long *ijoblist;
+	struct aiocblist *cb, *cbfirst;
 	struct aiocb **ujoblist;
+	int njoblist;
+	int error;
+	int timo;
+	int i;
 
 	if (uap->nent < 0 || uap->nent > AIO_LISTIO_MAX)
 		return (EINVAL);
@@ -1617,7 +1723,6 @@
 		return (EAGAIN);
 
 	njoblist = 0;
-	ijoblist = uma_zalloc(aiol_zone, M_WAITOK);
 	ujoblist = uma_zalloc(aiol_zone, M_WAITOK);
 	cbptr = uap->aiocbp;
 
@@ -1626,70 +1731,44 @@
 		if (cbp == 0)
 			continue;
 		ujoblist[njoblist] = cbp;
-		ijoblist[njoblist] = fuword(&cbp->_aiocb_private.kernelinfo);
 		njoblist++;
 	}
 
 	if (njoblist == 0) {
-		uma_zfree(aiol_zone, ijoblist);
 		uma_zfree(aiol_zone, ujoblist);
 		return (0);
 	}
 
-	error = 0;
+	AIO_LOCK(ki);
 	for (;;) {
-		PROC_LOCK(p);
-		TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) {
+		cbfirst = NULL;
+		error = 0;
+		TAILQ_FOREACH(cb, &ki->kaio_all, allist) {
 			for (i = 0; i < njoblist; i++) {
-				if (((intptr_t)
-				    cb->uaiocb._aiocb_private.kernelinfo) ==
-				    ijoblist[i]) {
-					PROC_UNLOCK(p);
-					if (ujoblist[i] != cb->uuaiocb)
-						error = EINVAL;
-					uma_zfree(aiol_zone, ijoblist);
-					uma_zfree(aiol_zone, ujoblist);
-					return (error);
-				}
-			}
-		}
-
-		s = splbio();
-		for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb =
-		    TAILQ_NEXT(cb, plist)) {
-			for (i = 0; i < njoblist; i++) {
-				if (((intptr_t)
-				    cb->uaiocb._aiocb_private.kernelinfo) ==
-				    ijoblist[i]) {
-					PROC_UNLOCK(p);
-					splx(s);
-					if (ujoblist[i] != cb->uuaiocb)
-						error = EINVAL;
-					uma_zfree(aiol_zone, ijoblist);
-					uma_zfree(aiol_zone, ujoblist);
-					return (error);
+				if (cb->uuaiocb == ujoblist[i]) {
+					if (cbfirst == NULL)
+						cbfirst = cb;
+					if (cb->jobstate == JOBST_JOBFINISHED)
+						goto RETURN;
 				}
 			}
 		}
+		/* All tasks were finished. */
+		if (cbfirst == NULL)
+			break;
 
 		ki->kaio_flags |= KAIO_WAKEUP;
-		error = msleep(p, &p->p_mtx, PDROP | PRIBIO | PCATCH, "aiospn",
-		    timo);
-		splx(s);
-
-		if (error == ERESTART || error == EINTR) {
-			uma_zfree(aiol_zone, ijoblist);
-			uma_zfree(aiol_zone, ujoblist);
-			return (EINTR);
-		} else if (error == EWOULDBLOCK) {
-			uma_zfree(aiol_zone, ijoblist);
-			uma_zfree(aiol_zone, ujoblist);
-			return (EAGAIN);
-		}
+		error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH,
+		    "aiospn", timo);
+		if (error == ERESTART)
+			error = EINTR;
+		if (error)
+			break;
 	}
-
-/* NOTREACHED */
-	return (EINVAL);
+RETURN:
+	AIO_UNLOCK(ki);
+	uma_zfree(aiol_zone, ujoblist);
+	return (error);
 }
 
 /*
@@ -1703,237 +1782,195 @@
 	struct kaioinfo *ki;
 	struct aiocblist *cbe, *cbn;
 	struct file *fp;
-	struct filedesc *fdp;
 	struct socket *so;
-	struct proc *po;
-	int s,error;
-	int cancelled=0;
-	int notcancelled=0;
+	int error;
+	int remove;
+	int cancelled = 0;
+	int notcancelled = 0;
 	struct vnode *vp;
 
-	fdp = p->p_fd;
-	if ((u_int)uap->fd >= fdp->fd_nfiles ||
-	    (fp = fdp->fd_ofiles[uap->fd]) == NULL)
-		return (EBADF);
+	/* Lookup file object. */
+	error = fget(td, uap->fd, &fp);
+	if (error)
+		return (error);
+
+	ki = p->p_aioinfo;
+	if (ki == NULL)
+		goto done;
 
 	if (fp->f_type == DTYPE_VNODE) {
 		vp = fp->f_vnode;
-
-		if (vn_isdisk(vp,&error)) {
+		if (vn_isdisk(vp, &error)) {
+			fdrop(fp, td);
 			td->td_retval[0] = AIO_NOTCANCELED;
 			return (0);
 		}
-	} else if (fp->f_type == DTYPE_SOCKET) {
-		so = fp->f_data;
-
-		s = splnet();
-
-		for (cbe = TAILQ_FIRST(&so->so_aiojobq); cbe; cbe = cbn) {
-			cbn = TAILQ_NEXT(cbe, list);
-			if ((uap->aiocbp == NULL) ||
-				(uap->aiocbp == cbe->uuaiocb) ) {
-				po = cbe->userproc;
-				ki = po->p_aioinfo;
-				TAILQ_REMOVE(&so->so_aiojobq, cbe, list);
-				TAILQ_REMOVE(&ki->kaio_sockqueue, cbe, plist);
-				TAILQ_INSERT_TAIL(&ki->kaio_jobdone, cbe, plist);
-				if (ki->kaio_flags & KAIO_WAKEUP) {
-					wakeup(po);
-				}
-				cbe->jobstate = JOBST_JOBFINISHED;
-				cbe->uaiocb._aiocb_private.status=-1;
-				cbe->uaiocb._aiocb_private.error=ECANCELED;
-				cancelled++;
-/* XXX cancelled, knote? */
-				if (cbe->uaiocb.aio_sigevent.sigev_notify ==
-				    SIGEV_SIGNAL) {
-					PROC_LOCK(cbe->userproc);
-					psignal(cbe->userproc, cbe->uaiocb.aio_sigevent.sigev_signo);
-					PROC_UNLOCK(cbe->userproc);
-				}
-				if (uap->aiocbp)
-					break;
-			}
-		}
-		splx(s);
-
-		if ((cancelled) && (uap->aiocbp)) {
-			td->td_retval[0] = AIO_CANCELED;
-			return (0);
-		}
 	}
-	ki=p->p_aioinfo;
-	if (ki == NULL)
-		goto done;
-	s = splnet();
-
-	for (cbe = TAILQ_FIRST(&ki->kaio_jobqueue); cbe; cbe = cbn) {
-		cbn = TAILQ_NEXT(cbe, plist);
 
+	AIO_LOCK(ki);
+	TAILQ_FOREACH_SAFE(cbe, &ki->kaio_jobqueue, plist, cbn) {
 		if ((uap->fd == cbe->uaiocb.aio_fildes) &&
-		    ((uap->aiocbp == NULL ) ||
+		    ((uap->aiocbp == NULL) ||
 		     (uap->aiocbp == cbe->uuaiocb))) {
+			remove = 0;
 
+			mtx_lock(&aio_job_mtx);
 			if (cbe->jobstate == JOBST_JOBQGLOBAL) {
 				TAILQ_REMOVE(&aio_jobs, cbe, list);
+				remove = 1;
+			} else if (cbe->jobstate == JOBST_JOBQSOCK) {
+				MPASS(fp->f_type == DTYPE_SOCKET);
+				so = fp->f_data;
+				TAILQ_REMOVE(&so->so_aiojobq, cbe, list);
+				remove = 1;
+			} else if (cbe->jobstate == JOBST_JOBQSYNC) {
+				TAILQ_REMOVE(&ki->kaio_syncqueue, cbe, list);
+				remove = 1;
+			}
+			mtx_unlock(&aio_job_mtx);
+
+			if (remove) {
 				TAILQ_REMOVE(&ki->kaio_jobqueue, cbe, plist);
-				TAILQ_INSERT_TAIL(&ki->kaio_jobdone, cbe,
-				    plist);
-				cancelled++;
-				ki->kaio_queue_finished_count++;
-				cbe->jobstate = JOBST_JOBFINISHED;
 				cbe->uaiocb._aiocb_private.status = -1;
 				cbe->uaiocb._aiocb_private.error = ECANCELED;
-/* XXX cancelled, knote? */
-				if (cbe->uaiocb.aio_sigevent.sigev_notify ==
-				    SIGEV_SIGNAL) {
-					PROC_LOCK(cbe->userproc);
-					psignal(cbe->userproc, cbe->uaiocb.aio_sigevent.sigev_signo);
-					PROC_UNLOCK(cbe->userproc);
-				}
+				aio_bio_done_notify(p, cbe, DONE_QUEUE);
+				cancelled++;
 			} else {
 				notcancelled++;
 			}
+			if (uap->aiocbp != NULL)
+				break;
 		}
 	}
-	splx(s);
+	AIO_UNLOCK(ki);
+
 done:
+	fdrop(fp, td);
+
+	if (uap->aiocbp != NULL) {
+		if (cancelled) {
+			td->td_retval[0] = AIO_CANCELED;
+			return (0);
+		}
+	}
+
 	if (notcancelled) {
 		td->td_retval[0] = AIO_NOTCANCELED;
 		return (0);
 	}
+
 	if (cancelled) {
 		td->td_retval[0] = AIO_CANCELED;
 		return (0);
 	}
+
 	td->td_retval[0] = AIO_ALLDONE;
 
 	return (0);
 }
 
 /*
- * aio_error is implemented in the kernel level for compatibility purposes only.
- * For a user mode async implementation, it would be best to do it in a userland
- * subroutine.
+ * aio_error is implemented in the kernel level for compatibility purposes
+ * only.  For a user mode async implementation, it would be best to do it in
+ * a userland subroutine.
  */
 int
 aio_error(struct thread *td, struct aio_error_args *uap)
 {
 	struct proc *p = td->td_proc;
-	int s;
 	struct aiocblist *cb;
 	struct kaioinfo *ki;
-	long jobref;
+	int status;
 
 	ki = p->p_aioinfo;
-	if (ki == NULL)
-		return (EINVAL);
-
-	jobref = fuword(&uap->aiocbp->_aiocb_private.kernelinfo);
-	if ((jobref == -1) || (jobref == 0))
-		return (EINVAL);
-
-	PROC_LOCK(p);
-	TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) {
-		if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
-		    jobref) {
-			PROC_UNLOCK(p);
-			td->td_retval[0] = cb->uaiocb._aiocb_private.error;
-			return (0);
-		}
-	}
-
-	s = splnet();
-
-	for (cb = TAILQ_FIRST(&ki->kaio_jobqueue); cb; cb = TAILQ_NEXT(cb,
-	    plist)) {
-		if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
-		    jobref) {
-			PROC_UNLOCK(p);
-			td->td_retval[0] = EINPROGRESS;
-			splx(s);
-			return (0);
-		}
-	}
-
-	for (cb = TAILQ_FIRST(&ki->kaio_sockqueue); cb; cb = TAILQ_NEXT(cb,
-	    plist)) {
-		if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
-		    jobref) {
-			PROC_UNLOCK(p);
-			td->td_retval[0] = EINPROGRESS;
-			splx(s);
-			return (0);
-		}
-	}
-	splx(s);
-
-	s = splbio();
-	for (cb = TAILQ_FIRST(&ki->kaio_bufdone); cb; cb = TAILQ_NEXT(cb,
-	    plist)) {
-		if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
-		    jobref) {
-			PROC_UNLOCK(p);
-			td->td_retval[0] = cb->uaiocb._aiocb_private.error;
-			splx(s);
-			return (0);
-		}
+	if (ki == NULL) {
+		td->td_retval[0] = EINVAL;
+		return (0);
 	}
 
-	for (cb = TAILQ_FIRST(&ki->kaio_bufqueue); cb; cb = TAILQ_NEXT(cb,
-	    plist)) {
-		if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo) ==
-		    jobref) {
-			PROC_UNLOCK(p);
-			td->td_retval[0] = EINPROGRESS;
-			splx(s);
+	AIO_LOCK(ki);
+	TAILQ_FOREACH(cb, &ki->kaio_all, allist) {
+		if (cb->uuaiocb == uap->aiocbp) {
+			if (cb->jobstate == JOBST_JOBFINISHED)
+				td->td_retval[0] =
+					cb->uaiocb._aiocb_private.error;
+			else
+				td->td_retval[0] = EINPROGRESS;
+			AIO_UNLOCK(ki);
 			return (0);
 		}
 	}
-	splx(s);
-	PROC_UNLOCK(p);
+	AIO_UNLOCK(ki);
 
-#if (0)
 	/*
-	 * Hack for lio.
+	 * Hack for failure of aio_aqueue.
 	 */
 	status = fuword(&uap->aiocbp->_aiocb_private.status);
-	if (status == -1)
-		return fuword(&uap->aiocbp->_aiocb_private.error);
-#endif
-	return (EINVAL);
+	if (status == -1) {
+		td->td_retval[0] = fuword(&uap->aiocbp->_aiocb_private.error);
+		return (0);
+	}
+
+	td->td_retval[0] = EINVAL;
+	return (0);
 }
 
 /* syscall - asynchronous read from a file (REALTIME) */
 int
+oaio_read(struct thread *td, struct oaio_read_args *uap)
+{
+
+	return aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_READ, 1);
+}
+
+int
 aio_read(struct thread *td, struct aio_read_args *uap)
 {
 
-	return aio_aqueue(td, uap->aiocbp, LIO_READ);
+	return aio_aqueue(td, uap->aiocbp, NULL, LIO_READ, 0);
 }
 
 /* syscall - asynchronous write to a file (REALTIME) */
 int
+oaio_write(struct thread *td, struct oaio_write_args *uap)
+{
+
+	return aio_aqueue(td, (struct aiocb *)uap->aiocbp, NULL, LIO_WRITE, 1);
+}
+
+int
 aio_write(struct thread *td, struct aio_write_args *uap)
 {
 
-	return aio_aqueue(td, uap->aiocbp, LIO_WRITE);
+	return aio_aqueue(td, uap->aiocbp, NULL, LIO_WRITE, 0);
+}
+
+/* syscall - list directed I/O (REALTIME) */
+int
+olio_listio(struct thread *td, struct olio_listio_args *uap)
+{
+	return do_lio_listio(td, (struct lio_listio_args *)uap, 1);
 }
 
 /* syscall - list directed I/O (REALTIME) */
 int
 lio_listio(struct thread *td, struct lio_listio_args *uap)
 {
+	return do_lio_listio(td, uap, 0);
+}
+
+static int
+do_lio_listio(struct thread *td, struct lio_listio_args *uap, int oldsigev)
+{
 	struct proc *p = td->td_proc;
-	int nent, nentqueued;
 	struct aiocb *iocb, * const *cbptr;
-	struct aiocblist *cb;
 	struct kaioinfo *ki;
-	struct aio_liojob *lj;
-	int error, runningcode;
+	struct aioliojob *lj;
+	struct kevent kev;
+	int nent;
+	int error;
 	int nerror;
 	int i;
-	int s;
 
 	if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT))
 		return (EINVAL);
@@ -1945,224 +1982,175 @@
 	if (p->p_aioinfo == NULL)
 		aio_init_aioinfo(p);
 
-	if ((nent + num_queue_count) > max_queue_count)
-		return (EAGAIN);
-
 	ki = p->p_aioinfo;
-	if ((nent + ki->kaio_queue_count) > ki->kaio_qallowed_count)
-		return (EAGAIN);
 
 	lj = uma_zalloc(aiolio_zone, M_WAITOK);
-	if (!lj)
-		return (EAGAIN);
-
 	lj->lioj_flags = 0;
-	lj->lioj_buffer_count = 0;
-	lj->lioj_buffer_finished_count = 0;
-	lj->lioj_queue_count = 0;
-	lj->lioj_queue_finished_count = 0;
+	lj->lioj_count = 0;
+	lj->lioj_finished_count = 0;
+	knlist_init(&lj->klist, AIO_MTX(ki), NULL, NULL, NULL);
+	ksiginfo_init(&lj->lioj_ksi);
 
 	/*
 	 * Setup signal.
 	 */
 	if (uap->sig && (uap->mode == LIO_NOWAIT)) {
+		bzero(&lj->lioj_signal, sizeof(&lj->lioj_signal));
 		error = copyin(uap->sig, &lj->lioj_signal,
-		    sizeof(lj->lioj_signal));
+				oldsigev ? sizeof(struct osigevent) :
+					   sizeof(struct sigevent));
 		if (error) {
 			uma_zfree(aiolio_zone, lj);
 			return (error);
 		}
-		if (!_SIG_VALID(lj->lioj_signal.sigev_signo)) {
+
+		if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
+			/* Assume only new style KEVENT */
+			kev.filter = EVFILT_LIO;
+			kev.flags = EV_ADD | EV_ENABLE | EV_FLAG1;
+			kev.ident = (uintptr_t)uap->acb_list; /* something unique */
+			kev.data = (intptr_t)lj;
+			/* pass user defined sigval data */
+			kev.udata = lj->lioj_signal.sigev_value.sival_ptr;
+			error = kqfd_register(
+			    lj->lioj_signal.sigev_notify_kqueue, &kev, td, 1);
+			if (error) {
+				uma_zfree(aiolio_zone, lj);
+				return (error);
+			}
+		} else if (lj->lioj_signal.sigev_notify == SIGEV_NONE) {
+			;
+		} else if (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL ||
+			   lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID) {
+				if (!_SIG_VALID(lj->lioj_signal.sigev_signo)) {
+					uma_zfree(aiolio_zone, lj);
+					return EINVAL;
+				}
+				lj->lioj_flags |= LIOJ_SIGNAL;
+		} else {
 			uma_zfree(aiolio_zone, lj);
-			return (EINVAL);
+			return EINVAL;
 		}
-		lj->lioj_flags |= LIOJ_SIGNAL;
 	}
+
+	AIO_LOCK(ki);
 	TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list);
 	/*
+	 * Add extra aiocb count to avoid the lio to be freed
+	 * by other threads doing aio_waitcomplete or aio_return,
+	 * and prevent event from being sent until we have queued
+	 * all tasks.
+	 */
+	lj->lioj_count = 1;
+	AIO_UNLOCK(ki);
+
+	/*
 	 * Get pointers to the list of I/O requests.
 	 */
 	nerror = 0;
-	nentqueued = 0;
 	cbptr = uap->acb_list;
 	for (i = 0; i < uap->nent; i++) {
 		iocb = (struct aiocb *)(intptr_t)fuword(&cbptr[i]);
 		if (((intptr_t)iocb != -1) && ((intptr_t)iocb != 0)) {
-			error = _aio_aqueue(td, iocb, lj, 0);
-			if (error == 0)
-				nentqueued++;
-			else
+			error = aio_aqueue(td, iocb, lj, LIO_NOP, oldsigev);
+			if (error != 0)
 				nerror++;
 		}
 	}
 
-	/*
-	 * If we haven't queued any, then just return error.
-	 */
-	if (nentqueued == 0)
-		return (0);
-
-	/*
-	 * Calculate the appropriate error return.
-	 */
-	runningcode = 0;
-	if (nerror)
-		runningcode = EIO;
-
+	error = 0;
+	AIO_LOCK(ki);
 	if (uap->mode == LIO_WAIT) {
-		int command, found;
-		long jobref;
-
-		for (;;) {
-			found = 0;
-			for (i = 0; i < uap->nent; i++) {
-				/*
-				 * Fetch address of the control buf pointer in
-				 * user space.
-				 */
-				iocb = (struct aiocb *)
-				    (intptr_t)fuword(&cbptr[i]);
-				if (((intptr_t)iocb == -1) || ((intptr_t)iocb
-				    == 0))
-					continue;
-
-				/*
-				 * Fetch the associated command from user space.
-				 */
-				command = fuword(&iocb->aio_lio_opcode);
-				if (command == LIO_NOP) {
-					found++;
-					continue;
-				}
-
-				jobref =
-				    fuword(&iocb->_aiocb_private.kernelinfo);
-
-				TAILQ_FOREACH(cb, &ki->kaio_jobdone, plist) {
-					if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo)
-					    == jobref) {
-						if (cb->uaiocb.aio_lio_opcode
-						    == LIO_WRITE) {
-							p->p_stats->p_ru.ru_oublock
-							    +=
-							    cb->outputcharge;
-							cb->outputcharge = 0;
-						} else if (cb->uaiocb.aio_lio_opcode
-						    == LIO_READ) {
-							p->p_stats->p_ru.ru_inblock
-							    += cb->inputcharge;
-							cb->inputcharge = 0;
-						}
-						found++;
-						break;
-					}
-				}
-
-				s = splbio();
-				TAILQ_FOREACH(cb, &ki->kaio_bufdone, plist) {
-					if (((intptr_t)cb->uaiocb._aiocb_private.kernelinfo)
-					    == jobref) {
-						found++;
-						break;
-					}
-				}
-				splx(s);
-			}
-
-			/*
-			 * If all I/Os have been disposed of, then we can
-			 * return.
-			 */
-			if (found == nentqueued)
-				return (runningcode);
-
+		while (lj->lioj_count - 1 != lj->lioj_finished_count) {
 			ki->kaio_flags |= KAIO_WAKEUP;
-			error = tsleep(p, PRIBIO | PCATCH, "aiospn", 0);
-
-			if (error == EINTR)
-				return (EINTR);
-			else if (error == EWOULDBLOCK)
-				return (EAGAIN);
+			error = msleep(&p->p_aioinfo, AIO_MTX(ki),
+			    PRIBIO | PCATCH, "aiospn", 0);
+			if (error == ERESTART)
+				error = EINTR;
+			if (error)
+				break;
+		}
+	} else {
+		if (lj->lioj_count - 1 == lj->lioj_finished_count) {
+			if (lj->lioj_signal.sigev_notify == SIGEV_KEVENT) {
+				lj->lioj_flags |= LIOJ_KEVENT_POSTED;
+				KNOTE_LOCKED(&lj->klist, 1);
+			}
+			if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED))
+			    == LIOJ_SIGNAL
+			    && (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL ||
+			    lj->lioj_signal.sigev_notify == SIGEV_THREAD_ID)) {
+				aio_sendsig(p, &lj->lioj_signal,
+					    &lj->lioj_ksi);
+				lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
+			}
 		}
 	}
+	lj->lioj_count--;
+	if (lj->lioj_count == 0) {
+		TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list);
+		knlist_delete(&lj->klist, curthread, 1);
+		PROC_LOCK(p);
+		sigqueue_take(&lj->lioj_ksi);
+		PROC_UNLOCK(p);
+		AIO_UNLOCK(ki);
+		uma_zfree(aiolio_zone, lj);
+	} else
+		AIO_UNLOCK(ki);
 
-	return (runningcode);
+	if (nerror)
+		return (EIO);
+	return (error);
 }
 
 /*
- * Interrupt handler for physio, performs the necessary process wakeups, and
- * signals.
+ * Called from interrupt thread for physio, we should return as fast
+ * as possible, so we schedule a biohelper task.
  */
 static void
 aio_physwakeup(struct buf *bp)
 {
 	struct aiocblist *aiocbe;
-	struct proc *p;
-	struct kaioinfo *ki;
-	struct aio_liojob *lj;
-
-	mtx_lock(&Giant);
-	bp->b_flags |= B_DONE;
-	wakeup(bp);
 
 	aiocbe = (struct aiocblist *)bp->b_caller1;
-	if (aiocbe) {
-		p = aiocbe->userproc;
-
-		aiocbe->jobstate = JOBST_JOBBFINISHED;
-		aiocbe->uaiocb._aiocb_private.status -= bp->b_resid;
-		aiocbe->uaiocb._aiocb_private.error = 0;
-		aiocbe->jobflags |= AIOCBLIST_DONE;
-
-		if (bp->b_ioflags & BIO_ERROR)
-			aiocbe->uaiocb._aiocb_private.error = bp->b_error;
-
-		lj = aiocbe->lio;
-		if (lj) {
-			lj->lioj_buffer_finished_count++;
+	taskqueue_enqueue(taskqueue_aiod_bio, &aiocbe->biotask);
+}
 
-			/*
-			 * wakeup/signal if all of the interrupt jobs are done.
-			 */
-			if (lj->lioj_buffer_finished_count ==
-			    lj->lioj_buffer_count &&
-			    lj->lioj_queue_finished_count ==
-			    lj->lioj_queue_count) {
-				/*
-				 * Post a signal if it is called for.
-				 */
-				if ((lj->lioj_flags &
-				    (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) ==
-				    LIOJ_SIGNAL) {
-					PROC_LOCK(p);
-					psignal(p, lj->lioj_signal.sigev_signo);
-					PROC_UNLOCK(p);
-					lj->lioj_flags |= LIOJ_SIGNAL_POSTED;
-				}
-			}
-		}
+/*
+ * Task routine to perform heavy tasks, process wakeup, and signals.
+ */
+static void
+biohelper(void *context, int pending)
+{
+	struct aiocblist *aiocbe = context;
+	struct buf *bp;
+	struct proc *userp;
+	struct kaioinfo *ki;
+	int nblks;
 
-		ki = p->p_aioinfo;
-		if (ki) {
-			ki->kaio_buffer_finished_count++;
-			TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist);
-			TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist);
-
-			KNOTE_UNLOCKED(&aiocbe->klist, 0);
-			/* Do the wakeup. */
-			if (ki->kaio_flags & (KAIO_RUNDOWN|KAIO_WAKEUP)) {
-				ki->kaio_flags &= ~KAIO_WAKEUP;
-				wakeup(p);
-			}
-		}
+	bp = aiocbe->bp;
+	userp = aiocbe->userproc;
+	ki = userp->p_aioinfo;
+	AIO_LOCK(ki);
+	aiocbe->uaiocb._aiocb_private.status -= bp->b_resid;
+	aiocbe->uaiocb._aiocb_private.error = 0;
+	if (bp->b_ioflags & BIO_ERROR)
+		aiocbe->uaiocb._aiocb_private.error = bp->b_error;
+	nblks = btodb(aiocbe->uaiocb.aio_nbytes);
+	if (aiocbe->uaiocb.aio_lio_opcode == LIO_WRITE)
+		aiocbe->outputcharge += nblks;
+	else
+		aiocbe->inputcharge += nblks;
+	aiocbe->bp = NULL;
+	TAILQ_REMOVE(&userp->p_aioinfo->kaio_bufqueue, aiocbe, plist);
+	ki->kaio_buffer_count--;
+	aio_bio_done_notify(userp, aiocbe, DONE_BUF);
+	AIO_UNLOCK(ki);
 
-		if (aiocbe->uaiocb.aio_sigevent.sigev_notify == SIGEV_SIGNAL) {
-			PROC_LOCK(p);
-			psignal(p, aiocbe->uaiocb.aio_sigevent.sigev_signo);
-			PROC_UNLOCK(p);
-		}
-	}
-	mtx_unlock(&Giant);
+	/* Release mapping into kernel space. */
+	vunmapbuf(bp);
+	relpbuf(bp, NULL);
+	atomic_subtract_int(&num_buf_aio, 1);
 }
 
 /* syscall - wait for the next completion of an aio request */
@@ -2173,10 +2161,11 @@
 	struct timeval atv;
 	struct timespec ts;
 	struct kaioinfo *ki;
-	struct aiocblist *cb = NULL;
-	int error, s, timo;
+	struct aiocblist *cb;
+	struct aiocb *uuaiocb;
+	int error, status, timo;
 
-	suword(uap->aiocbp, (int)NULL);
+	suword(uap->aiocbp, (long)NULL);
 
 	timo = 0;
 	if (uap->timeout) {
@@ -2194,54 +2183,59 @@
 		timo = tvtohz(&atv);
 	}
 
+	if (p->p_aioinfo == NULL)
+		aio_init_aioinfo(p);
 	ki = p->p_aioinfo;
-	if (ki == NULL)
-		return (EAGAIN);
 
-	for (;;) {
-		PROC_LOCK(p);
-		if ((cb = TAILQ_FIRST(&ki->kaio_jobdone)) != 0) {
-			PROC_UNLOCK(p);
-			suword(uap->aiocbp, (uintptr_t)cb->uuaiocb);
-			td->td_retval[0] = cb->uaiocb._aiocb_private.status;
-			if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
-				p->p_stats->p_ru.ru_oublock +=
-				    cb->outputcharge;
-				cb->outputcharge = 0;
-			} else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
-				p->p_stats->p_ru.ru_inblock += cb->inputcharge;
-				cb->inputcharge = 0;
-			}
-			error = cb->uaiocb._aiocb_private.error;
-			aio_free_entry(cb);
-			return (error);
-		}
+	error = 0;
+	cb = NULL;
+	AIO_LOCK(ki);
+	while ((cb = TAILQ_FIRST(&ki->kaio_done)) == NULL) {
+		ki->kaio_flags |= KAIO_WAKEUP;
+		error = msleep(&p->p_aioinfo, AIO_MTX(ki), PRIBIO | PCATCH,
+		    "aiowc", timo);
+		if (timo && error == ERESTART)
+			error = EINTR;
+		if (error)
+			break;
+	}
 
-		s = splbio();
- 		if ((cb = TAILQ_FIRST(&ki->kaio_bufdone)) != 0 ) {
-			PROC_UNLOCK(p);
-			splx(s);
-			suword(uap->aiocbp, (uintptr_t)cb->uuaiocb);
-			error = cb->uaiocb._aiocb_private.error;
-			td->td_retval[0] = cb->uaiocb._aiocb_private.status;
-			aio_free_entry(cb);
-			return (error);
+	if (cb != NULL) {
+		MPASS(cb->jobstate == JOBST_JOBFINISHED);
+		uuaiocb = cb->uuaiocb;
+		status = cb->uaiocb._aiocb_private.status;
+		error = cb->uaiocb._aiocb_private.error;
+		td->td_retval[0] = status;
+		if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) {
+			td->td_ru.ru_oublock += cb->outputcharge;
+			cb->outputcharge = 0;
+		} else if (cb->uaiocb.aio_lio_opcode == LIO_READ) {
+			td->td_ru.ru_inblock += cb->inputcharge;
+			cb->inputcharge = 0;
 		}
+		aio_free_entry(cb);
+		AIO_UNLOCK(ki);
+		suword(uap->aiocbp, (long)uuaiocb);
+		suword(&uuaiocb->_aiocb_private.error, error);
+		suword(&uuaiocb->_aiocb_private.status, status);
+	} else
+		AIO_UNLOCK(ki);
 
-		ki->kaio_flags |= KAIO_WAKEUP;
-		error = msleep(p, &p->p_mtx, PDROP | PRIBIO | PCATCH, "aiowc",
-		    timo);
-		splx(s);
+	return (error);
+}
 
-		if (error == ERESTART)
-			return (EINTR);
-		else if (error < 0)
-			return (error);
-		else if (error == EINTR)
-			return (EINTR);
-		else if (error == EWOULDBLOCK)
-			return (EAGAIN);
-	}
+int
+aio_fsync(struct thread *td, struct aio_fsync_args *uap)
+{
+	struct proc *p = td->td_proc;
+	struct kaioinfo *ki;
+
+	if (uap->op != O_SYNC) /* XXX lack of O_DSYNC */
+		return (EINVAL);
+	ki = p->p_aioinfo;
+	if (ki == NULL)
+		aio_init_aioinfo(p);
+	return aio_aqueue(td, uap->aiocbp, NULL, LIO_SYNC, 0);
 }
 
 /* kqueue attach function */
@@ -2257,6 +2251,7 @@
 	 */
 	if ((kn->kn_flags & EV_FLAG1) == 0)
 		return (EPERM);
+	kn->kn_ptr.p_aio = aiocbe;
 	kn->kn_flags &= ~EV_FLAG1;
 
 	knlist_add(&aiocbe->klist, kn, 0);
@@ -2268,9 +2263,10 @@
 static void
 filt_aiodetach(struct knote *kn)
 {
-	struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_sdata;
+	struct aiocblist *aiocbe = kn->kn_ptr.p_aio;
 
-	knlist_remove(&aiocbe->klist, kn, 0);
+	if (!knlist_empty(&aiocbe->klist))
+		knlist_remove(&aiocbe->klist, kn, 0);
 }
 
 /* kqueue filter function */
@@ -2278,12 +2274,52 @@
 static int
 filt_aio(struct knote *kn, long hint)
 {
-	struct aiocblist *aiocbe = (struct aiocblist *)kn->kn_sdata;
+	struct aiocblist *aiocbe = kn->kn_ptr.p_aio;
 
 	kn->kn_data = aiocbe->uaiocb._aiocb_private.error;
-	if (aiocbe->jobstate != JOBST_JOBFINISHED &&
-	    aiocbe->jobstate != JOBST_JOBBFINISHED)
+	if (aiocbe->jobstate != JOBST_JOBFINISHED)
 		return (0);
 	kn->kn_flags |= EV_EOF;
 	return (1);
 }
+
+/* kqueue attach function */
+static int
+filt_lioattach(struct knote *kn)
+{
+	struct aioliojob * lj = (struct aioliojob *)kn->kn_sdata;
+
+	/*
+	 * The aioliojob pointer must be validated before using it, so
+	 * registration is restricted to the kernel; the user cannot
+	 * set EV_FLAG1.
+	 */
+	if ((kn->kn_flags & EV_FLAG1) == 0)
+		return (EPERM);
+	kn->kn_ptr.p_lio = lj;
+	kn->kn_flags &= ~EV_FLAG1;
+
+	knlist_add(&lj->klist, kn, 0);
+
+	return (0);
+}
+
+/* kqueue detach function */
+static void
+filt_liodetach(struct knote *kn)
+{
+	struct aioliojob * lj = kn->kn_ptr.p_lio;
+
+	if (!knlist_empty(&lj->klist))
+		knlist_remove(&lj->klist, kn, 0);
+}
+
+/* kqueue filter function */
+/*ARGSUSED*/
+static int
+filt_lio(struct knote *kn, long hint)
+{
+	struct aioliojob * lj = kn->kn_ptr.p_lio;
+
+	return (lj->lioj_flags & LIOJ_KEVENT_POSTED);
+}
Index: kern_malloc.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_malloc.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/kern_malloc.c -L sys/kern/kern_malloc.c -u -r1.2 -r1.3
--- sys/kern/kern_malloc.c
+++ sys/kern/kern_malloc.c
@@ -1,7 +1,7 @@
 /*-
  * Copyright (c) 1987, 1991, 1993
  *	The Regents of the University of California.
- * Copyright (c) 2005 Robert N. M. Watson
+ * Copyright (c) 2005-2006 Robert N. M. Watson
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -31,8 +31,19 @@
  *	@(#)kern_malloc.c	8.3 (Berkeley) 1/4/94
  */
 
+/*
+ * Kernel malloc(9) implementation -- general purpose kernel memory allocator
+ * based on memory types.  Back end is implemented using the UMA(9) zone
+ * allocator.  A set of fixed-size buckets are used for smaller allocations,
+ * and a special UMA allocation interface is used for larger allocations.
+ * Callers declare memory types, and statistics are maintained independently
+ * for each memory type.  Statistics are maintained per-CPU for performance
+ * reasons.  See malloc(9) and comments in malloc.h for a detailed
+ * description.
+ */
+
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_malloc.c,v 1.142.2.7 2006/01/17 10:19:37 pjd Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_malloc.c,v 1.162 2007/06/27 13:39:38 rwatson Exp $");
 
 #include "opt_ddb.h"
 #include "opt_vm.h"
@@ -65,6 +76,9 @@
 #ifdef DEBUG_MEMGUARD
 #include <vm/memguard.h>
 #endif
+#ifdef DEBUG_REDZONE
+#include <vm/redzone.h>
+#endif
 
 #if defined(INVARIANTS) && defined(__i386__)
 #include <machine/cpu.h>
@@ -82,6 +96,9 @@
 #define	REALLOC_FRACTION	1	/* new block if <= half the size */
 #endif
 
+/*
+ * Centrally define some common malloc types.
+ */
 MALLOC_DEFINE(M_CACHE, "cache", "Various Dynamically allocated caches");
 MALLOC_DEFINE(M_DEVBUF, "devbuf", "device driver memory");
 MALLOC_DEFINE(M_TEMP, "temp", "misc temporary data buffers");
@@ -95,8 +112,8 @@
 static MALLOC_DEFINE(M_FREE, "free", "should be on free list");
 
 static struct malloc_type *kmemstatistics;
-static char *kmembase;
-static char *kmemlimit;
+static vm_offset_t kmembase;
+static vm_offset_t kmemlimit;
 static int kmemcount;
 
 #define KMEM_ZSHIFT	4
@@ -107,7 +124,14 @@
 #define KMEM_ZSIZE	(KMEM_ZMAX >> KMEM_ZSHIFT)
 static u_int8_t kmemsize[KMEM_ZSIZE + 1];
 
-/* These won't be powers of two for long */
+/*
+ * Small malloc(9) memory allocations are allocated from a set of UMA buckets
+ * of various sizes.
+ *
+ * XXX: The comment here used to read "These won't be powers of two for
+ * long."  It's possible that a significant amount of wasted memory could be
+ * recovered by tuning the sizes of these buckets.
+ */
 struct {
 	int kz_size;
 	char *kz_name;
@@ -140,18 +164,24 @@
 	{0, NULL},
 };
 
+/*
+ * Zone to allocate malloc type descriptions from.  For ABI reasons, memory
+ * types are described by a data structure passed by the declaring code, but
+ * the malloc(9) implementation has its own data structure describing the
+ * type and statistics.  This permits the malloc(9)-internal data structures
+ * to be modified without breaking binary-compiled kernel modules that
+ * declare malloc types.
+ */
 static uma_zone_t mt_zone;
 
-#ifdef DEBUG_MEMGUARD
-u_int vm_memguard_divisor;
-SYSCTL_UINT(_vm, OID_AUTO, memguard_divisor, CTLFLAG_RD, &vm_memguard_divisor,
-    0, "(kmem_size/memguard_divisor) == memguard submap size");
-#endif
-
 u_int vm_kmem_size;
 SYSCTL_UINT(_vm, OID_AUTO, kmem_size, CTLFLAG_RD, &vm_kmem_size, 0,
     "Size of kernel memory");
 
+u_int vm_kmem_size_min;
+SYSCTL_UINT(_vm, OID_AUTO, kmem_size_min, CTLFLAG_RD, &vm_kmem_size_min, 0,
+    "Minimum size of kernel memory");
+
 u_int vm_kmem_size_max;
 SYSCTL_UINT(_vm, OID_AUTO, kmem_size_max, CTLFLAG_RD, &vm_kmem_size_max, 0,
     "Maximum size of kernel memory");
@@ -163,7 +193,6 @@
 /*
  * The malloc_mtx protects the kmemstatistics linked list.
  */
-
 struct mtx malloc_mtx;
 
 #ifdef MALLOC_PROFILE
@@ -172,17 +201,18 @@
 static int sysctl_kern_mprof(SYSCTL_HANDLER_ARGS);
 #endif
 
-static int sysctl_kern_malloc(SYSCTL_HANDLER_ARGS);
 static int sysctl_kern_malloc_stats(SYSCTL_HANDLER_ARGS);
 
-/* time_uptime of last malloc(9) failure */
+/*
+ * time_uptime of the last malloc(9) failure (induced or real).
+ */
 static time_t t_malloc_fail;
 
-#ifdef MALLOC_MAKE_FAILURES
 /*
- * Causes malloc failures every (n) mallocs with M_NOWAIT.  If set to 0,
- * doesn't cause failures.
+ * malloc(9) fault injection -- cause malloc failures every (n) mallocs when
+ * the caller specifies M_NOWAIT.  If set to 0, no failures are caused.
  */
+#ifdef MALLOC_MAKE_FAILURES
 SYSCTL_NODE(_debug, OID_AUTO, malloc, CTLFLAG_RD, 0,
     "Kernel malloc debugging options");
 
@@ -204,7 +234,10 @@
 }
 
 /*
- * Add this to the informational malloc_type bucket.
+ * An allocation has succeeded -- update malloc type statistics for the
+ * amount of bucket size.  Occurs within a critical section so that the
+ * thread isn't preempted and doesn't migrate while updating per-PCU
+ * statistics.
  */
 static void
 malloc_type_zone_allocated(struct malloc_type *mtp, unsigned long size,
@@ -234,7 +267,10 @@
 }
 
 /*
- * Remove this allocation from the informational malloc_type bucket.
+ * A free operation has occurred -- update malloc type statistics for the
+ * amount of the bucket size.  Occurs within a critical section so that the
+ * thread isn't preempted and doesn't migrate while updating per-CPU
+ * statistics.
  */
 void
 malloc_type_freed(struct malloc_type *mtp, unsigned long size)
@@ -265,7 +301,7 @@
 	caddr_t va;
 	uma_zone_t zone;
 	uma_keg_t keg;
-#ifdef DIAGNOSTIC
+#if defined(DIAGNOSTIC) || defined(DEBUG_REDZONE)
 	unsigned long osize = size;
 #endif
 
@@ -285,10 +321,6 @@
 		}
 	}
 #endif
-#if 0
-	if (size == 0)
-		kdb_enter("zero size malloc");
-#endif
 #ifdef MALLOC_MAKE_FAILURES
 	if ((flags & M_NOWAIT) && (malloc_failure_rate != 0)) {
 		atomic_add_int(&malloc_nowait_count, 1);
@@ -304,11 +336,14 @@
 		   ("malloc(M_WAITOK) in interrupt context"));
 
 #ifdef DEBUG_MEMGUARD
-	/* XXX CHANGEME! */
-	if (mtp == M_SUBPROC)
+	if (memguard_cmp(mtp))
 		return memguard_alloc(size, flags);
 #endif
 
+#ifdef DEBUG_REDZONE
+	size = redzone_size_ntor(size);
+#endif
+
 	if (size <= KMEM_ZMAX) {
 		if (size & KMEM_ZMASK)
 			size = (size & ~KMEM_ZMASK) + KMEM_ZBASE;
@@ -338,6 +373,10 @@
 		memset(va, 0x70, osize);
 	}
 #endif
+#ifdef DEBUG_REDZONE
+	if (va != NULL)
+		va = redzone_setup(va, osize);
+#endif
 	return ((void *) va);
 }
 
@@ -359,13 +398,17 @@
 		return;
 
 #ifdef DEBUG_MEMGUARD
-	/* XXX CHANGEME! */
-	if (mtp == M_SUBPROC) {
+	if (memguard_cmp(mtp)) {
 		memguard_free(addr);
 		return;
 	}
 #endif
 
+#ifdef DEBUG_REDZONE
+	redzone_check(addr);
+	addr = redzone_addr_ntor(addr);
+#endif
+
 	size = 0;
 
 	slab = vtoslab((vm_offset_t)addr & (~UMA_SLAB_MASK));
@@ -423,13 +466,16 @@
 	 */
 
 #ifdef DEBUG_MEMGUARD
-/* XXX: CHANGEME! */
-if (mtp == M_SUBPROC) {
+if (memguard_cmp(mtp)) {
 	slab = NULL;
 	alloc = size;
 } else {
 #endif
 
+#ifdef DEBUG_REDZONE
+	slab = NULL;
+	alloc = redzone_get_size(addr);
+#else
 	slab = vtoslab((vm_offset_t)addr & ~(UMA_SLAB_MASK));
 
 	/* Sanity check */
@@ -446,6 +492,7 @@
 	if (size <= alloc
 	    && (size > (alloc >> REALLOC_FRACTION) || alloc == MINALLOCSIZE))
 		return (addr);
+#endif /* !DEBUG_REDZONE */
 
 #ifdef DEBUG_MEMGUARD
 }
@@ -510,6 +557,14 @@
 	    (mem_size / vm_kmem_size_scale) > (vm_kmem_size / PAGE_SIZE))
 		vm_kmem_size = (mem_size / vm_kmem_size_scale) * PAGE_SIZE;
 
+#if defined(VM_KMEM_SIZE_MIN)
+	vm_kmem_size_min = VM_KMEM_SIZE_MIN;
+#endif
+	TUNABLE_INT_FETCH("vm.kmem_size_min", &vm_kmem_size_min);
+	if (vm_kmem_size_min > 0 && vm_kmem_size < vm_kmem_size_min) {
+		vm_kmem_size = vm_kmem_size_min;
+	}
+
 #if defined(VM_KMEM_SIZE_MAX)
 	vm_kmem_size_max = VM_KMEM_SIZE_MAX;
 #endif
@@ -538,8 +593,8 @@
 	 */
 	init_param3(vm_kmem_size / PAGE_SIZE);
 
-	kmem_map = kmem_suballoc(kernel_map, (vm_offset_t *)&kmembase,
-		(vm_offset_t *)&kmemlimit, vm_kmem_size);
+	kmem_map = kmem_suballoc(kernel_map, &kmembase, &kmemlimit,
+	    vm_kmem_size);
 	kmem_map->system_map = 1;
 
 #ifdef DEBUG_MEMGUARD
@@ -549,7 +604,7 @@
 	 * scenarios as they occur.  It is only used for debugging.
 	 */
 	vm_memguard_divisor = 10;
-	TUNABLE_INT_FETCH("vm.memguard_divisor", &vm_memguard_divisor);
+	TUNABLE_INT_FETCH("vm.memguard.divisor", &vm_memguard_divisor);
 
 	/* Pick a conservative value if provided value sucks. */
 	if ((vm_memguard_divisor <= 0) ||
@@ -647,113 +702,23 @@
 		    temp_allocs, temp_bytes);
 	}
 
-        slab = vtoslab((vm_offset_t) mtip & (~UMA_SLAB_MASK));
+	slab = vtoslab((vm_offset_t) mtip & (~UMA_SLAB_MASK));
 	uma_zfree_arg(mt_zone, mtip, slab);
 }
 
-static int
-sysctl_kern_malloc(SYSCTL_HANDLER_ARGS)
+struct malloc_type *
+malloc_desc2type(const char *desc)
 {
-	struct malloc_type_stats mts_local, *mtsp;
-	struct malloc_type_internal *mtip;
 	struct malloc_type *mtp;
-	struct sbuf sbuf;
-	long temp_allocs, temp_bytes;
-	int linesize = 128;
-	int bufsize;
-	int first;
-	int error;
-	char *buf;
-	int cnt;
-	int i;
-
-	cnt = 0;
-
-	/* Guess at how much room is needed. */
-	mtx_lock(&malloc_mtx);
-	cnt = kmemcount;
-	mtx_unlock(&malloc_mtx);
-
-	bufsize = linesize * (cnt + 1);
-	buf = malloc(bufsize, M_TEMP, M_WAITOK|M_ZERO);
-	sbuf_new(&sbuf, buf, bufsize, SBUF_FIXEDLEN);
-
-	mtx_lock(&malloc_mtx);
-	sbuf_printf(&sbuf,
-	    "\n        Type  InUse MemUse HighUse Requests  Size(s)\n");
-	for (mtp = kmemstatistics; cnt != 0 && mtp != NULL;
-	    mtp = mtp->ks_next, cnt--) {
-		mtip = mtp->ks_handle;
-		bzero(&mts_local, sizeof(mts_local));
-		for (i = 0; i < MAXCPU; i++) {
-			mtsp = &mtip->mti_stats[i];
-			mts_local.mts_memalloced += mtsp->mts_memalloced;
-			mts_local.mts_memfreed += mtsp->mts_memfreed;
-			mts_local.mts_numallocs += mtsp->mts_numallocs;
-			mts_local.mts_numfrees += mtsp->mts_numfrees;
-			mts_local.mts_size |= mtsp->mts_size;
-		}
-		if (mts_local.mts_numallocs == 0)
-			continue;
-
-		/*
-		 * Due to races in per-CPU statistics gather, it's possible to
-		 * get a slightly negative number here.  If we do, approximate
-		 * with 0.
-		 */
-		if (mts_local.mts_numallocs > mts_local.mts_numfrees)
-			temp_allocs = mts_local.mts_numallocs -
-			    mts_local.mts_numfrees;
-		else
-			temp_allocs = 0;
 
-		/*
-		 * Ditto for bytes allocated.
-		 */
-		if (mts_local.mts_memalloced > mts_local.mts_memfreed)
-			temp_bytes = mts_local.mts_memalloced -
-			    mts_local.mts_memfreed;
-		else
-			temp_bytes = 0;
-
-		/*
-		 * High-waterwark is no longer easily available, so we just
-		 * print '-' for that column.
-		 */
-		sbuf_printf(&sbuf, "%13s%6lu%6luK       -%9llu",
-		    mtp->ks_shortdesc,
-		    temp_allocs,
-		    (temp_bytes + 1023) / 1024,
-		    (unsigned long long)mts_local.mts_numallocs);
-
-		first = 1;
-		for (i = 0; i < sizeof(kmemzones) / sizeof(kmemzones[0]) - 1;
-		    i++) {
-			if (mts_local.mts_size & (1 << i)) {
-				if (first)
-					sbuf_printf(&sbuf, "  ");
-				else
-					sbuf_printf(&sbuf, ",");
-				sbuf_printf(&sbuf, "%s",
-				    kmemzones[i].kz_name);
-				first = 0;
-			}
-		}
-		sbuf_printf(&sbuf, "\n");
+	mtx_assert(&malloc_mtx, MA_OWNED);
+	for (mtp = kmemstatistics; mtp != NULL; mtp = mtp->ks_next) {
+		if (strcmp(mtp->ks_shortdesc, desc) == 0)
+			return (mtp);
 	}
-	sbuf_finish(&sbuf);
-	mtx_unlock(&malloc_mtx);
-
-	error = SYSCTL_OUT(req, sbuf_data(&sbuf), sbuf_len(&sbuf));
-
-	sbuf_delete(&sbuf);
-	free(buf, M_TEMP);
-	return (error);
+	return (NULL);
 }
 
-SYSCTL_OID(_kern, OID_AUTO, malloc, CTLTYPE_STRING|CTLFLAG_RD,
-    NULL, 0, sysctl_kern_malloc, "A", "Malloc Stats");
-
 static int
 sysctl_kern_malloc_stats(SYSCTL_HANDLER_ARGS)
 {
@@ -845,20 +810,26 @@
 	struct malloc_type_internal *mtip;
 	struct malloc_type *mtp;
 	u_int64_t allocs, frees;
+	u_int64_t alloced, freed;
 	int i;
 
-	db_printf("%18s %12s %12s %12s\n", "Type", "Allocs", "Frees",
-	    "Used");
+	db_printf("%18s %12s  %12s %12s\n", "Type", "InUse", "MemUse",
+	    "Requests");
 	for (mtp = kmemstatistics; mtp != NULL; mtp = mtp->ks_next) {
 		mtip = (struct malloc_type_internal *)mtp->ks_handle;
 		allocs = 0;
 		frees = 0;
+		alloced = 0;
+		freed = 0;
 		for (i = 0; i < MAXCPU; i++) {
 			allocs += mtip->mti_stats[i].mts_numallocs;
 			frees += mtip->mti_stats[i].mts_numfrees;
+			alloced += mtip->mti_stats[i].mts_memalloced;
+			freed += mtip->mti_stats[i].mts_memfreed;
 		}
-		db_printf("%18s %12ju %12ju %12ju\n", mtp->ks_shortdesc,
-		    allocs, frees, allocs - frees);
+		db_printf("%18s %12ju %12juK %12ju\n",
+		    mtp->ks_shortdesc, allocs - frees,
+		    (alloced - freed + 1023) / 1024, allocs);
 	}
 }
 #endif
Index: vfs_init.c
===================================================================
RCS file: /home/cvs/src/sys/kern/vfs_init.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/vfs_init.c -L sys/kern/vfs_init.c -u -r1.1.1.1 -r1.2
--- sys/kern/vfs_init.c
+++ sys/kern/vfs_init.c
@@ -35,7 +35,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/vfs_init.c,v 1.81 2005/02/20 23:02:20 das Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/vfs_init.c,v 1.85 2007/02/16 17:32:41 pjd Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -43,6 +43,7 @@
 #include <sys/linker.h>
 #include <sys/mount.h>
 #include <sys/proc.h>
+#include <sys/syscallsubr.h>
 #include <sys/sysctl.h>
 #include <sys/vnode.h>
 #include <sys/malloc.h>
@@ -108,30 +109,21 @@
 vfs_byname_kld(const char *fstype, struct thread *td, int *error)
 {
 	struct vfsconf *vfsp;
-	linker_file_t lf;
+	int fileid;
 
 	vfsp = vfs_byname(fstype);
 	if (vfsp != NULL)
 		return (vfsp);
 
-	/* Only load modules for root (very important!). */
-	*error = suser(td);
+	/* Try to load the respective module. */
+	*error = kern_kldload(td, fstype, &fileid);
 	if (*error)
 		return (NULL);
-	*error = securelevel_gt(td->td_ucred, 0);
-	if (*error) 
-		return (NULL);
-	*error = linker_load_module(NULL, fstype, NULL, NULL, &lf);
-	if (lf == NULL)
-		*error = ENODEV;
-	if (*error)
-		return (NULL);
-	lf->userrefs++;
+
 	/* Look up again to see if the VFS was loaded. */
 	vfsp = vfs_byname(fstype);
 	if (vfsp == NULL) {
-		lf->userrefs--;
-		linker_file_unload(lf, LINKER_UNLOAD_FORCE);
+		(void)kern_kldunload(td, fileid, LINKER_UNLOAD_FORCE);
 		*error = ENODEV;
 		return (NULL);
 	}
@@ -223,9 +215,6 @@
 	if (vfsops->vfs_checkexp == NULL)
 		/* check if file system is exported */
 		vfsops->vfs_checkexp =	vfs_stdcheckexp;
-	if (vfsops->vfs_vptofh == NULL)
-		/* turn a vnode into an NFS file handle */
-		vfsops->vfs_vptofh =	vfs_stdvptofh;
 	if (vfsops->vfs_init == NULL)
 		/* file system specific initialisation */
 		vfsops->vfs_init =	vfs_stdinit;
Index: subr_bus.c
===================================================================
RCS file: /home/cvs/src/sys/kern/subr_bus.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/subr_bus.c -L sys/kern/subr_bus.c -u -r1.1.1.1 -r1.2
--- sys/kern/subr_bus.c
+++ sys/kern/subr_bus.c
@@ -25,7 +25,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/subr_bus.c,v 1.184.2.1 2005/10/06 23:15:18 imp Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/subr_bus.c,v 1.201.4.1 2008/02/06 03:35:40 iwasaki Exp $");
 
 #include "opt_bus.h"
 
@@ -50,6 +50,7 @@
 #include <sys/systm.h>
 #include <sys/uio.h>
 #include <sys/bus.h>
+#include <sys/interrupt.h>
 
 #include <machine/stdarg.h>
 
@@ -417,7 +418,7 @@
  * userland in realtime.  We are required to free the data as well as
  * the n1 object because we allocate them separately.  Also note that
  * we return one record at a time.  If you try to read this device a
- * character at a time, you will loose the rest of the data.  Listening
+ * character at a time, you will lose the rest of the data.  Listening
  * programs are expected to cope.
  */
 static int
@@ -498,6 +499,15 @@
 }
 
 /**
+ * @brief Return whether the userland process is running
+ */
+boolean_t
+devctl_process_running(void)
+{
+	return (devsoftc.async_proc != NULL);
+}
+
+/**
  * @brief Queue data to be read from the devctl device
  *
  * Generic interface to queue data to the devctl device.  It is
@@ -781,8 +791,18 @@
 
 		bus_data_generation_update();
 	}
-	if (parentname && dc && !dc->parent) {
-		dc->parent = devclass_find_internal(parentname, 0, FALSE);
+
+	/*
+	 * If a parent class is specified, then set that as our parent so
+	 * that this devclass will support drivers for the parent class as
+	 * well.  If the parent class has the same name don't do this though
+	 * as it creates a cycle that can trigger an infinite loop in
+	 * device_probe_child() if a device exists for which there is no
+	 * suitable driver.
+	 */
+	if (parentname && dc && !dc->parent &&
+	    strcmp(classname, parentname) != 0) {
+		dc->parent = devclass_find_internal(parentname, NULL, FALSE);
 	}
 
 	return (dc);
@@ -799,7 +819,7 @@
 devclass_t
 devclass_create(const char *classname)
 {
-	return (devclass_find_internal(classname, 0, TRUE));
+	return (devclass_find_internal(classname, NULL, TRUE));
 }
 
 /**
@@ -813,7 +833,7 @@
 devclass_t
 devclass_find(const char *classname)
 {
-	return (devclass_find_internal(classname, 0, FALSE));
+	return (devclass_find_internal(classname, NULL, FALSE));
 }
 
 /**
@@ -850,7 +870,7 @@
 	/*
 	 * Make sure the devclass which the driver is implementing exists.
 	 */
-	devclass_find_internal(driver->name, 0, TRUE);
+	devclass_find_internal(driver->name, NULL, TRUE);
 
 	dl->driver = driver;
 	TAILQ_INSERT_TAIL(&dc->drivers, dl, link);
@@ -1427,7 +1447,7 @@
 	PDEBUG(("%s at %s as unit %d", name, DEVICENAME(parent), unit));
 
 	if (name) {
-		dc = devclass_find_internal(name, 0, TRUE);
+		dc = devclass_find_internal(name, NULL, TRUE);
 		if (!dc) {
 			printf("make_device: can't find device class %s\n",
 			    name);
@@ -1677,11 +1697,11 @@
 /**
  * @internal
  */
-static int
+int
 device_probe_child(device_t dev, device_t child)
 {
 	devclass_t dc;
-	driverlink_t best = 0;
+	driverlink_t best = NULL;
 	driverlink_t dl;
 	int result, pri = 0;
 	int hasclass = (child->devclass != 0);
@@ -1717,7 +1737,7 @@
 			/* Reset flags and devclass before the next probe. */
 			child->devflags = 0;
 			if (!hasclass)
-				device_set_devclass(child, 0);
+				device_set_devclass(child, NULL);
 
 			/*
 			 * If the driver returns SUCCESS, there can be
@@ -1734,7 +1754,7 @@
 			 * certainly doesn't match.
 			 */
 			if (result > 0) {
-				device_set_driver(child, 0);
+				device_set_driver(child, NULL);
 				continue;
 			}
 
@@ -1743,7 +1763,7 @@
 			 * best matching driver. Initialise the value
 			 * of pri for the first match.
 			 */
-			if (best == 0 || result > pri) {
+			if (best == NULL || result > pri) {
 				best = dl;
 				pri = result;
 				continue;
@@ -2230,7 +2250,7 @@
 		return (EINVAL);
 	}
 
-	dc = devclass_find_internal(classname, 0, TRUE);
+	dc = devclass_find_internal(classname, NULL, TRUE);
 	if (!dc)
 		return (ENOMEM);
 
@@ -2260,7 +2280,7 @@
 		free(dev->softc, M_BUS_SC);
 		dev->softc = NULL;
 	}
-	kobj_delete((kobj_t) dev, 0);
+	kobj_delete((kobj_t) dev, NULL);
 	dev->driver = driver;
 	if (driver) {
 		kobj_init((kobj_t) dev, (kobj_class_t) driver);
@@ -2268,7 +2288,7 @@
 			dev->softc = malloc(driver->size, M_BUS_SC,
 			    M_NOWAIT | M_ZERO);
 			if (!dev->softc) {
-				kobj_delete((kobj_t) dev, 0);
+				kobj_delete((kobj_t) dev, NULL);
 				kobj_init((kobj_t) dev, &null_class);
 				dev->driver = NULL;
 				return (ENOMEM);
@@ -2369,8 +2389,8 @@
 		printf("device_attach: %s%d attach returned %d\n",
 		    dev->driver->name, dev->unit, error);
 		/* Unset the class; set in device_probe_child */
-		if (dev->devclass == 0)
-			device_set_devclass(dev, 0);
+		if (dev->devclass == NULL)
+			device_set_devclass(dev, NULL);
 		device_set_driver(dev, NULL);
 		device_sysctl_fini(dev);
 		dev->state = DS_NOTPRESENT;
@@ -2681,7 +2701,7 @@
 resource_list_alloc(struct resource_list *rl, device_t bus, device_t child,
     int type, int *rid, u_long start, u_long end, u_long count, u_int flags)
 {
-	struct resource_list_entry *rle = 0;
+	struct resource_list_entry *rle = NULL;
 	int passthrough = (device_get_parent(child) != bus);
 	int isdefault = (start == 0UL && end == ~0UL);
 
@@ -2740,7 +2760,7 @@
 resource_list_release(struct resource_list *rl, device_t bus, device_t child,
     int type, int rid, struct resource *res)
 {
-	struct resource_list_entry *rle = 0;
+	struct resource_list_entry *rle = NULL;
 	int passthrough = (device_get_parent(child) != bus);
 	int error;
 
@@ -2820,7 +2840,7 @@
 {
 	struct resource_list_entry *rle;
 
-	STAILQ_FOREACH(rle, rl, link) {
+	while ((rle = STAILQ_FIRST(rl)) != NULL) {
 		if (rle->res)
 			bus_release_resource(rman_get_device(rle->res),
 			    rle->type, rle->rid, rle->res);
@@ -2829,6 +2849,13 @@
 	}
 }
 
+device_t
+bus_generic_add_child(device_t dev, int order, const char *name, int unit)
+{
+
+	return (device_add_child_ordered(dev, order, name, unit));
+}
+
 /**
  * @brief Helper function for implementing DEVICE_PROBE()
  *
@@ -3078,12 +3105,13 @@
  */
 int
 bus_generic_setup_intr(device_t dev, device_t child, struct resource *irq,
-    int flags, driver_intr_t *intr, void *arg, void **cookiep)
+    int flags, driver_filter_t *filter, driver_intr_t *intr, void *arg, 
+    void **cookiep)
 {
 	/* Propagate up the bus hierarchy until someone handles it. */
 	if (dev->parent)
 		return (BUS_SETUP_INTR(dev->parent, child, irq, flags,
-		    intr, arg, cookiep));
+		    filter, intr, arg, cookiep));
 	return (EINVAL);
 }
 
@@ -3189,6 +3217,22 @@
 }
 
 /**
+ * @brief Helper function for implementing BUS_GET_DMA_TAG().
+ *
+ * This simple implementation of BUS_GET_DMA_TAG() simply calls the
+ * BUS_GET_DMA_TAG() method of the parent of @p dev.
+ */
+bus_dma_tag_t
+bus_generic_get_dma_tag(device_t dev, device_t child)
+{
+
+	/* Propagate up the bus hierarchy until someone handles it. */
+	if (dev->parent != NULL)
+		return (BUS_GET_DMA_TAG(dev->parent, child));
+	return (NULL);
+}
+
+/**
  * @brief Helper function for implementing BUS_GET_RESOURCE().
  *
  * This implementation of BUS_GET_RESOURCE() uses the
@@ -3325,6 +3369,39 @@
  * to maintain some sort of a list of resources allocated by each device.
  */
 
+int
+bus_alloc_resources(device_t dev, struct resource_spec *rs,
+    struct resource **res)
+{
+	int i;
+
+	for (i = 0; rs[i].type != -1; i++)
+		res[i] = NULL;
+	for (i = 0; rs[i].type != -1; i++) {
+		res[i] = bus_alloc_resource_any(dev,
+		    rs[i].type, &rs[i].rid, rs[i].flags);
+		if (res[i] == NULL && !(rs[i].flags & RF_OPTIONAL)) {
+			bus_release_resources(dev, rs, res);
+			return (ENXIO);
+		}
+	}
+	return (0);
+}
+
+void
+bus_release_resources(device_t dev, const struct resource_spec *rs,
+    struct resource **res)
+{
+	int i;
+
+	for (i = 0; rs[i].type != -1; i++)
+		if (res[i] != NULL) {
+			bus_release_resource(
+			    dev, rs[i].type, rs[i].rid, res[i]);
+			res[i] = NULL;
+		}
+}
+
 /**
  * @brief Wrapper function for BUS_ALLOC_RESOURCE().
  *
@@ -3335,8 +3412,8 @@
 bus_alloc_resource(device_t dev, int type, int *rid, u_long start, u_long end,
     u_long count, u_int flags)
 {
-	if (dev->parent == 0)
-		return (0);
+	if (dev->parent == NULL)
+		return (NULL);
 	return (BUS_ALLOC_RESOURCE(dev->parent, dev, type, rid, start, end,
 	    count, flags));
 }
@@ -3350,7 +3427,7 @@
 int
 bus_activate_resource(device_t dev, int type, int rid, struct resource *r)
 {
-	if (dev->parent == 0)
+	if (dev->parent == NULL)
 		return (EINVAL);
 	return (BUS_ACTIVATE_RESOURCE(dev->parent, dev, type, rid, r));
 }
@@ -3364,7 +3441,7 @@
 int
 bus_deactivate_resource(device_t dev, int type, int rid, struct resource *r)
 {
-	if (dev->parent == 0)
+	if (dev->parent == NULL)
 		return (EINVAL);
 	return (BUS_DEACTIVATE_RESOURCE(dev->parent, dev, type, rid, r));
 }
@@ -3378,7 +3455,7 @@
 int
 bus_release_resource(device_t dev, int type, int rid, struct resource *r)
 {
-	if (dev->parent == 0)
+	if (dev->parent == NULL)
 		return (EINVAL);
 	return (BUS_RELEASE_RESOURCE(dev->parent, dev, type, rid, r));
 }
@@ -3391,23 +3468,25 @@
  */
 int
 bus_setup_intr(device_t dev, struct resource *r, int flags,
-    driver_intr_t handler, void *arg, void **cookiep)
+    driver_filter_t filter, driver_intr_t handler, void *arg, void **cookiep)
 {
 	int error;
 
-	if (dev->parent != 0) {
-		if ((flags &~ INTR_ENTROPY) == (INTR_TYPE_NET | INTR_MPSAFE) &&
-		    !debug_mpsafenet)
-			flags &= ~INTR_MPSAFE;
+	if (dev->parent != NULL) {
 		error = BUS_SETUP_INTR(dev->parent, dev, r, flags,
-		    handler, arg, cookiep);
+		    filter, handler, arg, cookiep);
 		if (error == 0) {
-			if (!(flags & (INTR_MPSAFE | INTR_FAST)))
+			if (handler != NULL && !(flags & INTR_MPSAFE))
 				device_printf(dev, "[GIANT-LOCKED]\n");
 			if (bootverbose && (flags & INTR_MPSAFE))
 				device_printf(dev, "[MPSAFE]\n");
-			if (flags & INTR_FAST)
-				device_printf(dev, "[FAST]\n");
+			if (filter != NULL) {
+				if (handler == NULL)
+					device_printf(dev, "[FILTER]\n");
+				else 
+					device_printf(dev, "[FILTER+ITHREAD]\n");
+			} else 
+				device_printf(dev, "[ITHREAD]\n");
 		}
 	} else
 		error = EINVAL;
@@ -3423,7 +3502,7 @@
 int
 bus_teardown_intr(device_t dev, struct resource *r, void *cookie)
 {
-	if (dev->parent == 0)
+	if (dev->parent == NULL)
 		return (EINVAL);
 	return (BUS_TEARDOWN_INTR(dev->parent, dev, r, cookie));
 }
@@ -3556,6 +3635,35 @@
 	return (BUS_CHILD_LOCATION_STR(parent, child, buf, buflen));
 }
 
+/**
+ * @brief Wrapper function for BUS_GET_DMA_TAG().
+ *
+ * This function simply calls the BUS_GET_DMA_TAG() method of the
+ * parent of @p dev.
+ */
+bus_dma_tag_t
+bus_get_dma_tag(device_t dev)
+{
+	device_t parent;
+
+	parent = device_get_parent(dev);
+	if (parent == NULL)
+		return (NULL);
+	return (BUS_GET_DMA_TAG(parent, dev));
+}
+
+/* Resume all devices and then notify userland that we're up again. */
+static int
+root_resume(device_t dev)
+{
+	int error;
+
+	error = bus_generic_resume(dev);
+	if (error == 0)
+		devctl_notify("kern", "power", "resume", NULL);
+	return (error);
+}
+
 static int
 root_print_child(device_t dev, device_t child)
 {
@@ -3594,7 +3702,7 @@
 	/* Device interface */
 	KOBJMETHOD(device_shutdown,	bus_generic_shutdown),
 	KOBJMETHOD(device_suspend,	bus_generic_suspend),
-	KOBJMETHOD(device_resume,	bus_generic_resume),
+	KOBJMETHOD(device_resume,	root_resume),
 
 	/* Bus interface */
 	KOBJMETHOD(bus_print_child,	root_print_child),
@@ -3627,7 +3735,7 @@
 		kobj_init((kobj_t) root_bus, (kobj_class_t) &root_driver);
 		root_bus->driver = &root_driver;
 		root_bus->state = DS_ATTACHED;
-		root_devclass = devclass_find_internal("root", 0, FALSE);
+		root_devclass = devclass_find_internal("root", NULL, FALSE);
 		devinit();
 		return (0);
 
@@ -3683,7 +3791,7 @@
 	kobj_class_t driver;
 
 	dmd = (struct driver_module_data *)arg;
-	bus_devclass = devclass_find_internal(dmd->dmd_busname, 0, TRUE);
+	bus_devclass = devclass_find_internal(dmd->dmd_busname, NULL, TRUE);
 	error = 0;
 
 	switch (what) {
@@ -3713,7 +3821,7 @@
 				    parentname, TRUE);
 		} else {
 			*dmd->dmd_devclass =
-				devclass_find_internal(driver->name, 0, TRUE);
+				devclass_find_internal(driver->name, NULL, TRUE);
 		}
 		break;
 
@@ -3745,6 +3853,40 @@
 	return (error);
 }
 
+/**
+ * @brief Enumerate all hinted devices for this bus.
+ *
+ * Walks through the hints for this bus and calls the bus_hinted_child
+ * routine for each one it fines.  It searches first for the specific
+ * bus that's being probed for hinted children (eg isa0), and then for
+ * generic children (eg isa).
+ *
+ * @param	dev	bus device to enumerate
+ */
+void
+bus_enumerate_hinted_children(device_t bus)
+{
+	int i;
+	const char *dname, *busname;
+	int dunit;
+
+	/*
+	 * enumerate all devices on the specific bus
+	 */
+	busname = device_get_nameunit(bus);
+	i = 0;
+	while (resource_find_match(&i, &dname, &dunit, "at", busname) == 0)
+		BUS_HINTED_CHILD(bus, dname, dunit);
+
+	/*
+	 * and all the generic ones.
+	 */
+	busname = device_get_name(bus);
+	i = 0;
+	while (resource_find_match(&i, &dname, &dunit, "at", busname) == 0)
+		BUS_HINTED_CHILD(bus, dname, dunit);
+}
+
 #ifdef BUS_DEBUG
 
 /* the _short versions avoid iteration by not calling anything that prints
Index: vnode_if.src
===================================================================
RCS file: /home/cvs/src/sys/kern/vnode_if.src,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/vnode_if.src -L sys/kern/vnode_if.src -u -r1.1.1.1 -r1.2
--- sys/kern/vnode_if.src
+++ sys/kern/vnode_if.src
@@ -27,16 +27,16 @@
 # SUCH DAMAGE.
 #
 #	@(#)vnode_if.src	8.12 (Berkeley) 5/14/95
-# $FreeBSD: src/sys/kern/vnode_if.src,v 1.78 2005/06/09 20:20:29 ssouhlal Exp $
+# $FreeBSD: src/sys/kern/vnode_if.src,v 1.87 2007/05/31 11:51:52 kib Exp $
 #
 
 #
-# Above each of the vop descriptors is a specification of the locking
-# protocol used by each vop call.  The first column is the name of
-# the variable, the remaining three columns are in, out and error
-# respectively.  The "in" column defines the lock state on input,
-# the "out" column defines the state on succesful return, and the
-# "error" column defines the locking state on error exit.
+# Above each of the vop descriptors in lines starting with %%
+# is a specification of the locking protocol used by each vop call.
+# The first column is the name of the variable, the remaining three
+# columns are in, out and error respectively.  The "in" column defines
+# the lock state on input, the "out" column defines the state on succesful
+# return, and the "error" column defines the locking state on error exit.
 #
 # The locking value can take the following values:
 # L: locked; not converted to type of lock.
@@ -52,51 +52,48 @@
 # The paramater named "vpp" is assumed to be always used with double
 # indirection (**vpp) and that name is hard-codeed in vnode_if.awk !
 #
+# Lines starting with %! specify a pre or post-condition function
+# to call before/after the vop call.
+#
 # If other such parameters are introduced, they have to be added to
 # the AWK script at the head of the definition of "add_debug_code()".
 #
 
-#
-# islocked	vp	= = =
-#
 vop_islocked {
 	IN struct vnode *vp;
 	IN struct thread *td;
 };
 
-#
-# lookup	dvp	L ? ?
-# lookup	vpp	- L -
-#! lookup	pre	vop_lookup_pre
-#! lookup	post	vop_lookup_post
-#
+%% lookup	dvp	L ? ?
+%% lookup	vpp	- L -
+%! lookup	pre	vop_lookup_pre
+%! lookup	post	vop_lookup_post
+
 # XXX - the lookup locking protocol defies simple description and depends
 #	on the flags and operation fields in the (cnp) structure.  Note
 #	especially that *vpp may equal dvp and both may be locked.
-#
+
 vop_lookup {
 	IN struct vnode *dvp;
 	INOUT struct vnode **vpp;
 	IN struct componentname *cnp;
 };
 
-#
-#% cachedlookup	dvp	L ? ?
-#% cachedlookup	vpp	- L -
-#
+%% cachedlookup	dvp	L ? ?
+%% cachedlookup	vpp	- L -
+
 # This must be an exact copy of lookup.  See kern/vfs_cache.c for details.
-#
+
 vop_cachedlookup {
 	IN struct vnode *dvp;
 	INOUT struct vnode **vpp;
 	IN struct componentname *cnp;
 };
 
-#
-#% create	dvp	E E E
-#% create	vpp	- L -
-#! create	post	vop_create_post
-#
+%% create	dvp	E E E
+%% create	vpp	- L -
+%! create	post	vop_create_post
+
 vop_create {
 	IN struct vnode *dvp;
 	OUT struct vnode **vpp;
@@ -104,20 +101,20 @@
 	IN struct vattr *vap;
 };
 
-#
-#% whiteout	dvp	E E E
-#
+
+%% whiteout	dvp	E E E
+
 vop_whiteout {
 	IN struct vnode *dvp;
 	IN struct componentname *cnp;
 	IN int flags;
 };
 
-#
-#% mknod	dvp	E E E
-#% mknod	vpp	- L -
-#! mknod	post	vop_mknod_post
-#
+
+%% mknod	dvp	E E E
+%% mknod	vpp	- L -
+%! mknod	post	vop_mknod_post
+
 vop_mknod {
 	IN struct vnode *dvp;
 	OUT struct vnode **vpp;
@@ -125,20 +122,20 @@
 	IN struct vattr *vap;
 };
 
-#
-#% open		vp	L L L
-#
+
+%% open		vp	L L L
+
 vop_open {
 	IN struct vnode *vp;
 	IN int mode;
 	IN struct ucred *cred;
 	IN struct thread *td;
-	IN int fdidx;
+	IN struct file *fp;
 };
 
-#
-#% close	vp	E E E
-#
+
+%% close	vp	E E E
+
 vop_close {
 	IN struct vnode *vp;
 	IN int fflag;
@@ -146,9 +143,9 @@
 	IN struct thread *td;
 };
 
-#
-#% access	vp	L L L
-#
+
+%% access	vp	L L L
+
 vop_access {
 	IN struct vnode *vp;
 	IN int mode;
@@ -156,9 +153,9 @@
 	IN struct thread *td;
 };
 
-#
-#% getattr	vp	L L L
-#
+
+%% getattr	vp	L L L
+
 vop_getattr {
 	IN struct vnode *vp;
 	OUT struct vattr *vap;
@@ -166,10 +163,10 @@
 	IN struct thread *td;
 };
 
-#
-#% setattr	vp	E E E
-#! setattr	post	vop_setattr_post
-#
+
+%% setattr	vp	E E E
+%! setattr	post	vop_setattr_post
+
 vop_setattr {
 	IN struct vnode *vp;
 	IN struct vattr *vap;
@@ -177,9 +174,9 @@
 	IN struct thread *td;
 };
 
-#
-#% read		vp	L L L
-#
+
+%% read		vp	L L L
+
 vop_read {
 	IN struct vnode *vp;
 	INOUT struct uio *uio;
@@ -187,11 +184,11 @@
 	IN struct ucred *cred;
 };
 
-#
-#% write	vp	E E E
-#! write	pre	VOP_WRITE_PRE
-#! write	post	VOP_WRITE_POST
-#
+
+%% write	vp	E E E
+%! write	pre	VOP_WRITE_PRE
+%! write	post	VOP_WRITE_POST
+
 vop_write {
 	IN struct vnode *vp;
 	INOUT struct uio *uio;
@@ -199,9 +196,9 @@
 	IN struct ucred *cred;
 };
 
-#
-#% lease	vp	= = =
-#
+
+%% lease	vp	= = =
+
 vop_lease {
 	IN struct vnode *vp;
 	IN struct thread *td;
@@ -209,21 +206,21 @@
 	IN int flag;
 };
 
-#
-#% ioctl	vp	U U U
-#
+
+%% ioctl	vp	U U U
+
 vop_ioctl {
 	IN struct vnode *vp;
 	IN u_long command;
-	IN caddr_t data;
+	IN void *data;
 	IN int fflag;
 	IN struct ucred *cred;
 	IN struct thread *td;
 };
 
-#
-#% poll	vp	U U U
-#
+
+%% poll	vp	U U U
+
 vop_poll {
 	IN struct vnode *vp;
 	IN int events;
@@ -231,61 +228,57 @@
 	IN struct thread *td;
 };
 
-#
-#% kqfilter	vp	U U U
-#
+
+%% kqfilter	vp	U U U
+
 vop_kqfilter {
 	IN struct vnode *vp;
 	IN struct knote *kn;
 };
 
-#
-#% revoke	vp	L L L
-#
+
+%% revoke	vp	L L L
+
 vop_revoke {
 	IN struct vnode *vp;
 	IN int flags;
 };
 
-#
-#% fsync	vp	E E E
-#
+
+%% fsync	vp	E E E
+
 vop_fsync {
 	IN struct vnode *vp;
 	IN int waitfor;
 	IN struct thread *td;
 };
 
-#
-#% remove	dvp	E E E
-#% remove	vp	E E E
-#! remove	post	vop_remove_post
-#
+
+%% remove	dvp	E E E
+%% remove	vp	E E E
+%! remove	post	vop_remove_post
+
 vop_remove {
 	IN struct vnode *dvp;
 	IN struct vnode *vp;
 	IN struct componentname *cnp;
 };
 
-#
-#% link		tdvp	E E E
-#% link		vp	E E E
-#! link		post	vop_link_post
-#
+
+%% link		tdvp	E E E
+%% link		vp	E E E
+%! link		post	vop_link_post
+
 vop_link {
 	IN struct vnode *tdvp;
 	IN struct vnode *vp;
 	IN struct componentname *cnp;
 };
 
-#
-# rename	fdvp	U U U
-# rename	fvp	U U U
-# rename	tdvp	E U U
-# rename	tvp	X U U
-#! rename	pre	vop_rename_pre
-#! rename	post	vop_rename_post
-#
+
+%! rename	pre	vop_rename_pre
+%! rename	post	vop_rename_post
+
 vop_rename {
 	IN WILLRELE struct vnode *fdvp;
 	IN WILLRELE struct vnode *fvp;
@@ -295,11 +288,11 @@
 	IN struct componentname *tcnp;
 };
 
-#
-#% mkdir	dvp	E E E
-#% mkdir	vpp	- E -
-#! mkdir	post	vop_mkdir_post
-#
+
+%% mkdir	dvp	E E E
+%% mkdir	vpp	- E -
+%! mkdir	post	vop_mkdir_post
+
 vop_mkdir {
 	IN struct vnode *dvp;
 	OUT struct vnode **vpp;
@@ -307,22 +300,22 @@
 	IN struct vattr *vap;
 };
 
-#
-#% rmdir	dvp	E E E
-#% rmdir	vp	E E E
-#! rmdir	post	vop_rmdir_post
-#
+
+%% rmdir	dvp	E E E
+%% rmdir	vp	E E E
+%! rmdir	post	vop_rmdir_post
+
 vop_rmdir {
 	IN struct vnode *dvp;
 	IN struct vnode *vp;
 	IN struct componentname *cnp;
 };
 
-#
-#% symlink	dvp	E E E
-#% symlink	vpp	- E -
-#! symlink	post	vop_symlink_post
-#
+
+%% symlink	dvp	E E E
+%% symlink	vpp	- E -
+%! symlink	post	vop_symlink_post
+
 vop_symlink {
 	IN struct vnode *dvp;
 	OUT struct vnode **vpp;
@@ -331,9 +324,9 @@
 	IN char *target;
 };
 
-#
-#% readdir	vp	L L L
-#
+
+%% readdir	vp	L L L
+
 vop_readdir {
 	IN struct vnode *vp;
 	INOUT struct uio *uio;
@@ -343,56 +336,56 @@
 	INOUT u_long **cookies;
 };
 
-#
-#% readlink	vp	L L L
-#
+
+%% readlink	vp	L L L
+
 vop_readlink {
 	IN struct vnode *vp;
 	INOUT struct uio *uio;
 	IN struct ucred *cred;
 };
 
-#
-#% inactive	vp	E E E
-#
+
+%% inactive	vp	E E E
+
 vop_inactive {
 	IN struct vnode *vp;
 	IN struct thread *td;
 };
 
-#
-#% reclaim	vp	E E E
-#
+
+%% reclaim	vp	E E E
+
 vop_reclaim {
 	IN struct vnode *vp;
 	IN struct thread *td;
 };
 
-#
-#lock		vp	? ? ?
-#! lock		pre	vop_lock_pre
-#! lock		post	vop_lock_post
-#
-vop_lock {
+
+%! lock1	pre	vop_lock_pre
+%! lock1	post	vop_lock_post
+
+vop_lock1 {
 	IN struct vnode *vp;
 	IN int flags;
 	IN struct thread *td;
+	IN char *file;
+	IN int line;
 };
 
-#
-#unlock		vp	L ? L
-#! unlock	pre	vop_unlock_pre
-#! unlock	post	vop_unlock_post
-#
+
+%! unlock	pre	vop_unlock_pre
+%! unlock	post	vop_unlock_post
+
 vop_unlock {
 	IN struct vnode *vp;
 	IN int flags;
 	IN struct thread *td;
 };
 
-#
-#% bmap		vp	L L L
-#
+
+%% bmap		vp	L L L
+
 vop_bmap {
 	IN struct vnode *vp;
 	IN daddr_t bn;
@@ -402,61 +395,61 @@
 	OUT int *runb;
 };
 
-#
-# strategy	vp	L L L
-#! strategy	pre	vop_strategy_pre
-#
+
+%% strategy	vp	L L L
+%! strategy	pre	vop_strategy_pre
+
 vop_strategy {
 	IN struct vnode *vp;
 	IN struct buf *bp;
 };
 
-#
-#% getwritemount vp	= = =
-#
+
+%% getwritemount vp	= = =
+
 vop_getwritemount {
 	IN struct vnode *vp;
 	OUT struct mount **mpp;
 };
 
-#
-#% print	vp	= = =
-#
+
+%% print	vp	= = =
+
 vop_print {
 	IN struct vnode *vp;
 };
 
-#
-#% pathconf	vp	L L L
-#
+
+%% pathconf	vp	L L L
+
 vop_pathconf {
 	IN struct vnode *vp;
 	IN int name;
 	OUT register_t *retval;
 };
 
-#
-#% advlock	vp	U U U
-#
+
+%% advlock	vp	U U U
+
 vop_advlock {
 	IN struct vnode *vp;
-	IN caddr_t id;
+	IN void *id;
 	IN int op;
 	IN struct flock *fl;
 	IN int flags;
 };
 
-#
-#% reallocblks	vp	E E E
-#
+
+%% reallocblks	vp	E E E
+
 vop_reallocblks {
 	IN struct vnode *vp;
 	IN struct cluster_save *buflist;
 };
 
-#
-#% getpages	vp	L L L
-#
+
+%% getpages	vp	L L L
+
 vop_getpages {
 	IN struct vnode *vp;
 	IN vm_page_t *m;
@@ -465,9 +458,9 @@
 	IN vm_ooffset_t offset;
 };
 
-#
-#% putpages	vp	E E E
-#
+
+%% putpages	vp	E E E
+
 vop_putpages {
 	IN struct vnode *vp;
 	IN vm_page_t *m;
@@ -477,9 +470,9 @@
 	IN vm_ooffset_t offset;
 };
 
-#
-#% getacl	vp	L L L
-#
+
+%% getacl	vp	L L L
+
 vop_getacl {
 	IN struct vnode *vp;
 	IN acl_type_t type;
@@ -488,9 +481,9 @@
 	IN struct thread *td;
 };
 
-#
-#% setacl	vp	E E E
-#
+
+%% setacl	vp	E E E
+
 vop_setacl {
 	IN struct vnode *vp;
 	IN acl_type_t type;
@@ -499,9 +492,9 @@
 	IN struct thread *td;
 };
 
-#
-#% aclcheck	vp	= = =
-#
+
+%% aclcheck	vp	= = =
+
 vop_aclcheck {
 	IN struct vnode *vp;
 	IN acl_type_t type;
@@ -510,9 +503,9 @@
 	IN struct thread *td;
 };
 
-#
-#% closeextattr	vp	L L L
-#
+
+%% closeextattr	vp	L L L
+
 vop_closeextattr {
 	IN struct vnode *vp;
 	IN int commit;
@@ -520,9 +513,9 @@
 	IN struct thread *td;
 };
 
-#
-#% getextattr	vp	L L L
-#
+
+%% getextattr	vp	L L L
+
 vop_getextattr {
 	IN struct vnode *vp;
 	IN int attrnamespace;
@@ -533,9 +526,9 @@
 	IN struct thread *td;
 };
 
-#
-#% listextattr	vp	L L L
-#
+
+%% listextattr	vp	L L L
+
 vop_listextattr {
 	IN struct vnode *vp;
 	IN int attrnamespace;
@@ -545,18 +538,18 @@
 	IN struct thread *td;
 };
 
-#
-#% openextattr	vp	L L L
-#
+
+%% openextattr	vp	L L L
+
 vop_openextattr {
 	IN struct vnode *vp;
 	IN struct ucred *cred;
 	IN struct thread *td;
 };
 
-#
-#% deleteextattr	vp	E E E
-#
+
+%% deleteextattr	vp	E E E
+
 vop_deleteextattr {
 	IN struct vnode *vp;
 	IN int attrnamespace;
@@ -565,9 +558,9 @@
 	IN struct thread *td;
 };
 
-#
-#% setextattr	vp	E E E
-#
+
+%% setextattr	vp	E E E
+
 vop_setextattr {
 	IN struct vnode *vp;
 	IN int attrnamespace;
@@ -577,12 +570,20 @@
 	IN struct thread *td;
 };
 
-#
-#% setlabel	vp	E E E
-#
+
+%% setlabel	vp	E E E
+
 vop_setlabel {
 	IN struct vnode *vp;
 	IN struct label *label;
 	IN struct ucred *cred;
 	IN struct thread *td;
 };
+
+
+%% setlabel	vp	= = =
+
+vop_vptofh {
+	IN struct vnode *vp;
+	IN struct fid *fhp;
+};
Index: kern_idle.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_idle.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/kern_idle.c -L sys/kern/kern_idle.c -u -r1.1.1.1 -r1.2
--- sys/kern/kern_idle.c
+++ sys/kern/kern_idle.c
@@ -24,7 +24,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_idle.c,v 1.43 2005/04/04 21:53:54 jhb Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_idle.c,v 1.48 2007/06/05 00:00:53 jeff Exp $");
 
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -43,8 +43,6 @@
 static void idle_setup(void *dummy);
 SYSINIT(idle_setup, SI_SUB_SCHED_IDLE, SI_ORDER_FIRST, idle_setup, NULL)
 
-static void idle_proc(void *dummy);
-
 /*
  * Set up per-cpu idle process contexts.  The AP's shouldn't be running or
  * accessing their idle processes at this point, so don't bother with
@@ -62,11 +60,11 @@
 
 #ifdef SMP
 	SLIST_FOREACH(pc, &cpuhead, pc_allcpu) {
-		error = kthread_create(idle_proc, NULL, &p,
+		error = kthread_create(sched_idletd, NULL, &p,
 		    RFSTOPPED | RFHIGHPID, 0, "idle: cpu%d", pc->pc_cpuid);
 		pc->pc_idlethread = FIRST_THREAD_IN_PROC(p);
 #else
-		error = kthread_create(idle_proc, NULL, &p,
+		error = kthread_create(sched_idletd, NULL, &p,
 		    RFSTOPPED | RFHIGHPID, 0, "idle");
 		PCPU_SET(idlethread, FIRST_THREAD_IN_PROC(p));
 #endif
@@ -75,53 +73,15 @@
 
 		PROC_LOCK(p);
 		p->p_flag |= P_NOLOAD;
-		mtx_lock_spin(&sched_lock);
 		td = FIRST_THREAD_IN_PROC(p);
+		thread_lock(td);
 		TD_SET_CAN_RUN(td);
 		td->td_flags |= TDF_IDLETD;
-		sched_class(td->td_ksegrp, PRI_IDLE);
+		sched_class(td, PRI_IDLE);
 		sched_prio(td, PRI_MAX_IDLE);
-		mtx_unlock_spin(&sched_lock);
+		thread_unlock(td);
 		PROC_UNLOCK(p);
 #ifdef SMP
 	}
 #endif
 }
-
-/*
- * The actual idle process.
- */
-static void
-idle_proc(void *dummy)
-{
-	struct proc *p;
-	struct thread *td;
-#ifdef SMP
-	cpumask_t mycpu;
-#endif
-
-	td = curthread;
-	p = td->td_proc;
-#ifdef SMP
-	mycpu = PCPU_GET(cpumask);
-	mtx_lock_spin(&sched_lock);
-	idle_cpus_mask |= mycpu;
-	mtx_unlock_spin(&sched_lock);
-#endif
-	for (;;) {
-		mtx_assert(&Giant, MA_NOTOWNED);
-
-		while (sched_runnable() == 0)
-			cpu_idle();
-
-		mtx_lock_spin(&sched_lock);
-#ifdef SMP
-		idle_cpus_mask &= ~mycpu;
-#endif
-		mi_switch(SW_VOL, NULL);
-#ifdef SMP
-		idle_cpus_mask |= mycpu;
-#endif
-		mtx_unlock_spin(&sched_lock);
-	}
-}
Index: uipc_domain.c
===================================================================
RCS file: /home/cvs/src/sys/kern/uipc_domain.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/uipc_domain.c -L sys/kern/uipc_domain.c -u -r1.2 -r1.3
--- sys/kern/uipc_domain.c
+++ sys/kern/uipc_domain.c
@@ -30,12 +30,13 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/uipc_domain.c,v 1.44.2.2 2006/03/01 20:58:36 andre Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/uipc_domain.c,v 1.51 2007/08/06 14:26:00 rwatson Exp $");
 
 #include <sys/param.h>
 #include <sys/socket.h>
 #include <sys/protosw.h>
 #include <sys/domain.h>
+#include <sys/eventhandler.h>
 #include <sys/mbuf.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
@@ -79,14 +80,12 @@
  * All functions return EOPNOTSUPP.
  */
 struct pr_usrreqs nousrreqs = {
-	.pru_abort =		pru_abort_notsupp,
 	.pru_accept =		pru_accept_notsupp,
 	.pru_attach =		pru_attach_notsupp,
 	.pru_bind =		pru_bind_notsupp,
 	.pru_connect =		pru_connect_notsupp,
 	.pru_connect2 =		pru_connect2_notsupp,
 	.pru_control =		pru_control_notsupp,
-	.pru_detach =		pru_detach_notsupp,
 	.pru_disconnect	=	pru_disconnect_notsupp,
 	.pru_listen =		pru_listen_notsupp,
 	.pru_peeraddr =		pru_peeraddr_notsupp,
@@ -99,7 +98,6 @@
 	.pru_sosend =		pru_sosend_notsupp,
 	.pru_soreceive =	pru_soreceive_notsupp,
 	.pru_sopoll =		pru_sopoll_notsupp,
-	.pru_sosetlabel =	pru_sosetlabel_null
 };
 
 static void
@@ -121,10 +119,9 @@
 	DEFAULT(pu->pru_rcvd, pru_rcvd_notsupp);
 	DEFAULT(pu->pru_rcvoob, pru_rcvoob_notsupp);
 	DEFAULT(pu->pru_sense, pru_sense_null);
-	DEFAULT(pu->pru_sosend, sosend);
-	DEFAULT(pu->pru_soreceive, soreceive);
-	DEFAULT(pu->pru_sopoll, sopoll);
-	DEFAULT(pu->pru_sosetlabel, pru_sosetlabel_null);
+	DEFAULT(pu->pru_sosend, sosend_generic);
+	DEFAULT(pu->pru_soreceive, soreceive_generic);
+	DEFAULT(pu->pru_sopoll, sopoll_generic);
 #undef DEFAULT
 	if (pr->pr_init)
 		(*pr->pr_init)();
@@ -181,39 +178,41 @@
 	    ("attempt to net_add_domain(%s) after domainfinalize()",
 	    dp->dom_name));
 #else
-#ifdef DIAGNOSTIC
 	if (domain_init_status >= 2)
 		printf("WARNING: attempt to net_add_domain(%s) after "
 		    "domainfinalize()\n", dp->dom_name);
 #endif
-#endif
 	mtx_unlock(&dom_mtx);
 	net_init_domain(dp);
 }
 
+static void
+socket_zone_change(void *tag)
+{
+
+	uma_zone_set_max(socket_zone, maxsockets);
+}
+
 /* ARGSUSED*/
 static void
 domaininit(void *dummy)
 {
+
 	/*
 	 * Before we do any setup, make sure to initialize the
 	 * zone allocator we get struct sockets from.
 	 */
-
 	socket_zone = uma_zcreate("socket", sizeof(struct socket), NULL, NULL,
 	    NULL, NULL, UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	uma_zone_set_max(socket_zone, maxsockets);
+	EVENTHANDLER_REGISTER(maxsockets_change, socket_zone_change, NULL,
+		EVENTHANDLER_PRI_FIRST);
 
 	if (max_linkhdr < 16)		/* XXX */
 		max_linkhdr = 16;
 
-	if (debug_mpsafenet) {
-		callout_init(&pffast_callout, CALLOUT_MPSAFE);
-		callout_init(&pfslow_callout, CALLOUT_MPSAFE);
-	} else {
-		callout_init(&pffast_callout, 0);
-		callout_init(&pfslow_callout, 0);
-	}
+	callout_init(&pffast_callout, CALLOUT_MPSAFE);
+	callout_init(&pfslow_callout, CALLOUT_MPSAFE);
 
 	mtx_lock(&dom_mtx);
 	KASSERT(domain_init_status == 0, ("domaininit called too late!"));
@@ -225,6 +224,7 @@
 static void
 domainfinalize(void *dummy)
 {
+
 	mtx_lock(&dom_mtx);
 	KASSERT(domain_init_status == 1, ("domainfinalize called too late!"));
 	domain_init_status = 2;
@@ -235,12 +235,10 @@
 }
 
 struct protosw *
-pffindtype(family, type)
-	int family;
-	int type;
+pffindtype(int family, int type)
 {
-	register struct domain *dp;
-	register struct protosw *pr;
+	struct domain *dp;
+	struct protosw *pr;
 
 	for (dp = domains; dp; dp = dp->dom_next)
 		if (dp->dom_family == family)
@@ -254,13 +252,10 @@
 }
 
 struct protosw *
-pffindproto(family, protocol, type)
-	int family;
-	int protocol;
-	int type;
+pffindproto(int family, int protocol, int type)
 {
-	register struct domain *dp;
-	register struct protosw *pr;
+	struct domain *dp;
+	struct protosw *pr;
 	struct protosw *maybe = 0;
 
 	if (family == 0)
@@ -286,9 +281,7 @@
  * accept requests before it is registered.
  */
 int
-pf_proto_register(family, npr)
-	int family;
-	struct protosw *npr;
+pf_proto_register(int family, struct protosw *npr)
 {
 	struct domain *dp;
 	struct protosw *pr, *fpr;
@@ -355,10 +348,7 @@
  * all sockets and release all locks and memory references.
  */
 int
-pf_proto_unregister(family, protocol, type)
-	int family;
-	int protocol;
-	int type;
+pf_proto_unregister(int family, int protocol, int type)
 {
 	struct domain *dp;
 	struct protosw *pr, *dpr;
@@ -423,12 +413,10 @@
 }
 
 void
-pfctlinput(cmd, sa)
-	int cmd;
-	struct sockaddr *sa;
+pfctlinput(int cmd, struct sockaddr *sa)
 {
-	register struct domain *dp;
-	register struct protosw *pr;
+	struct domain *dp;
+	struct protosw *pr;
 
 	for (dp = domains; dp; dp = dp->dom_next)
 		for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
@@ -437,10 +425,7 @@
 }
 
 void
-pfctlinput2(cmd, sa, ctlparam)
-	int cmd;
-	struct sockaddr *sa;
-	void *ctlparam;
+pfctlinput2(int cmd, struct sockaddr *sa, void *ctlparam)
 {
 	struct domain *dp;
 	struct protosw *pr;
@@ -463,13 +448,10 @@
 }
 
 static void
-pfslowtimo(arg)
-	void *arg;
+pfslowtimo(void *arg)
 {
-	register struct domain *dp;
-	register struct protosw *pr;
-
-	NET_ASSERT_GIANT();
+	struct domain *dp;
+	struct protosw *pr;
 
 	for (dp = domains; dp; dp = dp->dom_next)
 		for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
@@ -479,13 +461,10 @@
 }
 
 static void
-pffasttimo(arg)
-	void *arg;
+pffasttimo(void *arg)
 {
-	register struct domain *dp;
-	register struct protosw *pr;
-
-	NET_ASSERT_GIANT();
+	struct domain *dp;
+	struct protosw *pr;
 
 	for (dp = domains; dp; dp = dp->dom_next)
 		for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
Index: init_main.c
===================================================================
RCS file: /home/cvs/src/sys/kern/init_main.c,v
retrieving revision 1.3
retrieving revision 1.4
diff -L sys/kern/init_main.c -L sys/kern/init_main.c -u -r1.3 -r1.4
--- sys/kern/init_main.c
+++ sys/kern/init_main.c
@@ -39,12 +39,12 @@
  * SUCH DAMAGE.
  *
  *	@(#)init_main.c	8.9 (Berkeley) 1/21/94
- * $FreeBSD: src/sys/kern/init_main.c,v 1.256.2.2 2005/10/05 10:31:03 rwatson Exp $
  */
 
 #include <sys/cdefs.h>
-__MBSDID("$MidnightBSD$");
+__FBSDID("$FreeBSD: src/sys/kern/init_main.c,v 1.283.2.2 2007/12/14 13:41:08 rrs Exp $");
 
+#include "opt_ddb.h"
 #include "opt_init_path.h"
 #include "opt_mac.h"
 
@@ -55,7 +55,6 @@
 #include <sys/filedesc.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
-#include <sys/mac.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/syscallsubr.h>
@@ -77,20 +76,25 @@
 
 #include <machine/cpu.h>
 
+#include <security/audit/audit.h>
+#include <security/mac/mac_framework.h>
+
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <sys/copyright.h>
 
+#include <ddb/ddb.h>
+#include <ddb/db_sym.h>
+
 void mi_startup(void);				/* Should be elsewhere */
 
 /* Components of the first process -- never freed. */
 static struct session session0;
 static struct pgrp pgrp0;
 struct	proc proc0;
-struct	thread thread0 __aligned(8);
-struct	ksegrp ksegrp0;
+struct	thread thread0 __aligned(16);
 struct	vmspace vmspace0;
 struct	proc *initproc;
 
@@ -168,6 +172,11 @@
 	register struct sysinit **xipp;		/* interior loop of sort*/
 	register struct sysinit *save;		/* bubble*/
 
+#if defined(VERBOSE_SYSINIT)
+	int last;
+	int verbose;
+#endif
+
 	if (sysinit == NULL) {
 		sysinit = SET_BEGIN(sysinit_set);
 		sysinit_end = SET_LIMIT(sysinit_set);
@@ -190,6 +199,14 @@
 		}
 	}
 
+#if defined(VERBOSE_SYSINIT)
+	last = SI_SUB_COPYRIGHT;
+	verbose = 0;
+#if !defined(DDB)
+	printf("VERBOSE_SYSINIT: DDB not enabled, symbol lookups disabled.\n");
+#endif
+#endif
+
 	/*
 	 * Traverse the (now) ordered list of system initialization tasks.
 	 * Perform each task, and continue on to the next task.
@@ -205,9 +222,38 @@
 		if ((*sipp)->subsystem == SI_SUB_DONE)
 			continue;
 
+#if defined(VERBOSE_SYSINIT)
+		if ((*sipp)->subsystem > last) {
+			verbose = 1;
+			last = (*sipp)->subsystem;
+			printf("subsystem %x\n", last);
+		}
+		if (verbose) {
+#if defined(DDB)
+			const char *name;
+			c_db_sym_t sym;
+			db_expr_t  offset;
+
+			sym = db_search_symbol((vm_offset_t)(*sipp)->func,
+			    DB_STGY_PROC, &offset);
+			db_symbol_values(sym, &name, NULL);
+			if (name != NULL)
+				printf("   %s(%p)... ", name, (*sipp)->udata);
+			else
+#endif
+				printf("   %p(%p)... ", (*sipp)->func,
+				    (*sipp)->udata);
+		}
+#endif
+
 		/* Call function */
 		(*((*sipp)->func))((*sipp)->udata);
 
+#if defined(VERBOSE_SYSINIT)
+		if (verbose)
+			printf("done.\n");
+#endif
+
 		/* Check off the one we're just done */
 		(*sipp)->subsystem = SI_SUB_DONE;
 
@@ -242,19 +288,24 @@
 	printf("%s", (char *)data);
 }
 SYSINIT(announce, SI_SUB_COPYRIGHT, SI_ORDER_FIRST, print_caddr_t, copyright)
-SYSINIT(version, SI_SUB_COPYRIGHT, SI_ORDER_SECOND, print_caddr_t, version)
+SYSINIT(trademark, SI_SUB_COPYRIGHT, SI_ORDER_SECOND, print_caddr_t, trademark)
+SYSINIT(version, SI_SUB_COPYRIGHT, SI_ORDER_THIRD, print_caddr_t, version)
 
 #ifdef WITNESS
 static char wit_warn[] =
      "WARNING: WITNESS option enabled, expect reduced performance.\n";
-SYSINIT(witwarn, SI_SUB_COPYRIGHT, SI_ORDER_SECOND + 1,
+SYSINIT(witwarn, SI_SUB_COPYRIGHT, SI_ORDER_THIRD + 1,
+   print_caddr_t, wit_warn)
+SYSINIT(witwarn2, SI_SUB_RUN_SCHEDULER, SI_ORDER_THIRD + 1,
    print_caddr_t, wit_warn)
 #endif
 
 #ifdef DIAGNOSTIC
 static char diag_warn[] =
      "WARNING: DIAGNOSTIC option enabled, expect reduced performance.\n";
-SYSINIT(diagwarn, SI_SUB_COPYRIGHT, SI_ORDER_SECOND + 2,
+SYSINIT(diagwarn, SI_SUB_COPYRIGHT, SI_ORDER_THIRD + 2,
+    print_caddr_t, diag_warn)
+SYSINIT(diagwarn2, SI_SUB_RUN_SCHEDULER, SI_ORDER_THIRD + 2,
     print_caddr_t, diag_warn)
 #endif
 
@@ -316,27 +367,26 @@
 	struct proc *p;
 	unsigned i;
 	struct thread *td;
-	struct ksegrp *kg;
 
 	GIANT_REQUIRED;
 	p = &proc0;
 	td = &thread0;
-	kg = &ksegrp0;
 
 	/*
-	 * Initialize magic number.
+	 * Initialize magic number and osrel.
 	 */
 	p->p_magic = P_MAGIC;
+	p->p_osrel = osreldate;
 
 	/*
-	 * Initialize thread, process and ksegrp structures.
+	 * Initialize thread and process structures.
 	 */
 	procinit();	/* set up proc zone */
-	threadinit();	/* set up thead, upcall and KSEGRP zones */
+	threadinit();	/* set up UMA zones */
 
 	/*
 	 * Initialise scheduler resources.
-	 * Add scheduler specific parts to proc, ksegrp, thread as needed.
+	 * Add scheduler specific parts to proc, thread as needed.
 	 */
 	schedinit();	/* scheduler gets its house in order */
 	/*
@@ -366,17 +416,19 @@
 	session0.s_leader = p;
 
 	p->p_sysent = &null_sysvec;
-	p->p_flag = P_SYSTEM;
-	p->p_sflag = PS_INMEM;
+	p->p_flag = P_SYSTEM | P_INMEM;
 	p->p_state = PRS_NORMAL;
 	knlist_init(&p->p_klist, &p->p_mtx, NULL, NULL, NULL);
+	STAILQ_INIT(&p->p_ktr);
 	p->p_nice = NZERO;
 	td->td_state = TDS_RUNNING;
-	kg->kg_pri_class = PRI_TIMESHARE;
-	kg->kg_user_pri = PUSER;
+	td->td_pri_class = PRI_TIMESHARE;
+	td->td_user_pri = PUSER;
+	td->td_base_user_pri = PUSER;
 	td->td_priority = PVM;
 	td->td_base_pri = PUSER;
 	td->td_oncpu = 0;
+	td->td_flags = TDF_INMEM;
 	p->p_peers = 0;
 	p->p_leader = p;
 
@@ -384,6 +436,7 @@
 	bcopy("swapper", p->p_comm, sizeof ("swapper"));
 
 	callout_init(&p->p_itcallout, CALLOUT_MPSAFE);
+	callout_init_mtx(&p->p_limco, &p->p_mtx, 0);
 	callout_init(&td->td_slpcallout, CALLOUT_MPSAFE);
 
 	/* Create credentials. */
@@ -392,6 +445,9 @@
 	p->p_ucred->cr_uidinfo = uifind(0);
 	p->p_ucred->cr_ruidinfo = uifind(0);
 	p->p_ucred->cr_prison = NULL;	/* Don't jail it. */
+#ifdef AUDIT
+	audit_cred_kproc0(p->p_ucred);
+#endif
 #ifdef MAC
 	mac_create_proc0(p->p_ucred);
 #endif
@@ -431,6 +487,15 @@
 	vm_map_init(&vmspace0.vm_map, p->p_sysent->sv_minuser,
 	    p->p_sysent->sv_maxuser);
 	vmspace0.vm_map.pmap = vmspace_pmap(&vmspace0);
+	/*-
+	 * call the init and ctor for the new thread and proc
+	 * we wait to do this until all other structures
+	 * are fairly sane.
+	 */
+	EVENTHANDLER_INVOKE(process_init, p);
+	EVENTHANDLER_INVOKE(thread_init, td);
+	EVENTHANDLER_INVOKE(process_ctor, p);
+	EVENTHANDLER_INVOKE(thread_ctor, td);
 
 	/*
 	 * Charge root for one process.
@@ -445,19 +510,25 @@
 {
 	struct timespec ts;
 	struct proc *p;
+	struct rusage ru;
 
 	/*
 	 * Now we can look at the time, having had a chance to verify the
 	 * time from the filesystem.  Pretend that proc0 started now.
 	 */
 	sx_slock(&allproc_lock);
-	LIST_FOREACH(p, &allproc, p_list) {
+	FOREACH_PROC_IN_SYSTEM(p) {
 		microuptime(&p->p_stats->p_start);
-		p->p_rux.rux_runtime.sec = 0;
-		p->p_rux.rux_runtime.frac = 0;
+		PROC_SLOCK(p);
+		rufetch(p, &ru);	/* Clears thread stats */
+		PROC_SUNLOCK(p);
+		p->p_rux.rux_runtime = 0;
+		p->p_rux.rux_uticks = 0;
+		p->p_rux.rux_sticks = 0;
+		p->p_rux.rux_iticks = 0;
 	}
 	sx_sunlock(&allproc_lock);
-	binuptime(PCPU_PTR(switchtime));
+	PCPU_SET(switchtime, cpu_ticks());
 	PCPU_SET(switchticks, ticks);
 
 	/*
@@ -649,19 +720,19 @@
 	/* divorce init's credentials from the kernel's */
 	newcred = crget();
 	PROC_LOCK(initproc);
-	initproc->p_flag |= P_SYSTEM;
+	initproc->p_flag |= P_SYSTEM | P_INMEM;
 	oldcred = initproc->p_ucred;
 	crcopy(newcred, oldcred);
 #ifdef MAC
 	mac_create_proc1(newcred);
 #endif
+#ifdef AUDIT
+	audit_cred_proc1(newcred);
+#endif
 	initproc->p_ucred = newcred;
 	PROC_UNLOCK(initproc);
 	crfree(oldcred);
 	cred_update_thread(FIRST_THREAD_IN_PROC(initproc));
-	mtx_lock_spin(&sched_lock);
-	initproc->p_sflag |= PS_INMEM;
-	mtx_unlock_spin(&sched_lock);
 	cpu_set_fork_handler(FIRST_THREAD_IN_PROC(initproc), start_init, NULL);
 }
 SYSINIT(init, SI_SUB_CREATE_INIT, SI_ORDER_FIRST, create_init, NULL)
@@ -675,9 +746,9 @@
 	struct thread *td;
 
 	td = FIRST_THREAD_IN_PROC(initproc);
-	mtx_lock_spin(&sched_lock);
+	thread_lock(td);
 	TD_SET_CAN_RUN(td);
-	setrunqueue(td, SRQ_BORING);	/* XXXKSE */
-	mtx_unlock_spin(&sched_lock);
+	sched_add(td, SRQ_BORING);
+	thread_unlock(td);
 }
 SYSINIT(kickinit, SI_SUB_KTHREAD_INIT, SI_ORDER_FIRST, kick_init, NULL)
Index: kern_ktrace.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_ktrace.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/kern_ktrace.c -L sys/kern/kern_ktrace.c -u -r1.2 -r1.3
--- sys/kern/kern_ktrace.c
+++ sys/kern/kern_ktrace.c
@@ -1,6 +1,8 @@
 /*-
  * Copyright (c) 1989, 1993
- *	The Regents of the University of California.  All rights reserved.
+ *	The Regents of the University of California.
+ * Copyright (c) 2005 Robert N. M. Watson
+ * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -30,7 +32,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_ktrace.c,v 1.101.2.3 2006/03/13 03:05:47 jeff Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_ktrace.c,v 1.121 2007/08/29 21:17:11 jhb Exp $");
 
 #include "opt_ktrace.h"
 #include "opt_mac.h"
@@ -42,10 +44,10 @@
 #include <sys/kthread.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
-#include <sys/mac.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/namei.h>
+#include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/unistd.h>
 #include <sys/vnode.h>
@@ -55,6 +57,27 @@
 #include <sys/syslog.h>
 #include <sys/sysproto.h>
 
+#include <security/mac/mac_framework.h>
+
+/*
+ * The ktrace facility allows the tracing of certain key events in user space
+ * processes, such as system calls, signal delivery, context switches, and
+ * user generated events using utrace(2).  It works by streaming event
+ * records and data to a vnode associated with the process using the
+ * ktrace(2) system call.  In general, records can be written directly from
+ * the context that generates the event.  One important exception to this is
+ * during a context switch, where sleeping is not permitted.  To handle this
+ * case, trace events are generated using in-kernel ktr_request records, and
+ * then delivered to disk at a convenient moment -- either immediately, the
+ * next traceable event, at system call return, or at process exit.
+ *
+ * When dealing with multiple threads or processes writing to the same event
+ * log, ordering guarantees are weak: specifically, if an event has multiple
+ * records (i.e., system call enter and return), they may be interlaced with
+ * records from another event.  Process and thread ID information is provided
+ * in the record, and user applications can de-interlace events if required.
+ */
+
 static MALLOC_DEFINE(M_KTRACE, "KTRACE", "KTRACE");
 
 #ifdef KTRACE
@@ -66,8 +89,6 @@
 struct ktr_request {
 	struct	ktr_header ktr_header;
 	void	*ktr_buffer;
-	struct	ucred *ktr_cred;
-	struct	vnode *ktr_vp;
 	union {
 		struct	ktr_syscall ktr_syscall;
 		struct	ktr_sysret ktr_sysret;
@@ -89,7 +110,6 @@
 	0					/* KTR_USER */
 };
 
-static STAILQ_HEAD(, ktr_request) ktr_todo;
 static STAILQ_HEAD(, ktr_request) ktr_free;
 
 static SYSCTL_NODE(_kern, OID_AUTO, ktrace, CTLFLAG_RD, 0, "KTRACE options");
@@ -104,20 +124,48 @@
 
 static int print_message = 1;
 struct mtx ktrace_mtx;
-static struct cv ktrace_cv;
+static struct sx ktrace_sx;
 
 static void ktrace_init(void *dummy);
 static int sysctl_kern_ktrace_request_pool(SYSCTL_HANDLER_ARGS);
 static u_int ktrace_resize_pool(u_int newsize);
 static struct ktr_request *ktr_getrequest(int type);
-static void ktr_submitrequest(struct ktr_request *req);
+static void ktr_submitrequest(struct thread *td, struct ktr_request *req);
 static void ktr_freerequest(struct ktr_request *req);
-static void ktr_loop(void *dummy);
-static void ktr_writerequest(struct ktr_request *req);
+static void ktr_writerequest(struct thread *td, struct ktr_request *req);
 static int ktrcanset(struct thread *,struct proc *);
 static int ktrsetchildren(struct thread *,struct proc *,int,int,struct vnode *);
 static int ktrops(struct thread *,struct proc *,int,int,struct vnode *);
 
+/*
+ * ktrace itself generates events, such as context switches, which we do not
+ * wish to trace.  Maintain a flag, TDP_INKTRACE, on each thread to determine
+ * whether or not it is in a region where tracing of events should be
+ * suppressed.
+ */
+static void
+ktrace_enter(struct thread *td)
+{
+
+	KASSERT(!(td->td_pflags & TDP_INKTRACE), ("ktrace_enter: flag set"));
+	td->td_pflags |= TDP_INKTRACE;
+}
+
+static void
+ktrace_exit(struct thread *td)
+{
+
+	KASSERT(td->td_pflags & TDP_INKTRACE, ("ktrace_exit: flag not set"));
+	td->td_pflags &= ~TDP_INKTRACE;
+}
+
+static void
+ktrace_assert(struct thread *td)
+{
+
+	KASSERT(td->td_pflags & TDP_INKTRACE, ("ktrace_assert: flag not set"));
+}
+
 static void
 ktrace_init(void *dummy)
 {
@@ -125,14 +173,12 @@
 	int i;
 
 	mtx_init(&ktrace_mtx, "ktrace", NULL, MTX_DEF | MTX_QUIET);
-	cv_init(&ktrace_cv, "ktrace");
-	STAILQ_INIT(&ktr_todo);
+	sx_init(&ktrace_sx, "ktrace_sx");
 	STAILQ_INIT(&ktr_free);
 	for (i = 0; i < ktr_requestpool; i++) {
 		req = malloc(sizeof(struct ktr_request), M_KTRACE, M_WAITOK);
 		STAILQ_INSERT_HEAD(&ktr_free, req, ktr_list);
 	}
-	kthread_create(ktr_loop, NULL, NULL, RFHIGHPID, 0, "ktrace");
 }
 SYSINIT(ktrace_init, SI_SUB_KTRACE, SI_ORDER_ANY, ktrace_init, NULL);
 
@@ -155,12 +201,12 @@
 	if (error)
 		return (error);
 	td = curthread;
-	td->td_pflags |= TDP_INKTRACE;
+	ktrace_enter(td);
 	mtx_lock(&ktrace_mtx);
 	oldsize = ktr_requestpool;
 	newsize = ktrace_resize_pool(wantsize);
 	mtx_unlock(&ktrace_mtx);
-	td->td_pflags &= ~TDP_INKTRACE;
+	ktrace_exit(td);
 	error = SYSCTL_OUT(req, &oldsize, sizeof(u_int));
 	if (error)
 		return (error);
@@ -215,11 +261,11 @@
 	struct proc *p = td->td_proc;
 	int pm;
 
-	td->td_pflags |= TDP_INKTRACE;
+	ktrace_enter(td);	/* XXX: In caller instead? */
 	mtx_lock(&ktrace_mtx);
 	if (!KTRCHECK(td, type)) {
 		mtx_unlock(&ktrace_mtx);
-		td->td_pflags &= ~TDP_INKTRACE;
+		ktrace_exit(td);
 		return (NULL);
 	}
 	req = STAILQ_FIRST(&ktr_free);
@@ -230,11 +276,6 @@
 			req->ktr_header.ktr_type |= KTR_DROP;
 			p->p_traceflag &= ~KTRFAC_DROP;
 		}
-		KASSERT(p->p_tracevp != NULL, ("ktrace: no trace vnode"));
-		KASSERT(p->p_tracecred != NULL, ("ktrace: no trace cred"));
-		req->ktr_vp = p->p_tracevp;
-		VREF(p->p_tracevp);
-		req->ktr_cred = crhold(p->p_tracecred);
 		mtx_unlock(&ktrace_mtx);
 		microtime(&req->ktr_header.ktr_time);
 		req->ktr_header.ktr_pid = p->p_pid;
@@ -249,74 +290,89 @@
 		mtx_unlock(&ktrace_mtx);
 		if (pm)
 			printf("Out of ktrace request objects.\n");
-		td->td_pflags &= ~TDP_INKTRACE;
+		ktrace_exit(td);
 	}
 	return (req);
 }
 
+/*
+ * Some trace generation environments don't permit direct access to VFS,
+ * such as during a context switch where sleeping is not allowed.  Under these
+ * circumstances, queue a request to the thread to be written asynchronously
+ * later.
+ */
 static void
-ktr_submitrequest(struct ktr_request *req)
+ktr_enqueuerequest(struct thread *td, struct ktr_request *req)
 {
 
 	mtx_lock(&ktrace_mtx);
-	STAILQ_INSERT_TAIL(&ktr_todo, req, ktr_list);
-	cv_signal(&ktrace_cv);
+	STAILQ_INSERT_TAIL(&td->td_proc->p_ktr, req, ktr_list);
 	mtx_unlock(&ktrace_mtx);
-	curthread->td_pflags &= ~TDP_INKTRACE;
+	ktrace_exit(td);
 }
 
+/*
+ * Drain any pending ktrace records from the per-thread queue to disk.  This
+ * is used both internally before committing other records, and also on
+ * system call return.  We drain all the ones we can find at the time when
+ * drain is requested, but don't keep draining after that as those events
+ * may me approximately "after" the current event.
+ */
 static void
-ktr_freerequest(struct ktr_request *req)
+ktr_drain(struct thread *td)
 {
+	struct ktr_request *queued_req;
+	STAILQ_HEAD(, ktr_request) local_queue;
 
-	crfree(req->ktr_cred);
-	if (req->ktr_vp != NULL) {
-		mtx_lock(&Giant);
-		vrele(req->ktr_vp);
-		mtx_unlock(&Giant);
-	}
-	if (req->ktr_buffer != NULL)
-		free(req->ktr_buffer, M_KTRACE);
-	mtx_lock(&ktrace_mtx);
-	STAILQ_INSERT_HEAD(&ktr_free, req, ktr_list);
-	mtx_unlock(&ktrace_mtx);
-}
+	ktrace_assert(td);
+	sx_assert(&ktrace_sx, SX_XLOCKED);
 
-static void
-ktr_loop(void *dummy)
-{
-	struct ktr_request *req;
-	struct thread *td;
-	struct ucred *cred;
+	STAILQ_INIT(&local_queue);	/* XXXRW: needed? */
 
-	/* Only cache these values once. */
-	td = curthread;
-	cred = td->td_ucred;
-	for (;;) {
+	if (!STAILQ_EMPTY(&td->td_proc->p_ktr)) {
 		mtx_lock(&ktrace_mtx);
-		while (STAILQ_EMPTY(&ktr_todo))
-			cv_wait(&ktrace_cv, &ktrace_mtx);
-		req = STAILQ_FIRST(&ktr_todo);
-		STAILQ_REMOVE_HEAD(&ktr_todo, ktr_list);
-		KASSERT(req != NULL, ("got a NULL request"));
+		STAILQ_CONCAT(&local_queue, &td->td_proc->p_ktr);
 		mtx_unlock(&ktrace_mtx);
-		/*
-		 * It is not enough just to pass the cached cred
-		 * to the VOP's in ktr_writerequest().  Some VFS
-		 * operations use curthread->td_ucred, so we need
-		 * to modify our thread's credentials as well.
-		 * Evil.
-		 */
-		td->td_ucred = req->ktr_cred;
-		ktr_writerequest(req);
-		td->td_ucred = cred;
-		ktr_freerequest(req);
+
+		while ((queued_req = STAILQ_FIRST(&local_queue))) {
+			STAILQ_REMOVE_HEAD(&local_queue, ktr_list);
+			ktr_writerequest(td, queued_req);
+			ktr_freerequest(queued_req);
+		}
 	}
 }
 
 /*
- * MPSAFE
+ * Submit a trace record for immediate commit to disk -- to be used only
+ * where entering VFS is OK.  First drain any pending records that may have
+ * been cached in the thread.
  */
+static void
+ktr_submitrequest(struct thread *td, struct ktr_request *req)
+{
+
+	ktrace_assert(td);
+
+	sx_xlock(&ktrace_sx);
+	ktr_drain(td);
+	ktr_writerequest(td, req);
+	ktr_freerequest(req);
+	sx_xunlock(&ktrace_sx);
+
+	ktrace_exit(td);
+}
+
+static void
+ktr_freerequest(struct ktr_request *req)
+{
+
+	if (req->ktr_buffer != NULL)
+		free(req->ktr_buffer, M_KTRACE);
+	mtx_lock(&ktrace_mtx);
+	STAILQ_INSERT_HEAD(&ktr_free, req, ktr_list);
+	mtx_unlock(&ktrace_mtx);
+}
+
 void
 ktrsyscall(code, narg, args)
 	int code, narg;
@@ -345,12 +401,9 @@
 		req->ktr_header.ktr_len = buflen;
 		req->ktr_buffer = buf;
 	}
-	ktr_submitrequest(req);
+	ktr_submitrequest(curthread, req);
 }
 
-/*
- * MPSAFE
- */
 void
 ktrsysret(code, error, retval)
 	int code, error;
@@ -366,7 +419,36 @@
 	ktp->ktr_code = code;
 	ktp->ktr_error = error;
 	ktp->ktr_retval = retval;		/* what about val2 ? */
-	ktr_submitrequest(req);
+	ktr_submitrequest(curthread, req);
+}
+
+/*
+ * When a process exits, drain per-process asynchronous trace records.
+ */
+void
+ktrprocexit(struct thread *td)
+{
+
+	ktrace_enter(td);
+	sx_xlock(&ktrace_sx);
+	ktr_drain(td);
+	sx_xunlock(&ktrace_sx);
+	ktrace_exit(td);
+}
+
+/*
+ * When a thread returns, drain any asynchronous records generated by the
+ * system call.
+ */
+void
+ktruserret(struct thread *td)
+{
+
+	ktrace_enter(td);
+	sx_xlock(&ktrace_sx);
+	ktr_drain(td);
+	sx_xunlock(&ktrace_sx);
+	ktrace_exit(td);
 }
 
 void
@@ -392,18 +474,9 @@
 		req->ktr_header.ktr_len = namelen;
 		req->ktr_buffer = buf;
 	}
-	ktr_submitrequest(req);
+	ktr_submitrequest(curthread, req);
 }
 
-/*
- * Since the uio may not stay valid, we can not hand off this request to
- * the thread and need to process it synchronously.  However, we wish to
- * keep the relative order of records in a trace file correct, so we
- * do put this request on the queue (if it isn't empty) and then block.
- * The ktrace thread waks us back up when it is time for this event to
- * be posted and blocks until we have completed writing out the event
- * and woken it back up.
- */
 void
 ktrgenio(fd, rw, uio, error)
 	int fd;
@@ -440,7 +513,7 @@
 	ktg->ktr_rw = rw;
 	req->ktr_header.ktr_len = datalen;
 	req->ktr_buffer = buf;
-	ktr_submitrequest(req);
+	ktr_submitrequest(curthread, req);
 }
 
 void
@@ -461,7 +534,7 @@
 	kp->action = action;
 	kp->mask = *mask;
 	kp->code = code;
-	ktr_submitrequest(req);
+	ktr_enqueuerequest(curthread, req);
 }
 
 void
@@ -477,17 +550,12 @@
 	kc = &req->ktr_data.ktr_csw;
 	kc->out = out;
 	kc->user = user;
-	ktr_submitrequest(req);
+	ktr_enqueuerequest(curthread, req);
 }
 #endif /* KTRACE */
 
 /* Interface and common routines */
 
-/*
- * ktrace system call
- *
- * MPSAFE
- */
 #ifndef _SYS_SYSPROTO_H_
 struct ktrace_args {
 	char	*fname;
@@ -510,7 +578,7 @@
 	int ops = KTROP(uap->ops);
 	int descend = uap->ops & KTRFLAG_DESCEND;
 	int nfound, ret = 0;
-	int flags, error = 0;
+	int flags, error = 0, vfslocked;
 	struct nameidata nd;
 	struct ucred *cred;
 
@@ -520,37 +588,40 @@
 	if (ops != KTROP_CLEARFILE && facs == 0)
 		return (EINVAL);
 
-	td->td_pflags |= TDP_INKTRACE;
+	ktrace_enter(td);
 	if (ops != KTROP_CLEAR) {
 		/*
 		 * an operation which requires a file argument.
 		 */
-		NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, uap->fname, td);
+		NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE, UIO_USERSPACE,
+		    uap->fname, td);
 		flags = FREAD | FWRITE | O_NOFOLLOW;
-		mtx_lock(&Giant);
-		error = vn_open(&nd, &flags, 0, -1);
+		error = vn_open(&nd, &flags, 0, NULL);
 		if (error) {
-			mtx_unlock(&Giant);
-			td->td_pflags &= ~TDP_INKTRACE;
+			ktrace_exit(td);
 			return (error);
 		}
+		vfslocked = NDHASGIANT(&nd);
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 		vp = nd.ni_vp;
 		VOP_UNLOCK(vp, 0, td);
 		if (vp->v_type != VREG) {
 			(void) vn_close(vp, FREAD|FWRITE, td->td_ucred, td);
-			mtx_unlock(&Giant);
-			td->td_pflags &= ~TDP_INKTRACE;
+			VFS_UNLOCK_GIANT(vfslocked);
+			ktrace_exit(td);
 			return (EACCES);
 		}
-		mtx_unlock(&Giant);
+		VFS_UNLOCK_GIANT(vfslocked);
 	}
 	/*
 	 * Clear all uses of the tracefile.
 	 */
 	if (ops == KTROP_CLEARFILE) {
+		int vrele_count;
+
+		vrele_count = 0;
 		sx_slock(&allproc_lock);
-		LIST_FOREACH(p, &allproc, p_list) {
+		FOREACH_PROC_IN_SYSTEM(p) {
 			PROC_LOCK(p);
 			if (p->p_tracevp == vp) {
 				if (ktrcanset(td, p)) {
@@ -560,20 +631,20 @@
 					p->p_tracevp = NULL;
 					p->p_traceflag = 0;
 					mtx_unlock(&ktrace_mtx);
-					PROC_UNLOCK(p);
-					mtx_lock(&Giant);
-					(void) vn_close(vp, FREAD|FWRITE,
-						cred, td);
-					mtx_unlock(&Giant);
+					vrele_count++;
 					crfree(cred);
-				} else {
-					PROC_UNLOCK(p);
+				} else
 					error = EPERM;
-				}
-			} else
-				PROC_UNLOCK(p);
+			}
+			PROC_UNLOCK(p);
 		}
 		sx_sunlock(&allproc_lock);
+		if (vrele_count > 0) {
+			vfslocked = VFS_LOCK_GIANT(vp->v_mount);
+			while (vrele_count-- > 0)
+				vrele(vp);
+			VFS_UNLOCK_GIANT(vfslocked);
+		}
 		goto done;
 	}
 	/*
@@ -644,22 +715,17 @@
 		error = EPERM;
 done:
 	if (vp != NULL) {
-		mtx_lock(&Giant);
+		vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 		(void) vn_close(vp, FWRITE, td->td_ucred, td);
-		mtx_unlock(&Giant);
+		VFS_UNLOCK_GIANT(vfslocked);
 	}
-	td->td_pflags &= ~TDP_INKTRACE;
+	ktrace_exit(td);
 	return (error);
 #else /* !KTRACE */
 	return (ENOSYS);
 #endif /* KTRACE */
 }
 
-/*
- * utrace system call
- *
- * MPSAFE
- */
 /* ARGSUSED */
 int
 utrace(td, uap)
@@ -689,7 +755,7 @@
 	}
 	req->ktr_buffer = cp;
 	req->ktr_header.ktr_len = uap->len;
-	ktr_submitrequest(req);
+	ktr_submitrequest(td, req);
 	return (0);
 #else /* !KTRACE */
 	return (ENOSYS);
@@ -727,7 +793,7 @@
 			p->p_tracecred = crhold(td->td_ucred);
 		}
 		p->p_traceflag |= facs;
-		if (td->td_ucred->cr_uid == 0)
+		if (priv_check(td, PRIV_KTRACE) == 0)
 			p->p_traceflag |= KTRFAC_ROOT;
 	} else {
 		/* KTROP_CLEAR */
@@ -790,31 +856,48 @@
 }
 
 static void
-ktr_writerequest(struct ktr_request *req)
+ktr_writerequest(struct thread *td, struct ktr_request *req)
 {
 	struct ktr_header *kth;
 	struct vnode *vp;
 	struct proc *p;
-	struct thread *td;
 	struct ucred *cred;
 	struct uio auio;
 	struct iovec aiov[3];
 	struct mount *mp;
 	int datalen, buflen, vrele_count;
-	int error;
+	int error, vfslocked;
+
+	/*
+	 * We hold the vnode and credential for use in I/O in case ktrace is
+	 * disabled on the process as we write out the request.
+	 *
+	 * XXXRW: This is not ideal: we could end up performing a write after
+	 * the vnode has been closed.
+	 */
+	mtx_lock(&ktrace_mtx);
+	vp = td->td_proc->p_tracevp;
+	if (vp != NULL)
+		VREF(vp);
+	cred = td->td_proc->p_tracecred;
+	if (cred != NULL)
+		crhold(cred);
+	mtx_unlock(&ktrace_mtx);
 
-	vp = req->ktr_vp;
 	/*
 	 * If vp is NULL, the vp has been cleared out from under this
-	 * request, so just drop it.
+	 * request, so just drop it.  Make sure the credential and vnode are
+	 * in sync: we should have both or neither.
 	 */
-	if (vp == NULL)
+	if (vp == NULL) {
+		KASSERT(cred == NULL, ("ktr_writerequest: cred != NULL"));
 		return;
+	}
+	KASSERT(cred != NULL, ("ktr_writerequest: cred == NULL"));
+
 	kth = &req->ktr_header;
 	datalen = data_lengths[(u_short)kth->ktr_type & ~KTR_DROP];
 	buflen = kth->ktr_len;
-	cred = req->ktr_cred;
-	td = curthread;
 	auio.uio_iov = &aiov[0];
 	auio.uio_offset = 0;
 	auio.uio_segflg = UIO_SYSSPACE;
@@ -838,7 +921,8 @@
 		auio.uio_resid += buflen;
 		auio.uio_iovcnt++;
 	}
-	mtx_lock(&Giant);
+
+	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	vn_start_write(vp, &mp, V_WAIT);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	(void)VOP_LEASE(vp, td, cred, LEASE_WRITE);
@@ -849,7 +933,8 @@
 		error = VOP_WRITE(vp, &auio, IO_UNIT | IO_APPEND, cred);
 	VOP_UNLOCK(vp, 0, td);
 	vn_finished_write(mp);
-	mtx_unlock(&Giant);
+	vrele(vp);
+	VFS_UNLOCK_GIANT(vfslocked);
 	if (!error)
 		return;
 	/*
@@ -869,7 +954,7 @@
 	 */
 	cred = NULL;
 	sx_slock(&allproc_lock);
-	LIST_FOREACH(p, &allproc, p_list) {
+	FOREACH_PROC_IN_SYSTEM(p) {
 		PROC_LOCK(p);
 		if (p->p_tracevp == vp) {
 			mtx_lock(&ktrace_mtx);
@@ -887,21 +972,16 @@
 		}
 	}
 	sx_sunlock(&allproc_lock);
+
 	/*
-	 * Second, clear this vnode from any pending requests.
+	 * We can't clear any pending requests in threads that have cached
+	 * them but not yet committed them, as those are per-thread.  The
+	 * thread will have to clear it itself on system call return.
 	 */
-	mtx_lock(&ktrace_mtx);
-	STAILQ_FOREACH(req, &ktr_todo, ktr_list) {
-		if (req->ktr_vp == vp) {
-			req->ktr_vp = NULL;
-			vrele_count++;
-		}
-	}
-	mtx_unlock(&ktrace_mtx);
-	mtx_lock(&Giant);
+	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
 	while (vrele_count-- > 0)
 		vrele(vp);
-	mtx_unlock(&Giant);
+	VFS_UNLOCK_GIANT(vfslocked);
 }
 
 /*
@@ -919,7 +999,7 @@
 
 	PROC_LOCK_ASSERT(targetp, MA_OWNED);
 	if (targetp->p_traceflag & KTRFAC_ROOT &&
-	    suser_cred(td->td_ucred, SUSER_ALLOWJAIL))
+	    priv_check(td, PRIV_KTRACE))
 		return (0);
 
 	if (p_candebug(td, targetp) != 0)
Index: vfs_mount.c
===================================================================
RCS file: /home/cvs/src/sys/kern/vfs_mount.c,v
retrieving revision 1.4
retrieving revision 1.5
diff -L sys/kern/vfs_mount.c -L sys/kern/vfs_mount.c -u -r1.4 -r1.5
--- sys/kern/vfs_mount.c
+++ sys/kern/vfs_mount.c
@@ -35,18 +35,19 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/vfs_mount.c,v 1.196.2.8 2006/03/13 03:06:27 jeff Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/vfs_mount.c,v 1.265.2.1.2.1 2008/01/20 02:38:42 rodrigc Exp $");
 
 #include <sys/param.h>
 #include <sys/conf.h>
+#include <sys/clock.h>
 #include <sys/jail.h>
 #include <sys/kernel.h>
 #include <sys/libkern.h>
-#include <sys/mac.h>
 #include <sys/malloc.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
+#include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/filedesc.h>
 #include <sys/reboot.h>
@@ -57,11 +58,15 @@
 #include <sys/sysent.h>
 #include <sys/systm.h>
 #include <sys/vnode.h>
+#include <vm/uma.h>
 
 #include <geom/geom.h>
 
 #include <machine/stdarg.h>
 
+#include <security/audit/audit.h>
+#include <security/mac/mac_framework.h>
+
 #include "opt_rootdevname.h"
 #include "opt_ddb.h"
 #include "opt_mac.h"
@@ -75,14 +80,12 @@
 
 static int	vfs_domount(struct thread *td, const char *fstype,
 		    char *fspath, int fsflags, void *fsdata);
-static int	vfs_mount_alloc(struct vnode *dvp, struct vfsconf *vfsp,
-		    const char *fspath, struct thread *td, struct mount **mpp);
 static int	vfs_mountroot_ask(void);
 static int	vfs_mountroot_try(const char *mountfrom);
 static int	vfs_donmount(struct thread *td, int fsflags,
 		    struct uio *fsoptions);
 static void	free_mntarg(struct mntarg *ma);
-static void	vfs_mount_destroy(struct mount *, struct thread *);
+static int	vfs_getopt_pos(struct vfsoptlist *opts, const char *name);
 
 static int	usermount = 0;
 SYSCTL_INT(_vfs, OID_AUTO, usermount, CTLFLAG_RW, &usermount, 0,
@@ -90,6 +93,7 @@
 
 MALLOC_DEFINE(M_MOUNT, "mount", "vfs mount structure");
 MALLOC_DEFINE(M_VNODE_MARKER, "vnodemarker", "vnode marker");
+static uma_zone_t mount_zone;
 
 /* List of mounted filesystems. */
 struct mntlist mountlist = TAILQ_HEAD_INITIALIZER(mountlist);
@@ -127,13 +131,14 @@
  * Global opts, taken by all filesystems
  */
 static const char *global_opts[] = {
+	"errmsg",
 	"fstype",
 	"fspath",
-	"rdonly",
 	"ro",
 	"rw",
-	"suid",
-	"exec",
+	"nosuid",
+	"noexec",
+	"update",
 	NULL
 };
 
@@ -176,7 +181,7 @@
 }
 
 /* Release all resources related to the mount options. */
-static void
+void
 vfs_freeopts(struct vfsoptlist *opts)
 {
 	struct vfsopt *opt;
@@ -188,6 +193,17 @@
 	free(opts, M_MOUNT);
 }
 
+void
+vfs_deleteopt(struct vfsoptlist *opts, const char *name)
+{
+	struct vfsopt *opt, *temp;
+
+	TAILQ_FOREACH_SAFE(opt, opts, link, temp)  {
+		if (strcmp(opt->name, name) == 0)
+			vfs_freeopt(opts, opt);
+	}
+}
+
 /*
  * Check if options are equal (with or without the "no" prefix).
  */
@@ -351,8 +367,7 @@
 }
 
 /*
- * ---------------------------------------------------------------------
- * Mount a filesystem
+ * Mount a filesystem.
  */
 int
 nmount(td, uap)
@@ -369,9 +384,15 @@
 	int error;
 	u_int iovcnt;
 
-	/* Kick out MNT_ROOTFS early as it is legal internally */
-	if (uap->flags & MNT_ROOTFS)
-		return (EINVAL);
+	AUDIT_ARG(fflags, uap->flags);
+
+	/*
+	 * Filter out MNT_ROOTFS.  We do not want clients of nmount() in
+	 * userspace to set this flag, but we must filter it out if we want
+	 * MNT_UPDATE on the root file system to work.
+	 * MNT_ROOTFS should only be set in the kernel in vfs_mountroot_try().
+	 */
+	uap->flags &= ~MNT_ROOTFS;
 
 	iovcnt = uap->iovcnt;
 	/*
@@ -393,6 +414,7 @@
 		iov++;
 	}
 	error = vfs_donmount(td, uap->flags, auio);
+
 	free(auio, M_IOV);
 	return (error);
 }
@@ -420,27 +442,48 @@
 	MNT_IUNLOCK(mp);
 }
 
+static int
+mount_init(void *mem, int size, int flags)
+{
+	struct mount *mp;
+
+	mp = (struct mount *)mem;
+	mtx_init(&mp->mnt_mtx, "struct mount mtx", NULL, MTX_DEF);
+	lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, 0);
+	return (0);
+}
+
+static void
+mount_fini(void *mem, int size)
+{
+	struct mount *mp;
+
+	mp = (struct mount *)mem;
+	lockdestroy(&mp->mnt_lock);
+	mtx_destroy(&mp->mnt_mtx);
+}
+
 /*
  * Allocate and initialize the mount point struct.
  */
-static int
+struct mount *
 vfs_mount_alloc(struct vnode *vp, struct vfsconf *vfsp,
-    const char *fspath, struct thread *td, struct mount **mpp)
+    const char *fspath, struct thread *td)
 {
 	struct mount *mp;
 
-	mp = malloc(sizeof(struct mount), M_MOUNT, M_WAITOK | M_ZERO);
+	mp = uma_zalloc(mount_zone, M_WAITOK);
+	bzero(&mp->mnt_startzero,
+	    __rangeof(struct mount, mnt_startzero, mnt_endzero));
 	TAILQ_INIT(&mp->mnt_nvnodelist);
 	mp->mnt_nvnodelistsize = 0;
-	mtx_init(&mp->mnt_mtx, "struct mount mtx", NULL, MTX_DEF);
-	lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, 0);
-	(void) vfs_busy(mp, LK_NOWAIT, 0, td);
 	mp->mnt_ref = 0;
+	(void) vfs_busy(mp, LK_NOWAIT, 0, td);
 	mp->mnt_op = vfsp->vfc_vfsops;
 	mp->mnt_vfc = vfsp;
 	vfsp->vfc_refcount++;	/* XXX Unlocked */
 	mp->mnt_stat.f_type = vfsp->vfc_typenum;
-	mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
+	mp->mnt_gen++;
 	strlcpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
 	mp->mnt_vnodecovered = vp;
 	mp->mnt_cred = crdup(td->td_ucred);
@@ -452,19 +495,17 @@
 	mac_create_mount(td->td_ucred, mp);
 #endif
 	arc4rand(&mp->mnt_hashseed, sizeof mp->mnt_hashseed, 0);
-	*mpp = mp;
-	return (0);
+	return (mp);
 }
 
 /*
  * Destroy the mount struct previously allocated by vfs_mount_alloc().
  */
-static void
-vfs_mount_destroy(struct mount *mp, struct thread *td)
+void
+vfs_mount_destroy(struct mount *mp)
 {
 	int i;
 
-	vfs_unbusy(mp, td);
 	MNT_ILOCK(mp);
 	for (i = 0; mp->mnt_ref && i < 3; i++)
 		msleep(mp, MNT_MTX(mp), PVFS, "mntref", hz);
@@ -508,9 +549,13 @@
 	}
 	MNT_IUNLOCK(mp);
 	mp->mnt_vfc->vfc_refcount--;
-	if (!TAILQ_EMPTY(&mp->mnt_nvnodelist))
+	if (!TAILQ_EMPTY(&mp->mnt_nvnodelist)) {
+		struct vnode *vp;
+
+		TAILQ_FOREACH(vp, &mp->mnt_nvnodelist, v_nmntvnodes)
+			vprint("", vp);
 		panic("unmount: dangling vnode");
-	lockdestroy(&mp->mnt_lock);
+	}
 	MNT_ILOCK(mp);
 	if (mp->mnt_kern_flag & MNTK_MWAIT)
 		wakeup(mp);
@@ -524,27 +569,37 @@
 	mp->mnt_nvnodelistsize = -1000;
 	mp->mnt_secondary_writes = -1000;
 	MNT_IUNLOCK(mp);
-	mtx_destroy(&mp->mnt_mtx);
 #ifdef MAC
 	mac_destroy_mount(mp);
 #endif
 	if (mp->mnt_opt != NULL)
 		vfs_freeopts(mp->mnt_opt);
 	crfree(mp->mnt_cred);
-	free(mp, M_MOUNT);
+	uma_zfree(mount_zone, mp);
 }
 
 static int
 vfs_donmount(struct thread *td, int fsflags, struct uio *fsoptions)
 {
 	struct vfsoptlist *optlist;
-	char *fstype, *fspath;
-	int error, fstypelen, fspathlen;
+	struct vfsopt *opt, *noro_opt;
+	char *fstype, *fspath, *errmsg;
+	int error, fstypelen, fspathlen, errmsg_len, errmsg_pos;
+	int has_rw, has_noro;
+
+	errmsg = NULL;
+	errmsg_len = 0;
+	errmsg_pos = -1;
+	has_rw = 0;
+	has_noro = 0;
 
 	error = vfs_buildopts(fsoptions, &optlist);
 	if (error)
 		return (error);
 
+	if (vfs_getopt(optlist, "errmsg", (void **)&errmsg, &errmsg_len) == 0)
+		errmsg_pos = vfs_getopt_pos(optlist, "errmsg");
+
 	/*
 	 * We need these two options before the others,
 	 * and they are mandatory for any filesystem.
@@ -554,12 +609,16 @@
 	error = vfs_getopt(optlist, "fstype", (void **)&fstype, &fstypelen);
 	if (error || fstype[fstypelen - 1] != '\0') {
 		error = EINVAL;
+		if (errmsg != NULL)
+			strncpy(errmsg, "Invalid fstype", errmsg_len);
 		goto bail;
 	}
 	fspathlen = 0;
 	error = vfs_getopt(optlist, "fspath", (void **)&fspath, &fspathlen);
 	if (error || fspath[fspathlen - 1] != '\0') {
 		error = EINVAL;
+		if (errmsg != NULL)
+			strncpy(errmsg, "Invalid fspath", errmsg_len);
 		goto bail;
 	}
 
@@ -568,63 +627,92 @@
 	 * before we call vfs_domount(), since vfs_domount() has special
 	 * logic based on MNT_UPDATE.  This is very important
 	 * when we want to update the root filesystem.
-	 */ 
-	if (vfs_getopt(optlist, "update", NULL, NULL) == 0)
-		fsflags |= MNT_UPDATE;
-
-	if (vfs_getopt(optlist, "async", NULL, NULL) == 0)
-		fsflags |= MNT_ASYNC;
-
-	if (vfs_getopt(optlist, "force", NULL, NULL) == 0)
-		fsflags |= MNT_FORCE;
-
-	if (vfs_getopt(optlist, "multilabel", NULL, NULL) == 0)
-		fsflags |= MNT_MULTILABEL;
-
-	if (vfs_getopt(optlist, "noasync", NULL, NULL) == 0)
-		fsflags &= ~MNT_ASYNC;
-
-	if (vfs_getopt(optlist, "noatime", NULL, NULL) == 0)
-		fsflags |= MNT_NOATIME;
-
-	if (vfs_getopt(optlist, "noclusterr", NULL, NULL) == 0)
-		fsflags |= MNT_NOCLUSTERR;
-
-	if (vfs_getopt(optlist, "noclusterw", NULL, NULL) == 0)
-		fsflags |= MNT_NOCLUSTERW;
-
-	if (vfs_getopt(optlist, "noexec", NULL, NULL) == 0)
-		fsflags |= MNT_NOEXEC;
-
-	if (vfs_getopt(optlist, "nosuid", NULL, NULL) == 0)
-		fsflags |= MNT_NOSUID;
-
-	if (vfs_getopt(optlist, "nosymfollow", NULL, NULL) == 0)
-		fsflags |= MNT_NOSYMFOLLOW;
-
-	if (vfs_getopt(optlist, "noro", NULL, NULL) == 0)
-		fsflags &= ~MNT_RDONLY;
-
-	if (vfs_getopt(optlist, "ro", NULL, NULL) == 0)
-		fsflags |= MNT_RDONLY;
-
-	if (vfs_getopt(optlist, "rdonly", NULL, NULL) == 0)
-		fsflags |= MNT_RDONLY;
-
-	if (vfs_getopt(optlist, "rw", NULL, NULL) == 0)
-		fsflags &= ~MNT_RDONLY;
-
-	if (vfs_getopt(optlist, "snapshot", NULL, NULL) == 0)
-		fsflags |= MNT_SNAPSHOT;
-
-	if (vfs_getopt(optlist, "suiddir", NULL, NULL) == 0)
-		fsflags |= MNT_SUIDDIR;
-
-	if (vfs_getopt(optlist, "sync", NULL, NULL) == 0)
-		fsflags |= MNT_SYNCHRONOUS;
-
-	if (vfs_getopt(optlist, "union", NULL, NULL) == 0)
-		fsflags |= MNT_UNION;
+	 */
+	TAILQ_FOREACH(opt, optlist, link) {
+		if (strcmp(opt->name, "update") == 0)
+			fsflags |= MNT_UPDATE;
+		else if (strcmp(opt->name, "async") == 0)
+			fsflags |= MNT_ASYNC;
+		else if (strcmp(opt->name, "force") == 0)
+			fsflags |= MNT_FORCE;
+		else if (strcmp(opt->name, "multilabel") == 0)
+			fsflags |= MNT_MULTILABEL;
+		else if (strcmp(opt->name, "noasync") == 0)
+			fsflags &= ~MNT_ASYNC;
+		else if (strcmp(opt->name, "noatime") == 0)
+			fsflags |= MNT_NOATIME;
+		else if (strcmp(opt->name, "atime") == 0) {
+			free(opt->name, M_MOUNT);
+			opt->name = strdup("nonoatime", M_MOUNT);
+		}
+		else if (strcmp(opt->name, "noclusterr") == 0)
+			fsflags |= MNT_NOCLUSTERR;
+		else if (strcmp(opt->name, "clusterr") == 0) {
+			free(opt->name, M_MOUNT);
+			opt->name = strdup("nonoclusterr", M_MOUNT);
+		}
+		else if (strcmp(opt->name, "noclusterw") == 0)
+			fsflags |= MNT_NOCLUSTERW;
+		else if (strcmp(opt->name, "clusterw") == 0) {
+			free(opt->name, M_MOUNT);
+			opt->name = strdup("nonoclusterw", M_MOUNT);
+		}
+		else if (strcmp(opt->name, "noexec") == 0)
+			fsflags |= MNT_NOEXEC;
+		else if (strcmp(opt->name, "exec") == 0) {
+			free(opt->name, M_MOUNT);
+			opt->name = strdup("nonoexec", M_MOUNT);
+		}
+		else if (strcmp(opt->name, "nosuid") == 0)
+			fsflags |= MNT_NOSUID;
+		else if (strcmp(opt->name, "suid") == 0) {
+			free(opt->name, M_MOUNT);
+			opt->name = strdup("nonosuid", M_MOUNT);
+		}
+		else if (strcmp(opt->name, "nosymfollow") == 0)
+			fsflags |= MNT_NOSYMFOLLOW;
+		else if (strcmp(opt->name, "symfollow") == 0) {
+			free(opt->name, M_MOUNT);
+			opt->name = strdup("nonosymfollow", M_MOUNT);
+		}
+		else if (strcmp(opt->name, "noro") == 0) {
+			fsflags &= ~MNT_RDONLY;
+			has_noro = 1;
+		}
+		else if (strcmp(opt->name, "rw") == 0) {
+			fsflags &= ~MNT_RDONLY;
+			has_rw = 1;
+		}
+		else if (strcmp(opt->name, "ro") == 0)
+			fsflags |= MNT_RDONLY;
+		else if (strcmp(opt->name, "rdonly") == 0) {
+			free(opt->name, M_MOUNT);
+			opt->name = strdup("ro", M_MOUNT);
+			fsflags |= MNT_RDONLY;
+		}
+		else if (strcmp(opt->name, "snapshot") == 0)
+			fsflags |= MNT_SNAPSHOT;
+		else if (strcmp(opt->name, "suiddir") == 0)
+			fsflags |= MNT_SUIDDIR;
+		else if (strcmp(opt->name, "sync") == 0)
+			fsflags |= MNT_SYNCHRONOUS;
+		else if (strcmp(opt->name, "union") == 0)
+			fsflags |= MNT_UNION;
+	}
+
+	/*
+	 * If "rw" was specified as a mount option, and we
+	 * are trying to update a mount-point from "ro" to "rw",
+	 * we need a mount option "noro", since in vfs_mergeopts(),
+	 * "noro" will cancel "ro", but "rw" will not do anything.
+	 */
+	if (has_rw && !has_noro) {
+		noro_opt = malloc(sizeof(struct vfsopt), M_MOUNT, M_WAITOK);
+		noro_opt->name = strdup("noro", M_MOUNT);
+		noro_opt->value = NULL;
+		noro_opt->len = 0;
+		TAILQ_INSERT_TAIL(optlist, noro_opt, link);
+	}
 
 	/*
 	 * Be ultra-paranoid about making sure the type and fspath
@@ -640,13 +728,26 @@
 	error = vfs_domount(td, fstype, fspath, fsflags, optlist);
 	mtx_unlock(&Giant);
 bail:
-	if (error)
+	/* copyout the errmsg */
+	if (errmsg_pos != -1 && ((2 * errmsg_pos + 1) < fsoptions->uio_iovcnt)
+	    && errmsg_len > 0 && errmsg != NULL) {
+		if (fsoptions->uio_segflg == UIO_SYSSPACE) {
+			bcopy(errmsg,
+			    fsoptions->uio_iov[2 * errmsg_pos + 1].iov_base,
+			    fsoptions->uio_iov[2 * errmsg_pos + 1].iov_len);
+		} else {
+			copyout(errmsg,
+			    fsoptions->uio_iov[2 * errmsg_pos + 1].iov_base,
+			    fsoptions->uio_iov[2 * errmsg_pos + 1].iov_len);
+		}
+	}
+
+	if (error != 0)
 		vfs_freeopts(optlist);
 	return (error);
 }
 
 /*
- * ---------------------------------------------------------------------
  * Old mount API.
  */
 #ifndef _SYS_SYSPROTO_H_
@@ -673,26 +774,35 @@
 	struct mntarg *ma = NULL;
 	int error;
 
-	/* Kick out MNT_ROOTFS early as it is legal internally */
-	uap->flags &= ~MNT_ROOTFS;
+	AUDIT_ARG(fflags, uap->flags);
 
-	if (uap->data == NULL)
-		return (EINVAL);
+	/*
+	 * Filter out MNT_ROOTFS.  We do not want clients of mount() in
+	 * userspace to set this flag, but we must filter it out if we want
+	 * MNT_UPDATE on the root file system to work.
+	 * MNT_ROOTFS should only be set in the kernel in vfs_mountroot_try().
+	 */
+	uap->flags &= ~MNT_ROOTFS;
 
 	fstype = malloc(MFSNAMELEN, M_TEMP, M_WAITOK);
 	error = copyinstr(uap->type, fstype, MFSNAMELEN, NULL);
-	if (!error) {
-		mtx_lock(&Giant);	/* XXX ? */
-		vfsp = vfs_byname_kld(fstype, td, &error);
-		mtx_unlock(&Giant);
+	if (error) {
+		free(fstype, M_TEMP);
+		return (error);
 	}
+
+	AUDIT_ARG(text, fstype);
+	mtx_lock(&Giant);
+	vfsp = vfs_byname_kld(fstype, td, &error);
 	free(fstype, M_TEMP);
-	if (error)
-		return (error);
-	if (vfsp == NULL)
+	if (vfsp == NULL) {
+		mtx_unlock(&Giant);
 		return (ENOENT);
-	if (vfsp->vfc_vfsops->vfs_cmount == NULL)
+	}
+	if (vfsp->vfc_vfsops->vfs_cmount == NULL) {
+		mtx_unlock(&Giant);
 		return (EOPNOTSUPP);
+	}
 
 	ma = mount_argsu(ma, "fstype", uap->type, MNAMELEN);
 	ma = mount_argsu(ma, "fspath", uap->path, MNAMELEN);
@@ -701,6 +811,7 @@
 	ma = mount_argb(ma, !(uap->flags & MNT_NOEXEC), "noexec");
 
 	error = vfsp->vfc_vfsops->vfs_cmount(ma, uap->data, uap->flags, td);
+	mtx_unlock(&Giant);
 	return (error);
 }
 
@@ -710,7 +821,7 @@
  */
 static int
 vfs_domount(
-	struct thread *td,	/* Flags common to all filesystems. */
+	struct thread *td,	/* Calling thread. */
 	const char *fstype,	/* Filesystem type. */
 	char *fspath,		/* Mount path. */
 	int fsflags,		/* Flags common to all filesystems. */
@@ -720,7 +831,8 @@
 	struct vnode *vp;
 	struct mount *mp;
 	struct vfsconf *vfsp;
-	int error, flag = 0, kern_flag = 0;
+	struct export_args export;
+	int error, flag = 0;
 	struct vattr va;
 	struct nameidata nd;
 
@@ -733,26 +845,31 @@
 	if (strlen(fstype) >= MFSNAMELEN || strlen(fspath) >= MNAMELEN)
 		return (ENAMETOOLONG);
 
-	if (jailed(td->td_ucred))
-		return (EPERM);
-	if (usermount == 0) {
-		if ((error = suser(td)) != 0)
+	if (jailed(td->td_ucred) || usermount == 0) {
+		if ((error = priv_check(td, PRIV_VFS_MOUNT)) != 0)
 			return (error);
 	}
 
 	/*
 	 * Do not allow NFS export or MNT_SUIDDIR by unprivileged users.
 	 */
-	if (fsflags & (MNT_EXPORTED | MNT_SUIDDIR)) {
-		if ((error = suser(td)) != 0)
+	if (fsflags & MNT_EXPORTED) {
+		error = priv_check(td, PRIV_VFS_MOUNT_EXPORTED);
+		if (error)
+			return (error);
+	}
+	if (fsflags & MNT_SUIDDIR) {
+		error = priv_check(td, PRIV_VFS_MOUNT_SUIDDIR);
+		if (error)
 			return (error);
 	}
 	/*
-	 * Silently enforce MNT_NOSUID and MNT_USER for
-	 * unprivileged users.
+	 * Silently enforce MNT_NOSUID and MNT_USER for unprivileged users.
 	 */
-	if (suser(td) != 0)
-		fsflags |= MNT_NOSUID | MNT_USER;
+	if ((fsflags & (MNT_NOSUID | MNT_USER)) != (MNT_NOSUID | MNT_USER)) {
+		if (priv_check(td, PRIV_VFS_MOUNT_NONUSER) != 0)
+			fsflags |= MNT_NOSUID | MNT_USER;
+	}
 
 	/* Load KLDs before we lock the covered vnode to avoid reversals. */
 	vfsp = NULL;
@@ -764,11 +881,14 @@
 			vfsp = vfs_byname_kld(fstype, td, &error);
 		if (vfsp == NULL)
 			return (ENODEV);
+		if (jailed(td->td_ucred) && !(vfsp->vfc_flags & VFCF_JAIL))
+			return (EPERM);
 	}
 	/*
 	 * Get vnode to be covered
 	 */
-	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, fspath, td);
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | AUDITVNODE1, UIO_SYSSPACE,
+	    fspath, td);
 	if ((error = namei(&nd)) != 0)
 		return (error);
 	NDFREE(&nd, NDF_ONLY_PNBUF);
@@ -779,17 +899,19 @@
 			return (EINVAL);
 		}
 		mp = vp->v_mount;
+		MNT_ILOCK(mp);
 		flag = mp->mnt_flag;
-		kern_flag = mp->mnt_kern_flag;
 		/*
 		 * We only allow the filesystem to be reloaded if it
 		 * is currently mounted read-only.
 		 */
 		if ((fsflags & MNT_RELOAD) &&
 		    ((mp->mnt_flag & MNT_RDONLY) == 0)) {
+			MNT_IUNLOCK(mp);
 			vput(vp);
 			return (EOPNOTSUPP);	/* Needs translation */
 		}
+		MNT_IUNLOCK(mp);
 		/*
 		 * Only privileged root, or (if MNT_USER is set) the user that
 		 * did the original mount is permitted to update it.
@@ -813,8 +935,10 @@
 		}
 		vp->v_iflag |= VI_MOUNT;
 		VI_UNLOCK(vp);
+		MNT_ILOCK(mp);
 		mp->mnt_flag |= fsflags &
 		    (MNT_RELOAD | MNT_FORCE | MNT_UPDATE | MNT_SNAPSHOT | MNT_ROOTFS);
+		MNT_IUNLOCK(mp);
 		VOP_UNLOCK(vp, 0, td);
 		mp->mnt_optnew = fsdata;
 		vfs_mergeopts(mp->mnt_optnew, mp->mnt_opt);
@@ -829,7 +953,9 @@
 			return (error);
 		}
 		if (va.va_uid != td->td_ucred->cr_uid) {
-			if ((error = suser(td)) != 0) {
+			error = priv_check_cred(td->td_ucred, PRIV_VFS_ADMIN,
+			    0);
+			if (error) {
 				vput(vp);
 				return (error);
 			}
@@ -856,11 +982,7 @@
 		/*
 		 * Allocate and initialize the filesystem.
 		 */
-		error = vfs_mount_alloc(vp, vfsp, fspath, td, &mp);
-		if (error) {
-			vput(vp);
-			return (error);
-		}
+		mp = vfs_mount_alloc(vp, vfsp, fspath, td);
 		VOP_UNLOCK(vp, 0, td);
 
 		/* XXXMAC: pass to vfs_mount_alloc? */
@@ -870,16 +992,30 @@
 	/*
 	 * Set the mount level flags.
 	 */
-	if (fsflags & MNT_RDONLY)
-		mp->mnt_flag |= MNT_RDONLY;
-	mp->mnt_flag &=~ MNT_UPDATEMASK;
-	mp->mnt_flag |= fsflags & (MNT_UPDATEMASK | MNT_FORCE | MNT_ROOTFS);
+	MNT_ILOCK(mp);
+	mp->mnt_flag = (mp->mnt_flag & ~MNT_UPDATEMASK) |
+		(fsflags & (MNT_UPDATEMASK | MNT_FORCE | MNT_ROOTFS |
+			    MNT_RDONLY));
+	if ((mp->mnt_flag & MNT_ASYNC) == 0)
+		mp->mnt_kern_flag &= ~MNTK_ASYNC;
+	MNT_IUNLOCK(mp);
 	/*
 	 * Mount the filesystem.
 	 * XXX The final recipients of VFS_MOUNT just overwrite the ndp they
 	 * get.  No freeing of cn_pnbuf.
 	 */
         error = VFS_MOUNT(mp, td);
+
+	/*
+	 * Process the export option only if we are
+	 * updating mount options.
+	 */
+	if (!error && (fsflags & MNT_UPDATE)) {
+		if (vfs_copyopt(mp->mnt_optnew, "export", &export,
+		    sizeof(export)) == 0)
+			error = vfs_export(mp, &export);
+	}
+
 	if (!error) {
 		if (mp->mnt_opt != NULL)
 			vfs_freeopts(mp->mnt_opt);
@@ -892,12 +1028,18 @@
 	*/
 	mp->mnt_optnew = NULL;
 	if (mp->mnt_flag & MNT_UPDATE) {
-		mp->mnt_flag &=
-		    ~(MNT_UPDATE | MNT_RELOAD | MNT_FORCE | MNT_SNAPSHOT);
-		if (error) {
-			mp->mnt_flag = flag;
-			mp->mnt_kern_flag = kern_flag;
-		}
+		MNT_ILOCK(mp);
+		if (error)
+			mp->mnt_flag = (mp->mnt_flag & MNT_QUOTA) |
+				(flag & ~MNT_QUOTA);
+		else
+			mp->mnt_flag &=	~(MNT_UPDATE | MNT_RELOAD |
+					  MNT_FORCE | MNT_SNAPSHOT);
+		if ((mp->mnt_flag & MNT_ASYNC) != 0 && mp->mnt_noasync == 0)
+			mp->mnt_kern_flag |= MNTK_ASYNC;
+		else
+			mp->mnt_kern_flag &= ~MNTK_ASYNC;
+		MNT_IUNLOCK(mp);
 		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
 			if (mp->mnt_syncer == NULL)
 				error = vfs_allocate_syncvnode(mp);
@@ -913,6 +1055,12 @@
 		vrele(vp);
 		return (error);
 	}
+	MNT_ILOCK(mp);
+	if ((mp->mnt_flag & MNT_ASYNC) != 0 && mp->mnt_noasync == 0)
+		mp->mnt_kern_flag |= MNTK_ASYNC;
+	else
+		mp->mnt_kern_flag &= ~MNTK_ASYNC;
+	MNT_IUNLOCK(mp);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
 	/*
 	 * Put the new filesystem on the mount list after root.
@@ -943,18 +1091,18 @@
 		VI_LOCK(vp);
 		vp->v_iflag &= ~VI_MOUNT;
 		VI_UNLOCK(vp);
-		vfs_mount_destroy(mp, td);
+		vfs_unbusy(mp, td);
+		vfs_mount_destroy(mp);
 		vput(vp);
 	}
 	return (error);
 }
 
 /*
- * ---------------------------------------------------------------------
  * Unmount a filesystem.
  *
- * Note: unmount takes a path to the vnode mounted on as argument,
- * not special file (as before).
+ * Note: unmount takes a path to the vnode mounted on as argument, not
+ * special file (as before).
  */
 #ifndef _SYS_SYSPROTO_H_
 struct unmount_args {
@@ -975,10 +1123,9 @@
 	char *pathbuf;
 	int error, id0, id1;
 
-	if (jailed(td->td_ucred))
-		return (EPERM);
-	if (usermount == 0) {
-		if ((error = suser(td)) != 0)
+	if (jailed(td->td_ucred) || usermount == 0) {
+		error = priv_check(td, PRIV_VFS_UNMOUNT);
+		if (error)
 			return (error);
 	}
 
@@ -988,9 +1135,12 @@
 		free(pathbuf, M_TEMP);
 		return (error);
 	}
+	AUDIT_ARG(upath, td, pathbuf, ARG_UPATH1);
+	mtx_lock(&Giant);
 	if (uap->flags & MNT_BYFSID) {
 		/* Decode the filesystem ID. */
 		if (sscanf(pathbuf, "FSID:%d:%d", &id0, &id1) != 2) {
+			mtx_unlock(&Giant);
 			free(pathbuf, M_TEMP);
 			return (EINVAL);
 		}
@@ -1018,23 +1168,17 @@
 		 * now, so in the !MNT_BYFSID case return the more likely
 		 * EINVAL for compatibility.
 		 */
+		mtx_unlock(&Giant);
 		return ((uap->flags & MNT_BYFSID) ? ENOENT : EINVAL);
 	}
 
 	/*
-	 * Only privileged root, or (if MNT_USER is set) the user that did the
-	 * original mount is permitted to unmount this filesystem.
-	 */
-	error = vfs_suser(mp, td);
-	if (error)
-		return (error);
-
-	/*
 	 * Don't allow unmounting the root filesystem.
 	 */
-	if (mp->mnt_flag & MNT_ROOTFS)
+	if (mp->mnt_flag & MNT_ROOTFS) {
+		mtx_unlock(&Giant);
 		return (EINVAL);
-	mtx_lock(&Giant);
+	}
 	error = dounmount(mp, uap->flags, td);
 	mtx_unlock(&Giant);
 	return (error);
@@ -1052,11 +1196,37 @@
 	struct vnode *coveredvp, *fsrootvp;
 	int error;
 	int async_flag;
+	int mnt_gen_r;
 
 	mtx_assert(&Giant, MA_OWNED);
 
-	if ((coveredvp = mp->mnt_vnodecovered) != NULL)
-		vn_lock(coveredvp, LK_EXCLUSIVE | LK_RETRY, td);
+	if ((coveredvp = mp->mnt_vnodecovered) != NULL) {
+		mnt_gen_r = mp->mnt_gen;
+		VI_LOCK(coveredvp);
+		vholdl(coveredvp);
+		vn_lock(coveredvp, LK_EXCLUSIVE | LK_INTERLOCK | LK_RETRY, td);
+		vdrop(coveredvp);
+		/*
+		 * Check for mp being unmounted while waiting for the
+		 * covered vnode lock.
+		 */
+		if (coveredvp->v_mountedhere != mp ||
+		    coveredvp->v_mountedhere->mnt_gen != mnt_gen_r) {
+			VOP_UNLOCK(coveredvp, 0, td);
+			return (EBUSY);
+		}
+	}
+	/*
+	 * Only privileged root, or (if MNT_USER is set) the user that did the
+	 * original mount is permitted to unmount this filesystem.
+	 */
+	error = vfs_suser(mp, td);
+	if (error) {
+		if (coveredvp)
+			VOP_UNLOCK(coveredvp, 0, td);
+		return (error);
+	}
+
 	MNT_ILOCK(mp);
 	if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
 		MNT_IUNLOCK(mp);
@@ -1064,7 +1234,7 @@
 			VOP_UNLOCK(coveredvp, 0, td);
 		return (EBUSY);
 	}
-	mp->mnt_kern_flag |= MNTK_UNMOUNT;
+	mp->mnt_kern_flag |= MNTK_UNMOUNT | MNTK_NOINSMNTQ;
 	/* Allow filesystems to detect that a forced unmount is in progress. */
 	if (flags & MNT_FORCE)
 		mp->mnt_kern_flag |= MNTK_UNMOUNTF;
@@ -1072,7 +1242,8 @@
 	    ((flags & MNT_FORCE) ? 0 : LK_NOWAIT), MNT_MTX(mp), td);
 	if (error) {
 		MNT_ILOCK(mp);
-		mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_UNMOUNTF);
+		mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_NOINSMNTQ |
+		    MNTK_UNMOUNTF);
 		if (mp->mnt_kern_flag & MNTK_MWAIT)
 			wakeup(mp);
 		MNT_IUNLOCK(mp);
@@ -1086,8 +1257,11 @@
 		vfs_setpublicfs(NULL, NULL, NULL);
 
 	vfs_msync(mp, MNT_WAIT);
+	MNT_ILOCK(mp);
 	async_flag = mp->mnt_flag & MNT_ASYNC;
 	mp->mnt_flag &= ~MNT_ASYNC;
+	mp->mnt_kern_flag &= ~MNTK_ASYNC;
+	MNT_IUNLOCK(mp);
 	cache_purgevfs(mp);	/* remove cache entries for this file sys */
 	if (mp->mnt_syncer != NULL)
 		vrele(mp->mnt_syncer);
@@ -1124,11 +1298,17 @@
 			}
 			vput(fsrootvp);
 		}
-		if ((mp->mnt_flag & MNT_RDONLY) == 0 && mp->mnt_syncer == NULL)
-			(void) vfs_allocate_syncvnode(mp);
 		MNT_ILOCK(mp);
+		mp->mnt_kern_flag &= ~MNTK_NOINSMNTQ;
+		if ((mp->mnt_flag & MNT_RDONLY) == 0 && mp->mnt_syncer == NULL) {
+			MNT_IUNLOCK(mp);
+			(void) vfs_allocate_syncvnode(mp);
+			MNT_ILOCK(mp);
+		}
 		mp->mnt_kern_flag &= ~(MNTK_UNMOUNT | MNTK_UNMOUNTF);
 		mp->mnt_flag |= async_flag;
+		if ((mp->mnt_flag & MNT_ASYNC) != 0 && mp->mnt_noasync == 0)
+			mp->mnt_kern_flag |= MNTK_ASYNC;
 		lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, td);
 		if (mp->mnt_kern_flag & MNTK_MWAIT)
 			wakeup(mp);
@@ -1145,7 +1325,8 @@
 		vput(coveredvp);
 	}
 	vfs_event_signal(NULL, VQ_UNMOUNT, 0);
-	vfs_mount_destroy(mp, td);
+	lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, td);
+	vfs_mount_destroy(mp);
 	return (0);
 }
 
@@ -1156,13 +1337,18 @@
  */
 
 struct root_hold_token {
-	const char 			*who;
+	const char			*who;
 	LIST_ENTRY(root_hold_token)	list;
 };
 
 static LIST_HEAD(, root_hold_token)	root_holds =
     LIST_HEAD_INITIALIZER(&root_holds);
 
+static int root_mount_complete;
+
+/*
+ * Hold root mount.
+ */
 struct root_hold_token *
 root_mount_hold(const char *identifier)
 {
@@ -1176,6 +1362,9 @@
 	return (h);
 }
 
+/*
+ * Release root mount.
+ */
 void
 root_mount_rel(struct root_hold_token *h)
 {
@@ -1187,8 +1376,11 @@
 	free(h, M_DEVBUF);
 }
 
+/*
+ * Wait for all subsystems to release root mount.
+ */
 static void
-root_mount_wait(void)
+root_mount_prepare(void)
 {
 	struct root_hold_token *h;
 
@@ -1210,6 +1402,55 @@
 	}
 }
 
+/*
+ * Root was mounted, share the good news.
+ */
+static void
+root_mount_done(void)
+{
+
+	/*
+	 * Use a mutex to prevent the wakeup being missed and waiting for
+	 * an extra 1 second sleep.
+	 */
+	mtx_lock(&mountlist_mtx);
+	root_mount_complete = 1;
+	wakeup(&root_mount_complete);
+	mtx_unlock(&mountlist_mtx);
+}
+
+/*
+ * Return true if root is already mounted.
+ */
+int
+root_mounted(void)
+{
+
+	/* No mutex is acquired here because int stores are atomic. */
+	return (root_mount_complete);
+}
+
+/*
+ * Wait until root is mounted.
+ */
+void
+root_mount_wait(void)
+{
+
+	/*
+	 * Panic on an obvious deadlock - the function can't be called from
+	 * a thread which is doing the whole SYSINIT stuff.
+	 */
+	KASSERT(curthread->td_proc->p_pid != 0,
+	    ("root_mount_wait: cannot be called from the swapper thread"));
+	mtx_lock(&mountlist_mtx);
+	while (!root_mount_complete) {
+		msleep(&root_mount_complete, &mountlist_mtx, PZERO, "rootwait",
+		    hz);
+	}
+	mtx_unlock(&mountlist_mtx);
+}
+
 static void
 set_rootvnode(struct thread *td)
 {
@@ -1219,7 +1460,7 @@
 		panic("Cannot find root vnode");
 
 	p = td->td_proc;
-	FILEDESC_LOCK(p->p_fd);
+	FILEDESC_SLOCK(p->p_fd);
 
 	if (p->p_fd->fd_cdir != NULL)
 		vrele(p->p_fd->fd_cdir);
@@ -1231,7 +1472,7 @@
 	p->p_fd->fd_rdir = rootvnode;
 	VREF(rootvnode);
 
-	FILEDESC_UNLOCK(p->p_fd);
+	FILEDESC_SUNLOCK(p->p_fd);
 
 	VOP_UNLOCK(rootvnode, 0, td);
 }
@@ -1245,25 +1486,27 @@
 devfs_first(void)
 {
 	struct thread *td = curthread;
+	struct vfsoptlist *opts;
 	struct vfsconf *vfsp;
 	struct mount *mp = NULL;
 	int error;
 
 	vfsp = vfs_byname("devfs");
 	KASSERT(vfsp != NULL, ("Could not find devfs by name"));
-	if (vfsp == NULL) 
+	if (vfsp == NULL)
 		return;
 
-	error = vfs_mount_alloc(NULLVP, vfsp, "/dev", td, &mp);
-	KASSERT(error == 0, ("vfs_mount_alloc failed %d", error));
-	if (error)
-		return;
+	mp = vfs_mount_alloc(NULLVP, vfsp, "/dev", td);
 
-	error = VFS_MOUNT(mp, curthread);
+	error = VFS_MOUNT(mp, td);
 	KASSERT(error == 0, ("VFS_MOUNT(devfs) failed %d", error));
 	if (error)
 		return;
 
+	opts = malloc(sizeof(struct vfsoptlist), M_MOUNT, M_WAITOK);
+	TAILQ_INIT(opts);
+	mp->mnt_opt = opts;
+
 	mtx_lock(&mountlist_mtx);
 	TAILQ_INSERT_HEAD(&mountlist, mp, mnt_list);
 	mtx_unlock(&mountlist_mtx);
@@ -1297,8 +1540,8 @@
 	VFS_ROOT(mp, LK_EXCLUSIVE, &dvp, td);
 	VI_LOCK(dvp);
 	dvp->v_iflag &= ~VI_MOUNT;
-	dvp->v_mountedhere = NULL;
 	VI_UNLOCK(dvp);
+	dvp->v_mountedhere = NULL;
 
 	/* Set up the real rootvnode, and purge the cache */
 	TAILQ_FIRST(&mountlist)->mnt_vnodecovered = NULL;
@@ -1335,6 +1578,26 @@
 }
 
 /*
+ * Report errors during filesystem mounting.
+ */
+void
+vfs_mount_error(struct mount *mp, const char *fmt, ...)
+{
+	struct vfsoptlist *moptlist = mp->mnt_optnew;
+	va_list ap;
+	int error, len;
+	char *errmsg;
+
+	error = vfs_getopt(moptlist, "errmsg", (void **)&errmsg, &len);
+	if (error || errmsg == NULL || len <= 0)
+		return;
+
+	va_start(ap, fmt);
+	vsnprintf(errmsg, (size_t)len, fmt, ap);
+	va_end(ap);
+}
+
+/*
  * Find and mount the root filesystem
  */
 void
@@ -1343,8 +1606,11 @@
 	char *cp;
 	int error, i, asked = 0;
 
-	root_mount_wait();
+	root_mount_prepare();
 
+	mount_zone = uma_zcreate("Mountpoints", sizeof(struct mount),
+	    NULL, NULL, mount_init, mount_fini,
+	    UMA_ALIGN_PTR, UMA_ZONE_NOFREE);
 	devfs_first();
 
 	/*
@@ -1352,7 +1618,7 @@
 	 */
 	if (boothowto & RB_ASKNAME) {
 		if (!vfs_mountroot_ask())
-			return;
+			goto mounted;
 		asked = 1;
 	}
 
@@ -1362,7 +1628,7 @@
 	 */
 	if (ctrootdevname != NULL && (boothowto & RB_DFLTROOT)) {
 		if (!vfs_mountroot_try(ctrootdevname))
-			return;
+			goto mounted;
 		ctrootdevname = NULL;
 	}
 
@@ -1374,7 +1640,7 @@
 	if (boothowto & RB_CDROM) {
 		for (i = 0; cdrom_rootdevnames[i] != NULL; i++) {
 			if (!vfs_mountroot_try(cdrom_rootdevnames[i]))
-				return;
+				goto mounted;
 		}
 	}
 
@@ -1388,32 +1654,35 @@
 		error = vfs_mountroot_try(cp);
 		freeenv(cp);
 		if (!error)
-			return;
+			goto mounted;
 	}
 
 	/*
 	 * Try values that may have been computed by code during boot
 	 */
 	if (!vfs_mountroot_try(rootdevnames[0]))
-		return;
+		goto mounted;
 	if (!vfs_mountroot_try(rootdevnames[1]))
-		return;
+		goto mounted;
 
 	/*
 	 * If we (still) have a compiled-in default, try it.
 	 */
 	if (ctrootdevname != NULL)
 		if (!vfs_mountroot_try(ctrootdevname))
-			return;
+			goto mounted;
 	/*
 	 * Everything so far has failed, prompt on the console if we haven't
 	 * already tried that.
 	 */
 	if (!asked)
 		if (!vfs_mountroot_ask())
-			return;
+			goto mounted;
 
 	panic("Root mount failed, startup aborted.");
+
+mounted:
+	root_mount_done();
 }
 
 /*
@@ -1422,7 +1691,7 @@
 static int
 vfs_mountroot_try(const char *mountfrom)
 {
-        struct mount	*mp;
+	struct mount	*mp;
 	char		*vfsname, *path;
 	time_t		timebase;
 	int		error;
@@ -1499,7 +1768,7 @@
 	for(;;) {
 		printf("\nManual root filesystem specification:\n");
 		printf("  <fstype>:<device>  Mount <device> using filesystem <fstype>\n");
-#if defined(__i386__) || defined(__ia64__)
+#if defined(__amd64__) || defined(__i386__) || defined(__ia64__)
 		printf("                       eg. ufs:da0s1a\n");
 #else
 		printf("                       eg. ufs:/dev/da0a\n");
@@ -1532,27 +1801,47 @@
 vfs_filteropt(struct vfsoptlist *opts, const char **legal)
 {
 	struct vfsopt *opt;
-	const char **t, *p;
-	
+	char errmsg[255];
+	const char **t, *p, *q;
+	int ret = 0;
 
 	TAILQ_FOREACH(opt, opts, link) {
 		p = opt->name;
+		q = NULL;
 		if (p[0] == 'n' && p[1] == 'o')
-			p += 2;
-		for(t = global_opts; *t != NULL; t++)
-			if (!strcmp(*t, p))
+			q = p + 2;
+		for(t = global_opts; *t != NULL; t++) {
+			if (strcmp(*t, p) == 0)
 				break;
+			if (q != NULL) {
+				if (strcmp(*t, q) == 0)
+					break;
+			}
+		}
 		if (*t != NULL)
 			continue;
-		for(t = legal; *t != NULL; t++)
-			if (!strcmp(*t, p))
+		for(t = legal; *t != NULL; t++) {
+			if (strcmp(*t, p) == 0)
 				break;
+			if (q != NULL) {
+				if (strcmp(*t, q) == 0)
+					break;
+			}
+		}
 		if (*t != NULL)
 			continue;
-		printf("mount option <%s> is unknown\n", p);
-		return (EINVAL);
+		sprintf(errmsg, "mount option <%s> is unknown", p);
+		printf("%s\n", errmsg);
+		ret = EINVAL;
+	}
+	if (ret != 0) {
+		TAILQ_FOREACH(opt, opts, link) {
+			if (strcmp(opt->name, "errmsg") == 0) {
+				strncpy((char *)opt->value, errmsg, opt->len);
+			}
+		}
 	}
-	return (0);
+	return (ret);
 }
 
 /*
@@ -1586,6 +1875,24 @@
 	return (ENOENT);
 }
 
+static int
+vfs_getopt_pos(struct vfsoptlist *opts, const char *name)
+{
+	struct vfsopt *opt;
+	int i;
+
+	if (opts == NULL)
+		return (-1);
+
+	i = 0;
+	TAILQ_FOREACH(opt, opts, link) {
+		if (strcmp(name, opt->name) == 0)
+			return (i);
+		++i;
+	}
+	return (-1);
+}
+
 char *
 vfs_getopts(struct vfsoptlist *opts, const char *name, int *error)
 {
@@ -1601,6 +1908,7 @@
 		}
 		return (opt->value);
 	}
+	*error = ENOENT;
 	return (NULL);
 }
 
@@ -1633,6 +1941,8 @@
 	TAILQ_FOREACH(opt, opts, link) {
 		if (strcmp(name, opt->name) != 0)
 			continue;
+		if (opt->len == 0 || opt->value == NULL)
+			return (0);
 		if (((char *)opt->value)[opt->len - 1] != '\0')
 			return (0);
 		va_start(ap, fmt);
@@ -1687,9 +1997,9 @@
 
 	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
 	vp = TAILQ_NEXT(*mvp, v_nmntvnodes);
-	while (vp != NULL && vp->v_type == VMARKER) 
+	while (vp != NULL && vp->v_type == VMARKER)
 		vp = TAILQ_NEXT(vp, v_nmntvnodes);
-	
+
 	/* Check if we are done */
 	if (vp == NULL) {
 		__mnt_vnode_markerfree(mvp, mp);
@@ -1708,9 +2018,9 @@
 	mtx_assert(MNT_MTX(mp), MA_OWNED);
 
 	vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
-	while (vp != NULL && vp->v_type == VMARKER) 
+	while (vp != NULL && vp->v_type == VMARKER)
 		vp = TAILQ_NEXT(vp, v_nmntvnodes);
-	
+
 	/* Check if we are done */
 	if (vp == NULL) {
 		*mvp = NULL;
@@ -1725,9 +2035,9 @@
 	(*mvp)->v_type = VMARKER;
 
 	vp = TAILQ_FIRST(&mp->mnt_nvnodelist);
-	while (vp != NULL && vp->v_type == VMARKER) 
+	while (vp != NULL && vp->v_type == VMARKER)
 		vp = TAILQ_NEXT(vp, v_nmntvnodes);
-	
+
 	/* Check if we are done */
 	if (vp == NULL) {
 		MNT_IUNLOCK(mp);
@@ -1752,7 +2062,7 @@
 
 	if (*mvp == NULL)
 		return;
-	
+
 	mtx_assert(MNT_MTX(mp), MA_OWNED);
 
 	KASSERT((*mvp)->v_mount == mp, ("marker vnode mount list mismatch"));
Index: subr_trap.c
===================================================================
RCS file: /home/cvs/src/sys/kern/subr_trap.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/subr_trap.c -L sys/kern/subr_trap.c -u -r1.1.1.1 -r1.2
--- sys/kern/subr_trap.c
+++ sys/kern/subr_trap.c
@@ -38,19 +38,19 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/subr_trap.c,v 1.281 2005/03/28 12:52:46 jeff Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/subr_trap.c,v 1.299 2007/09/17 05:27:20 jeff Exp $");
 
 #include "opt_ktrace.h"
 #include "opt_mac.h"
 #ifdef __i386__
 #include "opt_npx.h"
 #endif
+#include "opt_sched.h"
 
 #include <sys/param.h>
 #include <sys/bus.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
-#include <sys/mac.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/ktr.h>
@@ -67,17 +67,14 @@
 #include <machine/cpu.h>
 #include <machine/pcb.h>
 
+#include <security/mac/mac_framework.h>
+
 /*
- * Define the code needed before returning to user mode, for
- * trap and syscall.
- *
- * MPSAFE
+ * Define the code needed before returning to user mode, for trap and
+ * syscall.
  */
 void
-userret(td, frame, oticks)
-	struct thread *td;
-	struct trapframe *frame;
-	u_int oticks;
+userret(struct thread *td, struct trapframe *frame)
 {
 	struct proc *p = td->td_proc;
 
@@ -86,14 +83,18 @@
 #ifdef DIAGNOSTIC
 	/* Check that we called signotify() enough. */
 	PROC_LOCK(p);
-	mtx_lock_spin(&sched_lock);
+	thread_lock(td);
 	if (SIGPENDING(td) && ((td->td_flags & TDF_NEEDSIGCHK) == 0 ||
 	    (td->td_flags & TDF_ASTPENDING) == 0))
 		printf("failed to set signal flags properly for ast()\n");
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(td);
 	PROC_UNLOCK(p);
 #endif
 
+#ifdef KTRACE
+	KTRUSERRET(td);
+#endif
+
 	/*
 	 * If this thread tickled GEOM, we need to wait for the giggling to
 	 * stop before we return to userland
@@ -113,20 +114,20 @@
 		PROC_UNLOCK(p);
 	}
 
+#ifdef KSE
 	/*
 	 * Do special thread processing, e.g. upcall tweaking and such.
 	 */
 	if (p->p_flag & P_SA)
 		thread_userret(td, frame);
+#endif
 
 	/*
 	 * Charge system time if profiling.
 	 */
 	if (p->p_flag & P_PROFIL) {
-		quad_t ticks;
 
-		ticks = td->td_sticks - oticks;
-		addupc_task(td, TRAPF_PC(frame), (u_int)ticks * psratio);
+		addupc_task(td, TRAPF_PC(frame), td->td_pticks * psratio);
 	}
 
 	/*
@@ -147,54 +148,48 @@
 {
 	struct thread *td;
 	struct proc *p;
-	struct ksegrp *kg;
-	struct rlimit rlim;
-	u_int sticks;
-	int sflag;
 	int flags;
 	int sig;
 #if defined(DEV_NPX) && !defined(SMP)
 	int ucode;
+	ksiginfo_t ksi;
 #endif
 
 	td = curthread;
 	p = td->td_proc;
-	kg = td->td_ksegrp;
 
 	CTR3(KTR_SYSC, "ast: thread %p (pid %d, %s)", td, p->p_pid,
             p->p_comm);
 	KASSERT(TRAPF_USERMODE(framep), ("ast in kernel mode"));
 	WITNESS_WARN(WARN_PANIC, NULL, "Returning to user mode");
 	mtx_assert(&Giant, MA_NOTOWNED);
-	mtx_assert(&sched_lock, MA_NOTOWNED);
+	THREAD_LOCK_ASSERT(td, MA_NOTOWNED);
 	td->td_frame = framep;
-	sticks = td->td_sticks;
+	td->td_pticks = 0;
 
+#ifdef KSE
 	if ((p->p_flag & P_SA) && (td->td_mailbox == NULL))
 		thread_user_enter(td);
+#endif
 
 	/*
-	 * This updates the p_sflag's for the checks below in one
+	 * This updates the td_flag's for the checks below in one
 	 * "atomic" operation with turning off the astpending flag.
 	 * If another AST is triggered while we are handling the
-	 * AST's saved in sflag, the astpending flag will be set and
+	 * AST's saved in flags, the astpending flag will be set and
 	 * ast() will be called again.
 	 */
-	mtx_lock_spin(&sched_lock);
+	thread_lock(td);
 	flags = td->td_flags;
-	sflag = p->p_sflag;
-	p->p_sflag &= ~(PS_ALRMPEND | PS_PROFPEND | PS_XCPU);
-#ifdef MAC
-	p->p_sflag &= ~PS_MACPEND;
-#endif
 	td->td_flags &= ~(TDF_ASTPENDING | TDF_NEEDSIGCHK |
-	    TDF_NEEDRESCHED | TDF_INTERRUPT);
-	cnt.v_soft++;
-	mtx_unlock_spin(&sched_lock);
+	    TDF_NEEDRESCHED | TDF_INTERRUPT | TDF_ALRMPEND | TDF_PROFPEND |
+	    TDF_MACPEND);
+	thread_unlock(td);
+	PCPU_INC(cnt.v_trap);
 
 	/*
 	 * XXXKSE While the fact that we owe a user profiling
-	 * tick is stored per KSE in this code, the statistics
+	 * tick is stored per thread in this code, the statistics
 	 * themselves are still stored per process.
 	 * This should probably change, by which I mean that
 	 * possibly the location of both might change.
@@ -206,7 +201,7 @@
 		td->td_profil_ticks = 0;
 		td->td_pflags &= ~TDP_OWEUPC;
 	}
-	if (sflag & PS_ALRMPEND) {
+	if (flags & TDF_ALRMPEND) {
 		PROC_LOCK(p);
 		psignal(p, SIGVTALRM);
 		PROC_UNLOCK(p);
@@ -217,32 +212,20 @@
 		    PCB_NPXTRAP);
 		ucode = npxtrap();
 		if (ucode != -1) {
-			trapsignal(td, SIGFPE, ucode);
+			ksiginfo_init_trap(&ksi);
+			ksi.ksi_signo = SIGFPE;
+			ksi.ksi_code = ucode;
+			trapsignal(td, &ksi);
 		}
 	}
 #endif
-	if (sflag & PS_PROFPEND) {
+	if (flags & TDF_PROFPEND) {
 		PROC_LOCK(p);
 		psignal(p, SIGPROF);
 		PROC_UNLOCK(p);
 	}
-	if (sflag & PS_XCPU) {
-		PROC_LOCK(p);
-		lim_rlimit(p, RLIMIT_CPU, &rlim);
-		mtx_lock_spin(&sched_lock);
-		if (p->p_rux.rux_runtime.sec >= rlim.rlim_max) {
-			mtx_unlock_spin(&sched_lock);
-			killproc(p, "exceeded maximum CPU limit");
-		} else {
-			if (p->p_cpulimit < rlim.rlim_max)
-				p->p_cpulimit += 5;
-			mtx_unlock_spin(&sched_lock);
-			psignal(p, SIGXCPU);
-		}
-		PROC_UNLOCK(p);
-	}
 #ifdef MAC
-	if (sflag & PS_MACPEND)
+	if (flags & TDF_MACPEND)
 		mac_thread_userret(td);
 #endif
 	if (flags & TDF_NEEDRESCHED) {
@@ -250,10 +233,11 @@
 		if (KTRPOINT(td, KTR_CSW))
 			ktrcsw(1, 1);
 #endif
-		mtx_lock_spin(&sched_lock);
-		sched_prio(td, kg->kg_user_pri);
+		thread_lock(td);
+		sched_prio(td, td->td_user_pri);
+		SCHED_STAT_INC(switch_needresched);
 		mi_switch(SW_INVOL, NULL);
-		mtx_unlock_spin(&sched_lock);
+		thread_unlock(td);
 #ifdef KTRACE
 		if (KTRPOINT(td, KTR_CSW))
 			ktrcsw(0, 1);
@@ -268,6 +252,6 @@
 		PROC_UNLOCK(p);
 	}
 
-	userret(td, framep, sticks);
+	userret(td, framep);
 	mtx_assert(&Giant, MA_NOTOWNED);
 }
Index: kern_clock.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_clock.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/kern_clock.c -L sys/kern/kern_clock.c -u -r1.2 -r1.3
--- sys/kern/kern_clock.c
+++ sys/kern/kern_clock.c
@@ -35,8 +35,9 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_clock.c,v 1.178.2.3 2006/03/10 19:37:33 jhb Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_clock.c,v 1.202 2007/09/17 05:27:20 jeff Exp $");
 
+#include "opt_kdb.h"
 #include "opt_device_polling.h"
 #include "opt_hwpmc_hooks.h"
 #include "opt_ntp.h"
@@ -65,8 +66,6 @@
 #include <sys/limits.h>
 #include <sys/timetc.h>
 
-#include <machine/cpu.h>
-
 #ifdef GPROF
 #include <sys/gmon.h>
 #endif
@@ -85,6 +84,9 @@
 /* Some of these don't belong here, but it's easiest to concentrate them. */
 long cp_time[CPUSTATES];
 
+/* Spin-lock protecting profiling statistics. */
+static struct mtx time_lock;
+
 static int
 sysctl_kern_cp_time(SYSCTL_HANDLER_ARGS)
 {
@@ -92,7 +94,7 @@
 #ifdef SCTL_MASK32
 	int i;
 	unsigned int cp_time32[CPUSTATES];
-	
+
 	if (req->flags & SCTL_MASK32) {
 		if (!req->oldptr)
 			return SYSCTL_OUT(req, 0, sizeof(cp_time32));
@@ -109,7 +111,7 @@
 	return error;
 }
 
-SYSCTL_PROC(_kern, OID_AUTO, cp_time, CTLTYPE_LONG|CTLFLAG_RD, 
+SYSCTL_PROC(_kern, OID_AUTO, cp_time, CTLTYPE_LONG|CTLFLAG_RD,
     0,0, sysctl_kern_cp_time, "LU", "CPU time statistics");
 
 #ifdef SW_WATCHDOG
@@ -173,6 +175,7 @@
 	 * Set divisors to 1 (normal case) and let the machine-specific
 	 * code do its bit.
 	 */
+	mtx_init(&time_lock, "time lock", NULL, MTX_SPIN);
 	cpu_initclocks();
 
 	/*
@@ -189,38 +192,39 @@
 
 /*
  * Each time the real-time timer fires, this function is called on all CPUs.
- * Note that hardclock() calls hardclock_process() for the boot CPU, so only
+ * Note that hardclock() calls hardclock_cpu() for the boot CPU, so only
  * the other CPUs in the system need to call this function.
  */
 void
-hardclock_process(frame)
-	register struct clockframe *frame;
+hardclock_cpu(int usermode)
 {
 	struct pstats *pstats;
 	struct thread *td = curthread;
 	struct proc *p = td->td_proc;
+	int flags;
 
 	/*
 	 * Run current process's virtual and profile time, as needed.
 	 */
-	mtx_lock_spin_flags(&sched_lock, MTX_QUIET);
-	if (p->p_flag & P_SA) {
-		/* XXXKSE What to do? */
-	} else {
-		pstats = p->p_stats;
-		if (CLKF_USERMODE(frame) &&
-		    timevalisset(&pstats->p_timer[ITIMER_VIRTUAL].it_value) &&
-		    itimerdecr(&pstats->p_timer[ITIMER_VIRTUAL], tick) == 0) {
-			p->p_sflag |= PS_ALRMPEND;
-			td->td_flags |= TDF_ASTPENDING;
-		}
-		if (timevalisset(&pstats->p_timer[ITIMER_PROF].it_value) &&
-		    itimerdecr(&pstats->p_timer[ITIMER_PROF], tick) == 0) {
-			p->p_sflag |= PS_PROFPEND;
-			td->td_flags |= TDF_ASTPENDING;
-		}
-	}
-	mtx_unlock_spin_flags(&sched_lock, MTX_QUIET);
+	pstats = p->p_stats;
+	flags = 0;
+	if (usermode &&
+	    timevalisset(&pstats->p_timer[ITIMER_VIRTUAL].it_value)) {
+		PROC_SLOCK(p);
+		if (itimerdecr(&pstats->p_timer[ITIMER_VIRTUAL], tick) == 0)
+			flags |= TDF_ALRMPEND | TDF_ASTPENDING;
+		PROC_SUNLOCK(p);
+	}
+	if (timevalisset(&pstats->p_timer[ITIMER_PROF].it_value)) {
+		PROC_SLOCK(p);
+		if (itimerdecr(&pstats->p_timer[ITIMER_PROF], tick) == 0)
+			flags |= TDF_PROFPEND | TDF_ASTPENDING;
+		PROC_SUNLOCK(p);
+	}
+	thread_lock(td);
+	sched_tick();
+	td->td_flags |= flags;
+	thread_unlock(td);
 
 #ifdef	HWPMC_HOOKS
 	if (PMC_CPU_HAS_SAMPLES(PCPU_GET(cpuid)))
@@ -232,13 +236,11 @@
  * The real-time timer, interrupting hz times per second.
  */
 void
-hardclock(frame)
-	register struct clockframe *frame;
+hardclock(int usermode, uintfptr_t pc)
 {
 	int need_softclock = 0;
 
-	CTR0(KTR_CLK, "hardclock fired");
-	hardclock_process(frame);
+	hardclock_cpu(usermode);
 
 	tc_ticktock();
 	/*
@@ -247,8 +249,8 @@
 	 * XXX: this only works for UP
 	 */
 	if (stathz == 0) {
-		profclock(frame);
-		statclock(frame);
+		profclock(usermode, pc);
+		statclock(usermode);
 	}
 
 #ifdef DEVICE_POLLING
@@ -261,15 +263,15 @@
 	 */
 	mtx_lock_spin_flags(&callout_lock, MTX_QUIET);
 	ticks++;
-	if (TAILQ_FIRST(&callwheel[ticks & callwheelmask]) != NULL) {
+	if (!TAILQ_EMPTY(&callwheel[ticks & callwheelmask])) {
 		need_softclock = 1;
 	} else if (softticks + 1 == ticks)
 		++softticks;
 	mtx_unlock_spin_flags(&callout_lock, MTX_QUIET);
 
 	/*
-	 * swi_sched acquires sched_lock, so we don't want to call it with
-	 * callout_lock held; incorrect locking order.
+	 * swi_sched acquires the thread lock, so we don't want to call it
+	 * with callout_lock held; incorrect locking order.
 	 */
 	if (need_softclock)
 		swi_sched(softclock_ih, 0);
@@ -350,20 +352,15 @@
 	register struct proc *p;
 {
 
-	/*
-	 * XXX; Right now sched_lock protects statclock(), but perhaps
-	 * it should be protected later on by a time_lock, which would
-	 * cover psdiv, etc. as well.
-	 */
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	if (p->p_flag & P_STOPPROF)
 		return;
 	if ((p->p_flag & P_PROFIL) == 0) {
-		mtx_lock_spin(&sched_lock);
 		p->p_flag |= P_PROFIL;
+		mtx_lock_spin(&time_lock);
 		if (++profprocs == 1)
 			cpu_startprofclock();
-		mtx_unlock_spin(&sched_lock);
+		mtx_unlock_spin(&time_lock);
 	}
 }
 
@@ -386,24 +383,22 @@
 		}
 		if ((p->p_flag & P_PROFIL) == 0)
 			return;
-		mtx_lock_spin(&sched_lock);
 		p->p_flag &= ~P_PROFIL;
+		mtx_lock_spin(&time_lock);
 		if (--profprocs == 0)
 			cpu_stopprofclock();
-		mtx_unlock_spin(&sched_lock);
+		mtx_unlock_spin(&time_lock);
 	}
 }
 
 /*
- * Statistics clock.  Grab profile sample, and if divider reaches 0,
- * do process and kernel statistics.  Most of the statistics are only
- * used by user-level statistics programs.  The main exceptions are
- * ke->ke_uticks, p->p_rux.rux_sticks, p->p_rux.rux_iticks, and p->p_estcpu.
+ * Statistics clock.  Updates rusage information and calls the scheduler
+ * to adjust priorities of the active thread.
+ *
  * This should be called by all active processors.
  */
 void
-statclock(frame)
-	register struct clockframe *frame;
+statclock(int usermode)
 {
 	struct rusage *ru;
 	struct vmspace *vm;
@@ -414,18 +409,20 @@
 	td = curthread;
 	p = td->td_proc;
 
-	mtx_lock_spin_flags(&sched_lock, MTX_QUIET);
-	if (CLKF_USERMODE(frame)) {
+	thread_lock_flags(td, MTX_QUIET);
+	if (usermode) {
 		/*
 		 * Charge the time as appropriate.
 		 */
+#ifdef KSE
 		if (p->p_flag & P_SA)
 			thread_statclock(1);
-		p->p_rux.rux_uticks++;
+#endif
+		td->td_uticks++;
 		if (p->p_nice > NZERO)
-			cp_time[CP_NICE]++;
+			atomic_add_long(&cp_time[CP_NICE], 1);
 		else
-			cp_time[CP_USER]++;
+			atomic_add_long(&cp_time[CP_USER], 1);
 	} else {
 		/*
 		 * Came from kernel mode, so we were:
@@ -441,50 +438,49 @@
 		 */
 		if ((td->td_pflags & TDP_ITHREAD) ||
 		    td->td_intr_nesting_level >= 2) {
-			p->p_rux.rux_iticks++;
-			cp_time[CP_INTR]++;
+			td->td_iticks++;
+			atomic_add_long(&cp_time[CP_INTR], 1);
 		} else {
+#ifdef KSE
 			if (p->p_flag & P_SA)
 				thread_statclock(0);
+#endif
+			td->td_pticks++;
 			td->td_sticks++;
-			p->p_rux.rux_sticks++;
-			if (td != PCPU_GET(idlethread))
-				cp_time[CP_SYS]++;
+			if (!TD_IS_IDLETHREAD(td))
+				atomic_add_long(&cp_time[CP_SYS], 1);
 			else
-				cp_time[CP_IDLE]++;
+				atomic_add_long(&cp_time[CP_IDLE], 1);
 		}
 	}
-	CTR4(KTR_SCHED, "statclock: %p(%s) prio %d stathz %d",
-	    td, td->td_proc->p_comm, td->td_priority, (stathz)?stathz:hz);
-
-	sched_clock(td);
 
 	/* Update resource usage integrals and maximums. */
-	MPASS(p->p_stats != NULL);
 	MPASS(p->p_vmspace != NULL);
 	vm = p->p_vmspace;
-	ru = &p->p_stats->p_ru;
+	ru = &td->td_ru;
 	ru->ru_ixrss += pgtok(vm->vm_tsize);
 	ru->ru_idrss += pgtok(vm->vm_dsize);
 	ru->ru_isrss += pgtok(vm->vm_ssize);
 	rss = pgtok(vmspace_resident_count(vm));
 	if (ru->ru_maxrss < rss)
 		ru->ru_maxrss = rss;
-	mtx_unlock_spin_flags(&sched_lock, MTX_QUIET);
+	CTR4(KTR_SCHED, "statclock: %p(%s) prio %d stathz %d",
+	    td, td->td_proc->p_comm, td->td_priority, (stathz)?stathz:hz);
+	sched_clock(td);
+	thread_unlock(td);
 }
 
 void
-profclock(frame)
-	register struct clockframe *frame;
+profclock(int usermode, uintfptr_t pc)
 {
 	struct thread *td;
 #ifdef GPROF
 	struct gmonparam *g;
-	int i;
+	uintfptr_t i;
 #endif
 
 	td = curthread;
-	if (CLKF_USERMODE(frame)) {
+	if (usermode) {
 		/*
 		 * Came from user mode; CPU was in user state.
 		 * If this process is being profiled, record the tick.
@@ -492,7 +488,7 @@
 		 * bother trying to count it.
 		 */
 		if (td->td_proc->p_flag & P_PROFIL)
-			addupc_intr(td, CLKF_PC(frame), 1);
+			addupc_intr(td, pc, 1);
 	}
 #ifdef GPROF
 	else {
@@ -500,11 +496,10 @@
 		 * Kernel statistics are just like addupc_intr, only easier.
 		 */
 		g = &_gmonparam;
-		if (g->state == GMON_PROF_ON) {
-			i = CLKF_PC(frame) - g->lowpc;
+		if (g->state == GMON_PROF_ON && pc >= g->lowpc) {
+			i = PC_TO_I(g, pc);
 			if (i < g->textsize) {
-				i /= HISTFRACTION * sizeof(*g->kcount);
-				g->kcount[i]++;
+				KCOUNT(g, i)++;
 			}
 		}
 	}
@@ -536,15 +531,15 @@
 #ifdef SW_WATCHDOG
 
 static void
-watchdog_config(void *unused __unused, u_int cmd, int *err)
+watchdog_config(void *unused __unused, u_int cmd, int *error)
 {
 	u_int u;
 
 	u = cmd & WD_INTERVAL;
-	if ((cmd & WD_ACTIVE) && u >= WD_TO_1SEC) {
+	if (u >= WD_TO_1SEC) {
 		watchdog_ticks = (1 << (u - WD_TO_1SEC)) * hz;
 		watchdog_enabled = 1;
-		*err = 0;
+		*error = 0;
 	} else {
 		watchdog_enabled = 0;
 	}
@@ -552,7 +547,7 @@
 
 /*
  * Handle a watchdog timeout by dumping interrupt information and
- * then either dropping to DDB or panicing.
+ * then either dropping to DDB or panicking.
  */
 static void
 watchdog_fire(void)
@@ -566,7 +561,7 @@
 	curname = intrnames;
 	inttotal = 0;
 	nintr = eintrcnt - intrcnt;
-	
+
 	printf("interrupt                   total\n");
 	while (--nintr >= 0) {
 		if (*curintr)
@@ -576,12 +571,12 @@
 	}
 	printf("Total        %20ju\n", (uintmax_t)inttotal);
 
-#ifdef KDB
+#if defined(KDB) && !defined(KDB_UNATTENDED)
 	kdb_backtrace();
 	kdb_enter("watchdog timeout");
 #else
 	panic("watchdog timeout");
-#endif /* KDB */
+#endif
 }
 
 #endif /* SW_WATCHDOG */
Index: kern_mutex.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_mutex.c,v
retrieving revision 1.1.1.2
retrieving revision 1.2
diff -L sys/kern/kern_mutex.c -L sys/kern/kern_mutex.c -u -r1.1.1.2 -r1.2
--- sys/kern/kern_mutex.c
+++ sys/kern/kern_mutex.c
@@ -34,12 +34,11 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_mutex.c,v 1.154.2.5 2005/12/20 19:28:23 jhb Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_mutex.c,v 1.198.2.1 2007/12/01 11:28:37 attilio Exp $");
 
 #include "opt_adaptive_mutexes.h"
 #include "opt_ddb.h"
-#include "opt_mprof.h"
-#include "opt_mutex_wake_all.h"
+#include "opt_global.h"
 #include "opt_sched.h"
 
 #include <sys/param.h>
@@ -59,10 +58,10 @@
 #include <sys/sysctl.h>
 #include <sys/turnstile.h>
 #include <sys/vmmeter.h>
+#include <sys/lock_profile.h>
 
 #include <machine/atomic.h>
 #include <machine/bus.h>
-#include <machine/clock.h>
 #include <machine/cpu.h>
 
 #include <ddb/ddb.h>
@@ -72,13 +71,8 @@
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 
-/* 
- * Force MUTEX_WAKE_ALL for now.
- * single thread wakeup needs fixes to avoid race conditions with 
- * priority inheritance.
- */
-#ifndef MUTEX_WAKE_ALL
-#define MUTEX_WAKE_ALL
+#if defined(SMP) && !defined(NO_ADAPTIVE_MUTEXES)
+#define	ADAPTIVE_MUTEXES
 #endif
 
 /*
@@ -86,188 +80,91 @@
  */
 #define mtx_unowned(m)	((m)->mtx_lock == MTX_UNOWNED)
 
-#define mtx_owner(m)	(mtx_unowned((m)) ? NULL \
-	: (struct thread *)((m)->mtx_lock & MTX_FLAGMASK))
+#define	mtx_destroyed(m) ((m)->mtx_lock == MTX_DESTROYED)
+
+#define	mtx_owner(m)	((struct thread *)((m)->mtx_lock & ~MTX_FLAGMASK))
 
 #ifdef DDB
 static void	db_show_mtx(struct lock_object *lock);
 #endif
+static void	lock_mtx(struct lock_object *lock, int how);
+static void	lock_spin(struct lock_object *lock, int how);
+static int	unlock_mtx(struct lock_object *lock);
+static int	unlock_spin(struct lock_object *lock);
 
 /*
  * Lock classes for sleep and spin mutexes.
  */
 struct lock_class lock_class_mtx_sleep = {
-	"sleep mutex",
-	LC_SLEEPLOCK | LC_RECURSABLE,
+	.lc_name = "sleep mutex",
+	.lc_flags = LC_SLEEPLOCK | LC_RECURSABLE,
 #ifdef DDB
-	db_show_mtx
+	.lc_ddb_show = db_show_mtx,
 #endif
+	.lc_lock = lock_mtx,
+	.lc_unlock = unlock_mtx,
 };
 struct lock_class lock_class_mtx_spin = {
-	"spin mutex",
-	LC_SPINLOCK | LC_RECURSABLE,
+	.lc_name = "spin mutex",
+	.lc_flags = LC_SPINLOCK | LC_RECURSABLE,
 #ifdef DDB
-	db_show_mtx
+	.lc_ddb_show = db_show_mtx,
 #endif
+	.lc_lock = lock_spin,
+	.lc_unlock = unlock_spin,
 };
 
 /*
  * System-wide mutexes
  */
-struct mtx sched_lock;
+struct mtx blocked_lock;
 struct mtx Giant;
 
-#ifdef MUTEX_PROFILING
-SYSCTL_NODE(_debug, OID_AUTO, mutex, CTLFLAG_RD, NULL, "mutex debugging");
-SYSCTL_NODE(_debug_mutex, OID_AUTO, prof, CTLFLAG_RD, NULL, "mutex profiling");
-static int mutex_prof_enable = 0;
-SYSCTL_INT(_debug_mutex_prof, OID_AUTO, enable, CTLFLAG_RW,
-    &mutex_prof_enable, 0, "Enable tracing of mutex holdtime");
-
-struct mutex_prof {
-	const char	*name;
-	const char	*file;
-	int		line;
-	uintmax_t	cnt_max;
-	uintmax_t	cnt_tot;
-	uintmax_t	cnt_cur;
-	uintmax_t	cnt_contest_holding;
-	uintmax_t	cnt_contest_locking;
-	struct mutex_prof *next;
-};
-
-/*
- * mprof_buf is a static pool of profiling records to avoid possible
- * reentrance of the memory allocation functions.
- *
- * Note: NUM_MPROF_BUFFERS must be smaller than MPROF_HASH_SIZE.
- */
-#ifdef MPROF_BUFFERS
-#define NUM_MPROF_BUFFERS	MPROF_BUFFERS
+#ifdef LOCK_PROFILING
+static inline void lock_profile_init(void)
+{
+        int i;
+        /* Initialize the mutex profiling locks */
+        for (i = 0; i < LPROF_LOCK_SIZE; i++) {
+                mtx_init(&lprof_locks[i], "mprof lock",
+                    NULL, MTX_SPIN|MTX_QUIET|MTX_NOPROFILE);
+        }
+}
 #else
-#define	NUM_MPROF_BUFFERS	1000
+static inline void lock_profile_init(void) {;}
 #endif
-static struct mutex_prof mprof_buf[NUM_MPROF_BUFFERS];
-static int first_free_mprof_buf;
-#ifndef MPROF_HASH_SIZE
-#define	MPROF_HASH_SIZE		1009
-#endif
-#if NUM_MPROF_BUFFERS >= MPROF_HASH_SIZE
-#error MPROF_BUFFERS must be larger than MPROF_HASH_SIZE
-#endif
-static struct mutex_prof *mprof_hash[MPROF_HASH_SIZE];
-/* SWAG: sbuf size = avg stat. line size * number of locks */
-#define MPROF_SBUF_SIZE		256 * 400
-
-static int mutex_prof_acquisitions;
-SYSCTL_INT(_debug_mutex_prof, OID_AUTO, acquisitions, CTLFLAG_RD,
-    &mutex_prof_acquisitions, 0, "Number of mutex acquistions recorded");
-static int mutex_prof_records;
-SYSCTL_INT(_debug_mutex_prof, OID_AUTO, records, CTLFLAG_RD,
-    &mutex_prof_records, 0, "Number of profiling records");
-static int mutex_prof_maxrecords = NUM_MPROF_BUFFERS;
-SYSCTL_INT(_debug_mutex_prof, OID_AUTO, maxrecords, CTLFLAG_RD,
-    &mutex_prof_maxrecords, 0, "Maximum number of profiling records");
-static int mutex_prof_rejected;
-SYSCTL_INT(_debug_mutex_prof, OID_AUTO, rejected, CTLFLAG_RD,
-    &mutex_prof_rejected, 0, "Number of rejected profiling records");
-static int mutex_prof_hashsize = MPROF_HASH_SIZE;
-SYSCTL_INT(_debug_mutex_prof, OID_AUTO, hashsize, CTLFLAG_RD,
-    &mutex_prof_hashsize, 0, "Hash size");
-static int mutex_prof_collisions = 0;
-SYSCTL_INT(_debug_mutex_prof, OID_AUTO, collisions, CTLFLAG_RD,
-    &mutex_prof_collisions, 0, "Number of hash collisions");
-
-/*
- * mprof_mtx protects the profiling buffers and the hash.
- */
-static struct mtx mprof_mtx;
-MTX_SYSINIT(mprof, &mprof_mtx, "mutex profiling lock", MTX_SPIN | MTX_QUIET);
-
-static u_int64_t
-nanoseconds(void)
-{
-	struct timespec tv;
-
-	nanotime(&tv);
-	return (tv.tv_sec * (u_int64_t)1000000000 + tv.tv_nsec);
-}
-
-static int
-dump_mutex_prof_stats(SYSCTL_HANDLER_ARGS)
-{
-	struct sbuf *sb;
-	int error, i;
-	static int multiplier = 1;
-
-	if (first_free_mprof_buf == 0)
-		return (SYSCTL_OUT(req, "No locking recorded",
-		    sizeof("No locking recorded")));
-
-retry_sbufops:
-	sb = sbuf_new(NULL, NULL, MPROF_SBUF_SIZE * multiplier, SBUF_FIXEDLEN);
-	sbuf_printf(sb, "\n%6s %12s %11s %5s %12s %12s %s\n",
-	    "max", "total", "count", "avg", "cnt_hold", "cnt_lock", "name");
-	/*
-	 * XXX this spinlock seems to be by far the largest perpetrator
-	 * of spinlock latency (1.6 msec on an Athlon1600 was recorded
-	 * even before I pessimized it further by moving the average
-	 * computation here).
-	 */
-	mtx_lock_spin(&mprof_mtx);
-	for (i = 0; i < first_free_mprof_buf; ++i) {
-		sbuf_printf(sb, "%6ju %12ju %11ju %5ju %12ju %12ju %s:%d (%s)\n",
-		    mprof_buf[i].cnt_max / 1000,
-		    mprof_buf[i].cnt_tot / 1000,
-		    mprof_buf[i].cnt_cur,
-		    mprof_buf[i].cnt_cur == 0 ? (uintmax_t)0 :
-			mprof_buf[i].cnt_tot / (mprof_buf[i].cnt_cur * 1000),
-		    mprof_buf[i].cnt_contest_holding,
-		    mprof_buf[i].cnt_contest_locking,
-		    mprof_buf[i].file, mprof_buf[i].line, mprof_buf[i].name);
-		if (sbuf_overflowed(sb)) {
-			mtx_unlock_spin(&mprof_mtx);
-			sbuf_delete(sb);
-			multiplier++;
-			goto retry_sbufops;
-		}
-	}
-	mtx_unlock_spin(&mprof_mtx);
-	sbuf_finish(sb);
-	error = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb) + 1);
-	sbuf_delete(sb);
-	return (error);
-}
-SYSCTL_PROC(_debug_mutex_prof, OID_AUTO, stats, CTLTYPE_STRING | CTLFLAG_RD,
-    NULL, 0, dump_mutex_prof_stats, "A", "Mutex profiling statistics");
-
-static int
-reset_mutex_prof_stats(SYSCTL_HANDLER_ARGS)
-{
-	int error, v;
-
-	if (first_free_mprof_buf == 0)
-		return (0);
-
-	v = 0;
-	error = sysctl_handle_int(oidp, &v, 0, req);
-	if (error)
-		return (error);
-	if (req->newptr == NULL)
-		return (error);
-	if (v == 0)
-		return (0);
-
-	mtx_lock_spin(&mprof_mtx);
-	bzero(mprof_buf, sizeof(*mprof_buf) * first_free_mprof_buf);
-	bzero(mprof_hash, sizeof(struct mtx *) * MPROF_HASH_SIZE);
-	first_free_mprof_buf = 0;
-	mtx_unlock_spin(&mprof_mtx);
+
+void
+lock_mtx(struct lock_object *lock, int how)
+{
+
+	mtx_lock((struct mtx *)lock);
+}
+
+void
+lock_spin(struct lock_object *lock, int how)
+{
+
+	panic("spin locks can only use msleep_spin");
+}
+
+int
+unlock_mtx(struct lock_object *lock)
+{
+	struct mtx *m;
+
+	m = (struct mtx *)lock;
+	mtx_assert(m, MA_OWNED | MA_NOTRECURSED);
+	mtx_unlock(m);
 	return (0);
 }
-SYSCTL_PROC(_debug_mutex_prof, OID_AUTO, reset, CTLTYPE_INT | CTLFLAG_RW,
-    NULL, 0, reset_mutex_prof_stats, "I", "Reset mutex profiling statistics");
-#endif
+
+int
+unlock_spin(struct lock_object *lock)
+{
+
+	panic("spin locks can only use msleep_spin");
+}
 
 /*
  * Function versions of the inlined __mtx_* macros.  These are used by
@@ -278,119 +175,57 @@
 {
 
 	MPASS(curthread != NULL);
-	KASSERT(m->mtx_object.lo_class == &lock_class_mtx_sleep,
-	    ("mtx_lock() of spin mutex %s @ %s:%d", m->mtx_object.lo_name,
+	KASSERT(m->mtx_lock != MTX_DESTROYED,
+	    ("mtx_lock() of destroyed mutex @ %s:%d", file, line));
+	KASSERT(LOCK_CLASS(&m->lock_object) == &lock_class_mtx_sleep,
+	    ("mtx_lock() of spin mutex %s @ %s:%d", m->lock_object.lo_name,
 	    file, line));
-	WITNESS_CHECKORDER(&m->mtx_object, opts | LOP_NEWORDER | LOP_EXCLUSIVE,
+	WITNESS_CHECKORDER(&m->lock_object, opts | LOP_NEWORDER | LOP_EXCLUSIVE,
 	    file, line);
+
 	_get_sleep_lock(m, curthread, opts, file, line);
-	LOCK_LOG_LOCK("LOCK", &m->mtx_object, opts, m->mtx_recurse, file,
+	LOCK_LOG_LOCK("LOCK", &m->lock_object, opts, m->mtx_recurse, file,
 	    line);
-	WITNESS_LOCK(&m->mtx_object, opts | LOP_EXCLUSIVE, file, line);
-#ifdef MUTEX_PROFILING
-	/* don't reset the timer when/if recursing */
-	if (m->mtx_acqtime == 0) {
-		m->mtx_filename = file;
-		m->mtx_lineno = line;
-		m->mtx_acqtime = mutex_prof_enable ? nanoseconds() : 0;
-		++mutex_prof_acquisitions;
-	}
-#endif
+	WITNESS_LOCK(&m->lock_object, opts | LOP_EXCLUSIVE, file, line);
+	curthread->td_locks++;
 }
 
 void
 _mtx_unlock_flags(struct mtx *m, int opts, const char *file, int line)
 {
-
 	MPASS(curthread != NULL);
-	KASSERT(m->mtx_object.lo_class == &lock_class_mtx_sleep,
-	    ("mtx_unlock() of spin mutex %s @ %s:%d", m->mtx_object.lo_name,
+	KASSERT(m->mtx_lock != MTX_DESTROYED,
+	    ("mtx_unlock() of destroyed mutex @ %s:%d", file, line));
+	KASSERT(LOCK_CLASS(&m->lock_object) == &lock_class_mtx_sleep,
+	    ("mtx_unlock() of spin mutex %s @ %s:%d", m->lock_object.lo_name,
 	    file, line));
-	WITNESS_UNLOCK(&m->mtx_object, opts | LOP_EXCLUSIVE, file, line);
-	LOCK_LOG_LOCK("UNLOCK", &m->mtx_object, opts, m->mtx_recurse, file,
+	curthread->td_locks--;
+	WITNESS_UNLOCK(&m->lock_object, opts | LOP_EXCLUSIVE, file, line);
+	LOCK_LOG_LOCK("UNLOCK", &m->lock_object, opts, m->mtx_recurse, file,
 	    line);
 	mtx_assert(m, MA_OWNED);
-#ifdef MUTEX_PROFILING
-	if (m->mtx_acqtime != 0) {
-		static const char *unknown = "(unknown)";
-		struct mutex_prof *mpp;
-		u_int64_t acqtime, now;
-		const char *p, *q;
-		volatile u_int hash;
-
-		now = nanoseconds();
-		acqtime = m->mtx_acqtime;
-		m->mtx_acqtime = 0;
-		if (now <= acqtime)
-			goto out;
-		for (p = m->mtx_filename;
-		    p != NULL && strncmp(p, "../", 3) == 0; p += 3)
-			/* nothing */ ;
-		if (p == NULL || *p == '\0')
-			p = unknown;
-		for (hash = m->mtx_lineno, q = p; *q != '\0'; ++q)
-			hash = (hash * 2 + *q) % MPROF_HASH_SIZE;
-		mtx_lock_spin(&mprof_mtx);
-		for (mpp = mprof_hash[hash]; mpp != NULL; mpp = mpp->next)
-			if (mpp->line == m->mtx_lineno &&
-			    strcmp(mpp->file, p) == 0)
-				break;
-		if (mpp == NULL) {
-			/* Just exit if we cannot get a trace buffer */
-			if (first_free_mprof_buf >= NUM_MPROF_BUFFERS) {
-				++mutex_prof_rejected;
-				goto unlock;
-			}
-			mpp = &mprof_buf[first_free_mprof_buf++];
-			mpp->name = mtx_name(m);
-			mpp->file = p;
-			mpp->line = m->mtx_lineno;
-			mpp->next = mprof_hash[hash];
-			if (mprof_hash[hash] != NULL)
-				++mutex_prof_collisions;
-			mprof_hash[hash] = mpp;
-			++mutex_prof_records;
-		}
-		/*
-		 * Record if the mutex has been held longer now than ever
-		 * before.
-		 */
-		if (now - acqtime > mpp->cnt_max)
-			mpp->cnt_max = now - acqtime;
-		mpp->cnt_tot += now - acqtime;
-		mpp->cnt_cur++;
-		/*
-		 * There's a small race, really we should cmpxchg
-		 * 0 with the current value, but that would bill
-		 * the contention to the wrong lock instance if
-		 * it followed this also.
-		 */
-		mpp->cnt_contest_holding += m->mtx_contest_holding;
-		m->mtx_contest_holding = 0;
-		mpp->cnt_contest_locking += m->mtx_contest_locking;
-		m->mtx_contest_locking = 0;
-unlock:
-		mtx_unlock_spin(&mprof_mtx);
-	}
-out:
-#endif
+
+	if (m->mtx_recurse == 0)
+		lock_profile_release_lock(&m->lock_object);
 	_rel_sleep_lock(m, curthread, opts, file, line);
 }
 
 void
 _mtx_lock_spin_flags(struct mtx *m, int opts, const char *file, int line)
 {
-
+	
 	MPASS(curthread != NULL);
-	KASSERT(m->mtx_object.lo_class == &lock_class_mtx_spin,
+	KASSERT(m->mtx_lock != MTX_DESTROYED,
+	    ("mtx_lock_spin() of destroyed mutex @ %s:%d", file, line));
+	KASSERT(LOCK_CLASS(&m->lock_object) == &lock_class_mtx_spin,
 	    ("mtx_lock_spin() of sleep mutex %s @ %s:%d",
-	    m->mtx_object.lo_name, file, line));
-	WITNESS_CHECKORDER(&m->mtx_object, opts | LOP_NEWORDER | LOP_EXCLUSIVE,
+	    m->lock_object.lo_name, file, line));
+	WITNESS_CHECKORDER(&m->lock_object, opts | LOP_NEWORDER | LOP_EXCLUSIVE,
 	    file, line);
 	_get_spin_lock(m, curthread, opts, file, line);
-	LOCK_LOG_LOCK("LOCK", &m->mtx_object, opts, m->mtx_recurse, file,
+	LOCK_LOG_LOCK("LOCK", &m->lock_object, opts, m->mtx_recurse, file,
 	    line);
-	WITNESS_LOCK(&m->mtx_object, opts | LOP_EXCLUSIVE, file, line);
+	WITNESS_LOCK(&m->lock_object, opts | LOP_EXCLUSIVE, file, line);
 }
 
 void
@@ -398,13 +233,16 @@
 {
 
 	MPASS(curthread != NULL);
-	KASSERT(m->mtx_object.lo_class == &lock_class_mtx_spin,
+	KASSERT(m->mtx_lock != MTX_DESTROYED,
+	    ("mtx_unlock_spin() of destroyed mutex @ %s:%d", file, line));
+	KASSERT(LOCK_CLASS(&m->lock_object) == &lock_class_mtx_spin,
 	    ("mtx_unlock_spin() of sleep mutex %s @ %s:%d",
-	    m->mtx_object.lo_name, file, line));
-	WITNESS_UNLOCK(&m->mtx_object, opts | LOP_EXCLUSIVE, file, line);
-	LOCK_LOG_LOCK("UNLOCK", &m->mtx_object, opts, m->mtx_recurse, file,
+	    m->lock_object.lo_name, file, line));
+	WITNESS_UNLOCK(&m->lock_object, opts | LOP_EXCLUSIVE, file, line);
+	LOCK_LOG_LOCK("UNLOCK", &m->lock_object, opts, m->mtx_recurse, file,
 	    line);
 	mtx_assert(m, MA_OWNED);
+
 	_rel_spin_lock(m);
 }
 
@@ -416,24 +254,33 @@
 int
 _mtx_trylock(struct mtx *m, int opts, const char *file, int line)
 {
-	int rval;
-
+	int rval, contested = 0;
+	uint64_t waittime = 0;
+	
 	MPASS(curthread != NULL);
-	KASSERT(m->mtx_object.lo_class == &lock_class_mtx_sleep,
-	    ("mtx_trylock() of spin mutex %s @ %s:%d", m->mtx_object.lo_name,
+	KASSERT(m->mtx_lock != MTX_DESTROYED,
+	    ("mtx_trylock() of destroyed mutex @ %s:%d", file, line));
+	KASSERT(LOCK_CLASS(&m->lock_object) == &lock_class_mtx_sleep,
+	    ("mtx_trylock() of spin mutex %s @ %s:%d", m->lock_object.lo_name,
 	    file, line));
 
-	if (mtx_owned(m) && (m->mtx_object.lo_flags & LO_RECURSABLE) != 0) {
+	if (mtx_owned(m) && (m->lock_object.lo_flags & LO_RECURSABLE) != 0) {
 		m->mtx_recurse++;
 		atomic_set_ptr(&m->mtx_lock, MTX_RECURSED);
 		rval = 1;
 	} else
 		rval = _obtain_lock(m, (uintptr_t)curthread);
 
-	LOCK_LOG_TRY("LOCK", &m->mtx_object, opts, rval, file, line);
-	if (rval)
-		WITNESS_LOCK(&m->mtx_object, opts | LOP_EXCLUSIVE | LOP_TRYLOCK,
+	LOCK_LOG_TRY("LOCK", &m->lock_object, opts, rval, file, line);
+	if (rval) {
+		WITNESS_LOCK(&m->lock_object, opts | LOP_EXCLUSIVE | LOP_TRYLOCK,
 		    file, line);
+		curthread->td_locks++;
+		if (m->mtx_recurse == 0)
+			lock_profile_obtain_lock_success(&m->lock_object, contested,
+			    waittime, file, line);
+
+	}
 
 	return (rval);
 }
@@ -448,42 +295,62 @@
 _mtx_lock_sleep(struct mtx *m, uintptr_t tid, int opts, const char *file,
     int line)
 {
-#if defined(SMP) && !defined(NO_ADAPTIVE_MUTEXES)
-	struct thread *owner;
+	struct turnstile *ts;
+#ifdef ADAPTIVE_MUTEXES
+	volatile struct thread *owner;
 #endif
-	uintptr_t v;
 #ifdef KTR
 	int cont_logged = 0;
 #endif
-#ifdef MUTEX_PROFILING
-	int contested;
-#endif
-
+	int contested = 0;
+	uint64_t waittime = 0;
+	uintptr_t v;
+	
 	if (mtx_owned(m)) {
-		KASSERT((m->mtx_object.lo_flags & LO_RECURSABLE) != 0,
+		KASSERT((m->lock_object.lo_flags & LO_RECURSABLE) != 0,
 	    ("_mtx_lock_sleep: recursed on non-recursive mutex %s @ %s:%d\n",
-		    m->mtx_object.lo_name, file, line));
+		    m->lock_object.lo_name, file, line));
 		m->mtx_recurse++;
 		atomic_set_ptr(&m->mtx_lock, MTX_RECURSED);
-		if (LOCK_LOG_TEST(&m->mtx_object, opts))
+		if (LOCK_LOG_TEST(&m->lock_object, opts))
 			CTR1(KTR_LOCK, "_mtx_lock_sleep: %p recursing", m);
 		return;
 	}
 
-	if (LOCK_LOG_TEST(&m->mtx_object, opts))
+	lock_profile_obtain_lock_failed(&m->lock_object,
+		    &contested, &waittime);
+	if (LOCK_LOG_TEST(&m->lock_object, opts))
 		CTR4(KTR_LOCK,
 		    "_mtx_lock_sleep: %s contested (lock=%p) at %s:%d",
-		    m->mtx_object.lo_name, (void *)m->mtx_lock, file, line);
+		    m->lock_object.lo_name, (void *)m->mtx_lock, file, line);
 
-#ifdef MUTEX_PROFILING
-	contested = 0;
+	while (!_obtain_lock(m, tid)) { 
+#ifdef ADAPTIVE_MUTEXES
+		/*
+		 * If the owner is running on another CPU, spin until the
+		 * owner stops running or the state of the lock changes.
+		 */
+		v = m->mtx_lock;
+		if (v != MTX_UNOWNED) {
+			owner = (struct thread *)(v & ~MTX_FLAGMASK);
+#ifdef ADAPTIVE_GIANT
+			if (TD_IS_RUNNING(owner)) {
+#else
+			if (m != &Giant && TD_IS_RUNNING(owner)) {
 #endif
-	while (!_obtain_lock(m, tid)) {
-#ifdef MUTEX_PROFILING
-		contested = 1;
-		atomic_add_int(&m->mtx_contest_holding, 1);
+				if (LOCK_LOG_TEST(&m->lock_object, 0))
+					CTR3(KTR_LOCK,
+					    "%s: spinning on %p held by %p",
+					    __func__, m, owner);
+				while (mtx_owner(m) == owner &&
+				    TD_IS_RUNNING(owner))
+					cpu_spinwait();
+				continue;
+			}
+		}
 #endif
-		turnstile_lock(&m->mtx_object);
+
+		ts = turnstile_trywait(&m->lock_object);
 		v = m->mtx_lock;
 
 		/*
@@ -491,24 +358,27 @@
 		 * the turnstile chain lock.
 		 */
 		if (v == MTX_UNOWNED) {
-			turnstile_release(&m->mtx_object);
+			turnstile_cancel(ts);
 			cpu_spinwait();
 			continue;
 		}
 
-#ifdef MUTEX_WAKE_ALL
 		MPASS(v != MTX_CONTESTED);
-#else
+
+#ifdef ADAPTIVE_MUTEXES
 		/*
-		 * The mutex was marked contested on release. This means that
-		 * there are other threads blocked on it.  Grab ownership of
-		 * it and propagate its priority to the current thread if
-		 * necessary.
+		 * If the current owner of the lock is executing on another
+		 * CPU quit the hard path and try to spin.
 		 */
-		if (v == MTX_CONTESTED) {
-			m->mtx_lock = tid | MTX_CONTESTED;
-			turnstile_claim(&m->mtx_object);
-			break;
+		owner = (struct thread *)(v & ~MTX_FLAGMASK);
+#ifdef ADAPTIVE_GIANT
+		if (TD_IS_RUNNING(owner)) {
+#else
+		if (m != &Giant && TD_IS_RUNNING(owner)) {
+#endif
+			turnstile_cancel(ts);
+			cpu_spinwait();
+			continue;
 		}
 #endif
 
@@ -519,30 +389,11 @@
 		 */
 		if ((v & MTX_CONTESTED) == 0 &&
 		    !atomic_cmpset_ptr(&m->mtx_lock, v, v | MTX_CONTESTED)) {
-			turnstile_release(&m->mtx_object);
+			turnstile_cancel(ts);
 			cpu_spinwait();
 			continue;
 		}
 
-#if defined(SMP) && !defined(NO_ADAPTIVE_MUTEXES)
-		/*
-		 * If the current owner of the lock is executing on another
-		 * CPU, spin instead of blocking.
-		 */
-		owner = (struct thread *)(v & MTX_FLAGMASK);
-#ifdef ADAPTIVE_GIANT
-		if (TD_IS_RUNNING(owner)) {
-#else
-		if (m != &Giant && TD_IS_RUNNING(owner)) {
-#endif
-			turnstile_release(&m->mtx_object);
-			while (mtx_owner(m) == owner && TD_IS_RUNNING(owner)) {
-				cpu_spinwait();
-			}
-			continue;
-		}
-#endif	/* SMP && !NO_ADAPTIVE_MUTEXES */
-
 		/*
 		 * We definitely must sleep for this lock.
 		 */
@@ -552,9 +403,9 @@
 		if (!cont_logged) {
 			CTR6(KTR_CONTENTION,
 			    "contention: %p at %s:%d wants %s, taken by %s:%d",
-			    (void *)tid, file, line, m->mtx_object.lo_name,
-			    WITNESS_FILE(&m->mtx_object),
-			    WITNESS_LINE(&m->mtx_object));
+			    (void *)tid, file, line, m->lock_object.lo_name,
+			    WITNESS_FILE(&m->lock_object),
+			    WITNESS_LINE(&m->lock_object));
 			cont_logged = 1;
 		}
 #endif
@@ -562,22 +413,36 @@
 		/*
 		 * Block on the turnstile.
 		 */
-		turnstile_wait(&m->mtx_object, mtx_owner(m));
+		turnstile_wait(ts, mtx_owner(m), TS_EXCLUSIVE_QUEUE);
 	}
-
 #ifdef KTR
 	if (cont_logged) {
 		CTR4(KTR_CONTENTION,
 		    "contention end: %s acquired by %p at %s:%d",
-		    m->mtx_object.lo_name, (void *)tid, file, line);
+		    m->lock_object.lo_name, (void *)tid, file, line);
 	}
 #endif
-#ifdef MUTEX_PROFILING
-	if (contested)
-		m->mtx_contest_locking++;
-	m->mtx_contest_holding = 0;
+	lock_profile_obtain_lock_success(&m->lock_object, contested,	
+	    waittime, (file), (line));					
+}
+
+static void
+_mtx_lock_spin_failed(struct mtx *m)
+{
+	struct thread *td;
+
+	td = mtx_owner(m);
+
+	/* If the mutex is unlocked, try again. */
+	if (td == NULL)
+		return;
+
+	printf( "spin lock %p (%s) held by %p (tid %d) too long\n",
+	    m, m->lock_object.lo_name, td, td->td_tid);
+#ifdef WITNESS
+	witness_display_spinlock(&m->lock_object, td);
 #endif
-	return;
+	panic("spin lock held too long");
 }
 
 #ifdef SMP
@@ -591,14 +456,14 @@
 _mtx_lock_spin(struct mtx *m, uintptr_t tid, int opts, const char *file,
     int line)
 {
-	int i = 0;
-
-	if (LOCK_LOG_TEST(&m->mtx_object, opts))
+	int i = 0, contested = 0;
+	uint64_t waittime = 0;
+	
+	if (LOCK_LOG_TEST(&m->lock_object, opts))
 		CTR1(KTR_LOCK, "_mtx_lock_spin: %p spinning", m);
 
-	for (;;) {
-		if (_obtain_lock(m, tid))
-			break;
+	lock_profile_obtain_lock_failed(&m->lock_object, &contested, &waittime);
+	while (!_obtain_lock(m, tid)) {
 
 		/* Give interrupts a chance while we spin. */
 		spinlock_exit();
@@ -607,29 +472,107 @@
 				cpu_spinwait();
 				continue;
 			}
-			if (i < 60000000)
+			if (i < 60000000 || kdb_active || panicstr != NULL)
 				DELAY(1);
-			else if (!kdb_active && !panicstr) {
-				printf("spin lock %s held by %p for > 5 seconds\n",
-				    m->mtx_object.lo_name, (void *)m->mtx_lock);
-#ifdef WITNESS
-				witness_display_spinlock(&m->mtx_object,
-				    mtx_owner(m));
-#endif
-				panic("spin lock held too long");
-			}
+			else
+				_mtx_lock_spin_failed(m);
 			cpu_spinwait();
 		}
 		spinlock_enter();
 	}
 
-	if (LOCK_LOG_TEST(&m->mtx_object, opts))
+	if (LOCK_LOG_TEST(&m->lock_object, opts))
 		CTR1(KTR_LOCK, "_mtx_lock_spin: %p spin done", m);
 
-	return;
+	lock_profile_obtain_lock_success(&m->lock_object, contested,	
+	    waittime, (file), (line));
 }
 #endif /* SMP */
 
+void
+_thread_lock_flags(struct thread *td, int opts, const char *file, int line)
+{
+	struct mtx *m;
+	uintptr_t tid;
+	int i, contested;
+	uint64_t waittime;
+
+	
+	contested = i = 0;
+	waittime = 0;
+	tid = (uintptr_t)curthread;
+	for (;;) {
+retry:
+		spinlock_enter();
+		m = td->td_lock;
+		WITNESS_CHECKORDER(&m->lock_object,
+		    opts | LOP_NEWORDER | LOP_EXCLUSIVE, file, line);
+		while (!_obtain_lock(m, tid)) {
+			if (m->mtx_lock == tid) {
+				m->mtx_recurse++;
+				break;
+			}
+			lock_profile_obtain_lock_failed(&m->lock_object, &contested, &waittime);
+			/* Give interrupts a chance while we spin. */
+			spinlock_exit();
+			while (m->mtx_lock != MTX_UNOWNED) {
+				if (i++ < 10000000)
+					cpu_spinwait();
+				else if (i < 60000000 ||
+				    kdb_active || panicstr != NULL)
+					DELAY(1);
+				else
+					_mtx_lock_spin_failed(m);
+				cpu_spinwait();
+				if (m != td->td_lock)
+					goto retry;
+			}
+			spinlock_enter();
+		}
+		if (m == td->td_lock)
+			break;
+		_rel_spin_lock(m);	/* does spinlock_exit() */
+	}
+	lock_profile_obtain_lock_success(&m->lock_object, contested,	
+	    waittime, (file), (line));
+	WITNESS_LOCK(&m->lock_object, opts | LOP_EXCLUSIVE, file, line);
+}
+
+struct mtx *
+thread_lock_block(struct thread *td)
+{
+	struct mtx *lock;
+
+	spinlock_enter();
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	lock = td->td_lock;
+	td->td_lock = &blocked_lock;
+	mtx_unlock_spin(lock);
+
+	return (lock);
+}
+
+void
+thread_lock_unblock(struct thread *td, struct mtx *new)
+{
+	mtx_assert(new, MA_OWNED);
+	MPASS(td->td_lock == &blocked_lock);
+	atomic_store_rel_ptr((volatile void *)&td->td_lock, (uintptr_t)new);
+	spinlock_exit();
+}
+
+void
+thread_lock_set(struct thread *td, struct mtx *new)
+{
+	struct mtx *lock;
+
+	mtx_assert(new, MA_OWNED);
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	lock = td->td_lock;
+	td->td_lock = new;
+	mtx_unlock_spin(lock);
+}
+
 /*
  * _mtx_unlock_sleep: the tougher part of releasing an MTX_DEF lock.
  *
@@ -640,95 +583,33 @@
 _mtx_unlock_sleep(struct mtx *m, int opts, const char *file, int line)
 {
 	struct turnstile *ts;
-#ifndef PREEMPTION
-	struct thread *td, *td1;
-#endif
 
 	if (mtx_recursed(m)) {
 		if (--(m->mtx_recurse) == 0)
 			atomic_clear_ptr(&m->mtx_lock, MTX_RECURSED);
-		if (LOCK_LOG_TEST(&m->mtx_object, opts))
+		if (LOCK_LOG_TEST(&m->lock_object, opts))
 			CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p unrecurse", m);
 		return;
 	}
 
-	turnstile_lock(&m->mtx_object);
-	ts = turnstile_lookup(&m->mtx_object);
-	if (LOCK_LOG_TEST(&m->mtx_object, opts))
+	/*
+	 * We have to lock the chain before the turnstile so this turnstile
+	 * can be removed from the hash list if it is empty.
+	 */
+	turnstile_chain_lock(&m->lock_object);
+	ts = turnstile_lookup(&m->lock_object);
+	if (LOCK_LOG_TEST(&m->lock_object, opts))
 		CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p contested", m);
 
-#if defined(SMP) && !defined(NO_ADAPTIVE_MUTEXES)
-	if (ts == NULL) {
-		_release_lock_quick(m);
-		if (LOCK_LOG_TEST(&m->mtx_object, opts))
-			CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p no sleepers", m);
-		turnstile_release(&m->mtx_object);
-		return;
-	}
-#else
 	MPASS(ts != NULL);
-#endif
-#ifndef PREEMPTION
-	/* XXX */
-	td1 = turnstile_head(ts);
-#endif
-#ifdef MUTEX_WAKE_ALL
-	turnstile_broadcast(ts);
+	turnstile_broadcast(ts, TS_EXCLUSIVE_QUEUE);
 	_release_lock_quick(m);
-#else
-	if (turnstile_signal(ts)) {
-		_release_lock_quick(m);
-		if (LOCK_LOG_TEST(&m->mtx_object, opts))
-			CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p not held", m);
-	} else {
-		m->mtx_lock = MTX_CONTESTED;
-		if (LOCK_LOG_TEST(&m->mtx_object, opts))
-			CTR1(KTR_LOCK, "_mtx_unlock_sleep: %p still contested",
-			    m);
-	}
-#endif
-	turnstile_unpend(ts);
-
-#ifndef PREEMPTION
 	/*
-	 * XXX: This is just a hack until preemption is done.  However,
-	 * once preemption is done we need to either wrap the
-	 * turnstile_signal() and release of the actual lock in an
-	 * extra critical section or change the preemption code to
-	 * always just set a flag and never do instant-preempts.
+	 * This turnstile is now no longer associated with the mutex.  We can
+	 * unlock the chain lock so a new turnstile may take it's place.
 	 */
-	td = curthread;
-	if (td->td_critnest > 0 || td1->td_priority >= td->td_priority)
-		return;
-	mtx_lock_spin(&sched_lock);
-	if (!TD_IS_RUNNING(td1)) {
-#ifdef notyet
-		if (td->td_ithd != NULL) {
-			struct ithd *it = td->td_ithd;
-
-			if (it->it_interrupted) {
-				if (LOCK_LOG_TEST(&m->mtx_object, opts))
-					CTR2(KTR_LOCK,
-				    "_mtx_unlock_sleep: %p interrupted %p",
-					    it, it->it_interrupted);
-				intr_thd_fixup(it);
-			}
-		}
-#endif
-		if (LOCK_LOG_TEST(&m->mtx_object, opts))
-			CTR2(KTR_LOCK,
-			    "_mtx_unlock_sleep: %p switching out lock=%p", m,
-			    (void *)m->mtx_lock);
-
-		mi_switch(SW_INVOL, NULL);
-		if (LOCK_LOG_TEST(&m->mtx_object, opts))
-			CTR2(KTR_LOCK, "_mtx_unlock_sleep: %p resuming lock=%p",
-			    m, (void *)m->mtx_lock);
-	}
-	mtx_unlock_spin(&sched_lock);
-#endif
-
-	return;
+	turnstile_unpend(ts, TS_EXCLUSIVE_LOCK);
+	turnstile_chain_unlock(&m->lock_object);
 }
 
 /*
@@ -752,20 +633,20 @@
 	case MA_OWNED | MA_NOTRECURSED:
 		if (!mtx_owned(m))
 			panic("mutex %s not owned at %s:%d",
-			    m->mtx_object.lo_name, file, line);
+			    m->lock_object.lo_name, file, line);
 		if (mtx_recursed(m)) {
 			if ((what & MA_NOTRECURSED) != 0)
 				panic("mutex %s recursed at %s:%d",
-				    m->mtx_object.lo_name, file, line);
+				    m->lock_object.lo_name, file, line);
 		} else if ((what & MA_RECURSED) != 0) {
 			panic("mutex %s unrecursed at %s:%d",
-			    m->mtx_object.lo_name, file, line);
+			    m->lock_object.lo_name, file, line);
 		}
 		break;
 	case MA_NOTOWNED:
 		if (mtx_owned(m))
 			panic("mutex %s owned at %s:%d",
-			    m->mtx_object.lo_name, file, line);
+			    m->lock_object.lo_name, file, line);
 		break;
 	default:
 		panic("unknown mtx_assert at %s:%d", file, line);
@@ -791,11 +672,6 @@
  * XXX: When kernacc() does not require Giant we can reenable this check
  */
 #ifdef notyet
-/*
- * XXX - When kernacc() is fixed on the alpha to handle K0_SEG memory properly
- * we can re-enable the kernacc() checks.
- */
-#ifndef __alpha__
 	/*
 	 * Can't call kernacc() from early init386(), especially when
 	 * initializing Giant mutex, because some stuff in kernacc()
@@ -806,7 +682,6 @@
 		    VM_PROT_READ | VM_PROT_WRITE))
 			panic("Can't read and write to mutex %p", m);
 #endif
-#endif
 }
 #endif
 
@@ -830,40 +705,39 @@
 void
 mtx_init(struct mtx *m, const char *name, const char *type, int opts)
 {
-	struct lock_object *lock;
+	struct lock_class *class;
+	int flags;
 
 	MPASS((opts & ~(MTX_SPIN | MTX_QUIET | MTX_RECURSE |
-	    MTX_NOWITNESS | MTX_DUPOK)) == 0);
+		MTX_NOWITNESS | MTX_DUPOK | MTX_NOPROFILE)) == 0);
 
 #ifdef MUTEX_DEBUG
 	/* Diagnostic and error correction */
 	mtx_validate(m);
 #endif
 
-	lock = &m->mtx_object;
-	KASSERT((lock->lo_flags & LO_INITIALIZED) == 0,
-	    ("mutex \"%s\" %p already initialized", name, m));
-	bzero(m, sizeof(*m));
+	/* Determine lock class and lock flags. */
 	if (opts & MTX_SPIN)
-		lock->lo_class = &lock_class_mtx_spin;
+		class = &lock_class_mtx_spin;
 	else
-		lock->lo_class = &lock_class_mtx_sleep;
-	lock->lo_name = name;
-	lock->lo_type = type != NULL ? type : name;
+		class = &lock_class_mtx_sleep;
+	flags = 0;
 	if (opts & MTX_QUIET)
-		lock->lo_flags = LO_QUIET;
+		flags |= LO_QUIET;
 	if (opts & MTX_RECURSE)
-		lock->lo_flags |= LO_RECURSABLE;
+		flags |= LO_RECURSABLE;
 	if ((opts & MTX_NOWITNESS) == 0)
-		lock->lo_flags |= LO_WITNESS;
+		flags |= LO_WITNESS;
 	if (opts & MTX_DUPOK)
-		lock->lo_flags |= LO_DUPOK;
+		flags |= LO_DUPOK;
+	if (opts & MTX_NOPROFILE)
+		flags |= LO_NOPROFILE;
 
+	/* Initialize mutex. */
 	m->mtx_lock = MTX_UNOWNED;
+	m->mtx_recurse = 0;
 
-	LOCK_LOG_INIT(lock, opts);
-
-	WITNESS_INIT(lock);
+	lock_init(&m->lock_object, class, name, type, flags);
 }
 
 /*
@@ -876,19 +750,24 @@
 mtx_destroy(struct mtx *m)
 {
 
-	LOCK_LOG_DESTROY(&m->mtx_object, 0);
-
 	if (!mtx_owned(m))
 		MPASS(mtx_unowned(m));
 	else {
 		MPASS((m->mtx_lock & (MTX_RECURSED|MTX_CONTESTED)) == 0);
 
+		/* Perform the non-mtx related part of mtx_unlock_spin(). */
+		if (LOCK_CLASS(&m->lock_object) == &lock_class_mtx_spin)
+			spinlock_exit();
+		else
+			curthread->td_locks--;
+
 		/* Tell witness this isn't locked to make it happy. */
-		WITNESS_UNLOCK(&m->mtx_object, LOP_EXCLUSIVE, __FILE__,
+		WITNESS_UNLOCK(&m->lock_object, LOP_EXCLUSIVE, __FILE__,
 		    __LINE__);
 	}
 
-	WITNESS_DESTROY(&m->mtx_object);
+	m->mtx_lock = MTX_DESTROYED;
+	lock_destroy(&m->lock_object);
 }
 
 /*
@@ -900,9 +779,6 @@
 mutex_init(void)
 {
 
-	/* Setup thread0 so that mutexes work. */
-	LIST_INIT(&thread0.td_contested);
-
 	/* Setup turnstiles so that sleep mutexes work. */
 	init_turnstiles();
 
@@ -910,34 +786,17 @@
 	 * Initialize mutexes.
 	 */
 	mtx_init(&Giant, "Giant", NULL, MTX_DEF | MTX_RECURSE);
-	mtx_init(&sched_lock, "sched lock", NULL, MTX_SPIN | MTX_RECURSE);
+	mtx_init(&blocked_lock, "blocked lock", NULL, MTX_SPIN);
+	blocked_lock.mtx_lock = 0xdeadc0de;	/* Always blocked. */
 	mtx_init(&proc0.p_mtx, "process lock", NULL, MTX_DEF | MTX_DUPOK);
+	mtx_init(&proc0.p_slock, "process slock", NULL, MTX_SPIN | MTX_RECURSE);
 	mtx_init(&devmtx, "cdev", NULL, MTX_DEF);
 	mtx_lock(&Giant);
+	
+	lock_profile_init();
 }
 
 #ifdef DDB
-/* XXX: This function is not mutex-specific. */
-DB_SHOW_COMMAND(lock, db_show_lock)
-{
-	struct lock_object *lock;
-
-	if (!have_addr)
-		return;
-	lock = (struct lock_object *)addr;
-	if (lock->lo_class != &lock_class_mtx_sleep &&
-	    lock->lo_class != &lock_class_mtx_spin &&
-	    lock->lo_class != &lock_class_sx) {
-		db_printf("Unknown lock class\n");
-		return;
-	}
-	db_printf(" class: %s\n", lock->lo_class->lc_name);
-	db_printf(" name: %s\n", lock->lo_name);
-	if (lock->lo_type && lock->lo_type != lock->lo_name)
-		db_printf(" type: %s\n", lock->lo_type);
-	lock->lo_class->lc_ddb_show(lock);
-}
-
 void
 db_show_mtx(struct lock_object *lock)
 {
@@ -947,18 +806,20 @@
 	m = (struct mtx *)lock;
 
 	db_printf(" flags: {");
-	if (m->mtx_object.lo_class == &lock_class_mtx_spin)
+	if (LOCK_CLASS(lock) == &lock_class_mtx_spin)
 		db_printf("SPIN");
 	else
 		db_printf("DEF");
-	if (m->mtx_object.lo_flags & LO_RECURSABLE)
+	if (m->lock_object.lo_flags & LO_RECURSABLE)
 		db_printf(", RECURSE");
-	if (m->mtx_object.lo_flags & LO_DUPOK)
+	if (m->lock_object.lo_flags & LO_DUPOK)
 		db_printf(", DUPOK");
 	db_printf("}\n");
 	db_printf(" state: {");
 	if (mtx_unowned(m))
 		db_printf("UNOWNED");
+	else if (mtx_destroyed(m))
+		db_printf("DESTROYED");
 	else {
 		db_printf("OWNED");
 		if (m->mtx_lock & MTX_CONTESTED)
@@ -967,7 +828,7 @@
 			db_printf(", RECURSED");
 	}
 	db_printf("}\n");
-	if (!mtx_unowned(m)) {
+	if (!mtx_unowned(m) && !mtx_destroyed(m)) {
 		td = mtx_owner(m);
 		db_printf(" owner: %p (tid %d, pid %d, \"%s\")\n", td,
 		    td->td_tid, td->td_proc->p_pid, td->td_proc->p_comm);
Index: kern_acct.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_acct.c,v
retrieving revision 1.1.1.2
retrieving revision 1.2
diff -L sys/kern/kern_acct.c -L sys/kern/kern_acct.c -u -r1.1.1.2 -r1.2
--- sys/kern/kern_acct.c
+++ sys/kern/kern_acct.c
@@ -2,13 +2,39 @@
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
+ * Copyright (c) 2005 Robert N. M. Watson
+ * All rights reserved.
+ *
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
  * Copyright (c) 1994 Christopher G. Demetriou
- * Copyright (c) 2005 Robert N. M. Watson
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -42,7 +68,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_acct.c,v 1.74.2.3 2006/02/14 23:13:17 jhb Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_acct.c,v 1.95 2007/08/31 13:56:26 dds Exp $");
 
 #include "opt_mac.h"
 
@@ -52,11 +78,12 @@
 #include <sys/fcntl.h>
 #include <sys/kernel.h>
 #include <sys/kthread.h>
+#include <sys/limits.h>
 #include <sys/lock.h>
-#include <sys/mac.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
 #include <sys/namei.h>
+#include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/sched.h>
@@ -68,23 +95,33 @@
 #include <sys/tty.h>
 #include <sys/vnode.h>
 
+#include <security/mac/mac_framework.h>
+
 /*
  * The routines implemented in this file are described in:
  *      Leffler, et al.: The Design and Implementation of the 4.3BSD
  *	    UNIX Operating System (Addison Welley, 1989)
  * on pages 62-63.
+ * On May 2007 the historic 3 bits base 8 exponent, 13 bit fraction
+ * compt_t representation described in the above reference was replaced
+ * with that of IEEE-754 floats.
  *
  * Arguably, to simplify accounting operations, this mechanism should
  * be replaced by one in which an accounting log file (similar to /dev/klog)
  * is read by a user process, etc.  However, that has its own problems.
  */
 
+/* Floating point definitions from <float.h>. */
+#define FLT_MANT_DIG    24              /* p */
+#define FLT_MAX_EXP     128             /* emax */
+
 /*
  * Internal accounting functions.
  * The former's operation is described in Leffler, et al., and the latter
  * was provided by UCB with the 4.4BSD-Lite release
  */
-static comp_t	encode_comp_t(u_long, u_long);
+static uint32_t	encode_timeval(struct timeval);
+static uint32_t	encode_long(long);
 static void	acctwatch(void);
 static void	acct_thread(void *);
 static int	acct_disable(struct thread *);
@@ -94,6 +131,7 @@
  * acct_sx protects against changes to the active vnode and credentials
  * while accounting records are being committed to disk.
  */
+static int		 acct_configured;
 static int		 acct_suspended;
 static struct vnode	*acct_vp;
 static struct ucred	*acct_cred;
@@ -146,60 +184,60 @@
     &acctchkfreq, 0, sysctl_acct_chkfreq, "I",
     "frequency for checking the free space");
 
+SYSCTL_INT(_kern, OID_AUTO, acct_configured, CTLFLAG_RD, &acct_configured, 0,
+	"Accounting configured or not");
+
 SYSCTL_INT(_kern, OID_AUTO, acct_suspended, CTLFLAG_RD, &acct_suspended, 0,
 	"Accounting suspended or not");
 
 /*
- * Accounting system call.  Written based on the specification and
- * previous implementation done by Mark Tinguely.
- *
- * MPSAFE
+ * Accounting system call.  Written based on the specification and previous
+ * implementation done by Mark Tinguely.
  */
 int
 acct(struct thread *td, struct acct_args *uap)
 {
 	struct nameidata nd;
-	int error, flags;
+	int error, flags, vfslocked;
 
-	/* Make sure that the caller is root. */
-	error = suser(td);
+	error = priv_check(td, PRIV_ACCT);
 	if (error)
 		return (error);
 
 	/*
 	 * If accounting is to be started to a file, open that file for
-	 * appending and make sure it's a 'normal'.  While we could
-	 * conditionally acquire Giant here, we're actually interacting with
-	 * vnodes from possibly two file systems, making the logic a bit
-	 * complicated.  For now, use Giant unconditionally.
+	 * appending and make sure it's a 'normal'.
 	 */
-	mtx_lock(&Giant);
 	if (uap->path != NULL) {
-		NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, uap->path, td);
+		NDINIT(&nd, LOOKUP, NOFOLLOW | MPSAFE | AUDITVNODE1,
+		    UIO_USERSPACE, uap->path, td);
 		flags = FWRITE | O_APPEND;
-		error = vn_open(&nd, &flags, 0, -1);
+		error = vn_open(&nd, &flags, 0, NULL);
 		if (error)
-			goto done;
+			return (error);
+		vfslocked = NDHASGIANT(&nd);
 		NDFREE(&nd, NDF_ONLY_PNBUF);
 #ifdef MAC
 		error = mac_check_system_acct(td->td_ucred, nd.ni_vp);
 		if (error) {
 			VOP_UNLOCK(nd.ni_vp, 0, td);
 			vn_close(nd.ni_vp, flags, td->td_ucred, td);
-			goto done;
+			VFS_UNLOCK_GIANT(vfslocked);
+			return (error);
 		}
 #endif
 		VOP_UNLOCK(nd.ni_vp, 0, td);
 		if (nd.ni_vp->v_type != VREG) {
 			vn_close(nd.ni_vp, flags, td->td_ucred, td);
-			error = EACCES;
-			goto done;
+			VFS_UNLOCK_GIANT(vfslocked);
+			return (EACCES);
 		}
+		VFS_UNLOCK_GIANT(vfslocked);
 #ifdef MAC
 	} else {
 		error = mac_check_system_acct(td->td_ucred, NULL);
 		if (error)
-			goto done;
+			return (error);
 #endif
 	}
 
@@ -216,15 +254,18 @@
 	 * enabled.
 	 */
 	acct_suspended = 0;
-	if (acct_vp != NULL)
+	if (acct_vp != NULL) {
+		vfslocked = VFS_LOCK_GIANT(acct_vp->v_mount);
 		error = acct_disable(td);
+		VFS_UNLOCK_GIANT(vfslocked);
+	}
 	if (uap->path == NULL) {
 		if (acct_state & ACCT_RUNNING) {
 			acct_state |= ACCT_EXITREQ;
 			wakeup(&acct_state);
 		}
 		sx_xunlock(&acct_sx);
-		goto done;
+		return (error);
 	}
 
 	/*
@@ -245,20 +286,22 @@
 		error = kthread_create(acct_thread, NULL, NULL, 0, 0,
 		    "accounting");
 		if (error) {
+			vfslocked = VFS_LOCK_GIANT(acct_vp->v_mount);
 			(void) vn_close(acct_vp, acct_flags, acct_cred, td);
+			VFS_UNLOCK_GIANT(vfslocked);
 			crfree(acct_cred);
+			acct_configured = 0;
 			acct_vp = NULL;
 			acct_cred = NULL;
 			acct_flags = 0;
 			sx_xunlock(&acct_sx);
 			log(LOG_NOTICE, "Unable to start accounting thread\n");
-			goto done;
+			return (error);
 		}
 	}
+	acct_configured = 1;
 	sx_xunlock(&acct_sx);
 	log(LOG_NOTICE, "Accounting enabled\n");
-done:
-	mtx_unlock(&Giant);
 	return (error);
 }
 
@@ -274,6 +317,7 @@
 	sx_assert(&acct_sx, SX_XLOCKED);
 	error = vn_close(acct_vp, acct_flags, acct_cred, td);
 	crfree(acct_cred);
+	acct_configured = 0;
 	acct_vp = NULL;
 	acct_cred = NULL;
 	acct_flags = 0;
@@ -290,11 +334,11 @@
 int
 acct_process(struct thread *td)
 {
-	struct acct acct;
+	struct acctv2 acct;
 	struct timeval ut, st, tmp;
 	struct plimit *newlim, *oldlim;
 	struct proc *p;
-	struct rusage *r;
+	struct rusage ru;
 	int t, ret, vfslocked;
 
 	/*
@@ -327,9 +371,9 @@
 	bcopy(p->p_comm, acct.ac_comm, sizeof acct.ac_comm);
 
 	/* (2) The amount of user and system time that was used */
-	calcru(p, &ut, &st);
-	acct.ac_utime = encode_comp_t(ut.tv_sec, ut.tv_usec);
-	acct.ac_stime = encode_comp_t(st.tv_sec, st.tv_usec);
+	rufetchcalc(p, &ru, &ut, &st);
+	acct.ac_utime = encode_timeval(ut);
+	acct.ac_stime = encode_timeval(st);
 
 	/* (3) The elapsed time the command ran (and its starting time) */
 	tmp = boottime;
@@ -337,20 +381,21 @@
 	acct.ac_btime = tmp.tv_sec;
 	microuptime(&tmp);
 	timevalsub(&tmp, &p->p_stats->p_start);
-	acct.ac_etime = encode_comp_t(tmp.tv_sec, tmp.tv_usec);
+	acct.ac_etime = encode_timeval(tmp);
 
 	/* (4) The average amount of memory used */
-	r = &p->p_stats->p_ru;
 	tmp = ut;
 	timevaladd(&tmp, &st);
+	/* Convert tmp (i.e. u + s) into hz units to match ru_i*. */
 	t = tmp.tv_sec * hz + tmp.tv_usec / tick;
 	if (t)
-		acct.ac_mem = (r->ru_ixrss + r->ru_idrss + r->ru_isrss) / t;
+		acct.ac_mem = encode_long((ru.ru_ixrss + ru.ru_idrss +
+		    + ru.ru_isrss) / t);
 	else
 		acct.ac_mem = 0;
 
 	/* (5) The number of disk I/O operations done */
-	acct.ac_io = encode_comp_t(r->ru_inblock + r->ru_oublock, 0);
+	acct.ac_io = encode_long(ru.ru_inblock + ru.ru_oublock);
 
 	/* (6) The UID and GID of the process */
 	acct.ac_uid = p->p_ucred->cr_ruid;
@@ -365,9 +410,15 @@
 	SESS_UNLOCK(p->p_session);
 
 	/* (8) The boolean flags that tell how the process terminated, etc. */
-	acct.ac_flag = p->p_acflag;
+	acct.ac_flagx = p->p_acflag;
 	PROC_UNLOCK(p);
 
+	/* Setup ancillary structure fields. */
+	acct.ac_flagx |= ANVER;
+	acct.ac_zero = 0;
+	acct.ac_version = 2;
+	acct.ac_len = acct.ac_len2 = sizeof(acct);
+
 	/*
 	 * Eliminate any file size rlimit.
 	 */
@@ -393,44 +444,107 @@
 	return (ret);
 }
 
+/* FLOAT_CONVERSION_START (Regression testing; don't remove this line.) */
+
+/* Convert timevals and longs into IEEE-754 bit patterns. */
+
+/* Mantissa mask (MSB is implied, so subtract 1). */
+#define MANT_MASK ((1 << (FLT_MANT_DIG - 1)) - 1)
+
 /*
- * Encode_comp_t converts from ticks in seconds and microseconds
- * to ticks in 1/AHZ seconds.  The encoding is described in
- * Leffler, et al., on page 63.
+ * We calculate integer values to a precision of approximately
+ * 28 bits.
+ * This is high-enough precision to fill the 24 float bits
+ * and low-enough to avoid overflowing the 32 int bits.
  */
+#define CALC_BITS 28
 
-#define	MANTSIZE	13			/* 13 bit mantissa. */
-#define	EXPSIZE		3			/* Base 8 (3 bit) exponent. */
-#define	MAXFRACT	((1 << MANTSIZE) - 1)	/* Maximum fractional value. */
+/* log_2(1000000). */
+#define LOG2_1M 20
 
-static comp_t
-encode_comp_t(u_long s, u_long us)
+/*
+ * Convert the elements of a timeval into a 32-bit word holding
+ * the bits of a IEEE-754 float.
+ * The float value represents the timeval's value in microsecond units.
+ */
+static uint32_t
+encode_timeval(struct timeval tv)
 {
-	int exp, rnd;
-
-	exp = 0;
-	rnd = 0;
-	s *= AHZ;
-	s += us / (1000000 / AHZ);	/* Maximize precision. */
+	int log2_s;
+	int val, exp;	/* Unnormalized value and exponent */
+	int norm_exp;	/* Normalized exponent */
+	int shift;
 
-	while (s > MAXFRACT) {
-	rnd = s & (1 << (EXPSIZE - 1));	/* Round up? */
-		s >>= EXPSIZE;		/* Base 8 exponent == 3 bit shift. */
-		exp++;
+	/*
+	 * First calculate value and exponent to about CALC_BITS precision.
+	 * Note that the following conditionals have been ordered so that
+	 * the most common cases appear first.
+	 */
+	if (tv.tv_sec == 0) {
+		if (tv.tv_usec == 0)
+			return (0);
+		exp = 0;
+		val = tv.tv_usec;
+	} else {
+		/*
+		 * Calculate the value to a precision of approximately
+		 * CALC_BITS.
+		 */
+		log2_s = fls(tv.tv_sec) - 1;
+		if (log2_s + LOG2_1M < CALC_BITS) {
+			exp = 0;
+			val = 1000000 * tv.tv_sec + tv.tv_usec;
+		} else {
+			exp = log2_s + LOG2_1M - CALC_BITS;
+			val = (unsigned int)(((u_int64_t)1000000 * tv.tv_sec +
+			    tv.tv_usec) >> exp);
+		}
 	}
+	/* Now normalize and pack the value into an IEEE-754 float. */
+	norm_exp = fls(val) - 1;
+	shift = FLT_MANT_DIG - norm_exp - 1;
+#ifdef ACCT_DEBUG
+	printf("val=%d exp=%d shift=%d log2(val)=%d\n",
+	    val, exp, shift, norm_exp);
+	printf("exp=%x mant=%x\n", FLT_MAX_EXP - 1 + exp + norm_exp,
+	    ((shift > 0 ? (val << shift) : (val >> -shift)) & MANT_MASK));
+#endif
+	return (((FLT_MAX_EXP - 1 + exp + norm_exp) << (FLT_MANT_DIG - 1)) |
+	    ((shift > 0 ? val << shift : val >> -shift) & MANT_MASK));
+}
 
-	/* If we need to round up, do it (and handle overflow correctly). */
-	if (rnd && (++s > MAXFRACT)) {
-		s >>= EXPSIZE;
-		exp++;
-	}
+/*
+ * Convert a non-negative long value into the bit pattern of
+ * an IEEE-754 float value.
+ */
+static uint32_t
+encode_long(long val)
+{
+	int norm_exp;	/* Normalized exponent */
+	int shift;
 
-	/* Clean it up and polish it off. */
-	exp <<= MANTSIZE;		/* Shift the exponent into place */
-	exp += s;			/* and add on the mantissa. */
-	return (exp);
+	if (val == 0)
+		return (0);
+	if (val < 0) {
+		log(LOG_NOTICE,
+		    "encode_long: negative value %ld in accounting record\n",
+		    val);
+		val = LONG_MAX;
+	}
+	norm_exp = fls(val) - 1;
+	shift = FLT_MANT_DIG - norm_exp - 1;
+#ifdef ACCT_DEBUG
+	printf("val=%d shift=%d log2(val)=%d\n",
+	    val, shift, norm_exp);
+	printf("exp=%x mant=%x\n", FLT_MAX_EXP - 1 + exp + norm_exp,
+	    ((shift > 0 ? (val << shift) : (val >> -shift)) & MANT_MASK));
+#endif
+	return (((FLT_MAX_EXP - 1 + norm_exp) << (FLT_MANT_DIG - 1)) |
+	    ((shift > 0 ? val << shift : val >> -shift) & MANT_MASK));
 }
 
+/* FLOAT_CONVERSION_END (Regression testing; don't remove this line.) */
+
 /*
  * Periodically check the filesystem to see if accounting
  * should be turned on or off.  Beware the case where the vnode
@@ -503,9 +617,9 @@
 
 	/* This is a low-priority kernel thread. */
 	pri = PRI_MAX_KERN;
-	mtx_lock_spin(&sched_lock);
+	thread_lock(curthread);
 	sched_prio(curthread, pri);
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(curthread);
 
 	/* If another accounting kthread is already running, just die. */
 	sx_xlock(&acct_sx);
@@ -527,9 +641,8 @@
 		 * to exit.
 		 */
 		if (!(acct_state & ACCT_EXITREQ)) {
-			sx_xunlock(&acct_sx);
-			tsleep(&acct_state, pri, "-", acctchkfreq * hz);
-			sx_xlock(&acct_sx);
+			sx_sleep(&acct_state, &acct_sx, 0, "-",
+			    acctchkfreq * hz);
 		}
 	}
 
Index: kern_uuid.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_uuid.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/kern_uuid.c -L sys/kern/kern_uuid.c -u -r1.1.1.1 -r1.2
--- sys/kern/kern_uuid.c
+++ sys/kern/kern_uuid.c
@@ -25,7 +25,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_uuid.c,v 1.8 2005/01/06 23:35:39 imp Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_uuid.c,v 1.13 2007/04/23 12:53:00 pjd Exp $");
 
 #include <sys/param.h>
 #include <sys/endian.h>
@@ -116,7 +116,7 @@
 /*
  * Get the current time as a 60 bit count of 100-nanosecond intervals
  * since 00:00:00.00, October 15,1582. We apply a magic offset to convert
- * the Unix time since 00:00:00.00, Januari 1, 1970 to the date of the
+ * the Unix time since 00:00:00.00, January 1, 1970 to the date of the
  * Gregorian reform to the Christian calendar.
  */
 static uint64_t
@@ -131,30 +131,12 @@
 	return (time & ((1LL << 60) - 1LL));
 }
 
-#ifndef _SYS_SYSPROTO_H_
-struct uuidgen_args {
-	struct uuid *store;
-	int	count;
-};
-#endif
-
-int
-uuidgen(struct thread *td, struct uuidgen_args *uap)
+struct uuid *
+kern_uuidgen(struct uuid *store, size_t count)
 {
 	struct uuid_private uuid;
 	uint64_t time;
-	int error;
-
-	/*
-	 * Limit the number of UUIDs that can be created at the same time
-	 * to some arbitrary number. This isn't really necessary, but I
-	 * like to have some sort of upper-bound that's less than 2G :-)
-	 * XXX needs to be tunable.
-	 */
-	if (uap->count < 1 || uap->count > 2048)
-		return (EINVAL);
-
-	/* XXX: pre-validate accessibility to the whole of the UUID store? */
+	size_t n;
 
 	mtx_lock(&uuid_mutex);
 
@@ -171,25 +153,52 @@
 		uuid.seq = uuid_last.seq;
 
 	uuid_last = uuid;
-	uuid_last.time.ll = (time + uap->count - 1) & ((1LL << 60) - 1LL);
+	uuid_last.time.ll = (time + count - 1) & ((1LL << 60) - 1LL);
 
 	mtx_unlock(&uuid_mutex);
 
 	/* Set sequence and variant and deal with byte order. */
 	uuid.seq = htobe16(uuid.seq | 0x8000);
 
-	/* XXX: this should copyout larger chunks at a time. */
-	do {
-		/* Set time and version (=1) and deal with byte order. */
+	for (n = 0; n < count; n++) {
+		/* Set time and version (=1). */
 		uuid.time.x.low = (uint32_t)time;
 		uuid.time.x.mid = (uint16_t)(time >> 32);
 		uuid.time.x.hi = ((uint16_t)(time >> 48) & 0xfff) | (1 << 12);
-		error = copyout(&uuid, uap->store, sizeof(uuid));
-		uap->store++;
-		uap->count--;
+		store[n] = *(struct uuid *)&uuid;
 		time++;
-	} while (uap->count > 0 && !error);
+	}
+
+	return (store);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct uuidgen_args {
+	struct uuid *store;
+	int	count;
+};
+#endif
+int
+uuidgen(struct thread *td, struct uuidgen_args *uap)
+{
+	struct uuid *store;
+	size_t count;
+	int error;
 
+	/*
+	 * Limit the number of UUIDs that can be created at the same time
+	 * to some arbitrary number. This isn't really necessary, but I
+	 * like to have some sort of upper-bound that's less than 2G :-)
+	 * XXX probably needs to be tunable.
+	 */
+	if (uap->count < 1 || uap->count > 2048)
+		return (EINVAL);
+
+	count = uap->count;
+	store = malloc(count * sizeof(struct uuid), M_TEMP, M_WAITOK);
+	kern_uuidgen(store, count);
+	error = copyout(store, uap->store, count * sizeof(struct uuid));
+	free(store, M_TEMP);
 	return (error);
 }
 
@@ -272,6 +281,7 @@
 	for (i = 0; i < _UUID_NODE_LEN; i++)
 		uuid->node[i] = p[10 + i];
 }
+
 void
 be_uuid_enc(void *buf, struct uuid const *uuid)
 {
@@ -303,3 +313,49 @@
 	for (i = 0; i < _UUID_NODE_LEN; i++)
 		uuid->node[i] = p[10 + i];
 }
+
+int
+parse_uuid(const char *str, struct uuid *uuid)
+{
+	u_int c[11];
+	int n;
+
+	/* An empty string represents a nil UUID. */
+	if (*str == '\0') {
+		bzero(uuid, sizeof(*uuid));
+		return (0);
+	}
+
+	/* The UUID string representation has a fixed length. */
+	if (strlen(str) != 36)
+		return (EINVAL);
+
+	/*
+	 * We only work with "new" UUIDs. New UUIDs have the form:
+	 *      01234567-89ab-cdef-0123-456789abcdef
+	 * The so called "old" UUIDs, which we don't support, have the form:
+	 *      0123456789ab.cd.ef.01.23.45.67.89.ab
+	 */
+	if (str[8] != '-')
+		return (EINVAL);
+
+	n = sscanf(str, "%8x-%4x-%4x-%2x%2x-%2x%2x%2x%2x%2x%2x", c + 0, c + 1,
+	    c + 2, c + 3, c + 4, c + 5, c + 6, c + 7, c + 8, c + 9, c + 10);
+	/* Make sure we have all conversions. */
+	if (n != 11)
+		return (EINVAL);
+
+	/* Successful scan. Build the UUID. */
+	uuid->time_low = c[0];
+	uuid->time_mid = c[1];
+	uuid->time_hi_and_version = c[2];
+	uuid->clock_seq_hi_and_reserved = c[3];
+	uuid->clock_seq_low = c[4];
+	for (n = 0; n < 6; n++)
+		uuid->node[n] = c[n + 5];
+
+	/* Check semantics... */
+	return (((c[3] & 0x80) != 0x00 &&		/* variant 0? */
+	    (c[3] & 0xc0) != 0x80 &&			/* variant 1? */
+	    (c[3] & 0xe0) != 0xc0) ? EINVAL : 0);	/* variant 2? */
+}
Index: kern_exit.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_exit.c,v
retrieving revision 1.2
retrieving revision 1.3
diff -L sys/kern/kern_exit.c -L sys/kern/kern_exit.c -u -r1.2 -r1.3
--- sys/kern/kern_exit.c
+++ sys/kern/kern_exit.c
@@ -35,7 +35,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_exit.c,v 1.263.2.7 2006/03/18 23:36:21 davidxu Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_exit.c,v 1.304 2007/06/13 20:01:42 jhb Exp $");
 
 #include "opt_compat.h"
 #include "opt_ktrace.h"
@@ -56,20 +56,24 @@
 #include <sys/vmmeter.h>
 #include <sys/vnode.h>
 #include <sys/resourcevar.h>
+#include <sys/sbuf.h>
 #include <sys/signalvar.h>
 #include <sys/sched.h>
 #include <sys/sx.h>
 #include <sys/syscallsubr.h>
+#include <sys/syslog.h>
 #include <sys/ptrace.h>
 #include <sys/acct.h>		/* for acct_process() function prototype */
 #include <sys/filedesc.h>
-#include <sys/mac.h>
 #include <sys/shm.h>
 #include <sys/sem.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 
+#include <security/audit/audit.h>
+#include <security/mac/mac_framework.h>
+
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_param.h>
@@ -85,10 +89,7 @@
 void (*nlminfo_release_p)(struct proc *p);
 
 /*
- * exit --
- *	Death of process.
- *
- * MPSAFE
+ * exit -- death of process.
  */
 void
 sys_exit(struct thread *td, struct sys_exit_args *uap)
@@ -99,9 +100,9 @@
 }
 
 /*
- * Exit: deallocate address space and other resources, change proc state
- * to zombie, and unlink proc from allproc and parent's lists.  Save exit
- * status and rusage for wait().  Check for child processes and orphan them.
+ * Exit: deallocate address space and other resources, change proc state to
+ * zombie, and unlink proc from allproc and parent's lists.  Save exit status
+ * and rusage for wait().  Check for child processes and orphan them.
  */
 void
 exit1(struct thread *td, int rv)
@@ -109,14 +110,13 @@
 	struct proc *p, *nq, *q;
 	struct tty *tp;
 	struct vnode *ttyvp;
-	struct vmspace *vm;
 	struct vnode *vtmp;
 #ifdef KTRACE
 	struct vnode *tracevp;
 	struct ucred *tracecred;
 #endif
 	struct plimit *plim;
-	int locked, refcnt;
+	int locked;
 
 	/*
 	 * Drop Giant if caller has it.  Eventually we should warn about
@@ -169,7 +169,8 @@
 		 * Threading support has been turned off.
 		 */
 	}
-
+	KASSERT(p->p_numthreads == 1,
+	    ("exit1: proc %p exiting with %d threads", p, p->p_numthreads));
 	/*
 	 * Wakeup anyone in procfs' PIOCWAIT.  They should have a hold
 	 * on our vmspace, so we should block below until they have
@@ -193,7 +194,21 @@
 	 */
 	while (p->p_lock > 0)
 		msleep(&p->p_lock, &p->p_mtx, PWAIT, "exithold", 0);
+
 	PROC_UNLOCK(p);
+	/* Drain the limit callout while we don't have the proc locked */
+	callout_drain(&p->p_limco);
+
+#ifdef AUDIT
+	/*
+	 * The Sun BSM exit token contains two components: an exit status as
+	 * passed to exit(), and a return value to indicate what sort of exit
+	 * it was.  The exit status is WEXITSTATUS(rv), but it's not clear
+	 * what the return value is.
+	 */
+	AUDIT_ARG(exit, WEXITSTATUS(rv), 0);
+	AUDIT_SYSCALL_EXIT(0, td);
+#endif
 
 	/* Are we a task leader? */
 	if (p == p->p_leader) {
@@ -217,8 +232,6 @@
 	 */
 	EVENTHANDLER_INVOKE(process_exit, p);
 
-	MALLOC(p->p_ru, struct rusage *, sizeof(struct rusage),
-		M_ZOMBIE, M_WAITOK);
 	/*
 	 * If parent is waiting for us to exit or exec,
 	 * P_PPWAIT is set; we will wakeup the parent below.
@@ -226,8 +239,6 @@
 	PROC_LOCK(p);
 	stopprofclock(p);
 	p->p_flag &= ~(P_TRACED | P_PPWAIT);
-	SIGEMPTYSET(p->p_siglist);
-	SIGEMPTYSET(td->td_siglist);
 
 	/*
 	 * Stop the real interval timer.  If the handler is currently
@@ -246,9 +257,7 @@
 	 * Reset any sigio structures pointing to us as a result of
 	 * F_SETOWN with our pid.
 	 */
-	mtx_lock(&Giant);	/* XXX: not sure if needed */
 	funsetownlst(&p->p_sigiolst);
-	mtx_unlock(&Giant);	
 
 	/*
 	 * If this process has an nlminfo data area (for lockd), release it
@@ -282,42 +291,15 @@
 	}
 	mtx_unlock(&ppeers_lock);
 
-	/* The next two chunks should probably be moved to vmspace_exit. */
-	vm = p->p_vmspace;
-	/*
-	 * Release user portion of address space.
-	 * This releases references to vnodes,
-	 * which could cause I/O if the file has been unlinked.
-	 * Need to do this early enough that we can still sleep.
-	 * Can't free the entire vmspace as the kernel stack
-	 * may be mapped within that space also.
-	 *
-	 * Processes sharing the same vmspace may exit in one order, and
-	 * get cleaned up by vmspace_exit() in a different order.  The
-	 * last exiting process to reach this point releases as much of
-	 * the environment as it can, and the last process cleaned up
-	 * by vmspace_exit() (which decrements exitingcnt) cleans up the
-	 * remainder.
-	 */
-	atomic_add_int(&vm->vm_exitingcnt, 1);
-	do
-		refcnt = vm->vm_refcnt;
-	while (!atomic_cmpset_int(&vm->vm_refcnt, refcnt, refcnt - 1));
-	if (refcnt == 1) {
-		shmexit(vm);
-		pmap_remove_pages(vmspace_pmap(vm), vm_map_min(&vm->vm_map),
-		    vm_map_max(&vm->vm_map));
-		(void) vm_map_remove(&vm->vm_map, vm_map_min(&vm->vm_map),
-		    vm_map_max(&vm->vm_map));
-	}
+	vmspace_exit(td);
 
+	mtx_lock(&Giant);	/* XXX TTY */
 	sx_xlock(&proctree_lock);
 	if (SESS_LEADER(p)) {
 		struct session *sp;
 
 		sp = p->p_session;
 		if (sp->s_ttyvp) {
-			locked = VFS_LOCK_GIANT(sp->s_ttyvp->v_mount);
 			/*
 			 * Controlling process.
 			 * Signal foreground pgrp,
@@ -363,7 +345,6 @@
 			 * that the session once had a controlling terminal.
 			 * (for logging and informational purposes)
 			 */
-			VFS_UNLOCK_GIANT(locked);
 		}
 		SESS_LOCK(p->p_session);
 		sp->s_leader = NULL;
@@ -372,26 +353,35 @@
 	fixjobc(p, p->p_pgrp, 0);
 	sx_xunlock(&proctree_lock);
 	(void)acct_process(td);
+	mtx_unlock(&Giant);	
 #ifdef KTRACE
 	/*
-	 * release trace file
+	 * Disable tracing, then drain any pending records and release
+	 * the trace file.
 	 */
-	PROC_LOCK(p);
-	mtx_lock(&ktrace_mtx);
-	p->p_traceflag = 0;	/* don't trace the vrele() */
-	tracevp = p->p_tracevp;
-	p->p_tracevp = NULL;
-	tracecred = p->p_tracecred;
-	p->p_tracecred = NULL;
-	mtx_unlock(&ktrace_mtx);
-	PROC_UNLOCK(p);
-	if (tracevp != NULL) {
-		locked = VFS_LOCK_GIANT(tracevp->v_mount);
-		vrele(tracevp);
-		VFS_UNLOCK_GIANT(locked);
+	if (p->p_traceflag != 0) {
+		PROC_LOCK(p);
+		mtx_lock(&ktrace_mtx);
+		p->p_traceflag = 0;
+		mtx_unlock(&ktrace_mtx);
+		PROC_UNLOCK(p);
+		ktrprocexit(td);
+		PROC_LOCK(p);
+		mtx_lock(&ktrace_mtx);
+		tracevp = p->p_tracevp;
+		p->p_tracevp = NULL;
+		tracecred = p->p_tracecred;
+		p->p_tracecred = NULL;
+		mtx_unlock(&ktrace_mtx);
+		PROC_UNLOCK(p);
+		if (tracevp != NULL) {
+			locked = VFS_LOCK_GIANT(tracevp->v_mount);
+			vrele(tracevp);
+			VFS_UNLOCK_GIANT(locked);
+		}
+		if (tracecred != NULL)
+			crfree(tracecred);
 	}
-	if (tracecred != NULL)
-		crfree(tracecred);
 #endif
 	/*
 	 * Release reference to text vnode
@@ -422,6 +412,19 @@
 	LIST_REMOVE(p, p_hash);
 	sx_xunlock(&allproc_lock);
 
+	/*
+	 * Call machine-dependent code to release any
+	 * machine-dependent resources other than the address space.
+	 * The address space is released by "vmspace_exitfree(p)" in
+	 * vm_waitproc().
+	 */
+	cpu_exit(td);
+
+	WITNESS_WARN(WARN_PANIC, NULL, "process (pid %d) exiting", p->p_pid);
+
+	/*
+	 * Reparent all of our children to init.
+	 */
 	sx_xlock(&proctree_lock);
 	q = LIST_FIRST(&p->p_children);
 	if (q != NULL)		/* only need this if any child is S_ZOMB */
@@ -442,16 +445,10 @@
 		PROC_UNLOCK(q);
 	}
 
-	/*
-	 * Save exit status and finalize rusage info except for times,
-	 * adding in child rusage info.
-	 */
+	/* Save exit status. */
 	PROC_LOCK(p);
 	p->p_xstat = rv;
 	p->p_xthread = td;
-	p->p_stats->p_ru.ru_nvcsw++;
-	*p->p_ru = p->p_stats->p_ru;
-
 	/*
 	 * Notify interested parties of our demise.
 	 */
@@ -492,31 +489,21 @@
 
 	if (p->p_pptr == initproc)
 		psignal(p->p_pptr, SIGCHLD);
-	else if (p->p_sigparent != 0)
-		psignal(p->p_pptr, p->p_sigparent);
-	PROC_UNLOCK(p->p_pptr);
+	else if (p->p_sigparent != 0) {
+		if (p->p_sigparent == SIGCHLD)
+			childproc_exited(p);
+		else	/* LINUX thread */
+			psignal(p->p_pptr, p->p_sigparent);
+	}
+	sx_xunlock(&proctree_lock);
 
 	/*
-	 * If this is a kthread, then wakeup anyone waiting for it to exit.
+	 * The state PRS_ZOMBIE prevents other proesses from sending
+	 * signal to the process, to avoid memory leak, we free memory
+	 * for signal queue at the time when the state is set.
 	 */
-	if (p->p_flag & P_KTHREAD)
-		wakeup(p);
-	PROC_UNLOCK(p);
-
-	/*
-	 * Finally, call machine-dependent code to release the remaining
-	 * resources including address space.
-	 * The address space is released by "vmspace_exitfree(p)" in
-	 * vm_waitproc().
-	 */
-	cpu_exit(td);
-
-	WITNESS_WARN(WARN_PANIC, &proctree_lock.sx_object,
-	    "process (pid %d) exiting", p->p_pid);
-
-	PROC_LOCK(p);
-	PROC_LOCK(p->p_pptr);
-	sx_xunlock(&proctree_lock);
+	sigqueue_flush(&p->p_sigqueue);
+	sigqueue_flush(&td->td_sigqueue);
 
 	/*
 	 * We have to wait until after acquiring all locks before
@@ -529,12 +516,13 @@
 	 * proc lock.
 	 */
 	wakeup(p->p_pptr);
-	mtx_lock_spin(&sched_lock);
+	PROC_SLOCK(p->p_pptr);
+	sched_exit(p->p_pptr, td);
+	PROC_SUNLOCK(p->p_pptr);
+	PROC_SLOCK(p);
 	p->p_state = PRS_ZOMBIE;
 	PROC_UNLOCK(p->p_pptr);
 
-	sched_exit(p->p_pptr, td);
-
 	/*
 	 * Hopefully no one will try to deliver a signal to the process this
 	 * late in the game.
@@ -542,6 +530,11 @@
 	knlist_destroy(&p->p_klist);
 
 	/*
+	 * Save our children's rusage information in our exit rusage.
+	 */
+	ruadd(&p->p_ru, &p->p_rux, &p->p_stats->p_cru, &p->p_crux);
+
+	/*
 	 * Make sure the scheduler takes this thread out of its tables etc.
 	 * This will also release this thread's reference to the ucred.
 	 * Other thread parts to release include pcb bits and such.
@@ -549,11 +542,87 @@
 	thread_exit();
 }
 
+
+#ifndef _SYS_SYSPROTO_H_
+struct abort2_args {
+	char *why;
+	int nargs;
+	void **args;
+};
+#endif
+
+int
+abort2(struct thread *td, struct abort2_args *uap)
+{
+	struct proc *p = td->td_proc;
+	struct sbuf *sb;
+	void *uargs[16];
+	int error, i, sig;
+
+	error = 0;	/* satisfy compiler */
+
+	/*
+	 * Do it right now so we can log either proper call of abort2(), or
+	 * note, that invalid argument was passed. 512 is big enough to
+	 * handle 16 arguments' descriptions with additional comments.
+	 */
+	sb = sbuf_new(NULL, NULL, 512, SBUF_FIXEDLEN);
+	sbuf_clear(sb);
+	sbuf_printf(sb, "%s(pid %d uid %d) aborted: ",
+	    p->p_comm, p->p_pid, td->td_ucred->cr_uid);
+	/* 
+	 * Since we can't return from abort2(), send SIGKILL in cases, where
+	 * abort2() was called improperly
+	 */
+	sig = SIGKILL;
+	/* Prevent from DoSes from user-space. */
+	if (uap->nargs < 0 || uap->nargs > 16)
+		goto out;
+	if (uap->args == NULL)
+		goto out;
+	error = copyin(uap->args, uargs, uap->nargs * sizeof(void *));
+	if (error != 0)
+		goto out;
+	/*
+	 * Limit size of 'reason' string to 128. Will fit even when
+	 * maximal number of arguments was chosen to be logged.
+	 */
+	if (uap->why != NULL) {
+		error = sbuf_copyin(sb, uap->why, 128);
+		if (error < 0)
+			goto out;
+	} else {
+		sbuf_printf(sb, "(null)");
+	}
+	if (uap->nargs) {
+		sbuf_printf(sb, "(");
+		for (i = 0;i < uap->nargs; i++)
+			sbuf_printf(sb, "%s%p", i == 0 ? "" : ", ", uargs[i]);
+		sbuf_printf(sb, ")");
+	}
+	/*
+	 * Final stage: arguments were proper, string has been
+	 * successfully copied from userspace, and copying pointers
+	 * from user-space succeed.
+	 */
+	sig = SIGABRT;
+out:
+	if (sig == SIGKILL) {
+		sbuf_trim(sb);
+		sbuf_printf(sb, " (Reason text inaccessible)");
+	}
+	sbuf_cat(sb, "\n");
+	sbuf_finish(sb);
+	log(LOG_INFO, "%s", sbuf_data(sb));
+	sbuf_delete(sb);
+	exit1(td, W_EXITCODE(0, sig));
+	return (0);
+}
+
+
 #ifdef COMPAT_43
 /*
  * The dirty work is handled by kern_wait().
- *
- * MPSAFE.
  */
 int
 owait(struct thread *td, struct owait_args *uap __unused)
@@ -569,8 +638,6 @@
 
 /*
  * The dirty work is handled by kern_wait().
- *
- * MPSAFE.
  */
 int
 wait4(struct thread *td, struct wait_args *uap)
@@ -597,6 +664,8 @@
 	struct proc *p, *q, *t;
 	int error, nfound;
 
+	AUDIT_ARG(pid, pid);
+
 	q = td->td_proc;
 	if (pid == 0) {
 		PROC_LOCK(q);
@@ -640,28 +709,19 @@
 		}
 
 		nfound++;
+		PROC_SLOCK(p);
 		if (p->p_state == PRS_ZOMBIE) {
-
-			/*
-			 * It is possible that the last thread of this
-			 * process is still running on another CPU
-			 * in thread_exit() after having dropped the process
-			 * lock via PROC_UNLOCK() but before it has completed
-			 * cpu_throw().  In that case, the other thread must
-			 * still hold sched_lock, so simply by acquiring
-			 * sched_lock once we will wait long enough for the
-			 * thread to exit in that case.
-			 */
-			mtx_lock_spin(&sched_lock);
-			mtx_unlock_spin(&sched_lock);
-			
-			td->td_retval[0] = p->p_pid;
-			if (status)
-				*status = p->p_xstat;	/* convert to int */
 			if (rusage) {
-				*rusage = *p->p_ru;
+				*rusage = p->p_ru;
 				calcru(p, &rusage->ru_utime, &rusage->ru_stime);
 			}
+			PROC_SUNLOCK(p);
+			td->td_retval[0] = p->p_pid;
+			if (status)
+				*status = p->p_xstat;	/* convert to int */
+			PROC_LOCK(q);
+			sigqueue_take(p->p_ksi);
+			PROC_UNLOCK(q);
 
 			/*
 			 * If we got the child via a ptrace 'attach',
@@ -673,7 +733,7 @@
 				p->p_oppid = 0;
 				proc_reparent(p, t);
 				PROC_UNLOCK(p);
-				psignal(t, SIGCHLD);
+				tdsignal(t, NULL, SIGCHLD, p->p_ksi);
 				wakeup(t);
 				PROC_UNLOCK(t);
 				sx_xunlock(&proctree_lock);
@@ -700,11 +760,9 @@
 			p->p_xstat = 0;		/* XXX: why? */
 			PROC_UNLOCK(p);
 			PROC_LOCK(q);
-			ruadd(&q->p_stats->p_cru, &q->p_crux, p->p_ru,
+			ruadd(&q->p_stats->p_cru, &q->p_crux, &p->p_ru,
 			    &p->p_rux);
 			PROC_UNLOCK(q);
-			FREE(p->p_ru, M_ZOMBIE);
-			p->p_ru = NULL;
 
 			/*
 			 * Decrement the count of procs running with this uid.
@@ -743,25 +801,33 @@
 			sx_xunlock(&allproc_lock);
 			return (0);
 		}
-		mtx_lock_spin(&sched_lock);
 		if ((p->p_flag & P_STOPPED_SIG) &&
 		    (p->p_suspcount == p->p_numthreads) &&
 		    (p->p_flag & P_WAITED) == 0 &&
 		    (p->p_flag & P_TRACED || options & WUNTRACED)) {
-			mtx_unlock_spin(&sched_lock);
+			PROC_SUNLOCK(p);
 			p->p_flag |= P_WAITED;
 			sx_xunlock(&proctree_lock);
 			td->td_retval[0] = p->p_pid;
 			if (status)
 				*status = W_STOPCODE(p->p_xstat);
+
+			PROC_LOCK(q);
+			sigqueue_take(p->p_ksi);
+			PROC_UNLOCK(q);
 			PROC_UNLOCK(p);
+
 			return (0);
 		}
-		mtx_unlock_spin(&sched_lock);
+		PROC_SUNLOCK(p);
 		if (options & WCONTINUED && (p->p_flag & P_CONTINUED)) {
 			sx_xunlock(&proctree_lock);
 			td->td_retval[0] = p->p_pid;
 			p->p_flag &= ~P_CONTINUED;
+
+			PROC_LOCK(q);
+			sigqueue_take(p->p_ksi);
+			PROC_UNLOCK(q);
 			PROC_UNLOCK(p);
 
 			if (status)
@@ -805,6 +871,9 @@
 	if (child->p_pptr == parent)
 		return;
 
+	PROC_LOCK(child->p_pptr);
+	sigqueue_take(child->p_ksi);
+	PROC_UNLOCK(child->p_pptr);
 	LIST_REMOVE(child, p_sibling);
 	LIST_INSERT_HEAD(&parent->p_children, child, p_sibling);
 	child->p_pptr = parent;
Index: subr_disk.c
===================================================================
RCS file: /home/cvs/src/sys/kern/subr_disk.c,v
retrieving revision 1.1.1.2
retrieving revision 1.2
diff -L sys/kern/subr_disk.c -L sys/kern/subr_disk.c -u -r1.1.1.2 -r1.2
--- sys/kern/subr_disk.c
+++ sys/kern/subr_disk.c
@@ -8,7 +8,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/subr_disk.c,v 1.85.2.1 2006/02/14 03:29:31 rwatson Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/subr_disk.c,v 1.88 2006/10/31 21:11:21 pjd Exp $");
 
 #include "opt_geom.h"
 
@@ -43,6 +43,7 @@
 	case BIO_WRITE:		printf("cmd=write "); break;
 	case BIO_DELETE:	printf("cmd=delete "); break;
 	case BIO_GETATTR:	printf("cmd=getattr "); break;
+	case BIO_FLUSH:		printf("cmd=flush "); break;
 	default:		printf("cmd=%x ", bp->bio_cmd); break;
 	}
 	sn = bp->bio_pblkno;
@@ -99,7 +100,7 @@
 bioq_insert_head(struct bio_queue_head *head, struct bio *bp)
 {
 
-	if (TAILQ_FIRST(&head->queue) == NULL)
+	if (TAILQ_EMPTY(&head->queue))
 		head->insert_point = bp;
 	TAILQ_INSERT_HEAD(&head->queue, bp, bio_queue);
 }
@@ -108,7 +109,7 @@
 bioq_insert_tail(struct bio_queue_head *head, struct bio *bp)
 {
 
-	if (TAILQ_FIRST(&head->queue) == NULL)
+	if (TAILQ_EMPTY(&head->queue))
 		head->insert_point = bp;
 	TAILQ_INSERT_TAIL(&head->queue, bp, bio_queue);
 }
--- /dev/null
+++ sys/kern/subr_rtc.c
@@ -0,0 +1,166 @@
+/*-
+ * Copyright (c) 1988 University of Utah.
+ * Copyright (c) 1982, 1990, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * the Systems Programming Group of the University of Utah Computer
+ * Science Department.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	from: Utah $Hdr: clock.c 1.18 91/01/21$
+ *	from: @(#)clock.c	8.2 (Berkeley) 1/12/94
+ *	from: NetBSD: clock_subr.c,v 1.6 2001/07/07 17:04:02 thorpej Exp
+ *	and
+ *	from: src/sys/i386/isa/clock.c,v 1.176 2001/09/04
+ */
+
+/*
+ * Helpers for time-of-day clocks. This is useful for architectures that need
+ * support multiple models of such clocks, and generally serves to make the
+ * code more machine-independent.
+ * If the clock in question can also be used as a time counter, the driver
+ * needs to initiate this.
+ * This code is not yet used by all architectures.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: src/sys/kern/subr_rtc.c,v 1.9 2006/10/02 18:23:37 phk Exp $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/bus.h>
+#include <sys/clock.h>
+#include <sys/sysctl.h>
+#include <sys/timetc.h>
+
+#include "clock_if.h"
+
+static device_t clock_dev = NULL;
+static long clock_res;
+
+void
+clock_register(device_t dev, long res)	/* res has units of microseconds */
+{
+
+	if (clock_dev != NULL) {
+		if (clock_res > res) {
+			if (bootverbose) {
+				device_printf(dev, "not installed as "
+				    "time-of-day clock: clock %s has higher "
+				    "resolution\n", device_get_name(clock_dev));
+			}
+			return;
+		} else {
+			if (bootverbose) {
+				device_printf(clock_dev, "removed as "
+				    "time-of-day clock: clock %s has higher "
+				    "resolution\n", device_get_name(dev));
+			}
+		}
+	}
+	clock_dev = dev;
+	clock_res = res;
+	if (bootverbose) {
+		device_printf(dev, "registered as a time-of-day clock "
+		    "(resolution %ldus)\n", res);
+	}
+}
+
+/*
+ * inittodr and settodr derived from the i386 versions written
+ * by Christoph Robitschko <chmr at edvz.tu-graz.ac.at>,  reintroduced and
+ * updated by Chris Stenton <chris at gnome.co.uk> 8/10/94
+ */
+
+/*
+ * Initialize the time of day register, based on the time base which is, e.g.
+ * from a filesystem.
+ */
+void
+inittodr(time_t base)
+{
+	struct timespec diff, ref, ts;
+	int error;
+
+	if (base) {
+		ref.tv_sec = base;
+		ref.tv_nsec = 0;
+		tc_setclock(&ref);
+	}
+
+	if (clock_dev == NULL) {
+		printf("warning: no time-of-day clock registered, system time "
+		    "will not be set accurately\n");
+		return;
+	}
+	error = CLOCK_GETTIME(clock_dev, &ts);
+	if (error != 0 && error != EINVAL) {
+		printf("warning: clock_gettime failed (%d), the system time "
+		    "will not be set accurately\n", error);
+		return;
+	}
+	if (error == EINVAL || ts.tv_sec < 0) {
+		printf("Invalid time in real time clock.\n");
+		printf("Check and reset the date immediately!\n");
+	}
+
+	ts.tv_sec += utc_offset();
+
+	if (timespeccmp(&ref, &ts, >)) {
+		diff = ref;
+		timespecsub(&ref, &ts);
+	} else {
+		diff = ts;
+		timespecsub(&diff, &ref);
+	}
+	if (ts.tv_sec >= 2) {
+		/* badly off, adjust it */
+		tc_setclock(&ts);
+	}
+}
+
+/*
+ * Write system time back to RTC
+ */
+void
+resettodr()
+{
+	struct timespec ts;
+	int error;
+
+	if (disable_rtc_set || clock_dev == NULL)
+		return;
+
+	getnanotime(&ts);
+	ts.tv_sec -= utc_offset();
+	if ((error = CLOCK_SETTIME(clock_dev, &ts)) != 0) {
+		printf("warning: clock_settime failed (%d), time-of-day clock "
+		    "not adjusted to system time\n", error);
+		return;
+	}
+}
Index: kern_subr.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_subr.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/kern_subr.c -L sys/kern/kern_subr.c -u -r1.1.1.1 -r1.2
--- sys/kern/kern_subr.c
+++ sys/kern/kern_subr.c
@@ -35,7 +35,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_subr.c,v 1.96 2005/01/06 23:35:39 imp Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_subr.c,v 1.103 2007/06/05 00:00:54 jeff Exp $");
 
 #include "opt_zero.h"
 
@@ -105,9 +105,9 @@
 	VM_OBJECT_LOCK(uobject);
 retry:
 	if ((user_pg = vm_page_lookup(uobject, upindex)) != NULL) {
-		vm_page_lock_queues();
-		if (vm_page_sleep_if_busy(user_pg, 1, "vm_pgmoveco"))
+		if (vm_page_sleep_if_busy(user_pg, TRUE, "vm_pgmoveco"))
 			goto retry;
+		vm_page_lock_queues();
 		pmap_remove_all(user_pg);
 		vm_page_free(user_pg);
 	} else {
@@ -358,10 +358,11 @@
 }
 
 /*
- * General routine to allocate a hash table.
+ * General routine to allocate a hash table with control of memory flags.
  */
 void *
-hashinit(int elements, struct malloc_type *type, u_long *hashmask)
+hashinit_flags(int elements, struct malloc_type *type, u_long *hashmask,
+    int flags)
 {
 	long hashsize;
 	LIST_HEAD(generic, generic) *hashtbl;
@@ -369,16 +370,40 @@
 
 	if (elements <= 0)
 		panic("hashinit: bad elements");
+
+	/* Exactly one of HASH_WAITOK and HASH_NOWAIT must be set. */
+	KASSERT((flags & HASH_WAITOK) ^ (flags & HASH_NOWAIT),
+	    ("Bad flags (0x%x) passed to hashinit_flags", flags));
+
 	for (hashsize = 1; hashsize <= elements; hashsize <<= 1)
 		continue;
 	hashsize >>= 1;
-	hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), type, M_WAITOK);
-	for (i = 0; i < hashsize; i++)
-		LIST_INIT(&hashtbl[i]);
-	*hashmask = hashsize - 1;
+
+	if (flags & HASH_NOWAIT)
+		hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl),
+		    type, M_NOWAIT);
+	else
+		hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl),
+		    type, M_WAITOK);
+
+	if (hashtbl != NULL) {
+		for (i = 0; i < hashsize; i++)
+			LIST_INIT(&hashtbl[i]);
+		*hashmask = hashsize - 1;
+	}
 	return (hashtbl);
 }
 
+/*
+ * Allocate and initialize a hash table with default flag: may sleep.
+ */
+void *
+hashinit(int elements, struct malloc_type *type, u_long *hashmask)
+{
+
+	return (hashinit_flags(elements, type, hashmask, HASH_WAITOK));
+}
+
 void
 hashdestroy(void *vhashtbl, struct malloc_type *type, u_long hashmask)
 {
@@ -428,11 +453,11 @@
 	struct thread *td;
 
 	td = curthread;
-	mtx_lock_spin(&sched_lock);
 	DROP_GIANT();
-	sched_prio(td, td->td_ksegrp->kg_user_pri); /* XXXKSE */
+	thread_lock(td);
+	sched_prio(td, td->td_user_pri);
 	mi_switch(SW_INVOL, NULL);
-	mtx_unlock_spin(&sched_lock);
+	thread_unlock(td);
 	PICKUP_GIANT();
 }
 
--- /dev/null
+++ sys/kern/kern_priv.c
@@ -0,0 +1,155 @@
+/*-
+ * Copyright (c) 2006 nCircle Network Security, Inc.
+ * All rights reserved.
+ *
+ * This software was developed by Robert N. M. Watson for the TrustedBSD
+ * Project under contract to nCircle Network Security, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR, NCIRCLE NETWORK SECURITY,
+ * INC., OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD: src/sys/kern/kern_priv.c,v 1.4 2007/07/02 14:03:29 rwatson Exp $
+ */
+
+#include "opt_mac.h"
+
+#include <sys/param.h>
+#include <sys/jail.h>
+#include <sys/kernel.h>
+#include <sys/priv.h>
+#include <sys/proc.h>
+#include <sys/sysctl.h>
+#include <sys/systm.h>
+
+#include <security/mac/mac_framework.h>
+
+/*
+ * `suser_enabled' (which can be set by the security.bsd.suser_enabled
+ * sysctl) determines whether the system 'super-user' policy is in effect.  If
+ * it is nonzero, an effective uid of 0 connotes special privilege,
+ * overriding many mandatory and discretionary protections.  If it is zero,
+ * uid 0 is offered no special privilege in the kernel security policy.
+ * Setting it to zero may seriously impact the functionality of many existing
+ * userland programs, and should not be done without careful consideration of
+ * the consequences. 
+ */
+static int	suser_enabled = 1;
+SYSCTL_INT(_security_bsd, OID_AUTO, suser_enabled, CTLFLAG_RW,
+    &suser_enabled, 0, "processes with uid 0 have privilege");
+TUNABLE_INT("security.bsd.suser_enabled", &suser_enabled);
+
+/*
+ * Check a credential for privilege.  Lots of good reasons to deny privilege;
+ * only a few to grant it.
+ */
+int
+priv_check_cred(struct ucred *cred, int priv, int flags)
+{
+	int error;
+
+	KASSERT(PRIV_VALID(priv), ("priv_check_cred: invalid privilege %d",
+	    priv));
+
+	/*
+	 * We first evaluate policies that may deny the granting of
+	 * privilege unilaterally.
+	 */
+#ifdef MAC
+	error = mac_priv_check(cred, priv);
+	if (error)
+		return (error);
+#endif
+
+	/*
+	 * Jail policy will restrict certain privileges that may otherwise be
+	 * be granted.
+	 */
+	error = prison_priv_check(cred, priv);
+	if (error)
+		return (error);
+
+	/*
+	 * Having determined if privilege is restricted by various policies,
+	 * now determine if privilege is granted.  At this point, any policy
+	 * may grant privilege.  For now, we allow short-circuit boolean
+	 * evaluation, so may not call all policies.  Perhaps we should.
+	 *
+	 * Superuser policy grants privilege based on the effective (or in
+	 * the case of specific privileges, real) uid being 0.  We allow the
+	 * superuser policy to be globally disabled, although this is
+	 * currenty of limited utility.
+	 */
+	if (suser_enabled) {
+		switch (priv) {
+		case PRIV_MAXFILES:
+		case PRIV_MAXPROC:
+		case PRIV_PROC_LIMIT:
+			if (cred->cr_ruid == 0)
+				return (0);
+			break;
+
+		default:
+			if (cred->cr_uid == 0)
+				return (0);
+			break;
+		}
+	}
+
+	/*
+	 * Now check with MAC, if enabled, to see if a policy module grants
+	 * privilege.
+	 */
+#ifdef MAC
+	if (mac_priv_grant(cred, priv) == 0)
+		return (0);
+#endif
+	return (EPERM);
+}
+
+int
+priv_check(struct thread *td, int priv)
+{
+
+	KASSERT(td == curthread, ("priv_check: td != curthread"));
+
+	return (priv_check_cred(td->td_ucred, priv, 0));
+}
+
+/*
+ * Historical suser() wrapper functions, which now simply request PRIV_ROOT.
+ * These will be removed in the near future, and exist solely because
+ * the kernel and modules are not yet fully adapted to the new model.
+ */
+int
+suser_cred(struct ucred *cred, int flags)
+{
+
+	return (priv_check_cred(cred, PRIV_ROOT, flags));
+}
+
+int
+suser(struct thread *td)
+{
+
+	KASSERT(td == curthread, ("suser: td != curthread"));
+
+	return (suser_cred(td->td_ucred, 0));
+}
Index: vfs_export.c
===================================================================
RCS file: /home/cvs/src/sys/kern/vfs_export.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/vfs_export.c -L sys/kern/vfs_export.c -u -r1.1.1.1 -r1.2
--- sys/kern/vfs_export.c
+++ sys/kern/vfs_export.c
@@ -35,7 +35,7 @@
  */
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/vfs_export.c,v 1.333 2005/05/11 18:25:42 kan Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/vfs_export.c,v 1.341 2007/02/15 22:08:35 pjd Exp $");
 
 #include <sys/param.h>
 #include <sys/dirent.h>
@@ -46,13 +46,14 @@
 #include <sys/mbuf.h>
 #include <sys/mount.h>
 #include <sys/mutex.h>
+#include <sys/refcount.h>
 #include <sys/socket.h>
 #include <sys/systm.h>
 #include <sys/vnode.h>
 
 #include <net/radix.h>
 
-static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure");
+static MALLOC_DEFINE(M_NETADDR, "export_host", "Export host address structure");
 
 static void	vfs_free_addrlist(struct netexport *nep);
 static int	vfs_free_netcred(struct radix_node *rn, void *w);
@@ -82,10 +83,8 @@
  * Called by ufs_mount() to set up the lists of export addresses.
  */
 static int
-vfs_hang_addrlist(mp, nep, argp)
-	struct mount *mp;
-	struct netexport *nep;
-	struct export_args *argp;
+vfs_hang_addrlist(struct mount *mp, struct netexport *nep,
+    struct export_args *argp)
 {
 	register struct netcred *np;
 	register struct radix_node_head *rnh;
@@ -102,12 +101,18 @@
 	 * with fields like cr_uidinfo and cr_prison?  Currently, this
 	 * routine does not touch them (leaves them as NULL).
 	 */
-	if (argp->ex_anon.cr_version != XUCRED_VERSION)
+	if (argp->ex_anon.cr_version != XUCRED_VERSION) {
+		vfs_mount_error(mp, "ex_anon.cr_version: %d != %d",
+		    argp->ex_anon.cr_version, XUCRED_VERSION);
 		return (EINVAL);
+	}
 
 	if (argp->ex_addrlen == 0) {
-		if (mp->mnt_flag & MNT_DEFEXPORTED)
+		if (mp->mnt_flag & MNT_DEFEXPORTED) {
+			vfs_mount_error(mp,
+			    "MNT_DEFEXPORTED already set for mount %p", mp);
 			return (EPERM);
+		}
 		np = &nep->ne_defexported;
 		np->netc_exflags = argp->ex_flags;
 		bzero(&np->netc_anon, sizeof(np->netc_anon));
@@ -115,14 +120,19 @@
 		np->netc_anon.cr_ngroups = argp->ex_anon.cr_ngroups;
 		bcopy(argp->ex_anon.cr_groups, np->netc_anon.cr_groups,
 		    sizeof(np->netc_anon.cr_groups));
-		np->netc_anon.cr_ref = 1;
+		refcount_init(&np->netc_anon.cr_ref, 1);
+		MNT_ILOCK(mp);
 		mp->mnt_flag |= MNT_DEFEXPORTED;
+		MNT_IUNLOCK(mp);
 		return (0);
 	}
 
 #if MSIZE <= 256
-	if (argp->ex_addrlen > MLEN)
+	if (argp->ex_addrlen > MLEN) {
+		vfs_mount_error(mp, "ex_addrlen %d is greater than %d",
+		    argp->ex_addrlen, MLEN);
 		return (EINVAL);
+	}
 #endif
 
 	i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen;
@@ -130,8 +140,9 @@
 	saddr = (struct sockaddr *) (np + 1);
 	if ((error = copyin(argp->ex_addr, saddr, argp->ex_addrlen)))
 		goto out;
-	if (saddr->sa_family > AF_MAX) {
+	if (saddr->sa_family == AF_UNSPEC || saddr->sa_family > AF_MAX) {
 		error = EINVAL;
+		vfs_mount_error(mp, "Invalid saddr->sa_family: %d");
 		goto out;
 	}
 	if (saddr->sa_len > argp->ex_addrlen)
@@ -158,6 +169,9 @@
 			}
 		if ((rnh = nep->ne_rtable[i]) == NULL) {
 			error = ENOBUFS;
+			vfs_mount_error(mp, "%s %s %d",
+			    "Unable to initialize radix node head ",
+			    "for address family", i);
 			goto out;
 		}
 	}
@@ -166,6 +180,8 @@
 	RADIX_NODE_HEAD_UNLOCK(rnh);
 	if (rn == NULL || np != (struct netcred *)rn) {	/* already exists */
 		error = EPERM;
+		vfs_mount_error(mp, "Invalid radix node head, rn: %p %p",
+		    rn, np);
 		goto out;
 	}
 	np->netc_exflags = argp->ex_flags;
@@ -174,7 +190,7 @@
 	np->netc_anon.cr_ngroups = argp->ex_anon.cr_ngroups;
 	bcopy(argp->ex_anon.cr_groups, np->netc_anon.cr_groups,
 	    sizeof(np->netc_anon.cr_groups));
-	np->netc_anon.cr_ref = 1;
+	refcount_init(&np->netc_anon.cr_ref, 1);
 	return (0);
 out:
 	free(np, M_NETADDR);
@@ -184,9 +200,7 @@
 /* Helper for vfs_free_addrlist. */
 /* ARGSUSED */
 static int
-vfs_free_netcred(rn, w)
-	struct radix_node *rn;
-	void *w;
+vfs_free_netcred(struct radix_node *rn, void *w)
 {
 	register struct radix_node_head *rnh = (struct radix_node_head *) w;
 
@@ -199,8 +213,7 @@
  * Free the net address hash lists that are hanging off the mount points.
  */
 static void
-vfs_free_addrlist(nep)
-	struct netexport *nep;
+vfs_free_addrlist(struct netexport *nep)
 {
 	register int i;
 	register struct radix_node_head *rnh;
@@ -222,26 +235,31 @@
  * the structure is described in sys/mount.h
  */
 int
-vfs_export(mp, argp)
-	struct mount *mp;
-	struct export_args *argp;
+vfs_export(struct mount *mp, struct export_args *argp)
 {
 	struct netexport *nep;
 	int error;
 
 	nep = mp->mnt_export;
+	error = 0;
 	if (argp->ex_flags & MNT_DELEXPORT) {
-		if (nep == NULL)
-			return (ENOENT);
+		if (nep == NULL) {
+			error = ENOENT;
+			goto out;
+		}
 		if (mp->mnt_flag & MNT_EXPUBLIC) {
 			vfs_setpublicfs(NULL, NULL, NULL);
+			MNT_ILOCK(mp);
 			mp->mnt_flag &= ~MNT_EXPUBLIC;
+			MNT_IUNLOCK(mp);
 		}
 		vfs_free_addrlist(nep);
 		mp->mnt_export = NULL;
 		free(nep, M_MOUNT);
 		nep = NULL;
+		MNT_ILOCK(mp);
 		mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED);
+		MNT_IUNLOCK(mp);
 	}
 	if (argp->ex_flags & MNT_EXPORTED) {
 		if (nep == NULL) {
@@ -250,14 +268,30 @@
 		}
 		if (argp->ex_flags & MNT_EXPUBLIC) {
 			if ((error = vfs_setpublicfs(mp, nep, argp)) != 0)
-				return (error);
+				goto out;
+			MNT_ILOCK(mp);
 			mp->mnt_flag |= MNT_EXPUBLIC;
+			MNT_IUNLOCK(mp);
 		}
 		if ((error = vfs_hang_addrlist(mp, nep, argp)))
-			return (error);
+			goto out;
+		MNT_ILOCK(mp);
 		mp->mnt_flag |= MNT_EXPORTED;
+		MNT_IUNLOCK(mp);
 	}
-	return (0);
+
+out:
+	/*
+	 * Once we have executed the vfs_export() command, we do
+	 * not want to keep the "export" option around in the
+	 * options list, since that will cause subsequent MNT_UPDATE
+	 * calls to fail.  The export information is saved in
+	 * mp->mnt_export, so we can safely delete the "export" mount option
+	 * here.
+	 */
+	vfs_deleteopt(mp->mnt_optnew, "export");
+	vfs_deleteopt(mp->mnt_opt, "export");
+	return (error);
 }
 
 /*
@@ -265,10 +299,8 @@
  * one public filesystem is possible in the spec (RFC 2054 and 2055)
  */
 int
-vfs_setpublicfs(mp, nep, argp)
-	struct mount *mp;
-	struct netexport *nep;
-	struct export_args *argp;
+vfs_setpublicfs(struct mount *mp, struct netexport *nep,
+    struct export_args *argp)
 {
 	int error;
 	struct vnode *rvp;
@@ -305,7 +337,7 @@
 	if ((error = VFS_ROOT(mp, LK_EXCLUSIVE, &rvp, curthread /* XXX */)))
 		return (error);
 
-	if ((error = VFS_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid)))
+	if ((error = VOP_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid)))
 		return (error);
 
 	vput(rvp);
@@ -393,11 +425,8 @@
  */
 
 int 
-vfs_stdcheckexp(mp, nam, extflagsp, credanonp)
-	struct mount *mp;
-	struct sockaddr *nam;
-	int *extflagsp;
-	struct ucred **credanonp;
+vfs_stdcheckexp(struct mount *mp, struct sockaddr *nam, int *extflagsp,
+    struct ucred **credanonp)
 {
 	struct netcred *np;
 
Index: kern_switch.c
===================================================================
RCS file: /home/cvs/src/sys/kern/kern_switch.c,v
retrieving revision 1.1.1.1
retrieving revision 1.2
diff -L sys/kern/kern_switch.c -L sys/kern/kern_switch.c -u -r1.1.1.1 -r1.2
--- sys/kern/kern_switch.c
+++ sys/kern/kern_switch.c
@@ -24,69 +24,9 @@
  * SUCH DAMAGE.
  */
 
-/***
-Here is the logic..
-
-If there are N processors, then there are at most N KSEs (kernel
-schedulable entities) working to process threads that belong to a
-KSEGROUP (kg). If there are X of these KSEs actually running at the
-moment in question, then there are at most M (N-X) of these KSEs on
-the run queue, as running KSEs are not on the queue.
-
-Runnable threads are queued off the KSEGROUP in priority order.
-If there are M or more threads runnable, the top M threads
-(by priority) are 'preassigned' to the M KSEs not running. The KSEs take
-their priority from those threads and are put on the run queue.
-
-The last thread that had a priority high enough to have a KSE associated
-with it, AND IS ON THE RUN QUEUE is pointed to by
-kg->kg_last_assigned. If no threads queued off the KSEGROUP have KSEs
-assigned as all the available KSEs are activly running, or because there
-are no threads queued, that pointer is NULL.
-
-When a KSE is removed from the run queue to become runnable, we know
-it was associated with the highest priority thread in the queue (at the head
-of the queue). If it is also the last assigned we know M was 1 and must
-now be 0. Since the thread is no longer queued that pointer must be
-removed from it. Since we know there were no more KSEs available,
-(M was 1 and is now 0) and since we are not FREEING our KSE
-but using it, we know there are STILL no more KSEs available, we can prove
-that the next thread in the ksegrp list will not have a KSE to assign to
-it, so we can show that the pointer must be made 'invalid' (NULL).
-
-The pointer exists so that when a new thread is made runnable, it can
-have its priority compared with the last assigned thread to see if
-it should 'steal' its KSE or not.. i.e. is it 'earlier'
-on the list than that thread or later.. If it's earlier, then the KSE is
-removed from the last assigned (which is now not assigned a KSE)
-and reassigned to the new thread, which is placed earlier in the list.
-The pointer is then backed up to the previous thread (which may or may not
-be the new thread).
-
-When a thread sleeps or is removed, the KSE becomes available and if there 
-are queued threads that are not assigned KSEs, the highest priority one of
-them is assigned the KSE, which is then placed back on the run queue at
-the approipriate place, and the kg->kg_last_assigned pointer is adjusted down
-to point to it.
-
-The following diagram shows 2 KSEs and 3 threads from a single process.
-
- RUNQ: --->KSE---KSE--...    (KSEs queued at priorities from threads)
-              \    \____   
-               \        \
-    KSEGROUP---thread--thread--thread    (queued in priority order)
-        \                 / 
-         \_______________/
-          (last_assigned)
-
-The result of this scheme is that the M available KSEs are always
-queued at the priorities they have inherrited from the M highest priority
-threads for that KSEGROUP. If this situation changes, the KSEs are 
-reassigned to keep this true.
-***/
 
 #include <sys/cdefs.h>
-__FBSDID("$FreeBSD: src/sys/kern/kern_switch.c,v 1.116.2.1 2005/08/06 03:06:25 davidxu Exp $");
+__FBSDID("$FreeBSD: src/sys/kern/kern_switch.c,v 1.137 2007/10/08 23:37:28 jeff Exp $");
 
 #include "opt_sched.h"
 
@@ -109,6 +49,15 @@
 #include <sys/sysctl.h>
 #endif
 
+#include <machine/cpu.h>
+
+/* Uncomment this to enable logging of critical_enter/exit. */
+#if 0
+#define	KTR_CRITICAL	KTR_SCHED
+#else
+#define	KTR_CRITICAL	0
+#endif
+
 #ifdef FULL_PREEMPTION
 #ifndef PREEMPTION
 #error "The FULL_PREEMPTION option requires the PREEMPTION option"
@@ -117,8 +66,6 @@
 
 CTASSERT((RQB_BPW * RQB_LEN) == RQ_NQS);
 
-#define td_kse td_sched
-
 /*
  * kern.sched.preemption allows user space to determine if preemption support
  * is compiled in or not.  It is not currently a boot or runtime flag that
@@ -132,55 +79,62 @@
 SYSCTL_INT(_kern_sched, OID_AUTO, preemption, CTLFLAG_RD,
     &kern_sched_preemption, 0, "Kernel preemption enabled");
 
+#ifdef SCHED_STATS
+long switch_preempt;
+long switch_owepreempt;
+long switch_turnstile;
+long switch_sleepq;
+long switch_sleepqtimo;
+long switch_relinquish;
+long switch_needresched;
+static SYSCTL_NODE(_kern_sched, OID_AUTO, stats, CTLFLAG_RW, 0, "switch stats");
+SYSCTL_INT(_kern_sched_stats, OID_AUTO, preempt, CTLFLAG_RD, &switch_preempt, 0, "");
+SYSCTL_INT(_kern_sched_stats, OID_AUTO, owepreempt, CTLFLAG_RD, &switch_owepreempt, 0, "");
+SYSCTL_INT(_kern_sched_stats, OID_AUTO, turnstile, CTLFLAG_RD, &switch_turnstile, 0, "");
+SYSCTL_INT(_kern_sched_stats, OID_AUTO, sleepq, CTLFLAG_RD, &switch_sleepq, 0, "");
+SYSCTL_INT(_kern_sched_stats, OID_AUTO, sleepqtimo, CTLFLAG_RD, &switch_sleepqtimo, 0, "");
+SYSCTL_INT(_kern_sched_stats, OID_AUTO, relinquish, CTLFLAG_RD, &switch_relinquish, 0, "");
+SYSCTL_INT(_kern_sched_stats, OID_AUTO, needresched, CTLFLAG_RD, &switch_needresched, 0, "");
+static int
+sysctl_stats_reset(SYSCTL_HANDLER_ARGS)
+{
+        int error;
+	int val;
+
+        val = 0;
+        error = sysctl_handle_int(oidp, &val, 0, req);
+        if (error != 0 || req->newptr == NULL)
+                return (error);
+        if (val == 0)
+                return (0);
+	switch_preempt = 0;
+	switch_owepreempt = 0;
+	switch_turnstile = 0;
+	switch_sleepq = 0;
+	switch_sleepqtimo = 0;
+	switch_relinquish = 0;
+	switch_needresched = 0;
+
+	return (0);
+}
+
+SYSCTL_PROC(_kern_sched_stats, OID_AUTO, reset, CTLTYPE_INT | CTLFLAG_WR, NULL,
+    0, sysctl_stats_reset, "I", "Reset scheduler statistics");
+#endif
+
 /************************************************************************
  * Functions that manipulate runnability from a thread perspective.	*
  ************************************************************************/
 /*
- * Select the KSE that will be run next.  From that find the thread, and
- * remove it from the KSEGRP's run queue.  If there is thread clustering,
- * this will be what does it.
+ * Select the thread that will be run next.
  */
 struct thread *
 choosethread(void)
 {
-	struct kse *ke;
 	struct thread *td;
-	struct ksegrp *kg;
-
-#if defined(SMP) && (defined(__i386__) || defined(__amd64__))
-	if (smp_active == 0 && PCPU_GET(cpuid) != 0) {
-		/* Shutting down, run idlethread on AP's */
-		td = PCPU_GET(idlethread);
-		ke = td->td_kse;
-		CTR1(KTR_RUNQ, "choosethread: td=%p (idle)", td);
-		ke->ke_flags |= KEF_DIDRUN;
-		TD_SET_RUNNING(td);
-		return (td);
-	}
-#endif
 
 retry:
-	ke = sched_choose();
-	if (ke) {
-		td = ke->ke_thread;
-		KASSERT((td->td_kse == ke), ("kse/thread mismatch"));
-		kg = ke->ke_ksegrp;
-		if (td->td_proc->p_flag & P_HADTHREADS) {
-			if (kg->kg_last_assigned == td) {
-				kg->kg_last_assigned = TAILQ_PREV(td,
-				    threadqueue, td_runq);
-			}
-			TAILQ_REMOVE(&kg->kg_runq, td, td_runq);
-		}
-		CTR2(KTR_RUNQ, "choosethread: td=%p pri=%d",
-		    td, td->td_priority);
-	} else {
-		/* Simulate runq_choose() having returned the idle thread */
-		td = PCPU_GET(idlethread);
-		ke = td->td_kse;
-		CTR1(KTR_RUNQ, "choosethread: td=%p (idle)", td);
-	}
-	ke->ke_flags |= KEF_DIDRUN;
+	td = sched_choose();
 
 	/*
 	 * If we are in panic, only allow system threads,
@@ -198,395 +152,6 @@
 }
 
 /*
- * Given a surplus system slot, try assign a new runnable thread to it.
- * Called from:
- *  sched_thread_exit()  (local)
- *  sched_switch()  (local)
- *  sched_thread_exit()  (local)
- *  remrunqueue()  (local)  (not at the moment)
- */
-static void
-slot_fill(struct ksegrp *kg)
-{
-	struct thread *td;
-
-	mtx_assert(&sched_lock, MA_OWNED);
-	while (kg->kg_avail_opennings > 0) {
-		/*
-		 * Find the first unassigned thread
-		 */
-		if ((td = kg->kg_last_assigned) != NULL)
-			td = TAILQ_NEXT(td, td_runq);
-		else
-			td = TAILQ_FIRST(&kg->kg_runq);
-
-		/*
-		 * If we found one, send it to the system scheduler.
-		 */
-		if (td) {
-			kg->kg_last_assigned = td;
-			sched_add(td, SRQ_YIELDING);
-			CTR2(KTR_RUNQ, "slot_fill: td%p -> kg%p", td, kg);
-		} else {
-			/* no threads to use up the slots. quit now */
-			break;
-		}
-	}
-}
-
-#ifdef	SCHED_4BSD
-/*
- * Remove a thread from its KSEGRP's run queue.
- * This in turn may remove it from a KSE if it was already assigned
- * to one, possibly causing a new thread to be assigned to the KSE
- * and the KSE getting a new priority.
- */
-static void
-remrunqueue(struct thread *td)
-{
-	struct thread *td2, *td3;
-	struct ksegrp *kg;
-	struct kse *ke;
-
-	mtx_assert(&sched_lock, MA_OWNED);
-	KASSERT((TD_ON_RUNQ(td)), ("remrunqueue: Bad state on run queue"));
-	kg = td->td_ksegrp;
-	ke = td->td_kse;
-	CTR1(KTR_RUNQ, "remrunqueue: td%p", td);
-	TD_SET_CAN_RUN(td);
-	/*
-	 * If it is not a threaded process, take the shortcut.
-	 */
-	if ((td->td_proc->p_flag & P_HADTHREADS) == 0) {
-		/* remve from sys run queue and free up a slot */
-		sched_rem(td);
-		ke->ke_state = KES_THREAD; 
-		return;
-	}
-   	td3 = TAILQ_PREV(td, threadqueue, td_runq);
-	TAILQ_REMOVE(&kg->kg_runq, td, td_runq);
-	if (ke->ke_state == KES_ONRUNQ) {
-		/*
-		 * This thread has been assigned to the system run queue.
-		 * We need to dissociate it and try assign the
-		 * KSE to the next available thread. Then, we should
-		 * see if we need to move the KSE in the run queues.
-		 */
-		sched_rem(td);
-		ke->ke_state = KES_THREAD; 
-		td2 = kg->kg_last_assigned;
-		KASSERT((td2 != NULL), ("last assigned has wrong value"));
-		if (td2 == td) 
-			kg->kg_last_assigned = td3;
-		/* slot_fill(kg); */ /* will replace it with another */
-	}
-}
-#endif
-
-/*
- * Change the priority of a thread that is on the run queue.
- */
-void
-adjustrunqueue( struct thread *td, int newpri) 
-{
-	struct ksegrp *kg;
-	struct kse *ke;
-
-	mtx_assert(&sched_lock, MA_OWNED);
-	KASSERT((TD_ON_RUNQ(td)), ("adjustrunqueue: Bad state on run queue"));
-
-	ke = td->td_kse;
-	CTR1(KTR_RUNQ, "adjustrunqueue: td%p", td);
-	/*
-	 * If it is not a threaded process, take the shortcut.
-	 */
-	if ((td->td_proc->p_flag & P_HADTHREADS) == 0) {
-		/* We only care about the kse in the run queue. */
-		td->td_priority = newpri;
-		if (ke->ke_rqindex != (newpri / RQ_PPQ)) {
-			sched_rem(td);
-			sched_add(td, SRQ_BORING);
-		}
-		return;
-	}
-
-	/* It is a threaded process */
-	kg = td->td_ksegrp;
-	if (ke->ke_state == KES_ONRUNQ
-#ifdef SCHED_ULE
-	 || ((ke->ke_flags & KEF_ASSIGNED) != 0 &&
-	     (ke->ke_flags & KEF_REMOVED) == 0)
-#endif
-	   ) {
-		if (kg->kg_last_assigned == td) {
-			kg->kg_last_assigned =
-			    TAILQ_PREV(td, threadqueue, td_runq);
-		}
-		sched_rem(td);
-	}
-	TAILQ_REMOVE(&kg->kg_runq, td, td_runq);
-	TD_SET_CAN_RUN(td);
-	td->td_priority = newpri;
-	setrunqueue(td, SRQ_BORING);
-}
-
-/*
- * This function is called when a thread is about to be put on a
- * ksegrp run queue because it has been made runnable or its 
- * priority has been adjusted and the ksegrp does not have a 
- * free kse slot.  It determines if a thread from the same ksegrp
- * should be preempted.  If so, it tries to switch threads
- * if the thread is on the same cpu or notifies another cpu that
- * it should switch threads. 
- */
-
-static void
-maybe_preempt_in_ksegrp(struct thread *td)
-#if  !defined(SMP)
-{
-	struct thread *running_thread;
-
-	mtx_assert(&sched_lock, MA_OWNED);
-	running_thread = curthread;
-
-	if (running_thread->td_ksegrp != td->td_ksegrp)
-		return;
-
-	if (td->td_priority >= running_thread->td_priority)
-		return;
-#ifdef PREEMPTION
-#ifndef FULL_PREEMPTION
-	if (td->td_priority > PRI_MAX_ITHD) {
-		running_thread->td_flags |= TDF_NEEDRESCHED;
-		return;
-	}
-#endif /* FULL_PREEMPTION */
-
-	if (running_thread->td_critnest > 1) 
-		running_thread->td_owepreempt = 1;
-	 else 		
-		 mi_switch(SW_INVOL, NULL);
-	
-#else /* PREEMPTION */
-	running_thread->td_flags |= TDF_NEEDRESCHED;
-#endif /* PREEMPTION */
-	return;
-}
-
-#else /* SMP */
-{
-	struct thread *running_thread;
-	int worst_pri;
-	struct ksegrp *kg;
-	cpumask_t cpumask,dontuse;
-	struct pcpu *pc;
-	struct pcpu *best_pcpu;
-	struct thread *cputhread;
-
-	mtx_assert(&sched_lock, MA_OWNED);
-
-	running_thread = curthread;
-
-#if !defined(KSEG_PEEMPT_BEST_CPU)
-	if (running_thread->td_ksegrp != td->td_ksegrp) {
-#endif
-		kg = td->td_ksegrp;
-
-		/* if someone is ahead of this thread, wait our turn */
-		if (td != TAILQ_FIRST(&kg->kg_runq))  
-			return;
-		
-		worst_pri = td->td_priority;
-		best_pcpu = NULL;
-		dontuse   = stopped_cpus | idle_cpus_mask;
-		
-		/* 
-		 * Find a cpu with the worst priority that runs at thread from
-		 * the same  ksegrp - if multiple exist give first the last run
-		 * cpu and then the current cpu priority 
-		 */
-		
-		SLIST_FOREACH(pc, &cpuhead, pc_allcpu) {
-			cpumask   = pc->pc_cpumask;
-			cputhread = pc->pc_curthread;
-
-			if ((cpumask & dontuse)  ||	 
-			    cputhread->td_ksegrp != kg)
-				continue;	
-
-			if (cputhread->td_priority > worst_pri) {
-				worst_pri = cputhread->td_priority;
-				best_pcpu = pc;	
-				continue;
-			}
-			
-			if (cputhread->td_priority == worst_pri &&
-			    best_pcpu != NULL &&			
-			    (td->td_lastcpu == pc->pc_cpuid ||
-				(PCPU_GET(cpumask) == cpumask &&
-				    td->td_lastcpu != best_pcpu->pc_cpuid))) 
-			    best_pcpu = pc;
-		}		
-		
-		/* Check if we need to preempt someone */
-		if (best_pcpu == NULL) 
-			return;
-
-#if defined(IPI_PREEMPTION) && defined(PREEMPTION)
-#if !defined(FULL_PREEMPTION)
-		if (td->td_priority <= PRI_MAX_ITHD)
-#endif /* ! FULL_PREEMPTION */
-			{
-				ipi_selected(best_pcpu->pc_cpumask, IPI_PREEMPT);
-				return;
-			}
-#endif /* defined(IPI_PREEMPTION) && defined(PREEMPTION) */
-
-		if (PCPU_GET(cpuid) != best_pcpu->pc_cpuid) {
-			best_pcpu->pc_curthread->td_flags |= TDF_NEEDRESCHED;
-			ipi_selected(best_pcpu->pc_cpumask, IPI_AST);
-			return;
-		}
-#if !defined(KSEG_PEEMPT_BEST_CPU)
-	}	
-#endif
-
-	if (td->td_priority >= running_thread->td_priority)
-		return;
-#ifdef PREEMPTION
-
-#if !defined(FULL_PREEMPTION)
-	if (td->td_priority > PRI_MAX_ITHD) {
-		running_thread->td_flags |= TDF_NEEDRESCHED;
-	}
-#endif /* ! FULL_PREEMPTION */
-	
-	if (running_thread->td_critnest > 1) 
-		running_thread->td_owepreempt = 1;
-	 else 		
-		 mi_switch(SW_INVOL, NULL);
-	
-#else /* PREEMPTION */
-	running_thread->td_flags |= TDF_NEEDRESCHED;
-#endif /* PREEMPTION */
-	return;
-}
-#endif /* !SMP */
-
-
-int limitcount;
-void
-setrunqueue(struct thread *td, int flags)
-{
-	struct ksegrp *kg;
-	struct thread *td2;
-	struct thread *tda;
-
-	CTR3(KTR_RUNQ, "setrunqueue: td:%p kg:%p pid:%d",
-	    td, td->td_ksegrp, td->td_proc->p_pid);
-	CTR5(KTR_SCHED, "setrunqueue: %p(%s) prio %d by %p(%s)",
-            td, td->td_proc->p_comm, td->td_priority, curthread,
-            curthread->td_proc->p_comm);
-	mtx_assert(&sched_lock, MA_OWNED);
-	KASSERT((td->td_inhibitors == 0),
-			("setrunqueue: trying to run inhibitted thread"));
-	KASSERT((TD_CAN_RUN(td) || TD_IS_RUNNING(td)),
-	    ("setrunqueue: bad thread state"));
-	TD_SET_RUNQ(td);
-	kg = td->td_ksegrp;
-	if ((td->td_proc->p_flag & P_HADTHREADS) == 0) {
-		/*
-		 * Common path optimisation: Only one of everything
-		 * and the KSE is always already attached.
-		 * Totally ignore the ksegrp run queue.
-		 */
-		if (kg->kg_avail_opennings != 1) {
-			if (limitcount < 1) {
-				limitcount++;
-				printf("pid %d: corrected slot count (%d->1)\n",
-				    td->td_proc->p_pid, kg->kg_avail_opennings);
-
-			}
-			kg->kg_avail_opennings = 1;
-		}
-		sched_add(td, flags);
-		return;
-	}
-
-	/* 
-	 * If the concurrency has reduced, and we would go in the 
-	 * assigned section, then keep removing entries from the 
-	 * system run queue, until we are not in that section 
-	 * or there is room for us to be put in that section.
-	 * What we MUST avoid is the case where there are threads of less
-	 * priority than the new one scheduled, but it can not
-	 * be scheduled itself. That would lead to a non contiguous set
-	 * of scheduled threads, and everything would break.
-	 */ 
-	tda = kg->kg_last_assigned;
-	while ((kg->kg_avail_opennings <= 0) &&
-	    (tda && (tda->td_priority > td->td_priority))) {
-		/*
-		 * None free, but there is one we can commandeer.
-		 */
-		CTR2(KTR_RUNQ,
-		    "setrunqueue: kg:%p: take slot from td: %p", kg, tda);
-		sched_rem(tda);
-		tda = kg->kg_last_assigned =
-		    TAILQ_PREV(tda, threadqueue, td_runq);
-	}
-
-	/*
-	 * Add the thread to the ksegrp's run queue at
-	 * the appropriate place.
-	 */
-	TAILQ_FOREACH(td2, &kg->kg_runq, td_runq) {
-		if (td2->td_priority > td->td_priority) {
-			TAILQ_INSERT_BEFORE(td2, td, td_runq);
-			break;
-		}
-	}
-	if (td2 == NULL) {
-		/* We ran off the end of the TAILQ or it was empty. */
-		TAILQ_INSERT_TAIL(&kg->kg_runq, td, td_runq);
-	}
-
-	/*
-	 * If we have a slot to use, then put the thread on the system
-	 * run queue and if needed, readjust the last_assigned pointer.
-	 * it may be that we need to schedule something anyhow
-	 * even if the availabel slots are -ve so that
-	 * all the items < last_assigned are scheduled.
-	 */
-	if (kg->kg_avail_opennings > 0) {
-		if (tda == NULL) {
-			/*
-			 * No pre-existing last assigned so whoever is first
-			 * gets the slot.. (maybe us)
-			 */
-			td2 = TAILQ_FIRST(&kg->kg_runq);
-			kg->kg_last_assigned = td2;
-		} else if (tda->td_priority > td->td_priority) {
-			td2 = td;
-		} else {
-			/* 
-			 * We are past last_assigned, so 
-			 * give the next slot to whatever is next,
-			 * which may or may not be us.
-			 */
-			td2 = TAILQ_NEXT(tda, td_runq);
-			kg->kg_last_assigned = td2;
-		}
-		sched_add(td2, flags);
-	} else {
-		CTR3(KTR_RUNQ, "setrunqueue: held: td%p kg%p pid%d",
-			td, td->td_ksegrp, td->td_proc->p_pid);
-		if ((flags & SRQ_YIELDING) == 0)
-			maybe_preempt_in_ksegrp(td);
-	}
-}
-
-/*
  * Kernel thread preemption implementation.  Critical sections mark
  * regions of code in which preemptions are not allowed.
  */
@@ -609,22 +174,20 @@
 	td = curthread;
 	KASSERT(td->td_critnest != 0,
 	    ("critical_exit: td_critnest == 0"));
-#ifdef PREEMPTION
+
 	if (td->td_critnest == 1) {
 		td->td_critnest = 0;
-		mtx_assert(&sched_lock, MA_NOTOWNED);
 		if (td->td_owepreempt) {
 			td->td_critnest = 1;
-			mtx_lock_spin(&sched_lock);
+			thread_lock(td);
 			td->td_critnest--;
-			mi_switch(SW_INVOL, NULL);
-			mtx_unlock_spin(&sched_lock);
+			SCHED_STAT_INC(switch_owepreempt);
+			mi_switch(SW_INVOL|SW_PREEMPT, NULL);
+			thread_unlock(td);
 		}
-	} else 
-#endif
+	} else
 		td->td_critnest--;
-	
-	
+
 	CTR4(KTR_CRITICAL, "critical_exit by thread %p (%ld, %s) to %d", td,
 	    (long)td->td_proc->p_pid, td->td_proc->p_comm, td->td_critnest);
 }
@@ -644,7 +207,6 @@
 	int cpri, pri;
 #endif
 
-	mtx_assert(&sched_lock, MA_OWNED);
 #ifdef PREEMPTION
 	/*
 	 * The new thread should not preempt the current thread if any of the
@@ -670,14 +232,15 @@
 	 * to the new thread.
 	 */
 	ctd = curthread;
-	KASSERT ((ctd->td_kse != NULL && ctd->td_kse->ke_thread == ctd),
+	THREAD_LOCK_ASSERT(td, MA_OWNED);
+	KASSERT ((ctd->td_sched != NULL && ctd->td_sched->ts_thread == ctd),
 	  ("thread has no (or wrong) sched-private part."));
 	KASSERT((td->td_inhibitors == 0),
-			("maybe_preempt: trying to run inhibitted thread"));
+			("maybe_preempt: trying to run inhibited thread"));
 	pri = td->td_priority;
 	cpri = ctd->td_priority;
 	if (panicstr != NULL || pri >= cpri || cold /* || dumping */ ||
-	    TD_IS_INHIBITED(ctd) || td->td_kse->ke_state != KES_THREAD)
+	    TD_IS_INHIBITED(ctd))
 		return (0);
 #ifndef FULL_PREEMPTION
 	if (pri > PRI_MAX_ITHD && cpri < PRI_MIN_IDLE)
@@ -690,32 +253,24 @@
 		ctd->td_owepreempt = 1;
 		return (0);
 	}
-
 	/*
 	 * Thread is runnable but not yet put on system run queue.
 	 */
+	MPASS(ctd->td_lock == td->td_lock);
 	MPASS(TD_ON_RUNQ(td));
-	MPASS(td->td_sched->ke_state != KES_ONRUNQ);
-	if (td->td_proc->p_flag & P_HADTHREADS) {
-		/*
-		 * If this is a threaded process we actually ARE on the
-		 * ksegrp run queue so take it off that first.
-		 * Also undo any damage done to the last_assigned pointer.
-		 * XXX Fix setrunqueue so this isn't needed
-		 */
-		struct ksegrp *kg;
-
-		kg = td->td_ksegrp;
-		if (kg->kg_last_assigned == td)
-			kg->kg_last_assigned =
-			    TAILQ_PREV(td, threadqueue, td_runq);
-		TAILQ_REMOVE(&kg->kg_runq, td, td_runq);
-	}
-		
 	TD_SET_RUNNING(td);
 	CTR3(KTR_PROC, "preempting to thread %p (pid %d, %s)\n", td,
 	    td->td_proc->p_pid, td->td_proc->p_comm);
+	SCHED_STAT_INC(switch_preempt);
 	mi_switch(SW_INVOL|SW_PREEMPT, td);
+	/*
+	 * td's lock pointer may have changed.  We have to return with it
+	 * locked.
+	 */
+	spinlock_enter();
+	thread_unlock(ctd);
+	thread_lock(td);
+	spinlock_exit();
 	return (1);
 #else
 	return (0);
@@ -793,6 +348,38 @@
 	return (-1);
 }
 
+static __inline int
+runq_findbit_from(struct runq *rq, u_char pri)
+{
+	struct rqbits *rqb;
+	rqb_word_t mask;
+	int i;
+
+	/*
+	 * Set the mask for the first word so we ignore priorities before 'pri'.
+	 */
+	mask = (rqb_word_t)-1 << (pri & (RQB_BPW - 1));
+	rqb = &rq->rq_status;
+again:
+	for (i = RQB_WORD(pri); i < RQB_LEN; mask = -1, i++) {
+		mask = rqb->rqb_bits[i] & mask;
+		if (mask == 0)
+			continue;
+		pri = RQB_FFS(mask) + (i << RQB_L2BPW);
+		CTR3(KTR_RUNQ, "runq_findbit_from: bits=%#x i=%d pri=%d",
+		    mask, i, pri);
+		return (pri);
+	}
+	if (pri == 0)
+		return (-1);
+	/*
+	 * Wrap back around to the beginning of the list just once so we
+	 * scan the whole thing.
+	 */
+	pri = 0;
+	goto again;
+}
+
 /*
  * Set the status bit of the queue corresponding to priority level pri,
  * indicating that it is non-empty.
@@ -811,28 +398,45 @@
 }
 
 /*
- * Add the KSE to the queue specified by its priority, and set the
+ * Add the thread to the queue specified by its priority, and set the
  * corresponding status bit.
  */
 void
-runq_add(struct runq *rq, struct kse *ke, int flags)
+runq_add(struct runq *rq, struct td_sched *ts, int flags)
 {
 	struct rqhead *rqh;
 	int pri;
 
-	pri = ke->ke_thread->td_priority / RQ_PPQ;
-	ke->ke_rqindex = pri;
+	pri = ts->ts_thread->td_priority / RQ_PPQ;
+	ts->ts_rqindex = pri;
 	runq_setbit(rq, pri);
 	rqh = &rq->rq_queues[pri];
-	CTR5(KTR_RUNQ, "runq_add: td=%p ke=%p pri=%d %d rqh=%p",
-	    ke->ke_thread, ke, ke->ke_thread->td_priority, pri, rqh);
+	CTR5(KTR_RUNQ, "runq_add: td=%p ts=%p pri=%d %d rqh=%p",
+	    ts->ts_thread, ts, ts->ts_thread->td_priority, pri, rqh);
 	if (flags & SRQ_PREEMPTED) {
-		TAILQ_INSERT_HEAD(rqh, ke, ke_procq);
+		TAILQ_INSERT_HEAD(rqh, ts, ts_procq);
 	} else {
-		TAILQ_INSERT_TAIL(rqh, ke, ke_procq);
+		TAILQ_INSERT_TAIL(rqh, ts, ts_procq);
 	}
 }
 
+void
+runq_add_pri(struct runq *rq, struct td_sched *ts, u_char pri, int flags)
+{
+	struct rqhead *rqh;
+
+	KASSERT(pri < RQ_NQS, ("runq_add_pri: %d out of range", pri));
+	ts->ts_rqindex = pri;
+	runq_setbit(rq, pri);
+	rqh = &rq->rq_queues[pri];
+	CTR5(KTR_RUNQ, "runq_add_pri: td=%p ke=%p pri=%d idx=%d rqh=%p",
+	    ts->ts_thread, ts, ts->ts_thread->td_priority, pri, rqh);
+	if (flags & SRQ_PREEMPTED) {
+		TAILQ_INSERT_HEAD(rqh, ts, ts_procq);
+	} else {
+		TAILQ_INSERT_TAIL(rqh, ts, ts_procq);
+	}
+}
 /*
  * Return true if there are runnable processes of any priority on the run
  * queue, false otherwise.  Has no side effects, does not modify the run
@@ -864,14 +468,13 @@
 /*
  * Find the highest priority process on the run queue.
  */
-struct kse *
+struct td_sched *
 runq_choose(struct runq *rq)
 {
 	struct rqhead *rqh;
-	struct kse *ke;
+	struct td_sched *ts;
 	int pri;
 
-	mtx_assert(&sched_lock, MA_OWNED);
 	while ((pri = runq_findbit(rq)) != -1) {
 		rqh = &rq->rq_queues[pri];
 #if defined(SMP) && defined(SCHED_4BSD)
@@ -883,72 +486,106 @@
 			 */
 			int count = runq_fuzz;
 			int cpu = PCPU_GET(cpuid);
-			struct kse *ke2;
-			ke2 = ke = TAILQ_FIRST(rqh);
+			struct td_sched *ts2;
+			ts2 = ts = TAILQ_FIRST(rqh);
 
-			while (count-- && ke2) {
-				if (ke->ke_thread->td_lastcpu == cpu) {
-					ke = ke2;
+			while (count-- && ts2) {
+				if (ts->ts_thread->td_lastcpu == cpu) {
+					ts = ts2;
 					break;
 				}
-				ke2 = TAILQ_NEXT(ke2, ke_procq);
+				ts2 = TAILQ_NEXT(ts2, ts_procq);
 			}
-		} else 
+		} else
 #endif
-			ke = TAILQ_FIRST(rqh);
-		KASSERT(ke != NULL, ("runq_choose: no proc on busy queue"));
+			ts = TAILQ_FIRST(rqh);
+		KASSERT(ts != NULL, ("runq_choose: no proc on busy queue"));
 		CTR3(KTR_RUNQ,
-		    "runq_choose: pri=%d kse=%p rqh=%p", pri, ke, rqh);
-		return (ke);
+		    "runq_choose: pri=%d td_sched=%p rqh=%p", pri, ts, rqh);
+		return (ts);
 	}
 	CTR1(KTR_RUNQ, "runq_choose: idleproc pri=%d", pri);
 
 	return (NULL);
 }
 
+struct td_sched *
+runq_choose_from(struct runq *rq, u_char idx)
+{
+	struct rqhead *rqh;
+	struct td_sched *ts;
+	int pri;
+
+	if ((pri = runq_findbit_from(rq, idx)) != -1) {
+		rqh = &rq->rq_queues[pri];
+		ts = TAILQ_FIRST(rqh);
+		KASSERT(ts != NULL, ("runq_choose: no proc on busy queue"));
+		CTR4(KTR_RUNQ,
+		    "runq_choose_from: pri=%d kse=%p idx=%d rqh=%p",
+		    pri, ts, ts->ts_rqindex, rqh);
+		return (ts);
+	}
+	CTR1(KTR_RUNQ, "runq_choose_from: idleproc pri=%d", pri);
+
+	return (NULL);
+}
 /*
- * Remove the KSE from the queue specified by its priority, and clear the
+ * Remove the thread from the queue specified by its priority, and clear the
  * corresponding status bit if the queue becomes empty.
- * Caller must set ke->ke_state afterwards.
+ * Caller must set state afterwards.
  */
 void
-runq_remove(struct runq *rq, struct kse *ke)
+runq_remove(struct runq *rq, struct td_sched *ts)
+{
+
+	runq_remove_idx(rq, ts, NULL);
+}
+
+void
+runq_remove_idx(struct runq *rq, struct td_sched *ts, u_char *idx)
 {
 	struct rqhead *rqh;
-	int pri;
+	u_char pri;
 
-	KASSERT(ke->ke_proc->p_sflag & PS_INMEM,
-		("runq_remove: process swapped out"));
-	pri = ke->ke_rqindex;
+	KASSERT(ts->ts_thread->td_flags & TDF_INMEM,
+		("runq_remove_idx: thread swapped out"));
+	pri = ts->ts_rqindex;
+	KASSERT(pri < RQ_NQS, ("runq_remove_idx: Invalid index %d\n", pri));
 	rqh = &rq->rq_queues[pri];
-	CTR5(KTR_RUNQ, "runq_remove: td=%p, ke=%p pri=%d %d rqh=%p",
-	    ke->ke_thread, ke, ke->ke_thread->td_priority, pri, rqh);
-	KASSERT(ke != NULL, ("runq_remove: no proc on busy queue"));
-	TAILQ_REMOVE(rqh, ke, ke_procq);
+	CTR5(KTR_RUNQ, "runq_remove_idx: td=%p, ts=%p pri=%d %d rqh=%p",
+	    ts->ts_thread, ts, ts->ts_thread->td_priority, pri, rqh);
+	{
+		struct td_sched *nts;
+
+		TAILQ_FOREACH(nts, rqh, ts_procq)
+			if (nts == ts)
+				break;
+		if (ts != nts)
+			panic("runq_remove_idx: ts %p not on rqindex %d",
+			    ts, pri);
+	}
+	TAILQ_REMOVE(rqh, ts, ts_procq);
 	if (TAILQ_EMPTY(rqh)) {
-		CTR0(KTR_RUNQ, "runq_remove: empty");
+		CTR0(KTR_RUNQ, "runq_remove_idx: empty");
 		runq_clrbit(rq, pri);
+		if (idx != NULL && *idx == pri)
+			*idx = (pri + 1) % RQ_NQS;
 	}
 }
 
 /****** functions that are temporarily here ***********/
 #include <vm/uma.h>
-extern struct mtx kse_zombie_lock;
 
 /*
  *  Allocate scheduler specific per-process resources.
- * The thread and ksegrp have already been linked in.
- * In this case just set the default concurrency value.
+ * The thread and proc have already been linked in.
  *
  * Called from:
  *  proc_init() (UMA init method)
  */
 void
-sched_newproc(struct proc *p, struct ksegrp *kg, struct thread *td)
+sched_newproc(struct proc *p, struct thread *td)
 {
-
-	/* This can go in sched_fork */
-	sched_init_concurrency(kg);
 }
 
 /*
@@ -962,70 +599,12 @@
 void
 sched_newthread(struct thread *td)
 {
-	struct td_sched *ke;
-
-	ke = (struct td_sched *) (td + 1);
-	bzero(ke, sizeof(*ke));
-	td->td_sched     = ke;
-	ke->ke_thread	= td;
-	ke->ke_state	= KES_THREAD;
-}
-
-/*
- * Set up an initial concurrency of 1
- * and set the given thread (if given) to be using that
- * concurrency slot.
- * May be used "offline"..before the ksegrp is attached to the world
- * and thus wouldn't need schedlock in that case.
- * Called from:
- *  thr_create()
- *  proc_init() (UMA) via sched_newproc()
- */
-void
-sched_init_concurrency(struct ksegrp *kg)
-{
-
-	CTR1(KTR_RUNQ,"kg %p init slots and concurrency to 1", kg);
-	kg->kg_concurrency = 1;
-	kg->kg_avail_opennings = 1;
-}
-
-/*
- * Change the concurrency of an existing ksegrp to N
- * Called from:
- *  kse_create()
- *  kse_exit()
- *  thread_exit()
- *  thread_single()
- */
-void
-sched_set_concurrency(struct ksegrp *kg, int concurrency)
-{
-
-	CTR4(KTR_RUNQ,"kg %p set concurrency to %d, slots %d -> %d",
-	    kg,
-	    concurrency,
-	    kg->kg_avail_opennings,
-	    kg->kg_avail_opennings + (concurrency - kg->kg_concurrency));
-	kg->kg_avail_opennings += (concurrency - kg->kg_concurrency);
-	kg->kg_concurrency = concurrency;
-}
-
-/*
- * Called from thread_exit() for all exiting thread
- *
- * Not to be confused with sched_exit_thread()
- * that is only called from thread_exit() for threads exiting
- * without the rest of the process exiting because it is also called from
- * sched_exit() and we wouldn't want to call it twice.
- * XXX This can probably be fixed.
- */
-void
-sched_thread_exit(struct thread *td)
-{
+	struct td_sched *ts;
 
-	SLOT_RELEASE(td->td_ksegrp);
-	slot_fill(td->td_ksegrp);
+	ts = (struct td_sched *) (td + 1);
+	bzero(ts, sizeof(*ts));
+	td->td_sched     = ts;
+	ts->ts_thread	= td;
 }
 
 #endif /* KERN_SWITCH_INCLUDE */